b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | From 3a1cc23a75abcd9cea585eb84846507363d58397 Mon Sep 17 00:00:00 2001 |
| 2 | From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl> |
| 3 | Date: Tue, 25 Oct 2022 15:22:45 +0200 |
| 4 | Subject: [PATCH] net: broadcom: bcm4908_enet: use build_skb() |
| 5 | MIME-Version: 1.0 |
| 6 | Content-Type: text/plain; charset=UTF-8 |
| 7 | Content-Transfer-Encoding: 8bit |
| 8 | |
| 9 | RX code can be more efficient with the build_skb(). Allocating actual |
| 10 | SKB around eth packet buffer - right before passing it up - results in |
| 11 | a better cache usage. |
| 12 | |
| 13 | Without RPS (echo 0 > rps_cpus) BCM4908 NAT masq performance "jumps" |
| 14 | between two speeds: ~900 Mbps and 940 Mbps (it's a 4 CPUs SoC). This |
| 15 | change bumps the lower speed from 905 Mb/s to 918 Mb/s (tested using |
| 16 | single stream iperf 2.0.5 traffic). |
| 17 | |
| 18 | There are more optimizations to consider. One obvious to try is GRO |
| 19 | however as BCM4908 doesn't do hw csum is may actually lower performance. |
| 20 | Sometimes. Some early testing: |
| 21 | |
| 22 | ┌─────────────────────────────────┬─────────────────────┬────────────────────┐ |
| 23 | │ │ netif_receive_skb() │ napi_gro_receive() │ |
| 24 | ├─────────────────────────────────┼─────────────────────┼────────────────────┤ |
| 25 | │ netdev_alloc_skb() │ 905 Mb/s │ 892 Mb/s │ |
| 26 | │ napi_alloc_frag() + build_skb() │ 918 Mb/s │ 917 Mb/s │ |
| 27 | └─────────────────────────────────┴─────────────────────┴────────────────────┘ |
| 28 | |
| 29 | Another ideas: |
| 30 | 1. napi_build_skb() |
| 31 | 2. skb_copy_from_linear_data() for small packets |
| 32 | |
| 33 | Those need proper testing first though. That can be done later. |
| 34 | |
| 35 | Signed-off-by: Rafał Miłecki <rafal@milecki.pl> |
| 36 | Link: https://lore.kernel.org/r/20221025132245.22871-1-zajec5@gmail.com |
| 37 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> |
| 38 | --- |
| 39 | drivers/net/ethernet/broadcom/bcm4908_enet.c | 53 +++++++++++++------- |
| 40 | 1 file changed, 36 insertions(+), 17 deletions(-) |
| 41 | |
| 42 | --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c |
| 43 | +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c |
| 44 | @@ -36,13 +36,24 @@ |
| 45 | #define ENET_MAX_ETH_OVERHEAD (ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \ |
| 46 | ETH_FCS_LEN + 4) /* 32 */ |
| 47 | |
| 48 | +#define ENET_RX_SKB_BUF_SIZE (NET_SKB_PAD + NET_IP_ALIGN + \ |
| 49 | + ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \ |
| 50 | + ENET_MTU_MAX + ETH_FCS_LEN + 4) |
| 51 | +#define ENET_RX_SKB_BUF_ALLOC_SIZE (SKB_DATA_ALIGN(ENET_RX_SKB_BUF_SIZE) + \ |
| 52 | + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) |
| 53 | +#define ENET_RX_BUF_DMA_OFFSET (NET_SKB_PAD + NET_IP_ALIGN) |
| 54 | +#define ENET_RX_BUF_DMA_SIZE (ENET_RX_SKB_BUF_SIZE - ENET_RX_BUF_DMA_OFFSET) |
| 55 | + |
| 56 | struct bcm4908_enet_dma_ring_bd { |
| 57 | __le32 ctl; |
| 58 | __le32 addr; |
| 59 | } __packed; |
| 60 | |
| 61 | struct bcm4908_enet_dma_ring_slot { |
| 62 | - struct sk_buff *skb; |
| 63 | + union { |
| 64 | + void *buf; /* RX */ |
| 65 | + struct sk_buff *skb; /* TX */ |
| 66 | + }; |
| 67 | unsigned int len; |
| 68 | dma_addr_t dma_addr; |
| 69 | }; |
| 70 | @@ -259,22 +270,21 @@ static int bcm4908_enet_dma_alloc_rx_buf |
| 71 | u32 tmp; |
| 72 | int err; |
| 73 | |
| 74 | - slot->len = ENET_MTU_MAX + ENET_MAX_ETH_OVERHEAD; |
| 75 | - |
| 76 | - slot->skb = netdev_alloc_skb(enet->netdev, slot->len); |
| 77 | - if (!slot->skb) |
| 78 | + slot->buf = napi_alloc_frag(ENET_RX_SKB_BUF_ALLOC_SIZE); |
| 79 | + if (!slot->buf) |
| 80 | return -ENOMEM; |
| 81 | |
| 82 | - slot->dma_addr = dma_map_single(dev, slot->skb->data, slot->len, DMA_FROM_DEVICE); |
| 83 | + slot->dma_addr = dma_map_single(dev, slot->buf + ENET_RX_BUF_DMA_OFFSET, |
| 84 | + ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE); |
| 85 | err = dma_mapping_error(dev, slot->dma_addr); |
| 86 | if (err) { |
| 87 | dev_err(dev, "Failed to map DMA buffer: %d\n", err); |
| 88 | - kfree_skb(slot->skb); |
| 89 | - slot->skb = NULL; |
| 90 | + skb_free_frag(slot->buf); |
| 91 | + slot->buf = NULL; |
| 92 | return err; |
| 93 | } |
| 94 | |
| 95 | - tmp = slot->len << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT; |
| 96 | + tmp = ENET_RX_BUF_DMA_SIZE << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT; |
| 97 | tmp |= DMA_CTL_STATUS_OWN; |
| 98 | if (idx == enet->rx_ring.length - 1) |
| 99 | tmp |= DMA_CTL_STATUS_WRAP; |
| 100 | @@ -314,11 +324,11 @@ static void bcm4908_enet_dma_uninit(stru |
| 101 | |
| 102 | for (i = rx_ring->length - 1; i >= 0; i--) { |
| 103 | slot = &rx_ring->slots[i]; |
| 104 | - if (!slot->skb) |
| 105 | + if (!slot->buf) |
| 106 | continue; |
| 107 | dma_unmap_single(dev, slot->dma_addr, slot->len, DMA_FROM_DEVICE); |
| 108 | - kfree_skb(slot->skb); |
| 109 | - slot->skb = NULL; |
| 110 | + skb_free_frag(slot->buf); |
| 111 | + slot->buf = NULL; |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | @@ -574,6 +584,7 @@ static int bcm4908_enet_poll_rx(struct n |
| 116 | while (handled < weight) { |
| 117 | struct bcm4908_enet_dma_ring_bd *buf_desc; |
| 118 | struct bcm4908_enet_dma_ring_slot slot; |
| 119 | + struct sk_buff *skb; |
| 120 | u32 ctl; |
| 121 | int len; |
| 122 | int err; |
| 123 | @@ -597,16 +608,24 @@ static int bcm4908_enet_poll_rx(struct n |
| 124 | |
| 125 | if (len < ETH_ZLEN || |
| 126 | (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) { |
| 127 | - kfree_skb(slot.skb); |
| 128 | + skb_free_frag(slot.buf); |
| 129 | enet->netdev->stats.rx_dropped++; |
| 130 | break; |
| 131 | } |
| 132 | |
| 133 | - dma_unmap_single(dev, slot.dma_addr, slot.len, DMA_FROM_DEVICE); |
| 134 | + dma_unmap_single(dev, slot.dma_addr, ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE); |
| 135 | + |
| 136 | + skb = build_skb(slot.buf, ENET_RX_SKB_BUF_ALLOC_SIZE); |
| 137 | + if (unlikely(!skb)) { |
| 138 | + skb_free_frag(slot.buf); |
| 139 | + enet->netdev->stats.rx_dropped++; |
| 140 | + break; |
| 141 | + } |
| 142 | + skb_reserve(skb, ENET_RX_BUF_DMA_OFFSET); |
| 143 | + skb_put(skb, len - ETH_FCS_LEN); |
| 144 | + skb->protocol = eth_type_trans(skb, enet->netdev); |
| 145 | |
| 146 | - skb_put(slot.skb, len - ETH_FCS_LEN); |
| 147 | - slot.skb->protocol = eth_type_trans(slot.skb, enet->netdev); |
| 148 | - netif_receive_skb(slot.skb); |
| 149 | + netif_receive_skb(skb); |
| 150 | |
| 151 | enet->netdev->stats.rx_packets++; |
| 152 | enet->netdev->stats.rx_bytes += len; |