b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame^] | 1 | From: Wei Wang <weiwan@google.com> |
| 2 | Date: Mon, 8 Feb 2021 11:34:09 -0800 |
| 3 | Subject: [PATCH] net: implement threaded-able napi poll loop support |
| 4 | |
| 5 | This patch allows running each napi poll loop inside its own |
| 6 | kernel thread. |
| 7 | The kthread is created during netif_napi_add() if dev->threaded |
| 8 | is set. And threaded mode is enabled in napi_enable(). We will |
| 9 | provide a way to set dev->threaded and enable threaded mode |
| 10 | without a device up/down in the following patch. |
| 11 | |
| 12 | Once that threaded mode is enabled and the kthread is |
| 13 | started, napi_schedule() will wake-up such thread instead |
| 14 | of scheduling the softirq. |
| 15 | |
| 16 | The threaded poll loop behaves quite likely the net_rx_action, |
| 17 | but it does not have to manipulate local irqs and uses |
| 18 | an explicit scheduling point based on netdev_budget. |
| 19 | |
| 20 | Co-developed-by: Paolo Abeni <pabeni@redhat.com> |
| 21 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> |
| 22 | Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org> |
| 23 | Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> |
| 24 | Co-developed-by: Jakub Kicinski <kuba@kernel.org> |
| 25 | Signed-off-by: Jakub Kicinski <kuba@kernel.org> |
| 26 | Signed-off-by: Wei Wang <weiwan@google.com> |
| 27 | Reviewed-by: Alexander Duyck <alexanderduyck@fb.com> |
| 28 | Signed-off-by: David S. Miller <davem@davemloft.net> |
| 29 | --- |
| 30 | |
| 31 | --- a/include/linux/netdevice.h |
| 32 | +++ b/include/linux/netdevice.h |
| 33 | @@ -349,6 +349,7 @@ struct napi_struct { |
| 34 | struct list_head dev_list; |
| 35 | struct hlist_node napi_hash_node; |
| 36 | unsigned int napi_id; |
| 37 | + struct task_struct *thread; |
| 38 | }; |
| 39 | |
| 40 | enum { |
| 41 | @@ -359,6 +360,7 @@ enum { |
| 42 | NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ |
| 43 | NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ |
| 44 | NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ |
| 45 | + NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ |
| 46 | }; |
| 47 | |
| 48 | enum { |
| 49 | @@ -369,6 +371,7 @@ enum { |
| 50 | NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), |
| 51 | NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), |
| 52 | NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), |
| 53 | + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), |
| 54 | }; |
| 55 | |
| 56 | enum gro_result { |
| 57 | @@ -513,20 +516,7 @@ bool napi_hash_del(struct napi_struct *n |
| 58 | */ |
| 59 | void napi_disable(struct napi_struct *n); |
| 60 | |
| 61 | -/** |
| 62 | - * napi_enable - enable NAPI scheduling |
| 63 | - * @n: NAPI context |
| 64 | - * |
| 65 | - * Resume NAPI from being scheduled on this context. |
| 66 | - * Must be paired with napi_disable. |
| 67 | - */ |
| 68 | -static inline void napi_enable(struct napi_struct *n) |
| 69 | -{ |
| 70 | - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); |
| 71 | - smp_mb__before_atomic(); |
| 72 | - clear_bit(NAPI_STATE_SCHED, &n->state); |
| 73 | - clear_bit(NAPI_STATE_NPSVC, &n->state); |
| 74 | -} |
| 75 | +void napi_enable(struct napi_struct *n); |
| 76 | |
| 77 | /** |
| 78 | * napi_synchronize - wait until NAPI is not running |
| 79 | @@ -1792,6 +1782,8 @@ enum netdev_ml_priv_type { |
| 80 | * |
| 81 | * @wol_enabled: Wake-on-LAN is enabled |
| 82 | * |
| 83 | + * @threaded: napi threaded mode is enabled |
| 84 | + * |
| 85 | * FIXME: cleanup struct net_device such that network protocol info |
| 86 | * moves out. |
| 87 | */ |
| 88 | @@ -2084,6 +2076,7 @@ struct net_device { |
| 89 | struct lock_class_key addr_list_lock_key; |
| 90 | bool proto_down; |
| 91 | unsigned wol_enabled:1; |
| 92 | + unsigned threaded:1; |
| 93 | }; |
| 94 | #define to_net_dev(d) container_of(d, struct net_device, dev) |
| 95 | |
| 96 | --- a/net/core/dev.c |
| 97 | +++ b/net/core/dev.c |
| 98 | @@ -91,6 +91,7 @@ |
| 99 | #include <linux/etherdevice.h> |
| 100 | #include <linux/ethtool.h> |
| 101 | #include <linux/skbuff.h> |
| 102 | +#include <linux/kthread.h> |
| 103 | #include <linux/bpf.h> |
| 104 | #include <linux/bpf_trace.h> |
| 105 | #include <net/net_namespace.h> |
| 106 | @@ -1289,6 +1290,27 @@ void netdev_notify_peers(struct net_devi |
| 107 | } |
| 108 | EXPORT_SYMBOL(netdev_notify_peers); |
| 109 | |
| 110 | +static int napi_threaded_poll(void *data); |
| 111 | + |
| 112 | +static int napi_kthread_create(struct napi_struct *n) |
| 113 | +{ |
| 114 | + int err = 0; |
| 115 | + |
| 116 | + /* Create and wake up the kthread once to put it in |
| 117 | + * TASK_INTERRUPTIBLE mode to avoid the blocked task |
| 118 | + * warning and work with loadavg. |
| 119 | + */ |
| 120 | + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", |
| 121 | + n->dev->name, n->napi_id); |
| 122 | + if (IS_ERR(n->thread)) { |
| 123 | + err = PTR_ERR(n->thread); |
| 124 | + pr_err("kthread_run failed with err %d\n", err); |
| 125 | + n->thread = NULL; |
| 126 | + } |
| 127 | + |
| 128 | + return err; |
| 129 | +} |
| 130 | + |
| 131 | static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) |
| 132 | { |
| 133 | const struct net_device_ops *ops = dev->netdev_ops; |
| 134 | @@ -3891,6 +3913,21 @@ int gro_normal_batch __read_mostly = 8; |
| 135 | static inline void ____napi_schedule(struct softnet_data *sd, |
| 136 | struct napi_struct *napi) |
| 137 | { |
| 138 | + struct task_struct *thread; |
| 139 | + |
| 140 | + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { |
| 141 | + /* Paired with smp_mb__before_atomic() in |
| 142 | + * napi_enable(). Use READ_ONCE() to guarantee |
| 143 | + * a complete read on napi->thread. Only call |
| 144 | + * wake_up_process() when it's not NULL. |
| 145 | + */ |
| 146 | + thread = READ_ONCE(napi->thread); |
| 147 | + if (thread) { |
| 148 | + wake_up_process(thread); |
| 149 | + return; |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | list_add_tail(&napi->poll_list, &sd->poll_list); |
| 154 | __raise_softirq_irqoff(NET_RX_SOFTIRQ); |
| 155 | } |
| 156 | @@ -6282,6 +6319,12 @@ void netif_napi_add(struct net_device *d |
| 157 | set_bit(NAPI_STATE_NPSVC, &napi->state); |
| 158 | list_add_rcu(&napi->dev_list, &dev->napi_list); |
| 159 | napi_hash_add(napi); |
| 160 | + /* Create kthread for this napi if dev->threaded is set. |
| 161 | + * Clear dev->threaded if kthread creation failed so that |
| 162 | + * threaded mode will not be enabled in napi_enable(). |
| 163 | + */ |
| 164 | + if (dev->threaded && napi_kthread_create(napi)) |
| 165 | + dev->threaded = 0; |
| 166 | } |
| 167 | EXPORT_SYMBOL(netif_napi_add); |
| 168 | |
| 169 | @@ -6298,9 +6341,28 @@ void napi_disable(struct napi_struct *n) |
| 170 | hrtimer_cancel(&n->timer); |
| 171 | |
| 172 | clear_bit(NAPI_STATE_DISABLE, &n->state); |
| 173 | + clear_bit(NAPI_STATE_THREADED, &n->state); |
| 174 | } |
| 175 | EXPORT_SYMBOL(napi_disable); |
| 176 | |
| 177 | +/** |
| 178 | + * napi_enable - enable NAPI scheduling |
| 179 | + * @n: NAPI context |
| 180 | + * |
| 181 | + * Resume NAPI from being scheduled on this context. |
| 182 | + * Must be paired with napi_disable. |
| 183 | + */ |
| 184 | +void napi_enable(struct napi_struct *n) |
| 185 | +{ |
| 186 | + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); |
| 187 | + smp_mb__before_atomic(); |
| 188 | + clear_bit(NAPI_STATE_SCHED, &n->state); |
| 189 | + clear_bit(NAPI_STATE_NPSVC, &n->state); |
| 190 | + if (n->dev->threaded && n->thread) |
| 191 | + set_bit(NAPI_STATE_THREADED, &n->state); |
| 192 | +} |
| 193 | +EXPORT_SYMBOL(napi_enable); |
| 194 | + |
| 195 | static void flush_gro_hash(struct napi_struct *napi) |
| 196 | { |
| 197 | int i; |
| 198 | @@ -6325,6 +6387,11 @@ void netif_napi_del(struct napi_struct * |
| 199 | |
| 200 | flush_gro_hash(napi); |
| 201 | napi->gro_bitmask = 0; |
| 202 | + |
| 203 | + if (napi->thread) { |
| 204 | + kthread_stop(napi->thread); |
| 205 | + napi->thread = NULL; |
| 206 | + } |
| 207 | } |
| 208 | EXPORT_SYMBOL(netif_napi_del); |
| 209 | |
| 210 | @@ -6404,6 +6471,51 @@ static int napi_poll(struct napi_struct |
| 211 | return work; |
| 212 | } |
| 213 | |
| 214 | +static int napi_thread_wait(struct napi_struct *napi) |
| 215 | +{ |
| 216 | + set_current_state(TASK_INTERRUPTIBLE); |
| 217 | + |
| 218 | + while (!kthread_should_stop() && !napi_disable_pending(napi)) { |
| 219 | + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { |
| 220 | + WARN_ON(!list_empty(&napi->poll_list)); |
| 221 | + __set_current_state(TASK_RUNNING); |
| 222 | + return 0; |
| 223 | + } |
| 224 | + |
| 225 | + schedule(); |
| 226 | + set_current_state(TASK_INTERRUPTIBLE); |
| 227 | + } |
| 228 | + __set_current_state(TASK_RUNNING); |
| 229 | + return -1; |
| 230 | +} |
| 231 | + |
| 232 | +static int napi_threaded_poll(void *data) |
| 233 | +{ |
| 234 | + struct napi_struct *napi = data; |
| 235 | + void *have; |
| 236 | + |
| 237 | + while (!napi_thread_wait(napi)) { |
| 238 | + for (;;) { |
| 239 | + bool repoll = false; |
| 240 | + |
| 241 | + local_bh_disable(); |
| 242 | + |
| 243 | + have = netpoll_poll_lock(napi); |
| 244 | + __napi_poll(napi, &repoll); |
| 245 | + netpoll_poll_unlock(have); |
| 246 | + |
| 247 | + __kfree_skb_flush(); |
| 248 | + local_bh_enable(); |
| 249 | + |
| 250 | + if (!repoll) |
| 251 | + break; |
| 252 | + |
| 253 | + cond_resched(); |
| 254 | + } |
| 255 | + } |
| 256 | + return 0; |
| 257 | +} |
| 258 | + |
| 259 | static __latent_entropy void net_rx_action(struct softirq_action *h) |
| 260 | { |
| 261 | struct softnet_data *sd = this_cpu_ptr(&softnet_data); |