| b.liu | e958203 | 2025-04-17 19:18:16 +0800 | [diff] [blame] | 1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | From: "Jason A. Donenfeld" <Jason@zx2c4.com> |
| 3 | Date: Fri, 4 Jun 2021 17:17:36 +0200 |
| 4 | Subject: [PATCH] wireguard: allowedips: remove nodes in O(1) |
| 5 | |
| 6 | commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream. |
| 7 | |
| 8 | Previously, deleting peers would require traversing the entire trie in |
| 9 | order to rebalance nodes and safely free them. This meant that removing |
| 10 | 1000 peers from a trie with a half million nodes would take an extremely |
| 11 | long time, during which we're holding the rtnl lock. Large-scale users |
| 12 | were reporting 200ms latencies added to the networking stack as a whole |
| 13 | every time their userspace software would queue up significant removals. |
| 14 | That's a serious situation. |
| 15 | |
| 16 | This commit fixes that by maintaining a double pointer to the parent's |
| 17 | bit pointer for each node, and then using the already existing node list |
| 18 | belonging to each peer to go directly to the node, fix up its pointers, |
| 19 | and free it with RCU. This means removal is O(1) instead of O(n), and we |
| 20 | don't use gobs of stack. |
| 21 | |
| 22 | The removal algorithm has the same downside as the code that it fixes: |
| 23 | it won't collapse needlessly long runs of fillers. We can enhance that |
| 24 | in the future if it ever becomes a problem. This commit documents that |
| 25 | limitation with a TODO comment in code, a small but meaningful |
| 26 | improvement over the prior situation. |
| 27 | |
| 28 | Currently the biggest flaw, which the next commit addresses, is that |
| 29 | because this increases the node size on 64-bit machines from 60 bytes to |
| 30 | 68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up |
| 31 | using twice as much memory per node, because of power-of-two |
| 32 | allocations, which is a big bummer. We'll need to figure something out |
| 33 | there. |
| 34 | |
| 35 | Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") |
| 36 | Cc: stable@vger.kernel.org |
| 37 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 38 | Signed-off-by: David S. Miller <davem@davemloft.net> |
| 39 | Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> |
| 40 | --- |
| 41 | drivers/net/wireguard/allowedips.c | 132 ++++++++++++----------------- |
| 42 | drivers/net/wireguard/allowedips.h | 9 +- |
| 43 | 2 files changed, 57 insertions(+), 84 deletions(-) |
| 44 | |
| 45 | --- a/drivers/net/wireguard/allowedips.c |
| 46 | +++ b/drivers/net/wireguard/allowedips.c |
| 47 | @@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc |
| 48 | } |
| 49 | } |
| 50 | |
| 51 | -static void walk_remove_by_peer(struct allowedips_node __rcu **top, |
| 52 | - struct wg_peer *peer, struct mutex *lock) |
| 53 | -{ |
| 54 | -#define REF(p) rcu_access_pointer(p) |
| 55 | -#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock)) |
| 56 | -#define PUSH(p) ({ \ |
| 57 | - WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \ |
| 58 | - stack[len++] = p; \ |
| 59 | - }) |
| 60 | - |
| 61 | - struct allowedips_node __rcu **stack[128], **nptr; |
| 62 | - struct allowedips_node *node, *prev; |
| 63 | - unsigned int len; |
| 64 | - |
| 65 | - if (unlikely(!peer || !REF(*top))) |
| 66 | - return; |
| 67 | - |
| 68 | - for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) { |
| 69 | - nptr = stack[len - 1]; |
| 70 | - node = DEREF(nptr); |
| 71 | - if (!node) { |
| 72 | - --len; |
| 73 | - continue; |
| 74 | - } |
| 75 | - if (!prev || REF(prev->bit[0]) == node || |
| 76 | - REF(prev->bit[1]) == node) { |
| 77 | - if (REF(node->bit[0])) |
| 78 | - PUSH(&node->bit[0]); |
| 79 | - else if (REF(node->bit[1])) |
| 80 | - PUSH(&node->bit[1]); |
| 81 | - } else if (REF(node->bit[0]) == prev) { |
| 82 | - if (REF(node->bit[1])) |
| 83 | - PUSH(&node->bit[1]); |
| 84 | - } else { |
| 85 | - if (rcu_dereference_protected(node->peer, |
| 86 | - lockdep_is_held(lock)) == peer) { |
| 87 | - RCU_INIT_POINTER(node->peer, NULL); |
| 88 | - list_del_init(&node->peer_list); |
| 89 | - if (!node->bit[0] || !node->bit[1]) { |
| 90 | - rcu_assign_pointer(*nptr, DEREF( |
| 91 | - &node->bit[!REF(node->bit[0])])); |
| 92 | - kfree_rcu(node, rcu); |
| 93 | - node = DEREF(nptr); |
| 94 | - } |
| 95 | - } |
| 96 | - --len; |
| 97 | - } |
| 98 | - } |
| 99 | - |
| 100 | -#undef REF |
| 101 | -#undef DEREF |
| 102 | -#undef PUSH |
| 103 | -} |
| 104 | - |
| 105 | static unsigned int fls128(u64 a, u64 b) |
| 106 | { |
| 107 | return a ? fls64(a) + 64U : fls64(b); |
| 108 | @@ -224,6 +170,7 @@ static int add(struct allowedips_node __ |
| 109 | RCU_INIT_POINTER(node->peer, peer); |
| 110 | list_add_tail(&node->peer_list, &peer->allowedips_list); |
| 111 | copy_and_assign_cidr(node, key, cidr, bits); |
| 112 | + rcu_assign_pointer(node->parent_bit, trie); |
| 113 | rcu_assign_pointer(*trie, node); |
| 114 | return 0; |
| 115 | } |
| 116 | @@ -243,9 +190,9 @@ static int add(struct allowedips_node __ |
| 117 | if (!node) { |
| 118 | down = rcu_dereference_protected(*trie, lockdep_is_held(lock)); |
| 119 | } else { |
| 120 | - down = rcu_dereference_protected(CHOOSE_NODE(node, key), |
| 121 | - lockdep_is_held(lock)); |
| 122 | + down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock)); |
| 123 | if (!down) { |
| 124 | + rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key)); |
| 125 | rcu_assign_pointer(CHOOSE_NODE(node, key), newnode); |
| 126 | return 0; |
| 127 | } |
| 128 | @@ -254,29 +201,37 @@ static int add(struct allowedips_node __ |
| 129 | parent = node; |
| 130 | |
| 131 | if (newnode->cidr == cidr) { |
| 132 | + rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits)); |
| 133 | rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down); |
| 134 | - if (!parent) |
| 135 | + if (!parent) { |
| 136 | + rcu_assign_pointer(newnode->parent_bit, trie); |
| 137 | rcu_assign_pointer(*trie, newnode); |
| 138 | - else |
| 139 | - rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), |
| 140 | - newnode); |
| 141 | - } else { |
| 142 | - node = kzalloc(sizeof(*node), GFP_KERNEL); |
| 143 | - if (unlikely(!node)) { |
| 144 | - list_del(&newnode->peer_list); |
| 145 | - kfree(newnode); |
| 146 | - return -ENOMEM; |
| 147 | + } else { |
| 148 | + rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits)); |
| 149 | + rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode); |
| 150 | } |
| 151 | - INIT_LIST_HEAD(&node->peer_list); |
| 152 | - copy_and_assign_cidr(node, newnode->bits, cidr, bits); |
| 153 | + return 0; |
| 154 | + } |
| 155 | + |
| 156 | + node = kzalloc(sizeof(*node), GFP_KERNEL); |
| 157 | + if (unlikely(!node)) { |
| 158 | + list_del(&newnode->peer_list); |
| 159 | + kfree(newnode); |
| 160 | + return -ENOMEM; |
| 161 | + } |
| 162 | + INIT_LIST_HEAD(&node->peer_list); |
| 163 | + copy_and_assign_cidr(node, newnode->bits, cidr, bits); |
| 164 | |
| 165 | - rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down); |
| 166 | - rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode); |
| 167 | - if (!parent) |
| 168 | - rcu_assign_pointer(*trie, node); |
| 169 | - else |
| 170 | - rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), |
| 171 | - node); |
| 172 | + rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits)); |
| 173 | + rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down); |
| 174 | + rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits)); |
| 175 | + rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode); |
| 176 | + if (!parent) { |
| 177 | + rcu_assign_pointer(node->parent_bit, trie); |
| 178 | + rcu_assign_pointer(*trie, node); |
| 179 | + } else { |
| 180 | + rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits)); |
| 181 | + rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node); |
| 182 | } |
| 183 | return 0; |
| 184 | } |
| 185 | @@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow |
| 186 | void wg_allowedips_remove_by_peer(struct allowedips *table, |
| 187 | struct wg_peer *peer, struct mutex *lock) |
| 188 | { |
| 189 | + struct allowedips_node *node, *child, *tmp; |
| 190 | + |
| 191 | + if (list_empty(&peer->allowedips_list)) |
| 192 | + return; |
| 193 | ++table->seq; |
| 194 | - walk_remove_by_peer(&table->root4, peer, lock); |
| 195 | - walk_remove_by_peer(&table->root6, peer, lock); |
| 196 | + list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) { |
| 197 | + list_del_init(&node->peer_list); |
| 198 | + RCU_INIT_POINTER(node->peer, NULL); |
| 199 | + if (node->bit[0] && node->bit[1]) |
| 200 | + continue; |
| 201 | + child = rcu_dereference_protected( |
| 202 | + node->bit[!rcu_access_pointer(node->bit[0])], |
| 203 | + lockdep_is_held(lock)); |
| 204 | + if (child) |
| 205 | + child->parent_bit = node->parent_bit; |
| 206 | + *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child; |
| 207 | + kfree_rcu(node, rcu); |
| 208 | + |
| 209 | + /* TODO: Note that we currently don't walk up and down in order to |
| 210 | + * free any potential filler nodes. This means that this function |
| 211 | + * doesn't free up as much as it could, which could be revisited |
| 212 | + * at some point. |
| 213 | + */ |
| 214 | + } |
| 215 | } |
| 216 | |
| 217 | int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr) |
| 218 | --- a/drivers/net/wireguard/allowedips.h |
| 219 | +++ b/drivers/net/wireguard/allowedips.h |
| 220 | @@ -15,14 +15,11 @@ struct wg_peer; |
| 221 | struct allowedips_node { |
| 222 | struct wg_peer __rcu *peer; |
| 223 | struct allowedips_node __rcu *bit[2]; |
| 224 | - /* While it may seem scandalous that we waste space for v4, |
| 225 | - * we're alloc'ing to the nearest power of 2 anyway, so this |
| 226 | - * doesn't actually make a difference. |
| 227 | - */ |
| 228 | - u8 bits[16] __aligned(__alignof(u64)); |
| 229 | u8 cidr, bit_at_a, bit_at_b, bitlen; |
| 230 | + u8 bits[16] __aligned(__alignof(u64)); |
| 231 | |
| 232 | - /* Keep rarely used list at bottom to be beyond cache line. */ |
| 233 | + /* Keep rarely used members at bottom to be beyond cache line. */ |
| 234 | + struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */ |
| 235 | union { |
| 236 | struct list_head peer_list; |
| 237 | struct rcu_head rcu; |