blob: 78da24ea46ce735b3cee391d3d3600a7f3c5ef52 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3Date: Fri, 4 Jun 2021 17:17:36 +0200
4Subject: [PATCH] wireguard: allowedips: remove nodes in O(1)
5
6commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream.
7
8Previously, deleting peers would require traversing the entire trie in
9order to rebalance nodes and safely free them. This meant that removing
101000 peers from a trie with a half million nodes would take an extremely
11long time, during which we're holding the rtnl lock. Large-scale users
12were reporting 200ms latencies added to the networking stack as a whole
13every time their userspace software would queue up significant removals.
14That's a serious situation.
15
16This commit fixes that by maintaining a double pointer to the parent's
17bit pointer for each node, and then using the already existing node list
18belonging to each peer to go directly to the node, fix up its pointers,
19and free it with RCU. This means removal is O(1) instead of O(n), and we
20don't use gobs of stack.
21
22The removal algorithm has the same downside as the code that it fixes:
23it won't collapse needlessly long runs of fillers. We can enhance that
24in the future if it ever becomes a problem. This commit documents that
25limitation with a TODO comment in code, a small but meaningful
26improvement over the prior situation.
27
28Currently the biggest flaw, which the next commit addresses, is that
29because this increases the node size on 64-bit machines from 60 bytes to
3068 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up
31using twice as much memory per node, because of power-of-two
32allocations, which is a big bummer. We'll need to figure something out
33there.
34
35Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
36Cc: stable@vger.kernel.org
37Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
38Signed-off-by: David S. Miller <davem@davemloft.net>
39Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
40---
41 drivers/net/wireguard/allowedips.c | 132 ++++++++++++-----------------
42 drivers/net/wireguard/allowedips.h | 9 +-
43 2 files changed, 57 insertions(+), 84 deletions(-)
44
45--- a/drivers/net/wireguard/allowedips.c
46+++ b/drivers/net/wireguard/allowedips.c
47@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc
48 }
49 }
50
51-static void walk_remove_by_peer(struct allowedips_node __rcu **top,
52- struct wg_peer *peer, struct mutex *lock)
53-{
54-#define REF(p) rcu_access_pointer(p)
55-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
56-#define PUSH(p) ({ \
57- WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \
58- stack[len++] = p; \
59- })
60-
61- struct allowedips_node __rcu **stack[128], **nptr;
62- struct allowedips_node *node, *prev;
63- unsigned int len;
64-
65- if (unlikely(!peer || !REF(*top)))
66- return;
67-
68- for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
69- nptr = stack[len - 1];
70- node = DEREF(nptr);
71- if (!node) {
72- --len;
73- continue;
74- }
75- if (!prev || REF(prev->bit[0]) == node ||
76- REF(prev->bit[1]) == node) {
77- if (REF(node->bit[0]))
78- PUSH(&node->bit[0]);
79- else if (REF(node->bit[1]))
80- PUSH(&node->bit[1]);
81- } else if (REF(node->bit[0]) == prev) {
82- if (REF(node->bit[1]))
83- PUSH(&node->bit[1]);
84- } else {
85- if (rcu_dereference_protected(node->peer,
86- lockdep_is_held(lock)) == peer) {
87- RCU_INIT_POINTER(node->peer, NULL);
88- list_del_init(&node->peer_list);
89- if (!node->bit[0] || !node->bit[1]) {
90- rcu_assign_pointer(*nptr, DEREF(
91- &node->bit[!REF(node->bit[0])]));
92- kfree_rcu(node, rcu);
93- node = DEREF(nptr);
94- }
95- }
96- --len;
97- }
98- }
99-
100-#undef REF
101-#undef DEREF
102-#undef PUSH
103-}
104-
105 static unsigned int fls128(u64 a, u64 b)
106 {
107 return a ? fls64(a) + 64U : fls64(b);
108@@ -224,6 +170,7 @@ static int add(struct allowedips_node __
109 RCU_INIT_POINTER(node->peer, peer);
110 list_add_tail(&node->peer_list, &peer->allowedips_list);
111 copy_and_assign_cidr(node, key, cidr, bits);
112+ rcu_assign_pointer(node->parent_bit, trie);
113 rcu_assign_pointer(*trie, node);
114 return 0;
115 }
116@@ -243,9 +190,9 @@ static int add(struct allowedips_node __
117 if (!node) {
118 down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
119 } else {
120- down = rcu_dereference_protected(CHOOSE_NODE(node, key),
121- lockdep_is_held(lock));
122+ down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
123 if (!down) {
124+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
125 rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
126 return 0;
127 }
128@@ -254,29 +201,37 @@ static int add(struct allowedips_node __
129 parent = node;
130
131 if (newnode->cidr == cidr) {
132+ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
133 rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
134- if (!parent)
135+ if (!parent) {
136+ rcu_assign_pointer(newnode->parent_bit, trie);
137 rcu_assign_pointer(*trie, newnode);
138- else
139- rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
140- newnode);
141- } else {
142- node = kzalloc(sizeof(*node), GFP_KERNEL);
143- if (unlikely(!node)) {
144- list_del(&newnode->peer_list);
145- kfree(newnode);
146- return -ENOMEM;
147+ } else {
148+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
149+ rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
150 }
151- INIT_LIST_HEAD(&node->peer_list);
152- copy_and_assign_cidr(node, newnode->bits, cidr, bits);
153+ return 0;
154+ }
155+
156+ node = kzalloc(sizeof(*node), GFP_KERNEL);
157+ if (unlikely(!node)) {
158+ list_del(&newnode->peer_list);
159+ kfree(newnode);
160+ return -ENOMEM;
161+ }
162+ INIT_LIST_HEAD(&node->peer_list);
163+ copy_and_assign_cidr(node, newnode->bits, cidr, bits);
164
165- rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
166- rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
167- if (!parent)
168- rcu_assign_pointer(*trie, node);
169- else
170- rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
171- node);
172+ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
173+ rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
174+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
175+ rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
176+ if (!parent) {
177+ rcu_assign_pointer(node->parent_bit, trie);
178+ rcu_assign_pointer(*trie, node);
179+ } else {
180+ rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
181+ rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
182 }
183 return 0;
184 }
185@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow
186 void wg_allowedips_remove_by_peer(struct allowedips *table,
187 struct wg_peer *peer, struct mutex *lock)
188 {
189+ struct allowedips_node *node, *child, *tmp;
190+
191+ if (list_empty(&peer->allowedips_list))
192+ return;
193 ++table->seq;
194- walk_remove_by_peer(&table->root4, peer, lock);
195- walk_remove_by_peer(&table->root6, peer, lock);
196+ list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
197+ list_del_init(&node->peer_list);
198+ RCU_INIT_POINTER(node->peer, NULL);
199+ if (node->bit[0] && node->bit[1])
200+ continue;
201+ child = rcu_dereference_protected(
202+ node->bit[!rcu_access_pointer(node->bit[0])],
203+ lockdep_is_held(lock));
204+ if (child)
205+ child->parent_bit = node->parent_bit;
206+ *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
207+ kfree_rcu(node, rcu);
208+
209+ /* TODO: Note that we currently don't walk up and down in order to
210+ * free any potential filler nodes. This means that this function
211+ * doesn't free up as much as it could, which could be revisited
212+ * at some point.
213+ */
214+ }
215 }
216
217 int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
218--- a/drivers/net/wireguard/allowedips.h
219+++ b/drivers/net/wireguard/allowedips.h
220@@ -15,14 +15,11 @@ struct wg_peer;
221 struct allowedips_node {
222 struct wg_peer __rcu *peer;
223 struct allowedips_node __rcu *bit[2];
224- /* While it may seem scandalous that we waste space for v4,
225- * we're alloc'ing to the nearest power of 2 anyway, so this
226- * doesn't actually make a difference.
227- */
228- u8 bits[16] __aligned(__alignof(u64));
229 u8 cidr, bit_at_a, bit_at_b, bitlen;
230+ u8 bits[16] __aligned(__alignof(u64));
231
232- /* Keep rarely used list at bottom to be beyond cache line. */
233+ /* Keep rarely used members at bottom to be beyond cache line. */
234+ struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
235 union {
236 struct list_head peer_list;
237 struct rcu_head rcu;