blob: 6f01964ec4ef728b4ac09bd1a6488826fbbf0e70 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic INET transport hashtables
7 *
8 * Authors: Lotsa people, from code originally in tcp
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/module.h>
17#include <linux/random.h>
18#include <linux/sched.h>
19#include <linux/slab.h>
20#include <linux/wait.h>
21
22#include <net/inet_connection_sock.h>
23#include <net/inet_hashtables.h>
24#include <net/secure_seq.h>
25#include <net/ip.h>
26
27
28
29
30/*
31 * Allocate and initialize a new local port bind bucket.
32 * The bindhash mutex for snum's hash chain must be held here.
33 */
34struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
35 struct net *net,
36 struct inet_bind_hashbucket *head,
37 const unsigned short snum)
38{
39 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
40
41 if (tb != NULL) {
42 netslab_inc(INET_HASHTABLES_SLAB);
43 write_pnet(&tb->ib_net, hold_net(net));
44 tb->port = snum;
45 tb->fastreuse = 0;
46 tb->num_owners = 0;
47 INIT_HLIST_HEAD(&tb->owners);
48 hlist_add_head(&tb->node, &head->chain);
49 }
50 return tb;
51}
52
53/*
54 * Caller must hold hashbucket lock for this tb with local BH disabled
55 */
56void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
57{
58 if (hlist_empty(&tb->owners)) {
59 __hlist_del(&tb->node);
60 release_net(ib_net(tb));
61 netslab_dec(INET_HASHTABLES_SLAB);
62 kmem_cache_free(cachep, tb);
63 }
64}
65
66void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
67 const unsigned short snum)
68{
69 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
70
71 atomic_inc(&hashinfo->bsockets);
72
73 inet_sk(sk)->inet_num = snum;
74 sk_add_bind_node(sk, &tb->owners);
75 tb->num_owners++;
76 inet_csk(sk)->icsk_bind_hash = tb;
77}
78
79/*
80 * Get rid of any references to a local port held by the given sock.
81 */
82static void __inet_put_port(struct sock *sk)
83{
84 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
85 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
86 hashinfo->bhash_size);
87 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
88 struct inet_bind_bucket *tb;
89
90 atomic_dec(&hashinfo->bsockets);
91
92 spin_lock(&head->lock);
93 tb = inet_csk(sk)->icsk_bind_hash;
94 __sk_del_bind_node(sk);
95 tb->num_owners--;
96 inet_csk(sk)->icsk_bind_hash = NULL;
97 inet_sk(sk)->inet_num = 0;
98 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
99 spin_unlock(&head->lock);
100}
101
102void inet_put_port(struct sock *sk)
103{
104 local_bh_disable();
105 __inet_put_port(sk);
106 local_bh_enable();
107}
108EXPORT_SYMBOL(inet_put_port);
109
110int __inet_inherit_port(struct sock *sk, struct sock *child)
111{
112 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
113 unsigned short port = inet_sk(child)->inet_num;
114 const int bhash = inet_bhashfn(sock_net(sk), port,
115 table->bhash_size);
116 struct inet_bind_hashbucket *head = &table->bhash[bhash];
117 struct inet_bind_bucket *tb;
118
119 spin_lock(&head->lock);
120 tb = inet_csk(sk)->icsk_bind_hash;
121 if (tb->port != port) {
122 /* NOTE: using tproxy and redirecting skbs to a proxy
123 * on a different listener port breaks the assumption
124 * that the listener socket's icsk_bind_hash is the same
125 * as that of the child socket. We have to look up or
126 * create a new bind bucket for the child here. */
127 struct hlist_node *node;
128 inet_bind_bucket_for_each(tb, node, &head->chain) {
129 if (net_eq(ib_net(tb), sock_net(sk)) &&
130 tb->port == port)
131 break;
132 }
133 if (!node) {
134 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
135 sock_net(sk), head, port);
136 if (!tb) {
137 spin_unlock(&head->lock);
138 return -ENOMEM;
139 }
140 }
141 }
142 inet_bind_hash(child, tb, port);
143 spin_unlock(&head->lock);
144
145 return 0;
146}
147EXPORT_SYMBOL_GPL(__inet_inherit_port);
148
149static inline int compute_score(struct sock *sk, struct net *net,
150 const unsigned short hnum, const __be32 daddr,
151 const int dif)
152{
153 int score = -1;
154 struct inet_sock *inet = inet_sk(sk);
155
156 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
157 !ipv6_only_sock(sk)) {
158 __be32 rcv_saddr = inet->inet_rcv_saddr;
159 score = sk->sk_family == PF_INET ? 1 : 0;
160 if (rcv_saddr) {
161 if (rcv_saddr != daddr)
162 return -1;
163 score += 2;
164 }
165 if (sk->sk_bound_dev_if) {
166 if (sk->sk_bound_dev_if != dif)
167 return -1;
168 score += 2;
169 }
170 }
171 return score;
172}
173
174/*
175 * Don't inline this cruft. Here are some nice properties to exploit here. The
176 * BSD API does not allow a listening sock to specify the remote port nor the
177 * remote address for the connection. So always assume those are both
178 * wildcarded during the search since they can never be otherwise.
179 */
180
181
182struct sock *__inet_lookup_listener(struct net *net,
183 struct inet_hashinfo *hashinfo,
184 const __be32 daddr, const unsigned short hnum,
185 const int dif)
186{
187 struct sock *sk, *result;
188 struct hlist_nulls_node *node;
189 unsigned int hash = inet_lhashfn(net, hnum);
190 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
191 int score, hiscore;
192
193 rcu_read_lock();
194begin:
195 result = NULL;
196 hiscore = -1;
197 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
198 score = compute_score(sk, net, hnum, daddr, dif);
199 if (score > hiscore) {
200 result = sk;
201 hiscore = score;
202 }
203 }
204 /*
205 * if the nulls value we got at the end of this lookup is
206 * not the expected one, we must restart lookup.
207 * We probably met an item that was moved to another chain.
208 */
209 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
210 goto begin;
211 if (result) {
212 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
213 result = NULL;
214 else if (unlikely(compute_score(result, net, hnum, daddr,
215 dif) < hiscore)) {
216 sock_put(result);
217 goto begin;
218 }
219 }
220 rcu_read_unlock();
221 net_run_track(PRT_SOCK,"SOCK");
222 return result;
223}
224EXPORT_SYMBOL_GPL(__inet_lookup_listener);
225
226struct sock * __inet_lookup_established(struct net *net,
227 struct inet_hashinfo *hashinfo,
228 const __be32 saddr, const __be16 sport,
229 const __be32 daddr, const u16 hnum,
230 const int dif)
231{
232 INET_ADDR_COOKIE(acookie, saddr, daddr)
233 const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
234 struct sock *sk;
235 const struct hlist_nulls_node *node;
236 /* Optimize here for direct hit, only listening connections can
237 * have wildcards anyways.
238 */
239 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
240 unsigned int slot = hash & hashinfo->ehash_mask;
241 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
242
243 rcu_read_lock();
244begin:
245 sk_nulls_for_each_rcu(sk, node, &head->chain) {
246 if (INET_MATCH(sk, net, hash, acookie,
247 saddr, daddr, ports, dif)) {
248 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
249 goto begintw;
250 if (unlikely(!INET_MATCH(sk, net, hash, acookie,
251 saddr, daddr, ports, dif))) {
252 sock_put(sk);
253 goto begin;
254 }
255 goto out;
256 }
257 }
258 /*
259 * if the nulls value we got at the end of this lookup is
260 * not the expected one, we must restart lookup.
261 * We probably met an item that was moved to another chain.
262 */
263 if (get_nulls_value(node) != slot)
264 goto begin;
265
266begintw:
267 /* Must check for a TIME_WAIT'er before going to listener hash. */
268 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
269 if (INET_TW_MATCH(sk, net, hash, acookie,
270 saddr, daddr, ports, dif)) {
271 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
272 sk = NULL;
273 goto out;
274 }
275 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
276 saddr, daddr, ports, dif))) {
277 inet_twsk_put(inet_twsk(sk));
278 goto begintw;
279 }
280 goto out;
281 }
282 }
283 /*
284 * if the nulls value we got at the end of this lookup is
285 * not the expected one, we must restart lookup.
286 * We probably met an item that was moved to another chain.
287 */
288 if (get_nulls_value(node) != slot)
289 goto begintw;
290 sk = NULL;
291out:
292 rcu_read_unlock();
293 net_run_track(PRT_SOCK,"SOCK");
294 return sk;
295}
296EXPORT_SYMBOL_GPL(__inet_lookup_established);
297
298/* called with local bh disabled */
299static int __inet_check_established(struct inet_timewait_death_row *death_row,
300 struct sock *sk, __u16 lport,
301 struct inet_timewait_sock **twp)
302{
303 struct inet_hashinfo *hinfo = death_row->hashinfo;
304 struct inet_sock *inet = inet_sk(sk);
305 __be32 daddr = inet->inet_rcv_saddr;
306 __be32 saddr = inet->inet_daddr;
307 int dif = sk->sk_bound_dev_if;
308 INET_ADDR_COOKIE(acookie, saddr, daddr)
309 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
310 struct net *net = sock_net(sk);
311 unsigned int hash = inet_ehashfn(net, daddr, lport,
312 saddr, inet->inet_dport);
313 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
314 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
315 struct sock *sk2;
316 const struct hlist_nulls_node *node;
317 struct inet_timewait_sock *tw;
318 int twrefcnt = 0;
319
320 spin_lock(lock);
321
322 /* Check TIME-WAIT sockets first. */
323 sk_nulls_for_each(sk2, node, &head->twchain) {
324 tw = inet_twsk(sk2);
325
326 if (INET_TW_MATCH(sk2, net, hash, acookie,
327 saddr, daddr, ports, dif)) {
328 if (twsk_unique(sk, sk2, twp))
329 goto unique;
330 else
331 goto not_unique;
332 }
333 }
334 tw = NULL;
335
336 /* And established part... */
337 sk_nulls_for_each(sk2, node, &head->chain) {
338 if (INET_MATCH(sk2, net, hash, acookie,
339 saddr, daddr, ports, dif))
340 goto not_unique;
341 }
342
343unique:
344 /* Must record num and sport now. Otherwise we will see
345 * in hash table socket with a funny identity. */
346 inet->inet_num = lport;
347 inet->inet_sport = htons(lport);
348 sk->sk_hash = hash;
349 WARN_ON(!sk_unhashed(sk));
350 __sk_nulls_add_node_rcu(sk, &head->chain);
351 if (tw) {
352 twrefcnt = inet_twsk_unhash(tw);
353 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
354 }
355 spin_unlock(lock);
356 if (twrefcnt)
357 inet_twsk_put(tw);
358 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
359
360 if (twp) {
361 *twp = tw;
362 } else if (tw) {
363 /* Silly. Should hash-dance instead... */
364 inet_twsk_deschedule(tw, death_row);
365
366 inet_twsk_put(tw);
367 }
368 return 0;
369
370not_unique:
371 spin_unlock(lock);
372 return -EADDRNOTAVAIL;
373}
374
375static inline u32 inet_sk_port_offset(const struct sock *sk)
376{
377 const struct inet_sock *inet = inet_sk(sk);
378 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
379 inet->inet_daddr,
380 inet->inet_dport);
381}
382
383int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
384{
385 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
386 struct hlist_nulls_head *list;
387 spinlock_t *lock;
388 struct inet_ehash_bucket *head;
389 int twrefcnt = 0;
390
391 WARN_ON(!sk_unhashed(sk));
392
393 sk->sk_hash = inet_sk_ehashfn(sk);
394 head = inet_ehash_bucket(hashinfo, sk->sk_hash);
395 list = &head->chain;
396 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
397
398 spin_lock(lock);
399 __sk_nulls_add_node_rcu(sk, list);
400 if (tw) {
401 WARN_ON(sk->sk_hash != tw->tw_hash);
402 twrefcnt = inet_twsk_unhash(tw);
403 }
404 spin_unlock(lock);
405 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
406 return twrefcnt;
407}
408EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
409
410static void __inet_hash(struct sock *sk)
411{
412 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
413 struct inet_listen_hashbucket *ilb;
414
415 if (sk->sk_state != TCP_LISTEN) {
416 __inet_hash_nolisten(sk, NULL);
417 return;
418 }
419
420 WARN_ON(!sk_unhashed(sk));
421 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
422
423 spin_lock(&ilb->lock);
424 __sk_nulls_add_node_rcu(sk, &ilb->head);
425 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
426 spin_unlock(&ilb->lock);
427}
428
429void inet_hash(struct sock *sk)
430{
431 if (sk->sk_state != TCP_CLOSE) {
432 local_bh_disable();
433 __inet_hash(sk);
434 local_bh_enable();
435 }
436}
437EXPORT_SYMBOL_GPL(inet_hash);
438
439void inet_unhash(struct sock *sk)
440{
441 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
442 spinlock_t *lock;
443 int done;
444
445 if (sk_unhashed(sk))
446 return;
447
448 if (sk->sk_state == TCP_LISTEN)
449 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
450 else
451 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
452
453 spin_lock_bh(lock);
454 done =__sk_nulls_del_node_init_rcu(sk);
455 if (done)
456 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
457 spin_unlock_bh(lock);
458}
459EXPORT_SYMBOL_GPL(inet_unhash);
460
461int __inet_hash_connect(struct inet_timewait_death_row *death_row,
462 struct sock *sk, u32 port_offset,
463 int (*check_established)(struct inet_timewait_death_row *,
464 struct sock *, __u16, struct inet_timewait_sock **),
465 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
466{
467 struct inet_hashinfo *hinfo = death_row->hashinfo;
468 const unsigned short snum = inet_sk(sk)->inet_num;
469 struct inet_bind_hashbucket *head;
470 struct inet_bind_bucket *tb;
471 int ret;
472 struct net *net = sock_net(sk);
473 int twrefcnt = 1;
474
475 if (!snum) {
476 int i, remaining, low, high, port;
477 static u32 hint;
478 u32 offset = hint + port_offset;
479 struct hlist_node *node;
480 struct inet_timewait_sock *tw = NULL;
481
482 inet_get_local_port_range(&low, &high);
483 remaining = (high - low) + 1;
484
485 local_bh_disable();
486 for (i = 1; i <= remaining; i++) {
487 port = low + (i + offset) % remaining;
488 if (inet_is_reserved_local_port(port))
489 continue;
490 head = &hinfo->bhash[inet_bhashfn(net, port,
491 hinfo->bhash_size)];
492 spin_lock(&head->lock);
493
494 /* Does not bother with rcv_saddr checks,
495 * because the established check is already
496 * unique enough.
497 */
498 inet_bind_bucket_for_each(tb, node, &head->chain) {
499 if (net_eq(ib_net(tb), net) &&
500 tb->port == port) {
501 if (tb->fastreuse >= 0)
502 goto next_port;
503 WARN_ON(hlist_empty(&tb->owners));
504 if (!check_established(death_row, sk,
505 port, &tw))
506 goto ok;
507 goto next_port;
508 }
509 }
510
511 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
512 net, head, port);
513 if (!tb) {
514 spin_unlock(&head->lock);
515 break;
516 }
517 tb->fastreuse = -1;
518 goto ok;
519
520 next_port:
521 spin_unlock(&head->lock);
522 }
523 local_bh_enable();
524
525 return -EADDRNOTAVAIL;
526
527ok:
528 hint += i;
529
530 /* Head lock still held and bh's disabled */
531 inet_bind_hash(sk, tb, port);
532 if (sk_unhashed(sk)) {
533 inet_sk(sk)->inet_sport = htons(port);
534 twrefcnt += hash(sk, tw);
535 }
536 if (tw)
537 twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
538 spin_unlock(&head->lock);
539
540 if (tw) {
541 inet_twsk_deschedule(tw, death_row);
542 while (twrefcnt) {
543 twrefcnt--;
544 inet_twsk_put(tw);
545 }
546 }
547
548 ret = 0;
549 goto out;
550 }
551
552 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
553 tb = inet_csk(sk)->icsk_bind_hash;
554 spin_lock_bh(&head->lock);
555 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
556 hash(sk, NULL);
557 spin_unlock_bh(&head->lock);
558 net_run_track(PRT_SOCK,"SOCK");
559 return 0;
560 } else {
561 spin_unlock(&head->lock);
562 /* No definite answer... Walk to established hash table */
563 ret = check_established(death_row, sk, snum, NULL);
564out:
565 local_bh_enable();
566 net_run_track(PRT_SOCK,"SOCK");
567 return ret;
568 }
569}
570
571/*
572 * Bind a port for a connect operation and hash it.
573 */
574int inet_hash_connect(struct inet_timewait_death_row *death_row,
575 struct sock *sk)
576{
577 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
578 __inet_check_established, __inet_hash_nolisten);
579}
580EXPORT_SYMBOL_GPL(inet_hash_connect);
581
582void inet_hashinfo_init(struct inet_hashinfo *h)
583{
584 int i;
585
586 atomic_set(&h->bsockets, 0);
587 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
588 spin_lock_init(&h->listening_hash[i].lock);
589 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
590 i + LISTENING_NULLS_BASE);
591 }
592}
593EXPORT_SYMBOL_GPL(inet_hashinfo_init);