blob: 1f8a9b323a0dde7400a42ee0f47cbbbea4a2e2d0 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79
80#include <crypto/hash.h>
81#include <linux/scatterlist.h>
82
83#include <trace/events/tcp.h>
84
85#ifdef CONFIG_TCP_MD5SIG
86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
88#endif
89
90struct inet_hashinfo tcp_hashinfo;
91EXPORT_SYMBOL(tcp_hashinfo);
92
93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94{
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
99}
100
101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102{
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104}
105
106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107{
108 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112
113 if (tw->tw_substate == TCP_FIN_WAIT2)
114 reuse = 0;
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124#if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
128 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
129 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
131 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
132 loopback = true;
133 } else
134#endif
135 {
136 if (ipv4_is_loopback(tw->tw_daddr) ||
137 ipv4_is_loopback(tw->tw_rcv_saddr))
138 loopback = true;
139 }
140 if (!loopback)
141 reuse = 0;
142 }
143
144 /* With PAWS, it is safe from the viewpoint
145 of data integrity. Even without PAWS it is safe provided sequence
146 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
147
148 Actually, the idea is close to VJ's one, only timestamp cache is
149 held not per host, but per port pair and TW bucket is used as state
150 holder.
151
152 If TW bucket has been already destroyed we fall back to VJ's scheme
153 and use initial timestamp retrieved from peer table.
154 */
155 if (tcptw->tw_ts_recent_stamp &&
156 (!twp || (reuse && time_after32(ktime_get_seconds(),
157 tcptw->tw_ts_recent_stamp)))) {
158 /* inet_twsk_hashdance() sets sk_refcnt after putting twsk
159 * and releasing the bucket lock.
160 */
161 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
162 return 0;
163
164 /* In case of repair and re-using TIME-WAIT sockets we still
165 * want to be sure that it is safe as above but honor the
166 * sequence numbers and time stamps set as part of the repair
167 * process.
168 *
169 * Without this check re-using a TIME-WAIT socket with TCP
170 * repair would accumulate a -1 on the repair assigned
171 * sequence number. The first time it is reused the sequence
172 * is -1, the second time -2, etc. This fixes that issue
173 * without appearing to create any others.
174 */
175 if (likely(!tp->repair)) {
176 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
177
178 if (!seq)
179 seq = 1;
180 WRITE_ONCE(tp->write_seq, seq);
181 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
182 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
183 }
184
185 return 1;
186 }
187
188 return 0;
189}
190EXPORT_SYMBOL_GPL(tcp_twsk_unique);
191
192static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
193 int addr_len)
194{
195 /* This check is replicated from tcp_v4_connect() and intended to
196 * prevent BPF program called below from accessing bytes that are out
197 * of the bound specified by user in addr_len.
198 */
199 if (addr_len < sizeof(struct sockaddr_in))
200 return -EINVAL;
201
202 sock_owned_by_me(sk);
203
204 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
205}
206
207/* This will initiate an outgoing connection. */
208int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
209{
210 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
211 struct inet_sock *inet = inet_sk(sk);
212 struct tcp_sock *tp = tcp_sk(sk);
213 __be16 orig_sport, orig_dport;
214 __be32 daddr, nexthop;
215 struct flowi4 *fl4;
216 struct rtable *rt;
217 int err;
218 struct ip_options_rcu *inet_opt;
219 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
220
221 if (addr_len < sizeof(struct sockaddr_in))
222 return -EINVAL;
223
224 if (usin->sin_family != AF_INET)
225 return -EAFNOSUPPORT;
226
227 nexthop = daddr = usin->sin_addr.s_addr;
228 inet_opt = rcu_dereference_protected(inet->inet_opt,
229 lockdep_sock_is_held(sk));
230 if (inet_opt && inet_opt->opt.srr) {
231 if (!daddr)
232 return -EINVAL;
233 nexthop = inet_opt->opt.faddr;
234 }
235
236 orig_sport = inet->inet_sport;
237 orig_dport = usin->sin_port;
238 fl4 = &inet->cork.fl.u.ip4;
239 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
240 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
241 IPPROTO_TCP,
242 orig_sport, orig_dport, sk);
243 if (IS_ERR(rt)) {
244 err = PTR_ERR(rt);
245 if (err == -ENETUNREACH)
246 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
247 return err;
248 }
249
250 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
251 ip_rt_put(rt);
252 return -ENETUNREACH;
253 }
254
255 if (!inet_opt || !inet_opt->opt.srr)
256 daddr = fl4->daddr;
257
258 if (!inet->inet_saddr)
259 inet->inet_saddr = fl4->saddr;
260 sk_rcv_saddr_set(sk, inet->inet_saddr);
261
262 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263 /* Reset inherited state */
264 tp->rx_opt.ts_recent = 0;
265 tp->rx_opt.ts_recent_stamp = 0;
266 if (likely(!tp->repair))
267 WRITE_ONCE(tp->write_seq, 0);
268 }
269
270 inet->inet_dport = usin->sin_port;
271 sk_daddr_set(sk, daddr);
272
273 inet_csk(sk)->icsk_ext_hdr_len = 0;
274 if (inet_opt)
275 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276
277 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278
279 /* Socket identity is still unknown (sport may be zero).
280 * However we set state to SYN-SENT and not releasing socket
281 * lock select source port, enter ourselves into the hash tables and
282 * complete initialization after this.
283 */
284 tcp_set_state(sk, TCP_SYN_SENT);
285 err = inet_hash_connect(tcp_death_row, sk);
286 if (err)
287 goto failure;
288
289 sk_set_txhash(sk);
290
291 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292 inet->inet_sport, inet->inet_dport, sk);
293 if (IS_ERR(rt)) {
294 err = PTR_ERR(rt);
295 rt = NULL;
296 goto failure;
297 }
298 /* OK, now commit destination to socket. */
299 sk->sk_gso_type = SKB_GSO_TCPV4;
300 sk_setup_caps(sk, &rt->dst);
301 rt = NULL;
302
303 if (likely(!tp->repair)) {
304 if (!tp->write_seq)
305 WRITE_ONCE(tp->write_seq,
306 secure_tcp_seq(inet->inet_saddr,
307 inet->inet_daddr,
308 inet->inet_sport,
309 usin->sin_port));
310 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
311 inet->inet_saddr,
312 inet->inet_daddr);
313 }
314
315 inet->inet_id = prandom_u32();
316
317 if (tcp_fastopen_defer_connect(sk, &err))
318 return err;
319 if (err)
320 goto failure;
321
322 err = tcp_connect(sk);
323
324 if (err)
325 goto failure;
326
327 return 0;
328
329failure:
330 /*
331 * This unhashes the socket and releases the local port,
332 * if necessary.
333 */
334 tcp_set_state(sk, TCP_CLOSE);
335 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
336 inet_reset_saddr(sk);
337 ip_rt_put(rt);
338 sk->sk_route_caps = 0;
339 inet->inet_dport = 0;
340 return err;
341}
342EXPORT_SYMBOL(tcp_v4_connect);
343
344/*
345 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
346 * It can be called through tcp_release_cb() if socket was owned by user
347 * at the time tcp_v4_err() was called to handle ICMP message.
348 */
349void tcp_v4_mtu_reduced(struct sock *sk)
350{
351 struct inet_sock *inet = inet_sk(sk);
352 struct dst_entry *dst;
353 u32 mtu;
354
355 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
356 return;
357 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
358 dst = inet_csk_update_pmtu(sk, mtu);
359 if (!dst)
360 return;
361
362 /* Something is about to be wrong... Remember soft error
363 * for the case, if this connection will not able to recover.
364 */
365 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
366 sk->sk_err_soft = EMSGSIZE;
367
368 mtu = dst_mtu(dst);
369
370 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
371 ip_sk_accept_pmtu(sk) &&
372 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
373 tcp_sync_mss(sk, mtu);
374
375 /* Resend the TCP packet because it's
376 * clear that the old packet has been
377 * dropped. This is the new "fast" path mtu
378 * discovery.
379 */
380 tcp_simple_retransmit(sk);
381 } /* else let the usual retransmit timer handle it */
382}
383EXPORT_SYMBOL(tcp_v4_mtu_reduced);
384
385static void do_redirect(struct sk_buff *skb, struct sock *sk)
386{
387 struct dst_entry *dst = __sk_dst_check(sk, 0);
388
389 if (dst)
390 dst->ops->redirect(dst, sk, skb);
391}
392
393
394/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
395void tcp_req_err(struct sock *sk, u32 seq, bool abort)
396{
397 struct request_sock *req = inet_reqsk(sk);
398 struct net *net = sock_net(sk);
399
400 /* ICMPs are not backlogged, hence we cannot get
401 * an established socket here.
402 */
403 if (seq != tcp_rsk(req)->snt_isn) {
404 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405 } else if (abort) {
406 /*
407 * Still in SYN_RECV, just remove it silently.
408 * There is no good way to pass the error to the newly
409 * created socket, and POSIX does not want network
410 * errors returned from accept().
411 */
412 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
413 tcp_listendrop(req->rsk_listener);
414 }
415 reqsk_put(req);
416}
417EXPORT_SYMBOL(tcp_req_err);
418
419/*
420 * This routine is called by the ICMP module when it gets some
421 * sort of error condition. If err < 0 then the socket should
422 * be closed and the error returned to the user. If err > 0
423 * it's just the icmp type << 8 | icmp code. After adjustment
424 * header points to the first 8 bytes of the tcp header. We need
425 * to find the appropriate port.
426 *
427 * The locking strategy used here is very "optimistic". When
428 * someone else accesses the socket the ICMP is just dropped
429 * and for some paths there is no check at all.
430 * A more general error queue to queue errors for later handling
431 * is probably better.
432 *
433 */
434
435int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
436{
437 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
438 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
439 struct inet_connection_sock *icsk;
440 struct tcp_sock *tp;
441 struct inet_sock *inet;
442 const int type = icmp_hdr(icmp_skb)->type;
443 const int code = icmp_hdr(icmp_skb)->code;
444 struct sock *sk;
445 struct sk_buff *skb;
446 struct request_sock *fastopen;
447 u32 seq, snd_una;
448 s32 remaining;
449 u32 delta_us;
450 int err;
451 struct net *net = dev_net(icmp_skb->dev);
452
453 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
454 th->dest, iph->saddr, ntohs(th->source),
455 inet_iif(icmp_skb), 0);
456 if (!sk) {
457 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
458 return -ENOENT;
459 }
460 if (sk->sk_state == TCP_TIME_WAIT) {
461 inet_twsk_put(inet_twsk(sk));
462 return 0;
463 }
464 seq = ntohl(th->seq);
465 if (sk->sk_state == TCP_NEW_SYN_RECV) {
466 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
467 type == ICMP_TIME_EXCEEDED ||
468 (type == ICMP_DEST_UNREACH &&
469 (code == ICMP_NET_UNREACH ||
470 code == ICMP_HOST_UNREACH)));
471 return 0;
472 }
473
474 bh_lock_sock(sk);
475 /* If too many ICMPs get dropped on busy
476 * servers this needs to be solved differently.
477 * We do take care of PMTU discovery (RFC1191) special case :
478 * we can receive locally generated ICMP messages while socket is held.
479 */
480 if (sock_owned_by_user(sk)) {
481 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
482 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
483 }
484 if (sk->sk_state == TCP_CLOSE)
485 goto out;
486
487 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
488 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
489 goto out;
490 }
491
492 icsk = inet_csk(sk);
493 tp = tcp_sk(sk);
494 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
495 fastopen = rcu_dereference(tp->fastopen_rsk);
496 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
497 if (sk->sk_state != TCP_LISTEN &&
498 !between(seq, snd_una, tp->snd_nxt)) {
499 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
500 goto out;
501 }
502
503 switch (type) {
504 case ICMP_REDIRECT:
505 if (!sock_owned_by_user(sk))
506 do_redirect(icmp_skb, sk);
507 goto out;
508 case ICMP_SOURCE_QUENCH:
509 /* Just silently ignore these. */
510 goto out;
511 case ICMP_PARAMETERPROB:
512 err = EPROTO;
513 break;
514 case ICMP_DEST_UNREACH:
515 if (code > NR_ICMP_UNREACH)
516 goto out;
517
518 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
519 /* We are not interested in TCP_LISTEN and open_requests
520 * (SYN-ACKs send out by Linux are always <576bytes so
521 * they should go through unfragmented).
522 */
523 if (sk->sk_state == TCP_LISTEN)
524 goto out;
525
526 WRITE_ONCE(tp->mtu_info, info);
527 if (!sock_owned_by_user(sk)) {
528 tcp_v4_mtu_reduced(sk);
529 } else {
530 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
531 sock_hold(sk);
532 }
533 goto out;
534 }
535
536 err = icmp_err_convert[code].errno;
537 /* check if icmp_skb allows revert of backoff
538 * (see draft-zimmermann-tcp-lcd) */
539 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
540 break;
541 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
542 !icsk->icsk_backoff || fastopen)
543 break;
544
545 if (sock_owned_by_user(sk))
546 break;
547
548 skb = tcp_rtx_queue_head(sk);
549 if (WARN_ON_ONCE(!skb))
550 break;
551
552 icsk->icsk_backoff--;
553 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
554 TCP_TIMEOUT_INIT;
555 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
556
557
558 tcp_mstamp_refresh(tp);
559 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
560 remaining = icsk->icsk_rto -
561 usecs_to_jiffies(delta_us);
562
563 if (remaining > 0) {
564 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
565 remaining, TCP_RTO_MAX);
566 } else {
567 /* RTO revert clocked out retransmission.
568 * Will retransmit now */
569 tcp_retransmit_timer(sk);
570 }
571
572 break;
573 case ICMP_TIME_EXCEEDED:
574 err = EHOSTUNREACH;
575 break;
576 default:
577 goto out;
578 }
579
580 switch (sk->sk_state) {
581 case TCP_SYN_SENT:
582 case TCP_SYN_RECV:
583 /* Only in fast or simultaneous open. If a fast open socket is
584 * is already accepted it is treated as a connected one below.
585 */
586 if (fastopen && !fastopen->sk)
587 break;
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk->sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
603 *
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
607 *
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 *
613 * Now we are in compliance with RFCs.
614 * --ANK (980905)
615 */
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk->sk_error_report(sk);
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
623 }
624
625out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629}
630
631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632{
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638}
639
640/* This routine computes an IPv4 TCP checksum. */
641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642{
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646}
647EXPORT_SYMBOL(tcp_v4_send_check);
648
649/*
650 * This routine will send an RST to the other tcp.
651 *
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653 * for reset.
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
660 */
661
662static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
663{
664 const struct tcphdr *th = tcp_hdr(skb);
665 struct {
666 struct tcphdr th;
667#ifdef CONFIG_TCP_MD5SIG
668 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
669#endif
670 } rep;
671 struct ip_reply_arg arg;
672#ifdef CONFIG_TCP_MD5SIG
673 struct tcp_md5sig_key *key = NULL;
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
678#endif
679 u64 transmit_time = 0;
680 struct sock *ctl_sk;
681 struct net *net;
682
683 /* Never send a reset in response to a reset. */
684 if (th->rst)
685 return;
686
687 /* If sk not NULL, it means we did a successful lookup and incoming
688 * route had to be correct. prequeue might have dropped our dst.
689 */
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 return;
692
693 /* Swap the send and the receive. */
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
699
700 if (th->ack) {
701 rep.th.seq = th->ack_seq;
702 } else {
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
706 }
707
708 memset(&arg, 0, sizeof(arg));
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713#ifdef CONFIG_TCP_MD5SIG
714 rcu_read_lock();
715 hash_location = tcp_parse_md5sig_option(th);
716 if (sk && sk_fullsock(sk)) {
717 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
718 &ip_hdr(skb)->saddr, AF_INET);
719 } else if (hash_location) {
720 /*
721 * active side is lost. Try to find listening socket through
722 * source port, and then find md5 key through listening socket.
723 * we are not loose security here:
724 * Incoming packet is checked with md5 hash with finding key,
725 * no RST generated if md5 hash doesn't match.
726 */
727 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
728 ip_hdr(skb)->saddr,
729 th->source, ip_hdr(skb)->daddr,
730 ntohs(th->source), inet_iif(skb),
731 tcp_v4_sdif(skb));
732 /* don't send rst if it can't find key */
733 if (!sk1)
734 goto out;
735
736 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
737 &ip_hdr(skb)->saddr, AF_INET);
738 if (!key)
739 goto out;
740
741
742 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
743 if (genhash || memcmp(hash_location, newhash, 16) != 0)
744 goto out;
745
746 }
747
748 if (key) {
749 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
750 (TCPOPT_NOP << 16) |
751 (TCPOPT_MD5SIG << 8) |
752 TCPOLEN_MD5SIG);
753 /* Update length and the length the header thinks exists */
754 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
755 rep.th.doff = arg.iov[0].iov_len / 4;
756
757 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
758 key, ip_hdr(skb)->saddr,
759 ip_hdr(skb)->daddr, &rep.th);
760 }
761#endif
762 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
763 ip_hdr(skb)->saddr, /* XXX */
764 arg.iov[0].iov_len, IPPROTO_TCP, 0);
765 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
766 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
767
768 /* When socket is gone, all binding information is lost.
769 * routing might fail in this case. No choice here, if we choose to force
770 * input interface, we will misroute in case of asymmetric route.
771 */
772 if (sk) {
773 arg.bound_dev_if = sk->sk_bound_dev_if;
774 if (sk_fullsock(sk))
775 trace_tcp_send_reset(sk, skb);
776 }
777
778 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
779 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
780
781 arg.tos = ip_hdr(skb)->tos;
782 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
783 local_bh_disable();
784 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
785 if (sk) {
786 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
787 inet_twsk(sk)->tw_mark : sk->sk_mark;
788 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
789 inet_twsk(sk)->tw_priority : sk->sk_priority;
790 transmit_time = tcp_transmit_time(sk);
791 }
792 ip_send_unicast_reply(ctl_sk,
793 skb, &TCP_SKB_CB(skb)->header.h4.opt,
794 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
795 &arg, arg.iov[0].iov_len,
796 transmit_time);
797
798 ctl_sk->sk_mark = 0;
799 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
800 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
801 local_bh_enable();
802
803#ifdef CONFIG_TCP_MD5SIG
804out:
805 rcu_read_unlock();
806#endif
807}
808
809/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
810 outside socket context is ugly, certainly. What can I do?
811 */
812
813static void tcp_v4_send_ack(const struct sock *sk,
814 struct sk_buff *skb, u32 seq, u32 ack,
815 u32 win, u32 tsval, u32 tsecr, int oif,
816 struct tcp_md5sig_key *key,
817 int reply_flags, u8 tos)
818{
819 const struct tcphdr *th = tcp_hdr(skb);
820 struct {
821 struct tcphdr th;
822 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
823#ifdef CONFIG_TCP_MD5SIG
824 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
825#endif
826 ];
827 } rep;
828 struct net *net = sock_net(sk);
829 struct ip_reply_arg arg;
830 struct sock *ctl_sk;
831 u64 transmit_time;
832
833 memset(&rep.th, 0, sizeof(struct tcphdr));
834 memset(&arg, 0, sizeof(arg));
835
836 arg.iov[0].iov_base = (unsigned char *)&rep;
837 arg.iov[0].iov_len = sizeof(rep.th);
838 if (tsecr) {
839 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
840 (TCPOPT_TIMESTAMP << 8) |
841 TCPOLEN_TIMESTAMP);
842 rep.opt[1] = htonl(tsval);
843 rep.opt[2] = htonl(tsecr);
844 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
845 }
846
847 /* Swap the send and the receive. */
848 rep.th.dest = th->source;
849 rep.th.source = th->dest;
850 rep.th.doff = arg.iov[0].iov_len / 4;
851 rep.th.seq = htonl(seq);
852 rep.th.ack_seq = htonl(ack);
853 rep.th.ack = 1;
854 rep.th.window = htons(win);
855
856#ifdef CONFIG_TCP_MD5SIG
857 if (key) {
858 int offset = (tsecr) ? 3 : 0;
859
860 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
861 (TCPOPT_NOP << 16) |
862 (TCPOPT_MD5SIG << 8) |
863 TCPOLEN_MD5SIG);
864 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
865 rep.th.doff = arg.iov[0].iov_len/4;
866
867 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
868 key, ip_hdr(skb)->saddr,
869 ip_hdr(skb)->daddr, &rep.th);
870 }
871#endif
872 arg.flags = reply_flags;
873 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
874 ip_hdr(skb)->saddr, /* XXX */
875 arg.iov[0].iov_len, IPPROTO_TCP, 0);
876 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
877 if (oif)
878 arg.bound_dev_if = oif;
879 arg.tos = tos;
880 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
881 local_bh_disable();
882 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
883 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
884 inet_twsk(sk)->tw_mark : sk->sk_mark;
885 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
886 inet_twsk(sk)->tw_priority : sk->sk_priority;
887 transmit_time = tcp_transmit_time(sk);
888 ip_send_unicast_reply(ctl_sk,
889 skb, &TCP_SKB_CB(skb)->header.h4.opt,
890 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
891 &arg, arg.iov[0].iov_len,
892 transmit_time);
893
894 ctl_sk->sk_mark = 0;
895 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
896 local_bh_enable();
897}
898
899static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
900{
901 struct inet_timewait_sock *tw = inet_twsk(sk);
902 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
903
904 tcp_v4_send_ack(sk, skb,
905 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
906 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
907 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
908 tcptw->tw_ts_recent,
909 tw->tw_bound_dev_if,
910 tcp_twsk_md5_key(tcptw),
911 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
912 tw->tw_tos
913 );
914
915 inet_twsk_put(tw);
916}
917
918static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
919 struct request_sock *req)
920{
921 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
922 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
923 */
924 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
925 tcp_sk(sk)->snd_nxt;
926
927 /* RFC 7323 2.3
928 * The window field (SEG.WND) of every outgoing segment, with the
929 * exception of <SYN> segments, MUST be right-shifted by
930 * Rcv.Wind.Shift bits:
931 */
932 tcp_v4_send_ack(sk, skb, seq,
933 tcp_rsk(req)->rcv_nxt,
934 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
935 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
936 req->ts_recent,
937 0,
938 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
939 AF_INET),
940 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
941 ip_hdr(skb)->tos);
942}
943
944/*
945 * Send a SYN-ACK after having received a SYN.
946 * This still operates on a request_sock only, not on a big
947 * socket.
948 */
949static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
950 struct flowi *fl,
951 struct request_sock *req,
952 struct tcp_fastopen_cookie *foc,
953 enum tcp_synack_type synack_type)
954{
955 const struct inet_request_sock *ireq = inet_rsk(req);
956 struct flowi4 fl4;
957 int err = -1;
958 struct sk_buff *skb;
959
960 /* First, grab a route. */
961 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
962 return -1;
963
964 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
965
966 if (skb) {
967 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
968
969 rcu_read_lock();
970 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
971 ireq->ir_rmt_addr,
972 rcu_dereference(ireq->ireq_opt));
973 rcu_read_unlock();
974 err = net_xmit_eval(err);
975 }
976
977 return err;
978}
979
980/*
981 * IPv4 request_sock destructor.
982 */
983static void tcp_v4_reqsk_destructor(struct request_sock *req)
984{
985 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
986}
987
988#ifdef CONFIG_TCP_MD5SIG
989/*
990 * RFC2385 MD5 checksumming requires a mapping of
991 * IP address->MD5 Key.
992 * We need to maintain these in the sk structure.
993 */
994
995DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
996EXPORT_SYMBOL(tcp_md5_needed);
997
998/* Find the Key structure for an address. */
999struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
1000 const union tcp_md5_addr *addr,
1001 int family)
1002{
1003 const struct tcp_sock *tp = tcp_sk(sk);
1004 struct tcp_md5sig_key *key;
1005 const struct tcp_md5sig_info *md5sig;
1006 __be32 mask;
1007 struct tcp_md5sig_key *best_match = NULL;
1008 bool match;
1009
1010 /* caller either holds rcu_read_lock() or socket lock */
1011 md5sig = rcu_dereference_check(tp->md5sig_info,
1012 lockdep_sock_is_held(sk));
1013 if (!md5sig)
1014 return NULL;
1015
1016 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1017 if (key->family != family)
1018 continue;
1019
1020 if (family == AF_INET) {
1021 mask = inet_make_mask(key->prefixlen);
1022 match = (key->addr.a4.s_addr & mask) ==
1023 (addr->a4.s_addr & mask);
1024#if IS_ENABLED(CONFIG_IPV6)
1025 } else if (family == AF_INET6) {
1026 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1027 key->prefixlen);
1028#endif
1029 } else {
1030 match = false;
1031 }
1032
1033 if (match && (!best_match ||
1034 key->prefixlen > best_match->prefixlen))
1035 best_match = key;
1036 }
1037 return best_match;
1038}
1039EXPORT_SYMBOL(__tcp_md5_do_lookup);
1040
1041static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1042 const union tcp_md5_addr *addr,
1043 int family, u8 prefixlen)
1044{
1045 const struct tcp_sock *tp = tcp_sk(sk);
1046 struct tcp_md5sig_key *key;
1047 unsigned int size = sizeof(struct in_addr);
1048 const struct tcp_md5sig_info *md5sig;
1049
1050 /* caller either holds rcu_read_lock() or socket lock */
1051 md5sig = rcu_dereference_check(tp->md5sig_info,
1052 lockdep_sock_is_held(sk));
1053 if (!md5sig)
1054 return NULL;
1055#if IS_ENABLED(CONFIG_IPV6)
1056 if (family == AF_INET6)
1057 size = sizeof(struct in6_addr);
1058#endif
1059 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1060 if (key->family != family)
1061 continue;
1062 if (!memcmp(&key->addr, addr, size) &&
1063 key->prefixlen == prefixlen)
1064 return key;
1065 }
1066 return NULL;
1067}
1068
1069struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1070 const struct sock *addr_sk)
1071{
1072 const union tcp_md5_addr *addr;
1073
1074 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1075 return tcp_md5_do_lookup(sk, addr, AF_INET);
1076}
1077EXPORT_SYMBOL(tcp_v4_md5_lookup);
1078
1079/* This can be called on a newly created socket, from other files */
1080int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1081 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1082 gfp_t gfp)
1083{
1084 /* Add Key to the list */
1085 struct tcp_md5sig_key *key;
1086 struct tcp_sock *tp = tcp_sk(sk);
1087 struct tcp_md5sig_info *md5sig;
1088
1089 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1090 if (key) {
1091 /* Pre-existing entry - just update that one.
1092 * Note that the key might be used concurrently.
1093 */
1094 memcpy(key->key, newkey, newkeylen);
1095
1096 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1097 * Also note that a reader could catch new key->keylen value
1098 * but old key->key[], this is the reason we use __GFP_ZERO
1099 * at sock_kmalloc() time below these lines.
1100 */
1101 WRITE_ONCE(key->keylen, newkeylen);
1102
1103 return 0;
1104 }
1105
1106 md5sig = rcu_dereference_protected(tp->md5sig_info,
1107 lockdep_sock_is_held(sk));
1108 if (!md5sig) {
1109 md5sig = kmalloc(sizeof(*md5sig), gfp);
1110 if (!md5sig)
1111 return -ENOMEM;
1112
1113 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1114 INIT_HLIST_HEAD(&md5sig->head);
1115 rcu_assign_pointer(tp->md5sig_info, md5sig);
1116 }
1117
1118 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1119 if (!key)
1120 return -ENOMEM;
1121 if (!tcp_alloc_md5sig_pool()) {
1122 sock_kfree_s(sk, key, sizeof(*key));
1123 return -ENOMEM;
1124 }
1125
1126 memcpy(key->key, newkey, newkeylen);
1127 key->keylen = newkeylen;
1128 key->family = family;
1129 key->prefixlen = prefixlen;
1130 memcpy(&key->addr, addr,
1131 (family == AF_INET6) ? sizeof(struct in6_addr) :
1132 sizeof(struct in_addr));
1133 hlist_add_head_rcu(&key->node, &md5sig->head);
1134 return 0;
1135}
1136EXPORT_SYMBOL(tcp_md5_do_add);
1137
1138int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1139 u8 prefixlen)
1140{
1141 struct tcp_md5sig_key *key;
1142
1143 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1144 if (!key)
1145 return -ENOENT;
1146 hlist_del_rcu(&key->node);
1147 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1148 kfree_rcu(key, rcu);
1149 return 0;
1150}
1151EXPORT_SYMBOL(tcp_md5_do_del);
1152
1153static void tcp_clear_md5_list(struct sock *sk)
1154{
1155 struct tcp_sock *tp = tcp_sk(sk);
1156 struct tcp_md5sig_key *key;
1157 struct hlist_node *n;
1158 struct tcp_md5sig_info *md5sig;
1159
1160 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1161
1162 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1163 hlist_del_rcu(&key->node);
1164 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1165 kfree_rcu(key, rcu);
1166 }
1167}
1168
1169static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1170 char __user *optval, int optlen)
1171{
1172 struct tcp_md5sig cmd;
1173 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1174 u8 prefixlen = 32;
1175
1176 if (optlen < sizeof(cmd))
1177 return -EINVAL;
1178
1179 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1180 return -EFAULT;
1181
1182 if (sin->sin_family != AF_INET)
1183 return -EINVAL;
1184
1185 if (optname == TCP_MD5SIG_EXT &&
1186 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1187 prefixlen = cmd.tcpm_prefixlen;
1188 if (prefixlen > 32)
1189 return -EINVAL;
1190 }
1191
1192 if (!cmd.tcpm_keylen)
1193 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1194 AF_INET, prefixlen);
1195
1196 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1197 return -EINVAL;
1198
1199 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1200 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1201 GFP_KERNEL);
1202}
1203
1204static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1205 __be32 daddr, __be32 saddr,
1206 const struct tcphdr *th, int nbytes)
1207{
1208 struct tcp4_pseudohdr *bp;
1209 struct scatterlist sg;
1210 struct tcphdr *_th;
1211
1212 bp = hp->scratch;
1213 bp->saddr = saddr;
1214 bp->daddr = daddr;
1215 bp->pad = 0;
1216 bp->protocol = IPPROTO_TCP;
1217 bp->len = cpu_to_be16(nbytes);
1218
1219 _th = (struct tcphdr *)(bp + 1);
1220 memcpy(_th, th, sizeof(*th));
1221 _th->check = 0;
1222
1223 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1224 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1225 sizeof(*bp) + sizeof(*th));
1226 return crypto_ahash_update(hp->md5_req);
1227}
1228
1229static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1230 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1231{
1232 struct tcp_md5sig_pool *hp;
1233 struct ahash_request *req;
1234
1235 hp = tcp_get_md5sig_pool();
1236 if (!hp)
1237 goto clear_hash_noput;
1238 req = hp->md5_req;
1239
1240 if (crypto_ahash_init(req))
1241 goto clear_hash;
1242 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1243 goto clear_hash;
1244 if (tcp_md5_hash_key(hp, key))
1245 goto clear_hash;
1246 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1247 if (crypto_ahash_final(req))
1248 goto clear_hash;
1249
1250 tcp_put_md5sig_pool();
1251 return 0;
1252
1253clear_hash:
1254 tcp_put_md5sig_pool();
1255clear_hash_noput:
1256 memset(md5_hash, 0, 16);
1257 return 1;
1258}
1259
1260int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1261 const struct sock *sk,
1262 const struct sk_buff *skb)
1263{
1264 struct tcp_md5sig_pool *hp;
1265 struct ahash_request *req;
1266 const struct tcphdr *th = tcp_hdr(skb);
1267 __be32 saddr, daddr;
1268
1269 if (sk) { /* valid for establish/request sockets */
1270 saddr = sk->sk_rcv_saddr;
1271 daddr = sk->sk_daddr;
1272 } else {
1273 const struct iphdr *iph = ip_hdr(skb);
1274 saddr = iph->saddr;
1275 daddr = iph->daddr;
1276 }
1277
1278 hp = tcp_get_md5sig_pool();
1279 if (!hp)
1280 goto clear_hash_noput;
1281 req = hp->md5_req;
1282
1283 if (crypto_ahash_init(req))
1284 goto clear_hash;
1285
1286 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1287 goto clear_hash;
1288 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1289 goto clear_hash;
1290 if (tcp_md5_hash_key(hp, key))
1291 goto clear_hash;
1292 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1293 if (crypto_ahash_final(req))
1294 goto clear_hash;
1295
1296 tcp_put_md5sig_pool();
1297 return 0;
1298
1299clear_hash:
1300 tcp_put_md5sig_pool();
1301clear_hash_noput:
1302 memset(md5_hash, 0, 16);
1303 return 1;
1304}
1305EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1306
1307#endif
1308
1309/* Called with rcu_read_lock() */
1310static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1311 const struct sk_buff *skb)
1312{
1313#ifdef CONFIG_TCP_MD5SIG
1314 /*
1315 * This gets called for each TCP segment that arrives
1316 * so we want to be efficient.
1317 * We have 3 drop cases:
1318 * o No MD5 hash and one expected.
1319 * o MD5 hash and we're not expecting one.
1320 * o MD5 hash and its wrong.
1321 */
1322 const __u8 *hash_location = NULL;
1323 struct tcp_md5sig_key *hash_expected;
1324 const struct iphdr *iph = ip_hdr(skb);
1325 const struct tcphdr *th = tcp_hdr(skb);
1326 int genhash;
1327 unsigned char newhash[16];
1328
1329 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1330 AF_INET);
1331 hash_location = tcp_parse_md5sig_option(th);
1332
1333 /* We've parsed the options - do we have a hash? */
1334 if (!hash_expected && !hash_location)
1335 return false;
1336
1337 if (hash_expected && !hash_location) {
1338 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1339 return true;
1340 }
1341
1342 if (!hash_expected && hash_location) {
1343 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1344 return true;
1345 }
1346
1347 /* Okay, so this is hash_expected and hash_location -
1348 * so we need to calculate the checksum.
1349 */
1350 genhash = tcp_v4_md5_hash_skb(newhash,
1351 hash_expected,
1352 NULL, skb);
1353
1354 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1355 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1356 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1357 &iph->saddr, ntohs(th->source),
1358 &iph->daddr, ntohs(th->dest),
1359 genhash ? " tcp_v4_calc_md5_hash failed"
1360 : "");
1361 return true;
1362 }
1363 return false;
1364#endif
1365 return false;
1366}
1367
1368static void tcp_v4_init_req(struct request_sock *req,
1369 const struct sock *sk_listener,
1370 struct sk_buff *skb)
1371{
1372 struct inet_request_sock *ireq = inet_rsk(req);
1373 struct net *net = sock_net(sk_listener);
1374
1375 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1376 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1377 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1378}
1379
1380static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1381 struct flowi *fl,
1382 const struct request_sock *req)
1383{
1384 return inet_csk_route_req(sk, &fl->u.ip4, req);
1385}
1386
1387struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1388 .family = PF_INET,
1389 .obj_size = sizeof(struct tcp_request_sock),
1390 .rtx_syn_ack = tcp_rtx_synack,
1391 .send_ack = tcp_v4_reqsk_send_ack,
1392 .destructor = tcp_v4_reqsk_destructor,
1393 .send_reset = tcp_v4_send_reset,
1394 .syn_ack_timeout = tcp_syn_ack_timeout,
1395};
1396
1397const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1398 .mss_clamp = TCP_MSS_DEFAULT,
1399#ifdef CONFIG_TCP_MD5SIG
1400 .req_md5_lookup = tcp_v4_md5_lookup,
1401 .calc_md5_hash = tcp_v4_md5_hash_skb,
1402#endif
1403 .init_req = tcp_v4_init_req,
1404#ifdef CONFIG_SYN_COOKIES
1405 .cookie_init_seq = cookie_v4_init_sequence,
1406#endif
1407 .route_req = tcp_v4_route_req,
1408 .init_seq = tcp_v4_init_seq,
1409 .init_ts_off = tcp_v4_init_ts_off,
1410 .send_synack = tcp_v4_send_synack,
1411};
1412
1413int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1414{
1415 /* Never answer to SYNs send to broadcast or multicast */
1416 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1417 goto drop;
1418
1419 return tcp_conn_request(&tcp_request_sock_ops,
1420 &tcp_request_sock_ipv4_ops, sk, skb);
1421
1422drop:
1423 tcp_listendrop(sk);
1424 return 0;
1425}
1426EXPORT_SYMBOL(tcp_v4_conn_request);
1427
1428
1429/*
1430 * The three way handshake has completed - we got a valid synack -
1431 * now create the new socket.
1432 */
1433struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1434 struct request_sock *req,
1435 struct dst_entry *dst,
1436 struct request_sock *req_unhash,
1437 bool *own_req)
1438{
1439 struct inet_request_sock *ireq;
1440 bool found_dup_sk = false;
1441 struct inet_sock *newinet;
1442 struct tcp_sock *newtp;
1443 struct sock *newsk;
1444#ifdef CONFIG_TCP_MD5SIG
1445 struct tcp_md5sig_key *key;
1446#endif
1447 struct ip_options_rcu *inet_opt;
1448
1449 if (sk_acceptq_is_full(sk))
1450 goto exit_overflow;
1451
1452 newsk = tcp_create_openreq_child(sk, req, skb);
1453 if (!newsk)
1454 goto exit_nonewsk;
1455
1456 newsk->sk_gso_type = SKB_GSO_TCPV4;
1457 inet_sk_rx_dst_set(newsk, skb);
1458
1459 newtp = tcp_sk(newsk);
1460 newinet = inet_sk(newsk);
1461 ireq = inet_rsk(req);
1462 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1463 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1464 newsk->sk_bound_dev_if = ireq->ir_iif;
1465 newinet->inet_saddr = ireq->ir_loc_addr;
1466 inet_opt = rcu_dereference(ireq->ireq_opt);
1467 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1468 newinet->mc_index = inet_iif(skb);
1469 newinet->mc_ttl = ip_hdr(skb)->ttl;
1470 newinet->rcv_tos = ip_hdr(skb)->tos;
1471 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1472 if (inet_opt)
1473 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1474 newinet->inet_id = prandom_u32();
1475
1476 if (!dst) {
1477 dst = inet_csk_route_child_sock(sk, newsk, req);
1478 if (!dst)
1479 goto put_and_exit;
1480 } else {
1481 /* syncookie case : see end of cookie_v4_check() */
1482 }
1483 sk_setup_caps(newsk, dst);
1484
1485 tcp_ca_openreq_child(newsk, dst);
1486
1487 tcp_sync_mss(newsk, dst_mtu(dst));
1488 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1489
1490 tcp_initialize_rcv_mss(newsk);
1491
1492#ifdef CONFIG_TCP_MD5SIG
1493 /* Copy over the MD5 key from the original socket */
1494 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1495 AF_INET);
1496 if (key) {
1497 /*
1498 * We're using one, so create a matching key
1499 * on the newsk structure. If we fail to get
1500 * memory, then we end up not copying the key
1501 * across. Shucks.
1502 */
1503 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1504 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1505 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1506 }
1507#endif
1508
1509 if (__inet_inherit_port(sk, newsk) < 0)
1510 goto put_and_exit;
1511 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1512 &found_dup_sk);
1513 if (likely(*own_req)) {
1514 tcp_move_syn(newtp, req);
1515 ireq->ireq_opt = NULL;
1516 } else {
1517 newinet->inet_opt = NULL;
1518
1519 if (!req_unhash && found_dup_sk) {
1520 /* This code path should only be executed in the
1521 * syncookie case only
1522 */
1523 bh_unlock_sock(newsk);
1524 sock_put(newsk);
1525 newsk = NULL;
1526 }
1527 }
1528 return newsk;
1529
1530exit_overflow:
1531 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1532exit_nonewsk:
1533 dst_release(dst);
1534exit:
1535 tcp_listendrop(sk);
1536 return NULL;
1537put_and_exit:
1538 newinet->inet_opt = NULL;
1539 inet_csk_prepare_forced_close(newsk);
1540 tcp_done(newsk);
1541 goto exit;
1542}
1543EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1544
1545static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1546{
1547#ifdef CONFIG_SYN_COOKIES
1548 const struct tcphdr *th = tcp_hdr(skb);
1549
1550 if (!th->syn)
1551 sk = cookie_v4_check(sk, skb);
1552#endif
1553 return sk;
1554}
1555
1556u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1557 struct tcphdr *th, u32 *cookie)
1558{
1559 u16 mss = 0;
1560#ifdef CONFIG_SYN_COOKIES
1561 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1562 &tcp_request_sock_ipv4_ops, sk, th);
1563 if (mss) {
1564 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1565 tcp_synq_overflow(sk);
1566 }
1567#endif
1568 return mss;
1569}
1570
1571/* The socket must have it's spinlock held when we get
1572 * here, unless it is a TCP_LISTEN socket.
1573 *
1574 * We have a potential double-lock case here, so even when
1575 * doing backlog processing we use the BH locking scheme.
1576 * This is because we cannot sleep with the original spinlock
1577 * held.
1578 */
1579int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1580{
1581 struct sock *rsk;
1582
1583 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1584 struct dst_entry *dst;
1585
1586 dst = rcu_dereference_protected(sk->sk_rx_dst,
1587 lockdep_sock_is_held(sk));
1588
1589 sock_rps_save_rxhash(sk, skb);
1590 sk_mark_napi_id(sk, skb);
1591 if (dst) {
1592 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1593 !dst->ops->check(dst, 0)) {
1594 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1595 dst_release(dst);
1596 }
1597 }
1598 tcp_rcv_established(sk, skb);
1599 return 0;
1600 }
1601
1602 if (tcp_checksum_complete(skb))
1603 goto csum_err;
1604
1605 if (sk->sk_state == TCP_LISTEN) {
1606 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1607
1608 if (!nsk)
1609 goto discard;
1610 if (nsk != sk) {
1611 if (tcp_child_process(sk, nsk, skb)) {
1612 rsk = nsk;
1613 goto reset;
1614 }
1615 return 0;
1616 }
1617 } else
1618 sock_rps_save_rxhash(sk, skb);
1619
1620 if (tcp_rcv_state_process(sk, skb)) {
1621 rsk = sk;
1622 goto reset;
1623 }
1624 return 0;
1625
1626reset:
1627 tcp_v4_send_reset(rsk, skb);
1628discard:
1629 kfree_skb(skb);
1630 /* Be careful here. If this function gets more complicated and
1631 * gcc suffers from register pressure on the x86, sk (in %ebx)
1632 * might be destroyed here. This current version compiles correctly,
1633 * but you have been warned.
1634 */
1635 return 0;
1636
1637csum_err:
1638 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1639 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1640 goto discard;
1641}
1642EXPORT_SYMBOL(tcp_v4_do_rcv);
1643
1644int tcp_v4_early_demux(struct sk_buff *skb)
1645{
1646 const struct iphdr *iph;
1647 const struct tcphdr *th;
1648 struct sock *sk;
1649
1650 if (skb->pkt_type != PACKET_HOST)
1651 return 0;
1652
1653 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1654 return 0;
1655
1656 iph = ip_hdr(skb);
1657 th = tcp_hdr(skb);
1658
1659 if (th->doff < sizeof(struct tcphdr) / 4)
1660 return 0;
1661
1662 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1663 iph->saddr, th->source,
1664 iph->daddr, ntohs(th->dest),
1665 skb->skb_iif, inet_sdif(skb));
1666 if (sk) {
1667 skb->sk = sk;
1668 skb->destructor = sock_edemux;
1669 if (sk_fullsock(sk)) {
1670 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1671
1672 if (dst)
1673 dst = dst_check(dst, 0);
1674 if (dst &&
1675 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1676 skb_dst_set_noref(skb, dst);
1677 }
1678 }
1679 return 0;
1680}
1681
1682bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1683{
1684 u32 tail_gso_size, tail_gso_segs;
1685 struct skb_shared_info *shinfo;
1686 const struct tcphdr *th;
1687 struct tcphdr *thtail;
1688 struct sk_buff *tail;
1689 unsigned int hdrlen;
1690 bool fragstolen;
1691 u32 gso_segs;
1692 u32 gso_size;
1693 u64 limit;
1694 int delta;
1695
1696 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1697 * we can fix skb->truesize to its real value to avoid future drops.
1698 * This is valid because skb is not yet charged to the socket.
1699 * It has been noticed pure SACK packets were sometimes dropped
1700 * (if cooked by drivers without copybreak feature).
1701 */
1702 skb_condense(skb);
1703
1704 skb_dst_drop(skb);
1705
1706 if (unlikely(tcp_checksum_complete(skb))) {
1707 bh_unlock_sock(sk);
1708 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1709 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1710 return true;
1711 }
1712
1713 /* Attempt coalescing to last skb in backlog, even if we are
1714 * above the limits.
1715 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1716 */
1717 th = (const struct tcphdr *)skb->data;
1718 hdrlen = th->doff * 4;
1719
1720 tail = sk->sk_backlog.tail;
1721 if (!tail)
1722 goto no_coalesce;
1723 thtail = (struct tcphdr *)tail->data;
1724
1725 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1726 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1727 ((TCP_SKB_CB(tail)->tcp_flags |
1728 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1729 !((TCP_SKB_CB(tail)->tcp_flags &
1730 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1731 ((TCP_SKB_CB(tail)->tcp_flags ^
1732 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1733#ifdef CONFIG_TLS_DEVICE
1734 tail->decrypted != skb->decrypted ||
1735#endif
1736 thtail->doff != th->doff ||
1737 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1738 goto no_coalesce;
1739
1740 __skb_pull(skb, hdrlen);
1741
1742 shinfo = skb_shinfo(skb);
1743 gso_size = shinfo->gso_size ?: skb->len;
1744 gso_segs = shinfo->gso_segs ?: 1;
1745
1746 shinfo = skb_shinfo(tail);
1747 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1748 tail_gso_segs = shinfo->gso_segs ?: 1;
1749
1750 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1751 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1752
1753 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1754 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1755 thtail->window = th->window;
1756 }
1757
1758 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1759 * thtail->fin, so that the fast path in tcp_rcv_established()
1760 * is not entered if we append a packet with a FIN.
1761 * SYN, RST, URG are not present.
1762 * ACK is set on both packets.
1763 * PSH : we do not really care in TCP stack,
1764 * at least for 'GRO' packets.
1765 */
1766 thtail->fin |= th->fin;
1767 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1768
1769 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1770 TCP_SKB_CB(tail)->has_rxtstamp = true;
1771 tail->tstamp = skb->tstamp;
1772 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1773 }
1774
1775 /* Not as strict as GRO. We only need to carry mss max value */
1776 shinfo->gso_size = max(gso_size, tail_gso_size);
1777 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1778
1779 sk->sk_backlog.len += delta;
1780 __NET_INC_STATS(sock_net(sk),
1781 LINUX_MIB_TCPBACKLOGCOALESCE);
1782 kfree_skb_partial(skb, fragstolen);
1783 return false;
1784 }
1785 __skb_push(skb, hdrlen);
1786
1787no_coalesce:
1788 /* sk->sk_backlog.len is reset only at the end of __release_sock().
1789 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
1790 * sk_rcvbuf in normal conditions.
1791 */
1792 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
1793
1794 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
1795
1796 /* Only socket owner can try to collapse/prune rx queues
1797 * to reduce memory overhead, so add a little headroom here.
1798 * Few sockets backlog are possibly concurrently non empty.
1799 */
1800 limit += 64 * 1024;
1801
1802 limit = min_t(u64, limit, UINT_MAX);
1803
1804 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1805 bh_unlock_sock(sk);
1806 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1807 return true;
1808 }
1809 return false;
1810}
1811EXPORT_SYMBOL(tcp_add_backlog);
1812
1813int tcp_filter(struct sock *sk, struct sk_buff *skb)
1814{
1815 struct tcphdr *th = (struct tcphdr *)skb->data;
1816
1817 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1818}
1819EXPORT_SYMBOL(tcp_filter);
1820
1821static void tcp_v4_restore_cb(struct sk_buff *skb)
1822{
1823 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1824 sizeof(struct inet_skb_parm));
1825}
1826
1827static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1828 const struct tcphdr *th)
1829{
1830 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1831 * barrier() makes sure compiler wont play fool^Waliasing games.
1832 */
1833 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1834 sizeof(struct inet_skb_parm));
1835 barrier();
1836
1837 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1838 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1839 skb->len - th->doff * 4);
1840 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1841 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1842 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1843 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1844 TCP_SKB_CB(skb)->sacked = 0;
1845 TCP_SKB_CB(skb)->has_rxtstamp =
1846 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1847}
1848
1849/*
1850 * From tcp_input.c
1851 */
1852
1853int tcp_v4_rcv(struct sk_buff *skb)
1854{
1855 struct net *net = dev_net(skb->dev);
1856 struct sk_buff *skb_to_free;
1857 int sdif = inet_sdif(skb);
1858 const struct iphdr *iph;
1859 const struct tcphdr *th;
1860 bool refcounted;
1861 struct sock *sk;
1862 int ret;
1863
1864 if (skb->pkt_type != PACKET_HOST)
1865 goto discard_it;
1866
1867 /* Count it even if it's bad */
1868 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1869
1870 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1871 goto discard_it;
1872
1873 th = (const struct tcphdr *)skb->data;
1874
1875 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1876 goto bad_packet;
1877 if (!pskb_may_pull(skb, th->doff * 4))
1878 goto discard_it;
1879
1880 /* An explanation is required here, I think.
1881 * Packet length and doff are validated by header prediction,
1882 * provided case of th->doff==0 is eliminated.
1883 * So, we defer the checks. */
1884
1885 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1886 goto csum_error;
1887
1888 th = (const struct tcphdr *)skb->data;
1889 iph = ip_hdr(skb);
1890lookup:
1891 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1892 th->dest, sdif, &refcounted);
1893 if (!sk)
1894 goto no_tcp_socket;
1895
1896process:
1897 if (sk->sk_state == TCP_TIME_WAIT)
1898 goto do_time_wait;
1899
1900 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1901 struct request_sock *req = inet_reqsk(sk);
1902 bool req_stolen = false;
1903 struct sock *nsk;
1904
1905 sk = req->rsk_listener;
1906 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1907 sk_drops_add(sk, skb);
1908 reqsk_put(req);
1909 goto discard_it;
1910 }
1911 if (tcp_checksum_complete(skb)) {
1912 reqsk_put(req);
1913 goto csum_error;
1914 }
1915 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1916 inet_csk_reqsk_queue_drop_and_put(sk, req);
1917 goto lookup;
1918 }
1919 /* We own a reference on the listener, increase it again
1920 * as we might lose it too soon.
1921 */
1922 sock_hold(sk);
1923 refcounted = true;
1924 nsk = NULL;
1925 if (!tcp_filter(sk, skb)) {
1926 th = (const struct tcphdr *)skb->data;
1927 iph = ip_hdr(skb);
1928 tcp_v4_fill_cb(skb, iph, th);
1929 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1930 }
1931 if (!nsk) {
1932 reqsk_put(req);
1933 if (req_stolen) {
1934 /* Another cpu got exclusive access to req
1935 * and created a full blown socket.
1936 * Try to feed this packet to this socket
1937 * instead of discarding it.
1938 */
1939 tcp_v4_restore_cb(skb);
1940 sock_put(sk);
1941 goto lookup;
1942 }
1943 goto discard_and_relse;
1944 }
1945 if (nsk == sk) {
1946 reqsk_put(req);
1947 tcp_v4_restore_cb(skb);
1948 } else if (tcp_child_process(sk, nsk, skb)) {
1949 tcp_v4_send_reset(nsk, skb);
1950 goto discard_and_relse;
1951 } else {
1952 sock_put(sk);
1953 return 0;
1954 }
1955 }
1956 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1957 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1958 goto discard_and_relse;
1959 }
1960
1961 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1962 goto discard_and_relse;
1963
1964 if (tcp_v4_inbound_md5_hash(sk, skb))
1965 goto discard_and_relse;
1966
1967 nf_reset_ct(skb);
1968
1969 if (tcp_filter(sk, skb))
1970 goto discard_and_relse;
1971 th = (const struct tcphdr *)skb->data;
1972 iph = ip_hdr(skb);
1973 tcp_v4_fill_cb(skb, iph, th);
1974
1975 skb->dev = NULL;
1976
1977 if (sk->sk_state == TCP_LISTEN) {
1978 ret = tcp_v4_do_rcv(sk, skb);
1979 goto put_and_return;
1980 }
1981
1982 sk_incoming_cpu_update(sk);
1983
1984 bh_lock_sock_nested(sk);
1985 tcp_segs_in(tcp_sk(sk), skb);
1986 ret = 0;
1987 if (!sock_owned_by_user(sk)) {
1988 skb_to_free = sk->sk_rx_skb_cache;
1989 sk->sk_rx_skb_cache = NULL;
1990 ret = tcp_v4_do_rcv(sk, skb);
1991 } else {
1992 if (tcp_add_backlog(sk, skb))
1993 goto discard_and_relse;
1994 skb_to_free = NULL;
1995 }
1996 bh_unlock_sock(sk);
1997 if (skb_to_free)
1998 __kfree_skb(skb_to_free);
1999
2000put_and_return:
2001 if (refcounted)
2002 sock_put(sk);
2003
2004 return ret;
2005
2006no_tcp_socket:
2007 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2008 goto discard_it;
2009
2010 tcp_v4_fill_cb(skb, iph, th);
2011
2012 if (tcp_checksum_complete(skb)) {
2013csum_error:
2014 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2015bad_packet:
2016 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2017 } else {
2018 tcp_v4_send_reset(NULL, skb);
2019 }
2020
2021discard_it:
2022 /* Discard frame. */
2023 kfree_skb(skb);
2024 return 0;
2025
2026discard_and_relse:
2027 sk_drops_add(sk, skb);
2028 if (refcounted)
2029 sock_put(sk);
2030 goto discard_it;
2031
2032do_time_wait:
2033 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2034 inet_twsk_put(inet_twsk(sk));
2035 goto discard_it;
2036 }
2037
2038 tcp_v4_fill_cb(skb, iph, th);
2039
2040 if (tcp_checksum_complete(skb)) {
2041 inet_twsk_put(inet_twsk(sk));
2042 goto csum_error;
2043 }
2044 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2045 case TCP_TW_SYN: {
2046 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2047 &tcp_hashinfo, skb,
2048 __tcp_hdrlen(th),
2049 iph->saddr, th->source,
2050 iph->daddr, th->dest,
2051 inet_iif(skb),
2052 sdif);
2053 if (sk2) {
2054 inet_twsk_deschedule_put(inet_twsk(sk));
2055 sk = sk2;
2056 tcp_v4_restore_cb(skb);
2057 refcounted = false;
2058 goto process;
2059 }
2060 }
2061 /* to ACK */
2062 /* fall through */
2063 case TCP_TW_ACK:
2064 tcp_v4_timewait_ack(sk, skb);
2065 break;
2066 case TCP_TW_RST:
2067 tcp_v4_send_reset(sk, skb);
2068 inet_twsk_deschedule_put(inet_twsk(sk));
2069 goto discard_it;
2070 case TCP_TW_SUCCESS:;
2071 }
2072 goto discard_it;
2073}
2074
2075static struct timewait_sock_ops tcp_timewait_sock_ops = {
2076 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2077 .twsk_unique = tcp_twsk_unique,
2078 .twsk_destructor= tcp_twsk_destructor,
2079};
2080
2081void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2082{
2083 struct dst_entry *dst = skb_dst(skb);
2084
2085 if (dst && dst_hold_safe(dst)) {
2086 rcu_assign_pointer(sk->sk_rx_dst, dst);
2087 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2088 }
2089}
2090EXPORT_SYMBOL(inet_sk_rx_dst_set);
2091
2092const struct inet_connection_sock_af_ops ipv4_specific = {
2093 .queue_xmit = ip_queue_xmit,
2094 .send_check = tcp_v4_send_check,
2095 .rebuild_header = inet_sk_rebuild_header,
2096 .sk_rx_dst_set = inet_sk_rx_dst_set,
2097 .conn_request = tcp_v4_conn_request,
2098 .syn_recv_sock = tcp_v4_syn_recv_sock,
2099 .net_header_len = sizeof(struct iphdr),
2100 .setsockopt = ip_setsockopt,
2101 .getsockopt = ip_getsockopt,
2102 .addr2sockaddr = inet_csk_addr2sockaddr,
2103 .sockaddr_len = sizeof(struct sockaddr_in),
2104#ifdef CONFIG_COMPAT
2105 .compat_setsockopt = compat_ip_setsockopt,
2106 .compat_getsockopt = compat_ip_getsockopt,
2107#endif
2108 .mtu_reduced = tcp_v4_mtu_reduced,
2109};
2110EXPORT_SYMBOL(ipv4_specific);
2111
2112#ifdef CONFIG_TCP_MD5SIG
2113static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2114 .md5_lookup = tcp_v4_md5_lookup,
2115 .calc_md5_hash = tcp_v4_md5_hash_skb,
2116 .md5_parse = tcp_v4_parse_md5_keys,
2117};
2118#endif
2119
2120/* NOTE: A lot of things set to zero explicitly by call to
2121 * sk_alloc() so need not be done here.
2122 */
2123static int tcp_v4_init_sock(struct sock *sk)
2124{
2125 struct inet_connection_sock *icsk = inet_csk(sk);
2126
2127 tcp_init_sock(sk);
2128
2129 icsk->icsk_af_ops = &ipv4_specific;
2130
2131#ifdef CONFIG_TCP_MD5SIG
2132 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2133#endif
2134
2135 return 0;
2136}
2137
2138void tcp_v4_destroy_sock(struct sock *sk)
2139{
2140 struct tcp_sock *tp = tcp_sk(sk);
2141
2142 trace_tcp_destroy_sock(sk);
2143
2144 tcp_clear_xmit_timers(sk);
2145
2146 tcp_cleanup_congestion_control(sk);
2147
2148 tcp_cleanup_ulp(sk);
2149
2150 /* Cleanup up the write buffer. */
2151 tcp_write_queue_purge(sk);
2152
2153 /* Check if we want to disable active TFO */
2154 tcp_fastopen_active_disable_ofo_check(sk);
2155
2156 /* Cleans up our, hopefully empty, out_of_order_queue. */
2157 skb_rbtree_purge(&tp->out_of_order_queue);
2158
2159#ifdef CONFIG_TCP_MD5SIG
2160 /* Clean up the MD5 key list, if any */
2161 if (tp->md5sig_info) {
2162 tcp_clear_md5_list(sk);
2163 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2164 tp->md5sig_info = NULL;
2165 }
2166#endif
2167
2168 /* Clean up a referenced TCP bind bucket. */
2169 if (inet_csk(sk)->icsk_bind_hash)
2170 inet_put_port(sk);
2171
2172 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2173
2174 /* If socket is aborted during connect operation */
2175 tcp_free_fastopen_req(tp);
2176 tcp_fastopen_destroy_cipher(sk);
2177 tcp_saved_syn_free(tp);
2178
2179 sk_sockets_allocated_dec(sk);
2180}
2181EXPORT_SYMBOL(tcp_v4_destroy_sock);
2182
2183#ifdef CONFIG_PROC_FS
2184/* Proc filesystem TCP sock list dumping. */
2185
2186/*
2187 * Get next listener socket follow cur. If cur is NULL, get first socket
2188 * starting from bucket given in st->bucket; when st->bucket is zero the
2189 * very first socket in the hash table is returned.
2190 */
2191static void *listening_get_next(struct seq_file *seq, void *cur)
2192{
2193 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2194 struct tcp_iter_state *st = seq->private;
2195 struct net *net = seq_file_net(seq);
2196 struct inet_listen_hashbucket *ilb;
2197 struct hlist_nulls_node *node;
2198 struct sock *sk = cur;
2199
2200 if (!sk) {
2201get_head:
2202 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2203 spin_lock(&ilb->lock);
2204 sk = sk_nulls_head(&ilb->nulls_head);
2205 st->offset = 0;
2206 goto get_sk;
2207 }
2208 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2209 ++st->num;
2210 ++st->offset;
2211
2212 sk = sk_nulls_next(sk);
2213get_sk:
2214 sk_nulls_for_each_from(sk, node) {
2215 if (!net_eq(sock_net(sk), net))
2216 continue;
2217 if (sk->sk_family == afinfo->family)
2218 return sk;
2219 }
2220 spin_unlock(&ilb->lock);
2221 st->offset = 0;
2222 if (++st->bucket < INET_LHTABLE_SIZE)
2223 goto get_head;
2224 return NULL;
2225}
2226
2227static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2228{
2229 struct tcp_iter_state *st = seq->private;
2230 void *rc;
2231
2232 st->bucket = 0;
2233 st->offset = 0;
2234 rc = listening_get_next(seq, NULL);
2235
2236 while (rc && *pos) {
2237 rc = listening_get_next(seq, rc);
2238 --*pos;
2239 }
2240 return rc;
2241}
2242
2243static inline bool empty_bucket(const struct tcp_iter_state *st)
2244{
2245 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2246}
2247
2248/*
2249 * Get first established socket starting from bucket given in st->bucket.
2250 * If st->bucket is zero, the very first socket in the hash is returned.
2251 */
2252static void *established_get_first(struct seq_file *seq)
2253{
2254 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2255 struct tcp_iter_state *st = seq->private;
2256 struct net *net = seq_file_net(seq);
2257 void *rc = NULL;
2258
2259 st->offset = 0;
2260 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2261 struct sock *sk;
2262 struct hlist_nulls_node *node;
2263 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2264
2265 /* Lockless fast path for the common case of empty buckets */
2266 if (empty_bucket(st))
2267 continue;
2268
2269 spin_lock_bh(lock);
2270 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2271 if (sk->sk_family != afinfo->family ||
2272 !net_eq(sock_net(sk), net)) {
2273 continue;
2274 }
2275 rc = sk;
2276 goto out;
2277 }
2278 spin_unlock_bh(lock);
2279 }
2280out:
2281 return rc;
2282}
2283
2284static void *established_get_next(struct seq_file *seq, void *cur)
2285{
2286 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2287 struct sock *sk = cur;
2288 struct hlist_nulls_node *node;
2289 struct tcp_iter_state *st = seq->private;
2290 struct net *net = seq_file_net(seq);
2291
2292 ++st->num;
2293 ++st->offset;
2294
2295 sk = sk_nulls_next(sk);
2296
2297 sk_nulls_for_each_from(sk, node) {
2298 if (sk->sk_family == afinfo->family &&
2299 net_eq(sock_net(sk), net))
2300 return sk;
2301 }
2302
2303 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2304 ++st->bucket;
2305 return established_get_first(seq);
2306}
2307
2308static void *established_get_idx(struct seq_file *seq, loff_t pos)
2309{
2310 struct tcp_iter_state *st = seq->private;
2311 void *rc;
2312
2313 st->bucket = 0;
2314 rc = established_get_first(seq);
2315
2316 while (rc && pos) {
2317 rc = established_get_next(seq, rc);
2318 --pos;
2319 }
2320 return rc;
2321}
2322
2323static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2324{
2325 void *rc;
2326 struct tcp_iter_state *st = seq->private;
2327
2328 st->state = TCP_SEQ_STATE_LISTENING;
2329 rc = listening_get_idx(seq, &pos);
2330
2331 if (!rc) {
2332 st->state = TCP_SEQ_STATE_ESTABLISHED;
2333 rc = established_get_idx(seq, pos);
2334 }
2335
2336 return rc;
2337}
2338
2339static void *tcp_seek_last_pos(struct seq_file *seq)
2340{
2341 struct tcp_iter_state *st = seq->private;
2342 int bucket = st->bucket;
2343 int offset = st->offset;
2344 int orig_num = st->num;
2345 void *rc = NULL;
2346
2347 switch (st->state) {
2348 case TCP_SEQ_STATE_LISTENING:
2349 if (st->bucket >= INET_LHTABLE_SIZE)
2350 break;
2351 st->state = TCP_SEQ_STATE_LISTENING;
2352 rc = listening_get_next(seq, NULL);
2353 while (offset-- && rc && bucket == st->bucket)
2354 rc = listening_get_next(seq, rc);
2355 if (rc)
2356 break;
2357 st->bucket = 0;
2358 st->state = TCP_SEQ_STATE_ESTABLISHED;
2359 /* Fallthrough */
2360 case TCP_SEQ_STATE_ESTABLISHED:
2361 if (st->bucket > tcp_hashinfo.ehash_mask)
2362 break;
2363 rc = established_get_first(seq);
2364 while (offset-- && rc && bucket == st->bucket)
2365 rc = established_get_next(seq, rc);
2366 }
2367
2368 st->num = orig_num;
2369
2370 return rc;
2371}
2372
2373void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2374{
2375 struct tcp_iter_state *st = seq->private;
2376 void *rc;
2377
2378 if (*pos && *pos == st->last_pos) {
2379 rc = tcp_seek_last_pos(seq);
2380 if (rc)
2381 goto out;
2382 }
2383
2384 st->state = TCP_SEQ_STATE_LISTENING;
2385 st->num = 0;
2386 st->bucket = 0;
2387 st->offset = 0;
2388 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2389
2390out:
2391 st->last_pos = *pos;
2392 return rc;
2393}
2394EXPORT_SYMBOL(tcp_seq_start);
2395
2396void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2397{
2398 struct tcp_iter_state *st = seq->private;
2399 void *rc = NULL;
2400
2401 if (v == SEQ_START_TOKEN) {
2402 rc = tcp_get_idx(seq, 0);
2403 goto out;
2404 }
2405
2406 switch (st->state) {
2407 case TCP_SEQ_STATE_LISTENING:
2408 rc = listening_get_next(seq, v);
2409 if (!rc) {
2410 st->state = TCP_SEQ_STATE_ESTABLISHED;
2411 st->bucket = 0;
2412 st->offset = 0;
2413 rc = established_get_first(seq);
2414 }
2415 break;
2416 case TCP_SEQ_STATE_ESTABLISHED:
2417 rc = established_get_next(seq, v);
2418 break;
2419 }
2420out:
2421 ++*pos;
2422 st->last_pos = *pos;
2423 return rc;
2424}
2425EXPORT_SYMBOL(tcp_seq_next);
2426
2427void tcp_seq_stop(struct seq_file *seq, void *v)
2428{
2429 struct tcp_iter_state *st = seq->private;
2430
2431 switch (st->state) {
2432 case TCP_SEQ_STATE_LISTENING:
2433 if (v != SEQ_START_TOKEN)
2434 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2435 break;
2436 case TCP_SEQ_STATE_ESTABLISHED:
2437 if (v)
2438 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2439 break;
2440 }
2441}
2442EXPORT_SYMBOL(tcp_seq_stop);
2443
2444static void get_openreq4(const struct request_sock *req,
2445 struct seq_file *f, int i)
2446{
2447 const struct inet_request_sock *ireq = inet_rsk(req);
2448 long delta = req->rsk_timer.expires - jiffies;
2449
2450 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2451 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2452 i,
2453 ireq->ir_loc_addr,
2454 ireq->ir_num,
2455 ireq->ir_rmt_addr,
2456 ntohs(ireq->ir_rmt_port),
2457 TCP_SYN_RECV,
2458 0, 0, /* could print option size, but that is af dependent. */
2459 1, /* timers active (only the expire timer) */
2460 jiffies_delta_to_clock_t(delta),
2461 req->num_timeout,
2462 from_kuid_munged(seq_user_ns(f),
2463 sock_i_uid(req->rsk_listener)),
2464 0, /* non standard timer */
2465 0, /* open_requests have no inode */
2466 0,
2467 req);
2468}
2469
2470static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2471{
2472 int timer_active;
2473 unsigned long timer_expires;
2474 const struct tcp_sock *tp = tcp_sk(sk);
2475 const struct inet_connection_sock *icsk = inet_csk(sk);
2476 const struct inet_sock *inet = inet_sk(sk);
2477 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2478 __be32 dest = inet->inet_daddr;
2479 __be32 src = inet->inet_rcv_saddr;
2480 __u16 destp = ntohs(inet->inet_dport);
2481 __u16 srcp = ntohs(inet->inet_sport);
2482 int rx_queue;
2483 int state;
2484
2485 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2486 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2487 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2488 timer_active = 1;
2489 timer_expires = icsk->icsk_timeout;
2490 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2491 timer_active = 4;
2492 timer_expires = icsk->icsk_timeout;
2493 } else if (timer_pending(&sk->sk_timer)) {
2494 timer_active = 2;
2495 timer_expires = sk->sk_timer.expires;
2496 } else {
2497 timer_active = 0;
2498 timer_expires = jiffies;
2499 }
2500
2501 state = inet_sk_state_load(sk);
2502 if (state == TCP_LISTEN)
2503 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2504 else
2505 /* Because we don't lock the socket,
2506 * we might find a transient negative value.
2507 */
2508 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2509 READ_ONCE(tp->copied_seq), 0);
2510
2511 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2512 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2513 i, src, srcp, dest, destp, state,
2514 READ_ONCE(tp->write_seq) - tp->snd_una,
2515 rx_queue,
2516 timer_active,
2517 jiffies_delta_to_clock_t(timer_expires - jiffies),
2518 icsk->icsk_retransmits,
2519 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2520 icsk->icsk_probes_out,
2521 sock_i_ino(sk),
2522 refcount_read(&sk->sk_refcnt), sk,
2523 jiffies_to_clock_t(icsk->icsk_rto),
2524 jiffies_to_clock_t(icsk->icsk_ack.ato),
2525 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2526 tp->snd_cwnd,
2527 state == TCP_LISTEN ?
2528 fastopenq->max_qlen :
2529 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2530}
2531
2532static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2533 struct seq_file *f, int i)
2534{
2535 long delta = tw->tw_timer.expires - jiffies;
2536 __be32 dest, src;
2537 __u16 destp, srcp;
2538
2539 dest = tw->tw_daddr;
2540 src = tw->tw_rcv_saddr;
2541 destp = ntohs(tw->tw_dport);
2542 srcp = ntohs(tw->tw_sport);
2543
2544 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2545 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2546 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2547 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2548 refcount_read(&tw->tw_refcnt), tw);
2549}
2550
2551#define TMPSZ 150
2552
2553static int tcp4_seq_show(struct seq_file *seq, void *v)
2554{
2555 struct tcp_iter_state *st;
2556 struct sock *sk = v;
2557
2558 seq_setwidth(seq, TMPSZ - 1);
2559 if (v == SEQ_START_TOKEN) {
2560 seq_puts(seq, " sl local_address rem_address st tx_queue "
2561 "rx_queue tr tm->when retrnsmt uid timeout "
2562 "inode");
2563 goto out;
2564 }
2565 st = seq->private;
2566
2567 if (sk->sk_state == TCP_TIME_WAIT)
2568 get_timewait4_sock(v, seq, st->num);
2569 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2570 get_openreq4(v, seq, st->num);
2571 else
2572 get_tcp4_sock(v, seq, st->num);
2573out:
2574 seq_pad(seq, '\n');
2575 return 0;
2576}
2577
2578static const struct seq_operations tcp4_seq_ops = {
2579 .show = tcp4_seq_show,
2580 .start = tcp_seq_start,
2581 .next = tcp_seq_next,
2582 .stop = tcp_seq_stop,
2583};
2584
2585static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2586 .family = AF_INET,
2587};
2588
2589static int __net_init tcp4_proc_init_net(struct net *net)
2590{
2591 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2592 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2593 return -ENOMEM;
2594 return 0;
2595}
2596
2597static void __net_exit tcp4_proc_exit_net(struct net *net)
2598{
2599 remove_proc_entry("tcp", net->proc_net);
2600}
2601
2602static struct pernet_operations tcp4_net_ops = {
2603 .init = tcp4_proc_init_net,
2604 .exit = tcp4_proc_exit_net,
2605};
2606
2607int __init tcp4_proc_init(void)
2608{
2609 return register_pernet_subsys(&tcp4_net_ops);
2610}
2611
2612void tcp4_proc_exit(void)
2613{
2614 unregister_pernet_subsys(&tcp4_net_ops);
2615}
2616#endif /* CONFIG_PROC_FS */
2617
2618struct proto tcp_prot = {
2619 .name = "TCP",
2620 .owner = THIS_MODULE,
2621 .close = tcp_close,
2622 .pre_connect = tcp_v4_pre_connect,
2623 .connect = tcp_v4_connect,
2624 .disconnect = tcp_disconnect,
2625 .accept = inet_csk_accept,
2626 .ioctl = tcp_ioctl,
2627 .init = tcp_v4_init_sock,
2628 .destroy = tcp_v4_destroy_sock,
2629 .shutdown = tcp_shutdown,
2630 .setsockopt = tcp_setsockopt,
2631 .getsockopt = tcp_getsockopt,
2632 .keepalive = tcp_set_keepalive,
2633 .recvmsg = tcp_recvmsg,
2634 .sendmsg = tcp_sendmsg,
2635 .sendpage = tcp_sendpage,
2636 .backlog_rcv = tcp_v4_do_rcv,
2637 .release_cb = tcp_release_cb,
2638 .hash = inet_hash,
2639 .unhash = inet_unhash,
2640 .get_port = inet_csk_get_port,
2641 .enter_memory_pressure = tcp_enter_memory_pressure,
2642 .leave_memory_pressure = tcp_leave_memory_pressure,
2643 .stream_memory_free = tcp_stream_memory_free,
2644 .sockets_allocated = &tcp_sockets_allocated,
2645 .orphan_count = &tcp_orphan_count,
2646 .memory_allocated = &tcp_memory_allocated,
2647 .memory_pressure = &tcp_memory_pressure,
2648 .sysctl_mem = sysctl_tcp_mem,
2649 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2650 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2651 .max_header = MAX_TCP_HEADER,
2652 .obj_size = sizeof(struct tcp_sock),
2653 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2654 .twsk_prot = &tcp_timewait_sock_ops,
2655 .rsk_prot = &tcp_request_sock_ops,
2656 .h.hashinfo = &tcp_hashinfo,
2657 .no_autobind = true,
2658#ifdef CONFIG_COMPAT
2659 .compat_setsockopt = compat_tcp_setsockopt,
2660 .compat_getsockopt = compat_tcp_getsockopt,
2661#endif
2662 .diag_destroy = tcp_abort,
2663};
2664EXPORT_SYMBOL(tcp_prot);
2665
2666static void __net_exit tcp_sk_exit(struct net *net)
2667{
2668 int cpu;
2669
2670 if (net->ipv4.tcp_congestion_control)
2671 module_put(net->ipv4.tcp_congestion_control->owner);
2672
2673 for_each_possible_cpu(cpu)
2674 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2675 free_percpu(net->ipv4.tcp_sk);
2676}
2677
2678static int __net_init tcp_sk_init(struct net *net)
2679{
2680 int res, cpu, cnt;
2681
2682 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2683 if (!net->ipv4.tcp_sk)
2684 return -ENOMEM;
2685
2686 for_each_possible_cpu(cpu) {
2687 struct sock *sk;
2688
2689 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2690 IPPROTO_TCP, net);
2691 if (res)
2692 goto fail;
2693 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2694
2695 /* Please enforce IP_DF and IPID==0 for RST and
2696 * ACK sent in SYN-RECV and TIME-WAIT state.
2697 */
2698 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2699
2700 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2701 }
2702
2703 net->ipv4.sysctl_tcp_ecn = 2;
2704 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2705
2706 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2707 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2708 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2709 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2710 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2711
2712 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2713 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2714 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2715
2716 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2717 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2718 net->ipv4.sysctl_tcp_syncookies = 1;
2719 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2720 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2721 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2722 net->ipv4.sysctl_tcp_orphan_retries = 0;
2723 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2724 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2725 net->ipv4.sysctl_tcp_tw_reuse = 2;
2726
2727 cnt = tcp_hashinfo.ehash_mask + 1;
2728 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2729 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2730
2731 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2732 net->ipv4.sysctl_tcp_sack = 1;
2733 net->ipv4.sysctl_tcp_window_scaling = 1;
2734 net->ipv4.sysctl_tcp_timestamps = 1;
2735 net->ipv4.sysctl_tcp_early_retrans = 3;
2736 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2737 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2738 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2739 net->ipv4.sysctl_tcp_max_reordering = 300;
2740 net->ipv4.sysctl_tcp_dsack = 1;
2741 net->ipv4.sysctl_tcp_app_win = 31;
2742 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2743 net->ipv4.sysctl_tcp_frto = 2;
2744 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2745 /* This limits the percentage of the congestion window which we
2746 * will allow a single TSO frame to consume. Building TSO frames
2747 * which are too large can cause TCP streams to be bursty.
2748 */
2749 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2750 /* Default TSQ limit of 16 TSO segments */
2751 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2752 /* rfc5961 challenge ack rate limiting */
2753 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2754 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2755 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2756 net->ipv4.sysctl_tcp_autocorking = 1;
2757 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2758 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2759 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2760 if (net != &init_net) {
2761 memcpy(net->ipv4.sysctl_tcp_rmem,
2762 init_net.ipv4.sysctl_tcp_rmem,
2763 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2764 memcpy(net->ipv4.sysctl_tcp_wmem,
2765 init_net.ipv4.sysctl_tcp_wmem,
2766 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2767 }
2768 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2769 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2770 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2771 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2772 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2773 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2774
2775 /* Reno is always built in */
2776 if (!net_eq(net, &init_net) &&
2777 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2778 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2779 else
2780 net->ipv4.tcp_congestion_control = &tcp_reno;
2781
2782 return 0;
2783fail:
2784 tcp_sk_exit(net);
2785
2786 return res;
2787}
2788
2789static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2790{
2791 struct net *net;
2792
2793 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2794
2795 list_for_each_entry(net, net_exit_list, exit_list)
2796 tcp_fastopen_ctx_destroy(net);
2797}
2798
2799static struct pernet_operations __net_initdata tcp_sk_ops = {
2800 .init = tcp_sk_init,
2801 .exit = tcp_sk_exit,
2802 .exit_batch = tcp_sk_exit_batch,
2803};
2804
2805void __init tcp_v4_init(void)
2806{
2807 if (register_pernet_subsys(&tcp_sk_ops))
2808 panic("Failed to create the TCP control socket.\n");
2809}