Blame - marvell/linux/net/ipv4/tcp_ipv4.c - T108

blob: 1f8a9b323a0dde7400a42ee0f47cbbbea4a2e2d0 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* IPv4 specific functions
				10	*
				11	* code split from:
				12	* linux/ipv4/tcp.c
				13	* linux/ipv4/tcp_input.c
				14	* linux/ipv4/tcp_output.c
				15	*
				16	* See tcp.c for author information
				17	*/
				18
				19	/*
				20	* Changes:
				21	* David S. Miller : New socket lookup architecture.
				22	* This code is dedicated to John Dyson.
				23	* David S. Miller : Change semantics of established hash,
				24	* half is devoted to TIME_WAIT sockets
				25	* and the rest go in the other half.
				26	* Andi Kleen : Add support for syncookies and fixed
				27	* some bugs: ip options weren't passed to
				28	* the TCP layer, missed a check for an
				29	* ACK bit.
				30	* Andi Kleen : Implemented fast path mtu discovery.
				31	* Fixed many serious bugs in the
				32	* request_sock handling and moved
				33	* most of it into the af independent code.
				34	* Added tail drop and some other bugfixes.
				35	* Added new listen semantics.
				36	* Mike McLagan : Routing by source
				37	* Juan Jose Ciarlante: ip_dynaddr bits
				38	* Andi Kleen: various fixes.
				39	* Vitaly E. Lavrov : Transparent proxy revived after year
				40	* coma.
				41	* Andi Kleen : Fix new listen.
				42	* Andi Kleen : Fix accept error reporting.
				43	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
				44	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
				45	* a single port at the same time.
				46	*/
				47
				48	#define pr_fmt(fmt) "TCP: " fmt
				49
				50	#include <linux/bottom_half.h>
				51	#include <linux/types.h>
				52	#include <linux/fcntl.h>
				53	#include <linux/module.h>
				54	#include <linux/random.h>
				55	#include <linux/cache.h>
				56	#include <linux/jhash.h>
				57	#include <linux/init.h>
				58	#include <linux/times.h>
				59	#include <linux/slab.h>
				60
				61	#include <net/net_namespace.h>
				62	#include <net/icmp.h>
				63	#include <net/inet_hashtables.h>
				64	#include <net/tcp.h>
				65	#include <net/transp_v6.h>
				66	#include <net/ipv6.h>
				67	#include <net/inet_common.h>
				68	#include <net/timewait_sock.h>
				69	#include <net/xfrm.h>
				70	#include <net/secure_seq.h>
				71	#include <net/busy_poll.h>
				72
				73	#include <linux/inet.h>
				74	#include <linux/ipv6.h>
				75	#include <linux/stddef.h>
				76	#include <linux/proc_fs.h>
				77	#include <linux/seq_file.h>
				78	#include <linux/inetdevice.h>
				79
				80	#include <crypto/hash.h>
				81	#include <linux/scatterlist.h>
				82
				83	#include <trace/events/tcp.h>
				84
				85	#ifdef CONFIG_TCP_MD5SIG
				86	static int tcp_v4_md5_hash_hdr(char md5_hash, const struct tcp_md5sig_key key,
				87	__be32 daddr, __be32 saddr, const struct tcphdr *th);
				88	#endif
				89
				90	struct inet_hashinfo tcp_hashinfo;
				91	EXPORT_SYMBOL(tcp_hashinfo);
				92
				93	static u32 tcp_v4_init_seq(const struct sk_buff *skb)
				94	{
				95	return secure_tcp_seq(ip_hdr(skb)->daddr,
				96	ip_hdr(skb)->saddr,
				97	tcp_hdr(skb)->dest,
				98	tcp_hdr(skb)->source);
				99	}
				100
				101	static u32 tcp_v4_init_ts_off(const struct net net, const struct sk_buff skb)
				102	{
				103	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
				104	}
				105
				106	int tcp_twsk_unique(struct sock sk, struct sock sktw, void *twp)
				107	{
				108	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
				109	const struct inet_timewait_sock *tw = inet_twsk(sktw);
				110	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
				111	struct tcp_sock *tp = tcp_sk(sk);
				112
				113	if (tw->tw_substate == TCP_FIN_WAIT2)
				114	reuse = 0;
				115
				116	if (reuse == 2) {
				117	/* Still does not detect everything that goes through
				118	* lo, since we require a loopback src or dst address
				119	* or direct binding to 'lo' interface.
				120	*/
				121	bool loopback = false;
				122	if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
				123	loopback = true;
				124	#if IS_ENABLED(CONFIG_IPV6)
				125	if (tw->tw_family == AF_INET6) {
				126	if (ipv6_addr_loopback(&tw->tw_v6_daddr) \|\|
				127	(ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
				128	(tw->tw_v6_daddr.s6_addr[12] == 127)) \|\|
				129	ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) \|\|
				130	(ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
				131	(tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
				132	loopback = true;
				133	} else
				134	#endif
				135	{
				136	if (ipv4_is_loopback(tw->tw_daddr) \|\|
				137	ipv4_is_loopback(tw->tw_rcv_saddr))
				138	loopback = true;
				139	}
				140	if (!loopback)
				141	reuse = 0;
				142	}
				143
				144	/* With PAWS, it is safe from the viewpoint
				145	of data integrity. Even without PAWS it is safe provided sequence
				146	spaces do not overlap i.e. at data rates <= 80Mbit/sec.
				147
				148	Actually, the idea is close to VJ's one, only timestamp cache is
				149	held not per host, but per port pair and TW bucket is used as state
				150	holder.
				151
				152	If TW bucket has been already destroyed we fall back to VJ's scheme
				153	and use initial timestamp retrieved from peer table.
				154	*/
				155	if (tcptw->tw_ts_recent_stamp &&
				156	(!twp \|\| (reuse && time_after32(ktime_get_seconds(),
				157	tcptw->tw_ts_recent_stamp)))) {
				158	/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
				159	* and releasing the bucket lock.
				160	*/
				161	if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
				162	return 0;
				163
				164	/* In case of repair and re-using TIME-WAIT sockets we still
				165	* want to be sure that it is safe as above but honor the
				166	* sequence numbers and time stamps set as part of the repair
				167	* process.
				168	*
				169	* Without this check re-using a TIME-WAIT socket with TCP
				170	* repair would accumulate a -1 on the repair assigned
				171	* sequence number. The first time it is reused the sequence
				172	* is -1, the second time -2, etc. This fixes that issue
				173	* without appearing to create any others.
				174	*/
				175	if (likely(!tp->repair)) {
				176	u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
				177
				178	if (!seq)
				179	seq = 1;
				180	WRITE_ONCE(tp->write_seq, seq);
				181	tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
				182	tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
				183	}
				184
				185	return 1;
				186	}
				187
				188	return 0;
				189	}
				190	EXPORT_SYMBOL_GPL(tcp_twsk_unique);
				191
				192	static int tcp_v4_pre_connect(struct sock sk, struct sockaddr uaddr,
				193	int addr_len)
				194	{
				195	/* This check is replicated from tcp_v4_connect() and intended to
				196	* prevent BPF program called below from accessing bytes that are out
				197	* of the bound specified by user in addr_len.
				198	*/
				199	if (addr_len < sizeof(struct sockaddr_in))
				200	return -EINVAL;
				201
				202	sock_owned_by_me(sk);
				203
				204	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
				205	}
				206
				207	/* This will initiate an outgoing connection. */
				208	int tcp_v4_connect(struct sock sk, struct sockaddr uaddr, int addr_len)
				209	{
				210	struct sockaddr_in usin = (struct sockaddr_in )uaddr;
				211	struct inet_sock *inet = inet_sk(sk);
				212	struct tcp_sock *tp = tcp_sk(sk);
				213	__be16 orig_sport, orig_dport;
				214	__be32 daddr, nexthop;
				215	struct flowi4 *fl4;
				216	struct rtable *rt;
				217	int err;
				218	struct ip_options_rcu *inet_opt;
				219	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
				220
				221	if (addr_len < sizeof(struct sockaddr_in))
				222	return -EINVAL;
				223
				224	if (usin->sin_family != AF_INET)
				225	return -EAFNOSUPPORT;
				226
				227	nexthop = daddr = usin->sin_addr.s_addr;
				228	inet_opt = rcu_dereference_protected(inet->inet_opt,
				229	lockdep_sock_is_held(sk));
				230	if (inet_opt && inet_opt->opt.srr) {
				231	if (!daddr)
				232	return -EINVAL;
				233	nexthop = inet_opt->opt.faddr;
				234	}
				235
				236	orig_sport = inet->inet_sport;
				237	orig_dport = usin->sin_port;
				238	fl4 = &inet->cork.fl.u.ip4;
				239	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
				240	RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
				241	IPPROTO_TCP,
				242	orig_sport, orig_dport, sk);
				243	if (IS_ERR(rt)) {
				244	err = PTR_ERR(rt);
				245	if (err == -ENETUNREACH)
				246	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
				247	return err;
				248	}
				249
				250	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
				251	ip_rt_put(rt);
				252	return -ENETUNREACH;
				253	}
				254
				255	if (!inet_opt \|\| !inet_opt->opt.srr)
				256	daddr = fl4->daddr;
				257
				258	if (!inet->inet_saddr)
				259	inet->inet_saddr = fl4->saddr;
				260	sk_rcv_saddr_set(sk, inet->inet_saddr);
				261
				262	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
				263	/* Reset inherited state */
				264	tp->rx_opt.ts_recent = 0;
				265	tp->rx_opt.ts_recent_stamp = 0;
				266	if (likely(!tp->repair))
				267	WRITE_ONCE(tp->write_seq, 0);
				268	}
				269
				270	inet->inet_dport = usin->sin_port;
				271	sk_daddr_set(sk, daddr);
				272
				273	inet_csk(sk)->icsk_ext_hdr_len = 0;
				274	if (inet_opt)
				275	inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
				276
				277	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
				278
				279	/* Socket identity is still unknown (sport may be zero).
				280	* However we set state to SYN-SENT and not releasing socket
				281	* lock select source port, enter ourselves into the hash tables and
				282	* complete initialization after this.
				283	*/
				284	tcp_set_state(sk, TCP_SYN_SENT);
				285	err = inet_hash_connect(tcp_death_row, sk);
				286	if (err)
				287	goto failure;
				288
				289	sk_set_txhash(sk);
				290
				291	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
				292	inet->inet_sport, inet->inet_dport, sk);
				293	if (IS_ERR(rt)) {
				294	err = PTR_ERR(rt);
				295	rt = NULL;
				296	goto failure;
				297	}
				298	/* OK, now commit destination to socket. */
				299	sk->sk_gso_type = SKB_GSO_TCPV4;
				300	sk_setup_caps(sk, &rt->dst);
				301	rt = NULL;
				302
				303	if (likely(!tp->repair)) {
				304	if (!tp->write_seq)
				305	WRITE_ONCE(tp->write_seq,
				306	secure_tcp_seq(inet->inet_saddr,
				307	inet->inet_daddr,
				308	inet->inet_sport,
				309	usin->sin_port));
				310	tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
				311	inet->inet_saddr,
				312	inet->inet_daddr);
				313	}
				314
				315	inet->inet_id = prandom_u32();
				316
				317	if (tcp_fastopen_defer_connect(sk, &err))
				318	return err;
				319	if (err)
				320	goto failure;
				321
				322	err = tcp_connect(sk);
				323
				324	if (err)
				325	goto failure;
				326
				327	return 0;
				328
				329	failure:
				330	/*
				331	* This unhashes the socket and releases the local port,
				332	* if necessary.
				333	*/
				334	tcp_set_state(sk, TCP_CLOSE);
				335	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				336	inet_reset_saddr(sk);
				337	ip_rt_put(rt);
				338	sk->sk_route_caps = 0;
				339	inet->inet_dport = 0;
				340	return err;
				341	}
				342	EXPORT_SYMBOL(tcp_v4_connect);
				343
				344	/*
				345	* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
				346	* It can be called through tcp_release_cb() if socket was owned by user
				347	* at the time tcp_v4_err() was called to handle ICMP message.
				348	*/
				349	void tcp_v4_mtu_reduced(struct sock *sk)
				350	{
				351	struct inet_sock *inet = inet_sk(sk);
				352	struct dst_entry *dst;
				353	u32 mtu;
				354
				355	if ((1 << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
				356	return;
				357	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
				358	dst = inet_csk_update_pmtu(sk, mtu);
				359	if (!dst)
				360	return;
				361
				362	/* Something is about to be wrong... Remember soft error
				363	* for the case, if this connection will not able to recover.
				364	*/
				365	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
				366	sk->sk_err_soft = EMSGSIZE;
				367
				368	mtu = dst_mtu(dst);
				369
				370	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
				371	ip_sk_accept_pmtu(sk) &&
				372	inet_csk(sk)->icsk_pmtu_cookie > mtu) {
				373	tcp_sync_mss(sk, mtu);
				374
				375	/* Resend the TCP packet because it's
				376	* clear that the old packet has been
				377	* dropped. This is the new "fast" path mtu
				378	* discovery.
				379	*/
				380	tcp_simple_retransmit(sk);
				381	} /* else let the usual retransmit timer handle it */
				382	}
				383	EXPORT_SYMBOL(tcp_v4_mtu_reduced);
				384
				385	static void do_redirect(struct sk_buff skb, struct sock sk)
				386	{
				387	struct dst_entry *dst = __sk_dst_check(sk, 0);
				388
				389	if (dst)
				390	dst->ops->redirect(dst, sk, skb);
				391	}
				392
				393
				394	/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
				395	void tcp_req_err(struct sock *sk, u32 seq, bool abort)
				396	{
				397	struct request_sock *req = inet_reqsk(sk);
				398	struct net *net = sock_net(sk);
				399
				400	/* ICMPs are not backlogged, hence we cannot get
				401	* an established socket here.
				402	*/
				403	if (seq != tcp_rsk(req)->snt_isn) {
				404	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
				405	} else if (abort) {
				406	/*
				407	* Still in SYN_RECV, just remove it silently.
				408	* There is no good way to pass the error to the newly
				409	* created socket, and POSIX does not want network
				410	* errors returned from accept().
				411	*/
				412	inet_csk_reqsk_queue_drop(req->rsk_listener, req);
				413	tcp_listendrop(req->rsk_listener);
				414	}
				415	reqsk_put(req);
				416	}
				417	EXPORT_SYMBOL(tcp_req_err);
				418
				419	/*
				420	* This routine is called by the ICMP module when it gets some
				421	* sort of error condition. If err < 0 then the socket should
				422	* be closed and the error returned to the user. If err > 0
				423	* it's just the icmp type << 8 \| icmp code. After adjustment
				424	* header points to the first 8 bytes of the tcp header. We need
				425	* to find the appropriate port.
				426	*
				427	* The locking strategy used here is very "optimistic". When
				428	* someone else accesses the socket the ICMP is just dropped
				429	* and for some paths there is no check at all.
				430	* A more general error queue to queue errors for later handling
				431	* is probably better.
				432	*
				433	*/
				434
				435	int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
				436	{
				437	const struct iphdr iph = (const struct iphdr )icmp_skb->data;
				438	struct tcphdr th = (struct tcphdr )(icmp_skb->data + (iph->ihl << 2));
				439	struct inet_connection_sock *icsk;
				440	struct tcp_sock *tp;
				441	struct inet_sock *inet;
				442	const int type = icmp_hdr(icmp_skb)->type;
				443	const int code = icmp_hdr(icmp_skb)->code;
				444	struct sock *sk;
				445	struct sk_buff *skb;
				446	struct request_sock *fastopen;
				447	u32 seq, snd_una;
				448	s32 remaining;
				449	u32 delta_us;
				450	int err;
				451	struct net *net = dev_net(icmp_skb->dev);
				452
				453	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
				454	th->dest, iph->saddr, ntohs(th->source),
				455	inet_iif(icmp_skb), 0);
				456	if (!sk) {
				457	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
				458	return -ENOENT;
				459	}
				460	if (sk->sk_state == TCP_TIME_WAIT) {
				461	inet_twsk_put(inet_twsk(sk));
				462	return 0;
				463	}
				464	seq = ntohl(th->seq);
				465	if (sk->sk_state == TCP_NEW_SYN_RECV) {
				466	tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB \|\|
				467	type == ICMP_TIME_EXCEEDED \|\|
				468	(type == ICMP_DEST_UNREACH &&
				469	(code == ICMP_NET_UNREACH \|\|
				470	code == ICMP_HOST_UNREACH)));
				471	return 0;
				472	}
				473
				474	bh_lock_sock(sk);
				475	/* If too many ICMPs get dropped on busy
				476	* servers this needs to be solved differently.
				477	* We do take care of PMTU discovery (RFC1191) special case :
				478	* we can receive locally generated ICMP messages while socket is held.
				479	*/
				480	if (sock_owned_by_user(sk)) {
				481	if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
				482	__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
				483	}
				484	if (sk->sk_state == TCP_CLOSE)
				485	goto out;
				486
				487	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
				488	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
				489	goto out;
				490	}
				491
				492	icsk = inet_csk(sk);
				493	tp = tcp_sk(sk);
				494	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
				495	fastopen = rcu_dereference(tp->fastopen_rsk);
				496	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
				497	if (sk->sk_state != TCP_LISTEN &&
				498	!between(seq, snd_una, tp->snd_nxt)) {
				499	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
				500	goto out;
				501	}
				502
				503	switch (type) {
				504	case ICMP_REDIRECT:
				505	if (!sock_owned_by_user(sk))
				506	do_redirect(icmp_skb, sk);
				507	goto out;
				508	case ICMP_SOURCE_QUENCH:
				509	/* Just silently ignore these. */
				510	goto out;
				511	case ICMP_PARAMETERPROB:
				512	err = EPROTO;
				513	break;
				514	case ICMP_DEST_UNREACH:
				515	if (code > NR_ICMP_UNREACH)
				516	goto out;
				517
				518	if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
				519	/* We are not interested in TCP_LISTEN and open_requests
				520	* (SYN-ACKs send out by Linux are always <576bytes so
				521	* they should go through unfragmented).
				522	*/
				523	if (sk->sk_state == TCP_LISTEN)
				524	goto out;
				525
				526	WRITE_ONCE(tp->mtu_info, info);
				527	if (!sock_owned_by_user(sk)) {
				528	tcp_v4_mtu_reduced(sk);
				529	} else {
				530	if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
				531	sock_hold(sk);
				532	}
				533	goto out;
				534	}
				535
				536	err = icmp_err_convert[code].errno;
				537	/* check if icmp_skb allows revert of backoff
				538	* (see draft-zimmermann-tcp-lcd) */
				539	if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
				540	break;
				541	if (seq != tp->snd_una \|\| !icsk->icsk_retransmits \|\|
				542	!icsk->icsk_backoff \|\| fastopen)
				543	break;
				544
				545	if (sock_owned_by_user(sk))
				546	break;
				547
				548	skb = tcp_rtx_queue_head(sk);
				549	if (WARN_ON_ONCE(!skb))
				550	break;
				551
				552	icsk->icsk_backoff--;
				553	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
				554	TCP_TIMEOUT_INIT;
				555	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
				556
				557
				558	tcp_mstamp_refresh(tp);
				559	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
				560	remaining = icsk->icsk_rto -
				561	usecs_to_jiffies(delta_us);
				562
				563	if (remaining > 0) {
				564	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				565	remaining, TCP_RTO_MAX);
				566	} else {
				567	/* RTO revert clocked out retransmission.
				568	* Will retransmit now */
				569	tcp_retransmit_timer(sk);
				570	}
				571
				572	break;
				573	case ICMP_TIME_EXCEEDED:
				574	err = EHOSTUNREACH;
				575	break;
				576	default:
				577	goto out;
				578	}
				579
				580	switch (sk->sk_state) {
				581	case TCP_SYN_SENT:
				582	case TCP_SYN_RECV:
				583	/* Only in fast or simultaneous open. If a fast open socket is
				584	* is already accepted it is treated as a connected one below.
				585	*/
				586	if (fastopen && !fastopen->sk)
				587	break;
				588
				589	if (!sock_owned_by_user(sk)) {
				590	sk->sk_err = err;
				591
				592	sk->sk_error_report(sk);
				593
				594	tcp_done(sk);
				595	} else {
				596	sk->sk_err_soft = err;
				597	}
				598	goto out;
				599	}
				600
				601	/* If we've already connected we will keep trying
				602	* until we time out, or the user gives up.
				603	*
				604	* rfc1122 4.2.3.9 allows to consider as hard errors
				605	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
				606	* but it is obsoleted by pmtu discovery).
				607	*
				608	* Note, that in modern internet, where routing is unreliable
				609	* and in each dark corner broken firewalls sit, sending random
				610	* errors ordered by their masters even this two messages finally lose
				611	* their original sense (even Linux sends invalid PORT_UNREACHs)
				612	*
				613	* Now we are in compliance with RFCs.
				614	* --ANK (980905)
				615	*/
				616
				617	inet = inet_sk(sk);
				618	if (!sock_owned_by_user(sk) && inet->recverr) {
				619	sk->sk_err = err;
				620	sk->sk_error_report(sk);
				621	} else { /* Only an error on timeout */
				622	sk->sk_err_soft = err;
				623	}
				624
				625	out:
				626	bh_unlock_sock(sk);
				627	sock_put(sk);
				628	return 0;
				629	}
				630
				631	void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
				632	{
				633	struct tcphdr *th = tcp_hdr(skb);
				634
				635	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
				636	skb->csum_start = skb_transport_header(skb) - skb->head;
				637	skb->csum_offset = offsetof(struct tcphdr, check);
				638	}
				639
				640	/* This routine computes an IPv4 TCP checksum. */
				641	void tcp_v4_send_check(struct sock sk, struct sk_buff skb)
				642	{
				643	const struct inet_sock *inet = inet_sk(sk);
				644
				645	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
				646	}
				647	EXPORT_SYMBOL(tcp_v4_send_check);
				648
				649	/*
				650	* This routine will send an RST to the other tcp.
				651	*
				652	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
				653	* for reset.
				654	* Answer: if a packet caused RST, it is not for a socket
				655	* existing in our system, if it is matched to a socket,
				656	* it is just duplicate segment or bug in other side's TCP.
				657	* So that we build reply only basing on parameters
				658	* arrived with segment.
				659	* Exception: precedence violation. We do not implement it in any case.
				660	*/
				661
				662	static void tcp_v4_send_reset(const struct sock sk, struct sk_buff skb)
				663	{
				664	const struct tcphdr *th = tcp_hdr(skb);
				665	struct {
				666	struct tcphdr th;
				667	#ifdef CONFIG_TCP_MD5SIG
				668	__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
				669	#endif
				670	} rep;
				671	struct ip_reply_arg arg;
				672	#ifdef CONFIG_TCP_MD5SIG
				673	struct tcp_md5sig_key *key = NULL;
				674	const __u8 *hash_location = NULL;
				675	unsigned char newhash[16];
				676	int genhash;
				677	struct sock *sk1 = NULL;
				678	#endif
				679	u64 transmit_time = 0;
				680	struct sock *ctl_sk;
				681	struct net *net;
				682
				683	/* Never send a reset in response to a reset. */
				684	if (th->rst)
				685	return;
				686
				687	/* If sk not NULL, it means we did a successful lookup and incoming
				688	* route had to be correct. prequeue might have dropped our dst.
				689	*/
				690	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
				691	return;
				692
				693	/* Swap the send and the receive. */
				694	memset(&rep, 0, sizeof(rep));
				695	rep.th.dest = th->source;
				696	rep.th.source = th->dest;
				697	rep.th.doff = sizeof(struct tcphdr) / 4;
				698	rep.th.rst = 1;
				699
				700	if (th->ack) {
				701	rep.th.seq = th->ack_seq;
				702	} else {
				703	rep.th.ack = 1;
				704	rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				705	skb->len - (th->doff << 2));
				706	}
				707
				708	memset(&arg, 0, sizeof(arg));
				709	arg.iov[0].iov_base = (unsigned char *)&rep;
				710	arg.iov[0].iov_len = sizeof(rep.th);
				711
				712	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
				713	#ifdef CONFIG_TCP_MD5SIG
				714	rcu_read_lock();
				715	hash_location = tcp_parse_md5sig_option(th);
				716	if (sk && sk_fullsock(sk)) {
				717	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
				718	&ip_hdr(skb)->saddr, AF_INET);
				719	} else if (hash_location) {
				720	/*
				721	* active side is lost. Try to find listening socket through
				722	* source port, and then find md5 key through listening socket.
				723	* we are not loose security here:
				724	* Incoming packet is checked with md5 hash with finding key,
				725	* no RST generated if md5 hash doesn't match.
				726	*/
				727	sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
				728	ip_hdr(skb)->saddr,
				729	th->source, ip_hdr(skb)->daddr,
				730	ntohs(th->source), inet_iif(skb),
				731	tcp_v4_sdif(skb));
				732	/* don't send rst if it can't find key */
				733	if (!sk1)
				734	goto out;
				735
				736	key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
				737	&ip_hdr(skb)->saddr, AF_INET);
				738	if (!key)
				739	goto out;
				740
				741
				742	genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
				743	if (genhash \|\| memcmp(hash_location, newhash, 16) != 0)
				744	goto out;
				745
				746	}
				747
				748	if (key) {
				749	rep.opt[0] = htonl((TCPOPT_NOP << 24) \|
				750	(TCPOPT_NOP << 16) \|
				751	(TCPOPT_MD5SIG << 8) \|
				752	TCPOLEN_MD5SIG);
				753	/* Update length and the length the header thinks exists */
				754	arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
				755	rep.th.doff = arg.iov[0].iov_len / 4;
				756
				757	tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
				758	key, ip_hdr(skb)->saddr,
				759	ip_hdr(skb)->daddr, &rep.th);
				760	}
				761	#endif
				762	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				763	ip_hdr(skb)->saddr, /* XXX */
				764	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				765	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				766	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
				767
				768	/* When socket is gone, all binding information is lost.
				769	* routing might fail in this case. No choice here, if we choose to force
				770	* input interface, we will misroute in case of asymmetric route.
				771	*/
				772	if (sk) {
				773	arg.bound_dev_if = sk->sk_bound_dev_if;
				774	if (sk_fullsock(sk))
				775	trace_tcp_send_reset(sk, skb);
				776	}
				777
				778	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
				779	offsetof(struct inet_timewait_sock, tw_bound_dev_if));
				780
				781	arg.tos = ip_hdr(skb)->tos;
				782	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
				783	local_bh_disable();
				784	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
				785	if (sk) {
				786	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
				787	inet_twsk(sk)->tw_mark : sk->sk_mark;
				788	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
				789	inet_twsk(sk)->tw_priority : sk->sk_priority;
				790	transmit_time = tcp_transmit_time(sk);
				791	}
				792	ip_send_unicast_reply(ctl_sk,
				793	skb, &TCP_SKB_CB(skb)->header.h4.opt,
				794	ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
				795	&arg, arg.iov[0].iov_len,
				796	transmit_time);
				797
				798	ctl_sk->sk_mark = 0;
				799	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
				800	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
				801	local_bh_enable();
				802
				803	#ifdef CONFIG_TCP_MD5SIG
				804	out:
				805	rcu_read_unlock();
				806	#endif
				807	}
				808
				809	/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
				810	outside socket context is ugly, certainly. What can I do?
				811	*/
				812
				813	static void tcp_v4_send_ack(const struct sock *sk,
				814	struct sk_buff *skb, u32 seq, u32 ack,
				815	u32 win, u32 tsval, u32 tsecr, int oif,
				816	struct tcp_md5sig_key *key,
				817	int reply_flags, u8 tos)
				818	{
				819	const struct tcphdr *th = tcp_hdr(skb);
				820	struct {
				821	struct tcphdr th;
				822	__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
				823	#ifdef CONFIG_TCP_MD5SIG
				824	+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
				825	#endif
				826	];
				827	} rep;
				828	struct net *net = sock_net(sk);
				829	struct ip_reply_arg arg;
				830	struct sock *ctl_sk;
				831	u64 transmit_time;
				832
				833	memset(&rep.th, 0, sizeof(struct tcphdr));
				834	memset(&arg, 0, sizeof(arg));
				835
				836	arg.iov[0].iov_base = (unsigned char *)&rep;
				837	arg.iov[0].iov_len = sizeof(rep.th);
				838	if (tsecr) {
				839	rep.opt[0] = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				840	(TCPOPT_TIMESTAMP << 8) \|
				841	TCPOLEN_TIMESTAMP);
				842	rep.opt[1] = htonl(tsval);
				843	rep.opt[2] = htonl(tsecr);
				844	arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
				845	}
				846
				847	/* Swap the send and the receive. */
				848	rep.th.dest = th->source;
				849	rep.th.source = th->dest;
				850	rep.th.doff = arg.iov[0].iov_len / 4;
				851	rep.th.seq = htonl(seq);
				852	rep.th.ack_seq = htonl(ack);
				853	rep.th.ack = 1;
				854	rep.th.window = htons(win);
				855
				856	#ifdef CONFIG_TCP_MD5SIG
				857	if (key) {
				858	int offset = (tsecr) ? 3 : 0;
				859
				860	rep.opt[offset++] = htonl((TCPOPT_NOP << 24) \|
				861	(TCPOPT_NOP << 16) \|
				862	(TCPOPT_MD5SIG << 8) \|
				863	TCPOLEN_MD5SIG);
				864	arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
				865	rep.th.doff = arg.iov[0].iov_len/4;
				866
				867	tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
				868	key, ip_hdr(skb)->saddr,
				869	ip_hdr(skb)->daddr, &rep.th);
				870	}
				871	#endif
				872	arg.flags = reply_flags;
				873	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				874	ip_hdr(skb)->saddr, /* XXX */
				875	arg.iov[0].iov_len, IPPROTO_TCP, 0);
				876	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
				877	if (oif)
				878	arg.bound_dev_if = oif;
				879	arg.tos = tos;
				880	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
				881	local_bh_disable();
				882	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
				883	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
				884	inet_twsk(sk)->tw_mark : sk->sk_mark;
				885	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
				886	inet_twsk(sk)->tw_priority : sk->sk_priority;
				887	transmit_time = tcp_transmit_time(sk);
				888	ip_send_unicast_reply(ctl_sk,
				889	skb, &TCP_SKB_CB(skb)->header.h4.opt,
				890	ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
				891	&arg, arg.iov[0].iov_len,
				892	transmit_time);
				893
				894	ctl_sk->sk_mark = 0;
				895	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
				896	local_bh_enable();
				897	}
				898
				899	static void tcp_v4_timewait_ack(struct sock sk, struct sk_buff skb)
				900	{
				901	struct inet_timewait_sock *tw = inet_twsk(sk);
				902	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
				903
				904	tcp_v4_send_ack(sk, skb,
				905	tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
				906	tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
				907	tcp_time_stamp_raw() + tcptw->tw_ts_offset,
				908	tcptw->tw_ts_recent,
				909	tw->tw_bound_dev_if,
				910	tcp_twsk_md5_key(tcptw),
				911	tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
				912	tw->tw_tos
				913	);
				914
				915	inet_twsk_put(tw);
				916	}
				917
				918	static void tcp_v4_reqsk_send_ack(const struct sock sk, struct sk_buff skb,
				919	struct request_sock *req)
				920	{
				921	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
				922	* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
				923	*/
				924	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
				925	tcp_sk(sk)->snd_nxt;
				926
				927	/* RFC 7323 2.3
				928	* The window field (SEG.WND) of every outgoing segment, with the
				929	* exception of <SYN> segments, MUST be right-shifted by
				930	* Rcv.Wind.Shift bits:
				931	*/
				932	tcp_v4_send_ack(sk, skb, seq,
				933	tcp_rsk(req)->rcv_nxt,
				934	req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
				935	tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
				936	req->ts_recent,
				937	0,
				938	tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
				939	AF_INET),
				940	inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
				941	ip_hdr(skb)->tos);
				942	}
				943
				944	/*
				945	* Send a SYN-ACK after having received a SYN.
				946	* This still operates on a request_sock only, not on a big
				947	* socket.
				948	*/
				949	static int tcp_v4_send_synack(const struct sock sk, struct dst_entry dst,
				950	struct flowi *fl,
				951	struct request_sock *req,
				952	struct tcp_fastopen_cookie *foc,
				953	enum tcp_synack_type synack_type)
				954	{
				955	const struct inet_request_sock *ireq = inet_rsk(req);
				956	struct flowi4 fl4;
				957	int err = -1;
				958	struct sk_buff *skb;
				959
				960	/* First, grab a route. */
				961	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
				962	return -1;
				963
				964	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
				965
				966	if (skb) {
				967	__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
				968
				969	rcu_read_lock();
				970	err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
				971	ireq->ir_rmt_addr,
				972	rcu_dereference(ireq->ireq_opt));
				973	rcu_read_unlock();
				974	err = net_xmit_eval(err);
				975	}
				976
				977	return err;
				978	}
				979
				980	/*
				981	* IPv4 request_sock destructor.
				982	*/
				983	static void tcp_v4_reqsk_destructor(struct request_sock *req)
				984	{
				985	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
				986	}
				987
				988	#ifdef CONFIG_TCP_MD5SIG
				989	/*
				990	* RFC2385 MD5 checksumming requires a mapping of
				991	* IP address->MD5 Key.
				992	* We need to maintain these in the sk structure.
				993	*/
				994
				995	DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
				996	EXPORT_SYMBOL(tcp_md5_needed);
				997
				998	/* Find the Key structure for an address. */
				999	struct tcp_md5sig_key __tcp_md5_do_lookup(const struct sock sk,
				1000	const union tcp_md5_addr *addr,
				1001	int family)
				1002	{
				1003	const struct tcp_sock *tp = tcp_sk(sk);
				1004	struct tcp_md5sig_key *key;
				1005	const struct tcp_md5sig_info *md5sig;
				1006	__be32 mask;
				1007	struct tcp_md5sig_key *best_match = NULL;
				1008	bool match;
				1009
				1010	/* caller either holds rcu_read_lock() or socket lock */
				1011	md5sig = rcu_dereference_check(tp->md5sig_info,
				1012	lockdep_sock_is_held(sk));
				1013	if (!md5sig)
				1014	return NULL;
				1015
				1016	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
				1017	if (key->family != family)
				1018	continue;
				1019
				1020	if (family == AF_INET) {
				1021	mask = inet_make_mask(key->prefixlen);
				1022	match = (key->addr.a4.s_addr & mask) ==
				1023	(addr->a4.s_addr & mask);
				1024	#if IS_ENABLED(CONFIG_IPV6)
				1025	} else if (family == AF_INET6) {
				1026	match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
				1027	key->prefixlen);
				1028	#endif
				1029	} else {
				1030	match = false;
				1031	}
				1032
				1033	if (match && (!best_match \|\|
				1034	key->prefixlen > best_match->prefixlen))
				1035	best_match = key;
				1036	}
				1037	return best_match;
				1038	}
				1039	EXPORT_SYMBOL(__tcp_md5_do_lookup);
				1040
				1041	static struct tcp_md5sig_key tcp_md5_do_lookup_exact(const struct sock sk,
				1042	const union tcp_md5_addr *addr,
				1043	int family, u8 prefixlen)
				1044	{
				1045	const struct tcp_sock *tp = tcp_sk(sk);
				1046	struct tcp_md5sig_key *key;
				1047	unsigned int size = sizeof(struct in_addr);
				1048	const struct tcp_md5sig_info *md5sig;
				1049
				1050	/* caller either holds rcu_read_lock() or socket lock */
				1051	md5sig = rcu_dereference_check(tp->md5sig_info,
				1052	lockdep_sock_is_held(sk));
				1053	if (!md5sig)
				1054	return NULL;
				1055	#if IS_ENABLED(CONFIG_IPV6)
				1056	if (family == AF_INET6)
				1057	size = sizeof(struct in6_addr);
				1058	#endif
				1059	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
				1060	if (key->family != family)
				1061	continue;
				1062	if (!memcmp(&key->addr, addr, size) &&
				1063	key->prefixlen == prefixlen)
				1064	return key;
				1065	}
				1066	return NULL;
				1067	}
				1068
				1069	struct tcp_md5sig_key tcp_v4_md5_lookup(const struct sock sk,
				1070	const struct sock *addr_sk)
				1071	{
				1072	const union tcp_md5_addr *addr;
				1073
				1074	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
				1075	return tcp_md5_do_lookup(sk, addr, AF_INET);
				1076	}
				1077	EXPORT_SYMBOL(tcp_v4_md5_lookup);
				1078
				1079	/* This can be called on a newly created socket, from other files */
				1080	int tcp_md5_do_add(struct sock sk, const union tcp_md5_addr addr,
				1081	int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
				1082	gfp_t gfp)
				1083	{
				1084	/* Add Key to the list */
				1085	struct tcp_md5sig_key *key;
				1086	struct tcp_sock *tp = tcp_sk(sk);
				1087	struct tcp_md5sig_info *md5sig;
				1088
				1089	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
				1090	if (key) {
				1091	/* Pre-existing entry - just update that one.
				1092	* Note that the key might be used concurrently.
				1093	*/
				1094	memcpy(key->key, newkey, newkeylen);
				1095
				1096	/* Pairs with READ_ONCE() in tcp_md5_hash_key().
				1097	* Also note that a reader could catch new key->keylen value
				1098	* but old key->key[], this is the reason we use __GFP_ZERO
				1099	* at sock_kmalloc() time below these lines.
				1100	*/
				1101	WRITE_ONCE(key->keylen, newkeylen);
				1102
				1103	return 0;
				1104	}
				1105
				1106	md5sig = rcu_dereference_protected(tp->md5sig_info,
				1107	lockdep_sock_is_held(sk));
				1108	if (!md5sig) {
				1109	md5sig = kmalloc(sizeof(*md5sig), gfp);
				1110	if (!md5sig)
				1111	return -ENOMEM;
				1112
				1113	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
				1114	INIT_HLIST_HEAD(&md5sig->head);
				1115	rcu_assign_pointer(tp->md5sig_info, md5sig);
				1116	}
				1117
				1118	key = sock_kmalloc(sk, sizeof(*key), gfp \| __GFP_ZERO);
				1119	if (!key)
				1120	return -ENOMEM;
				1121	if (!tcp_alloc_md5sig_pool()) {
				1122	sock_kfree_s(sk, key, sizeof(*key));
				1123	return -ENOMEM;
				1124	}
				1125
				1126	memcpy(key->key, newkey, newkeylen);
				1127	key->keylen = newkeylen;
				1128	key->family = family;
				1129	key->prefixlen = prefixlen;
				1130	memcpy(&key->addr, addr,
				1131	(family == AF_INET6) ? sizeof(struct in6_addr) :
				1132	sizeof(struct in_addr));
				1133	hlist_add_head_rcu(&key->node, &md5sig->head);
				1134	return 0;
				1135	}
				1136	EXPORT_SYMBOL(tcp_md5_do_add);
				1137
				1138	int tcp_md5_do_del(struct sock sk, const union tcp_md5_addr addr, int family,
				1139	u8 prefixlen)
				1140	{
				1141	struct tcp_md5sig_key *key;
				1142
				1143	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
				1144	if (!key)
				1145	return -ENOENT;
				1146	hlist_del_rcu(&key->node);
				1147	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
				1148	kfree_rcu(key, rcu);
				1149	return 0;
				1150	}
				1151	EXPORT_SYMBOL(tcp_md5_do_del);
				1152
				1153	static void tcp_clear_md5_list(struct sock *sk)
				1154	{
				1155	struct tcp_sock *tp = tcp_sk(sk);
				1156	struct tcp_md5sig_key *key;
				1157	struct hlist_node *n;
				1158	struct tcp_md5sig_info *md5sig;
				1159
				1160	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
				1161
				1162	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
				1163	hlist_del_rcu(&key->node);
				1164	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
				1165	kfree_rcu(key, rcu);
				1166	}
				1167	}
				1168
				1169	static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
				1170	char __user *optval, int optlen)
				1171	{
				1172	struct tcp_md5sig cmd;
				1173	struct sockaddr_in sin = (struct sockaddr_in )&cmd.tcpm_addr;
				1174	u8 prefixlen = 32;
				1175
				1176	if (optlen < sizeof(cmd))
				1177	return -EINVAL;
				1178
				1179	if (copy_from_user(&cmd, optval, sizeof(cmd)))
				1180	return -EFAULT;
				1181
				1182	if (sin->sin_family != AF_INET)
				1183	return -EINVAL;
				1184
				1185	if (optname == TCP_MD5SIG_EXT &&
				1186	cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
				1187	prefixlen = cmd.tcpm_prefixlen;
				1188	if (prefixlen > 32)
				1189	return -EINVAL;
				1190	}
				1191
				1192	if (!cmd.tcpm_keylen)
				1193	return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
				1194	AF_INET, prefixlen);
				1195
				1196	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
				1197	return -EINVAL;
				1198
				1199	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
				1200	AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
				1201	GFP_KERNEL);
				1202	}
				1203
				1204	static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
				1205	__be32 daddr, __be32 saddr,
				1206	const struct tcphdr *th, int nbytes)
				1207	{
				1208	struct tcp4_pseudohdr *bp;
				1209	struct scatterlist sg;
				1210	struct tcphdr *_th;
				1211
				1212	bp = hp->scratch;
				1213	bp->saddr = saddr;
				1214	bp->daddr = daddr;
				1215	bp->pad = 0;
				1216	bp->protocol = IPPROTO_TCP;
				1217	bp->len = cpu_to_be16(nbytes);
				1218
				1219	_th = (struct tcphdr *)(bp + 1);
				1220	memcpy(_th, th, sizeof(*th));
				1221	_th->check = 0;
				1222
				1223	sg_init_one(&sg, bp, sizeof(bp) + sizeof(th));
				1224	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
				1225	sizeof(bp) + sizeof(th));
				1226	return crypto_ahash_update(hp->md5_req);
				1227	}
				1228
				1229	static int tcp_v4_md5_hash_hdr(char md5_hash, const struct tcp_md5sig_key key,
				1230	__be32 daddr, __be32 saddr, const struct tcphdr *th)
				1231	{
				1232	struct tcp_md5sig_pool *hp;
				1233	struct ahash_request *req;
				1234
				1235	hp = tcp_get_md5sig_pool();
				1236	if (!hp)
				1237	goto clear_hash_noput;
				1238	req = hp->md5_req;
				1239
				1240	if (crypto_ahash_init(req))
				1241	goto clear_hash;
				1242	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
				1243	goto clear_hash;
				1244	if (tcp_md5_hash_key(hp, key))
				1245	goto clear_hash;
				1246	ahash_request_set_crypt(req, NULL, md5_hash, 0);
				1247	if (crypto_ahash_final(req))
				1248	goto clear_hash;
				1249
				1250	tcp_put_md5sig_pool();
				1251	return 0;
				1252
				1253	clear_hash:
				1254	tcp_put_md5sig_pool();
				1255	clear_hash_noput:
				1256	memset(md5_hash, 0, 16);
				1257	return 1;
				1258	}
				1259
				1260	int tcp_v4_md5_hash_skb(char md5_hash, const struct tcp_md5sig_key key,
				1261	const struct sock *sk,
				1262	const struct sk_buff *skb)
				1263	{
				1264	struct tcp_md5sig_pool *hp;
				1265	struct ahash_request *req;
				1266	const struct tcphdr *th = tcp_hdr(skb);
				1267	__be32 saddr, daddr;
				1268
				1269	if (sk) { /* valid for establish/request sockets */
				1270	saddr = sk->sk_rcv_saddr;
				1271	daddr = sk->sk_daddr;
				1272	} else {
				1273	const struct iphdr *iph = ip_hdr(skb);
				1274	saddr = iph->saddr;
				1275	daddr = iph->daddr;
				1276	}
				1277
				1278	hp = tcp_get_md5sig_pool();
				1279	if (!hp)
				1280	goto clear_hash_noput;
				1281	req = hp->md5_req;
				1282
				1283	if (crypto_ahash_init(req))
				1284	goto clear_hash;
				1285
				1286	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
				1287	goto clear_hash;
				1288	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
				1289	goto clear_hash;
				1290	if (tcp_md5_hash_key(hp, key))
				1291	goto clear_hash;
				1292	ahash_request_set_crypt(req, NULL, md5_hash, 0);
				1293	if (crypto_ahash_final(req))
				1294	goto clear_hash;
				1295
				1296	tcp_put_md5sig_pool();
				1297	return 0;
				1298
				1299	clear_hash:
				1300	tcp_put_md5sig_pool();
				1301	clear_hash_noput:
				1302	memset(md5_hash, 0, 16);
				1303	return 1;
				1304	}
				1305	EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
				1306
				1307	#endif
				1308
				1309	/* Called with rcu_read_lock() */
				1310	static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
				1311	const struct sk_buff *skb)
				1312	{
				1313	#ifdef CONFIG_TCP_MD5SIG
				1314	/*
				1315	* This gets called for each TCP segment that arrives
				1316	* so we want to be efficient.
				1317	* We have 3 drop cases:
				1318	* o No MD5 hash and one expected.
				1319	* o MD5 hash and we're not expecting one.
				1320	* o MD5 hash and its wrong.
				1321	*/
				1322	const __u8 *hash_location = NULL;
				1323	struct tcp_md5sig_key *hash_expected;
				1324	const struct iphdr *iph = ip_hdr(skb);
				1325	const struct tcphdr *th = tcp_hdr(skb);
				1326	int genhash;
				1327	unsigned char newhash[16];
				1328
				1329	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
				1330	AF_INET);
				1331	hash_location = tcp_parse_md5sig_option(th);
				1332
				1333	/* We've parsed the options - do we have a hash? */
				1334	if (!hash_expected && !hash_location)
				1335	return false;
				1336
				1337	if (hash_expected && !hash_location) {
				1338	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
				1339	return true;
				1340	}
				1341
				1342	if (!hash_expected && hash_location) {
				1343	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
				1344	return true;
				1345	}
				1346
				1347	/* Okay, so this is hash_expected and hash_location -
				1348	* so we need to calculate the checksum.
				1349	*/
				1350	genhash = tcp_v4_md5_hash_skb(newhash,
				1351	hash_expected,
				1352	NULL, skb);
				1353
				1354	if (genhash \|\| memcmp(hash_location, newhash, 16) != 0) {
				1355	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
				1356	net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
				1357	&iph->saddr, ntohs(th->source),
				1358	&iph->daddr, ntohs(th->dest),
				1359	genhash ? " tcp_v4_calc_md5_hash failed"
				1360	: "");
				1361	return true;
				1362	}
				1363	return false;
				1364	#endif
				1365	return false;
				1366	}
				1367
				1368	static void tcp_v4_init_req(struct request_sock *req,
				1369	const struct sock *sk_listener,
				1370	struct sk_buff *skb)
				1371	{
				1372	struct inet_request_sock *ireq = inet_rsk(req);
				1373	struct net *net = sock_net(sk_listener);
				1374
				1375	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
				1376	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
				1377	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
				1378	}
				1379
				1380	static struct dst_entry tcp_v4_route_req(const struct sock sk,
				1381	struct flowi *fl,
				1382	const struct request_sock *req)
				1383	{
				1384	return inet_csk_route_req(sk, &fl->u.ip4, req);
				1385	}
				1386
				1387	struct request_sock_ops tcp_request_sock_ops __read_mostly = {
				1388	.family = PF_INET,
				1389	.obj_size = sizeof(struct tcp_request_sock),
				1390	.rtx_syn_ack = tcp_rtx_synack,
				1391	.send_ack = tcp_v4_reqsk_send_ack,
				1392	.destructor = tcp_v4_reqsk_destructor,
				1393	.send_reset = tcp_v4_send_reset,
				1394	.syn_ack_timeout = tcp_syn_ack_timeout,
				1395	};
				1396
				1397	const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
				1398	.mss_clamp = TCP_MSS_DEFAULT,
				1399	#ifdef CONFIG_TCP_MD5SIG
				1400	.req_md5_lookup = tcp_v4_md5_lookup,
				1401	.calc_md5_hash = tcp_v4_md5_hash_skb,
				1402	#endif
				1403	.init_req = tcp_v4_init_req,
				1404	#ifdef CONFIG_SYN_COOKIES
				1405	.cookie_init_seq = cookie_v4_init_sequence,
				1406	#endif
				1407	.route_req = tcp_v4_route_req,
				1408	.init_seq = tcp_v4_init_seq,
				1409	.init_ts_off = tcp_v4_init_ts_off,
				1410	.send_synack = tcp_v4_send_synack,
				1411	};
				1412
				1413	int tcp_v4_conn_request(struct sock sk, struct sk_buff skb)
				1414	{
				1415	/* Never answer to SYNs send to broadcast or multicast */
				1416	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
				1417	goto drop;
				1418
				1419	return tcp_conn_request(&tcp_request_sock_ops,
				1420	&tcp_request_sock_ipv4_ops, sk, skb);
				1421
				1422	drop:
				1423	tcp_listendrop(sk);
				1424	return 0;
				1425	}
				1426	EXPORT_SYMBOL(tcp_v4_conn_request);
				1427
				1428
				1429	/*
				1430	* The three way handshake has completed - we got a valid synack -
				1431	* now create the new socket.
				1432	*/
				1433	struct sock tcp_v4_syn_recv_sock(const struct sock sk, struct sk_buff *skb,
				1434	struct request_sock *req,
				1435	struct dst_entry *dst,
				1436	struct request_sock *req_unhash,
				1437	bool *own_req)
				1438	{
				1439	struct inet_request_sock *ireq;
				1440	bool found_dup_sk = false;
				1441	struct inet_sock *newinet;
				1442	struct tcp_sock *newtp;
				1443	struct sock *newsk;
				1444	#ifdef CONFIG_TCP_MD5SIG
				1445	struct tcp_md5sig_key *key;
				1446	#endif
				1447	struct ip_options_rcu *inet_opt;
				1448
				1449	if (sk_acceptq_is_full(sk))
				1450	goto exit_overflow;
				1451
				1452	newsk = tcp_create_openreq_child(sk, req, skb);
				1453	if (!newsk)
				1454	goto exit_nonewsk;
				1455
				1456	newsk->sk_gso_type = SKB_GSO_TCPV4;
				1457	inet_sk_rx_dst_set(newsk, skb);
				1458
				1459	newtp = tcp_sk(newsk);
				1460	newinet = inet_sk(newsk);
				1461	ireq = inet_rsk(req);
				1462	sk_daddr_set(newsk, ireq->ir_rmt_addr);
				1463	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
				1464	newsk->sk_bound_dev_if = ireq->ir_iif;
				1465	newinet->inet_saddr = ireq->ir_loc_addr;
				1466	inet_opt = rcu_dereference(ireq->ireq_opt);
				1467	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
				1468	newinet->mc_index = inet_iif(skb);
				1469	newinet->mc_ttl = ip_hdr(skb)->ttl;
				1470	newinet->rcv_tos = ip_hdr(skb)->tos;
				1471	inet_csk(newsk)->icsk_ext_hdr_len = 0;
				1472	if (inet_opt)
				1473	inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
				1474	newinet->inet_id = prandom_u32();
				1475
				1476	if (!dst) {
				1477	dst = inet_csk_route_child_sock(sk, newsk, req);
				1478	if (!dst)
				1479	goto put_and_exit;
				1480	} else {
				1481	/* syncookie case : see end of cookie_v4_check() */
				1482	}
				1483	sk_setup_caps(newsk, dst);
				1484
				1485	tcp_ca_openreq_child(newsk, dst);
				1486
				1487	tcp_sync_mss(newsk, dst_mtu(dst));
				1488	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
				1489
				1490	tcp_initialize_rcv_mss(newsk);
				1491
				1492	#ifdef CONFIG_TCP_MD5SIG
				1493	/* Copy over the MD5 key from the original socket */
				1494	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
				1495	AF_INET);
				1496	if (key) {
				1497	/*
				1498	* We're using one, so create a matching key
				1499	* on the newsk structure. If we fail to get
				1500	* memory, then we end up not copying the key
				1501	* across. Shucks.
				1502	*/
				1503	tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
				1504	AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
				1505	sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
				1506	}
				1507	#endif
				1508
				1509	if (__inet_inherit_port(sk, newsk) < 0)
				1510	goto put_and_exit;
				1511	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
				1512	&found_dup_sk);
				1513	if (likely(*own_req)) {
				1514	tcp_move_syn(newtp, req);
				1515	ireq->ireq_opt = NULL;
				1516	} else {
				1517	newinet->inet_opt = NULL;
				1518
				1519	if (!req_unhash && found_dup_sk) {
				1520	/* This code path should only be executed in the
				1521	* syncookie case only
				1522	*/
				1523	bh_unlock_sock(newsk);
				1524	sock_put(newsk);
				1525	newsk = NULL;
				1526	}
				1527	}
				1528	return newsk;
				1529
				1530	exit_overflow:
				1531	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				1532	exit_nonewsk:
				1533	dst_release(dst);
				1534	exit:
				1535	tcp_listendrop(sk);
				1536	return NULL;
				1537	put_and_exit:
				1538	newinet->inet_opt = NULL;
				1539	inet_csk_prepare_forced_close(newsk);
				1540	tcp_done(newsk);
				1541	goto exit;
				1542	}
				1543	EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
				1544
				1545	static struct sock tcp_v4_cookie_check(struct sock sk, struct sk_buff *skb)
				1546	{
				1547	#ifdef CONFIG_SYN_COOKIES
				1548	const struct tcphdr *th = tcp_hdr(skb);
				1549
				1550	if (!th->syn)
				1551	sk = cookie_v4_check(sk, skb);
				1552	#endif
				1553	return sk;
				1554	}
				1555
				1556	u16 tcp_v4_get_syncookie(struct sock sk, struct iphdr iph,
				1557	struct tcphdr th, u32 cookie)
				1558	{
				1559	u16 mss = 0;
				1560	#ifdef CONFIG_SYN_COOKIES
				1561	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
				1562	&tcp_request_sock_ipv4_ops, sk, th);
				1563	if (mss) {
				1564	*cookie = __cookie_v4_init_sequence(iph, th, &mss);
				1565	tcp_synq_overflow(sk);
				1566	}
				1567	#endif
				1568	return mss;
				1569	}
				1570
				1571	/* The socket must have it's spinlock held when we get
				1572	* here, unless it is a TCP_LISTEN socket.
				1573	*
				1574	* We have a potential double-lock case here, so even when
				1575	* doing backlog processing we use the BH locking scheme.
				1576	* This is because we cannot sleep with the original spinlock
				1577	* held.
				1578	*/
				1579	int tcp_v4_do_rcv(struct sock sk, struct sk_buff skb)
				1580	{
				1581	struct sock *rsk;
				1582
				1583	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
				1584	struct dst_entry *dst;
				1585
				1586	dst = rcu_dereference_protected(sk->sk_rx_dst,
				1587	lockdep_sock_is_held(sk));
				1588
				1589	sock_rps_save_rxhash(sk, skb);
				1590	sk_mark_napi_id(sk, skb);
				1591	if (dst) {
				1592	if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif \|\|
				1593	!dst->ops->check(dst, 0)) {
				1594	RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
				1595	dst_release(dst);
				1596	}
				1597	}
				1598	tcp_rcv_established(sk, skb);
				1599	return 0;
				1600	}
				1601
				1602	if (tcp_checksum_complete(skb))
				1603	goto csum_err;
				1604
				1605	if (sk->sk_state == TCP_LISTEN) {
				1606	struct sock *nsk = tcp_v4_cookie_check(sk, skb);
				1607
				1608	if (!nsk)
				1609	goto discard;
				1610	if (nsk != sk) {
				1611	if (tcp_child_process(sk, nsk, skb)) {
				1612	rsk = nsk;
				1613	goto reset;
				1614	}
				1615	return 0;
				1616	}
				1617	} else
				1618	sock_rps_save_rxhash(sk, skb);
				1619
				1620	if (tcp_rcv_state_process(sk, skb)) {
				1621	rsk = sk;
				1622	goto reset;
				1623	}
				1624	return 0;
				1625
				1626	reset:
				1627	tcp_v4_send_reset(rsk, skb);
				1628	discard:
				1629	kfree_skb(skb);
				1630	/* Be careful here. If this function gets more complicated and
				1631	* gcc suffers from register pressure on the x86, sk (in %ebx)
				1632	* might be destroyed here. This current version compiles correctly,
				1633	* but you have been warned.
				1634	*/
				1635	return 0;
				1636
				1637	csum_err:
				1638	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				1639	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				1640	goto discard;
				1641	}
				1642	EXPORT_SYMBOL(tcp_v4_do_rcv);
				1643
				1644	int tcp_v4_early_demux(struct sk_buff *skb)
				1645	{
				1646	const struct iphdr *iph;
				1647	const struct tcphdr *th;
				1648	struct sock *sk;
				1649
				1650	if (skb->pkt_type != PACKET_HOST)
				1651	return 0;
				1652
				1653	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
				1654	return 0;
				1655
				1656	iph = ip_hdr(skb);
				1657	th = tcp_hdr(skb);
				1658
				1659	if (th->doff < sizeof(struct tcphdr) / 4)
				1660	return 0;
				1661
				1662	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
				1663	iph->saddr, th->source,
				1664	iph->daddr, ntohs(th->dest),
				1665	skb->skb_iif, inet_sdif(skb));
				1666	if (sk) {
				1667	skb->sk = sk;
				1668	skb->destructor = sock_edemux;
				1669	if (sk_fullsock(sk)) {
				1670	struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
				1671
				1672	if (dst)
				1673	dst = dst_check(dst, 0);
				1674	if (dst &&
				1675	inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
				1676	skb_dst_set_noref(skb, dst);
				1677	}
				1678	}
				1679	return 0;
				1680	}
				1681
				1682	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
				1683	{
				1684	u32 tail_gso_size, tail_gso_segs;
				1685	struct skb_shared_info *shinfo;
				1686	const struct tcphdr *th;
				1687	struct tcphdr *thtail;
				1688	struct sk_buff *tail;
				1689	unsigned int hdrlen;
				1690	bool fragstolen;
				1691	u32 gso_segs;
				1692	u32 gso_size;
				1693	u64 limit;
				1694	int delta;
				1695
				1696	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
				1697	* we can fix skb->truesize to its real value to avoid future drops.
				1698	* This is valid because skb is not yet charged to the socket.
				1699	* It has been noticed pure SACK packets were sometimes dropped
				1700	* (if cooked by drivers without copybreak feature).
				1701	*/
				1702	skb_condense(skb);
				1703
				1704	skb_dst_drop(skb);
				1705
				1706	if (unlikely(tcp_checksum_complete(skb))) {
				1707	bh_unlock_sock(sk);
				1708	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				1709	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				1710	return true;
				1711	}
				1712
				1713	/* Attempt coalescing to last skb in backlog, even if we are
				1714	* above the limits.
				1715	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
				1716	*/
				1717	th = (const struct tcphdr *)skb->data;
				1718	hdrlen = th->doff * 4;
				1719
				1720	tail = sk->sk_backlog.tail;
				1721	if (!tail)
				1722	goto no_coalesce;
				1723	thtail = (struct tcphdr *)tail->data;
				1724
				1725	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
				1726	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
				1727	((TCP_SKB_CB(tail)->tcp_flags \|
				1728	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN \| TCPHDR_RST \| TCPHDR_URG)) \|\|
				1729	!((TCP_SKB_CB(tail)->tcp_flags &
				1730	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) \|\|
				1731	((TCP_SKB_CB(tail)->tcp_flags ^
				1732	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE \| TCPHDR_CWR)) \|\|
				1733	#ifdef CONFIG_TLS_DEVICE
				1734	tail->decrypted != skb->decrypted \|\|
				1735	#endif
				1736	thtail->doff != th->doff \|\|
				1737	memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
				1738	goto no_coalesce;
				1739
				1740	__skb_pull(skb, hdrlen);
				1741
				1742	shinfo = skb_shinfo(skb);
				1743	gso_size = shinfo->gso_size ?: skb->len;
				1744	gso_segs = shinfo->gso_segs ?: 1;
				1745
				1746	shinfo = skb_shinfo(tail);
				1747	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
				1748	tail_gso_segs = shinfo->gso_segs ?: 1;
				1749
				1750	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
				1751	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1752
				1753	if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
				1754	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
				1755	thtail->window = th->window;
				1756	}
				1757
				1758	/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
				1759	* thtail->fin, so that the fast path in tcp_rcv_established()
				1760	* is not entered if we append a packet with a FIN.
				1761	* SYN, RST, URG are not present.
				1762	* ACK is set on both packets.
				1763	* PSH : we do not really care in TCP stack,
				1764	* at least for 'GRO' packets.
				1765	*/
				1766	thtail->fin \|= th->fin;
				1767	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1768
				1769	if (TCP_SKB_CB(skb)->has_rxtstamp) {
				1770	TCP_SKB_CB(tail)->has_rxtstamp = true;
				1771	tail->tstamp = skb->tstamp;
				1772	skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
				1773	}
				1774
				1775	/* Not as strict as GRO. We only need to carry mss max value */
				1776	shinfo->gso_size = max(gso_size, tail_gso_size);
				1777	shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
				1778
				1779	sk->sk_backlog.len += delta;
				1780	__NET_INC_STATS(sock_net(sk),
				1781	LINUX_MIB_TCPBACKLOGCOALESCE);
				1782	kfree_skb_partial(skb, fragstolen);
				1783	return false;
				1784	}
				1785	__skb_push(skb, hdrlen);
				1786
				1787	no_coalesce:
				1788	/* sk->sk_backlog.len is reset only at the end of __release_sock().
				1789	* Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
				1790	* sk_rcvbuf in normal conditions.
				1791	*/
				1792	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
				1793
				1794	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
				1795
				1796	/* Only socket owner can try to collapse/prune rx queues
				1797	* to reduce memory overhead, so add a little headroom here.
				1798	* Few sockets backlog are possibly concurrently non empty.
				1799	*/
				1800	limit += 64 * 1024;
				1801
				1802	limit = min_t(u64, limit, UINT_MAX);
				1803
				1804	if (unlikely(sk_add_backlog(sk, skb, limit))) {
				1805	bh_unlock_sock(sk);
				1806	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
				1807	return true;
				1808	}
				1809	return false;
				1810	}
				1811	EXPORT_SYMBOL(tcp_add_backlog);
				1812
				1813	int tcp_filter(struct sock sk, struct sk_buff skb)
				1814	{
				1815	struct tcphdr th = (struct tcphdr )skb->data;
				1816
				1817	return sk_filter_trim_cap(sk, skb, th->doff * 4);
				1818	}
				1819	EXPORT_SYMBOL(tcp_filter);
				1820
				1821	static void tcp_v4_restore_cb(struct sk_buff *skb)
				1822	{
				1823	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
				1824	sizeof(struct inet_skb_parm));
				1825	}
				1826
				1827	static void tcp_v4_fill_cb(struct sk_buff skb, const struct iphdr iph,
				1828	const struct tcphdr *th)
				1829	{
				1830	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
				1831	* barrier() makes sure compiler wont play fool^Waliasing games.
				1832	*/
				1833	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
				1834	sizeof(struct inet_skb_parm));
				1835	barrier();
				1836
				1837	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
				1838	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				1839	skb->len - th->doff * 4);
				1840	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
				1841	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
				1842	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
				1843	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
				1844	TCP_SKB_CB(skb)->sacked = 0;
				1845	TCP_SKB_CB(skb)->has_rxtstamp =
				1846	skb->tstamp \|\| skb_hwtstamps(skb)->hwtstamp;
				1847	}
				1848
				1849	/*
				1850	* From tcp_input.c
				1851	*/
				1852
				1853	int tcp_v4_rcv(struct sk_buff *skb)
				1854	{
				1855	struct net *net = dev_net(skb->dev);
				1856	struct sk_buff *skb_to_free;
				1857	int sdif = inet_sdif(skb);
				1858	const struct iphdr *iph;
				1859	const struct tcphdr *th;
				1860	bool refcounted;
				1861	struct sock *sk;
				1862	int ret;
				1863
				1864	if (skb->pkt_type != PACKET_HOST)
				1865	goto discard_it;
				1866
				1867	/* Count it even if it's bad */
				1868	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
				1869
				1870	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
				1871	goto discard_it;
				1872
				1873	th = (const struct tcphdr *)skb->data;
				1874
				1875	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
				1876	goto bad_packet;
				1877	if (!pskb_may_pull(skb, th->doff * 4))
				1878	goto discard_it;
				1879
				1880	/* An explanation is required here, I think.
				1881	* Packet length and doff are validated by header prediction,
				1882	* provided case of th->doff==0 is eliminated.
				1883	* So, we defer the checks. */
				1884
				1885	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
				1886	goto csum_error;
				1887
				1888	th = (const struct tcphdr *)skb->data;
				1889	iph = ip_hdr(skb);
				1890	lookup:
				1891	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
				1892	th->dest, sdif, &refcounted);
				1893	if (!sk)
				1894	goto no_tcp_socket;
				1895
				1896	process:
				1897	if (sk->sk_state == TCP_TIME_WAIT)
				1898	goto do_time_wait;
				1899
				1900	if (sk->sk_state == TCP_NEW_SYN_RECV) {
				1901	struct request_sock *req = inet_reqsk(sk);
				1902	bool req_stolen = false;
				1903	struct sock *nsk;
				1904
				1905	sk = req->rsk_listener;
				1906	if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
				1907	sk_drops_add(sk, skb);
				1908	reqsk_put(req);
				1909	goto discard_it;
				1910	}
				1911	if (tcp_checksum_complete(skb)) {
				1912	reqsk_put(req);
				1913	goto csum_error;
				1914	}
				1915	if (unlikely(sk->sk_state != TCP_LISTEN)) {
				1916	inet_csk_reqsk_queue_drop_and_put(sk, req);
				1917	goto lookup;
				1918	}
				1919	/* We own a reference on the listener, increase it again
				1920	* as we might lose it too soon.
				1921	*/
				1922	sock_hold(sk);
				1923	refcounted = true;
				1924	nsk = NULL;
				1925	if (!tcp_filter(sk, skb)) {
				1926	th = (const struct tcphdr *)skb->data;
				1927	iph = ip_hdr(skb);
				1928	tcp_v4_fill_cb(skb, iph, th);
				1929	nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
				1930	}
				1931	if (!nsk) {
				1932	reqsk_put(req);
				1933	if (req_stolen) {
				1934	/* Another cpu got exclusive access to req
				1935	* and created a full blown socket.
				1936	* Try to feed this packet to this socket
				1937	* instead of discarding it.
				1938	*/
				1939	tcp_v4_restore_cb(skb);
				1940	sock_put(sk);
				1941	goto lookup;
				1942	}
				1943	goto discard_and_relse;
				1944	}
				1945	if (nsk == sk) {
				1946	reqsk_put(req);
				1947	tcp_v4_restore_cb(skb);
				1948	} else if (tcp_child_process(sk, nsk, skb)) {
				1949	tcp_v4_send_reset(nsk, skb);
				1950	goto discard_and_relse;
				1951	} else {
				1952	sock_put(sk);
				1953	return 0;
				1954	}
				1955	}
				1956	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
				1957	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
				1958	goto discard_and_relse;
				1959	}
				1960
				1961	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
				1962	goto discard_and_relse;
				1963
				1964	if (tcp_v4_inbound_md5_hash(sk, skb))
				1965	goto discard_and_relse;
				1966
				1967	nf_reset_ct(skb);
				1968
				1969	if (tcp_filter(sk, skb))
				1970	goto discard_and_relse;
				1971	th = (const struct tcphdr *)skb->data;
				1972	iph = ip_hdr(skb);
				1973	tcp_v4_fill_cb(skb, iph, th);
				1974
				1975	skb->dev = NULL;
				1976
				1977	if (sk->sk_state == TCP_LISTEN) {
				1978	ret = tcp_v4_do_rcv(sk, skb);
				1979	goto put_and_return;
				1980	}
				1981
				1982	sk_incoming_cpu_update(sk);
				1983
				1984	bh_lock_sock_nested(sk);
				1985	tcp_segs_in(tcp_sk(sk), skb);
				1986	ret = 0;
				1987	if (!sock_owned_by_user(sk)) {
				1988	skb_to_free = sk->sk_rx_skb_cache;
				1989	sk->sk_rx_skb_cache = NULL;
				1990	ret = tcp_v4_do_rcv(sk, skb);
				1991	} else {
				1992	if (tcp_add_backlog(sk, skb))
				1993	goto discard_and_relse;
				1994	skb_to_free = NULL;
				1995	}
				1996	bh_unlock_sock(sk);
				1997	if (skb_to_free)
				1998	__kfree_skb(skb_to_free);
				1999
				2000	put_and_return:
				2001	if (refcounted)
				2002	sock_put(sk);
				2003
				2004	return ret;
				2005
				2006	no_tcp_socket:
				2007	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
				2008	goto discard_it;
				2009
				2010	tcp_v4_fill_cb(skb, iph, th);
				2011
				2012	if (tcp_checksum_complete(skb)) {
				2013	csum_error:
				2014	__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
				2015	bad_packet:
				2016	__TCP_INC_STATS(net, TCP_MIB_INERRS);
				2017	} else {
				2018	tcp_v4_send_reset(NULL, skb);
				2019	}
				2020
				2021	discard_it:
				2022	/* Discard frame. */
				2023	kfree_skb(skb);
				2024	return 0;
				2025
				2026	discard_and_relse:
				2027	sk_drops_add(sk, skb);
				2028	if (refcounted)
				2029	sock_put(sk);
				2030	goto discard_it;
				2031
				2032	do_time_wait:
				2033	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				2034	inet_twsk_put(inet_twsk(sk));
				2035	goto discard_it;
				2036	}
				2037
				2038	tcp_v4_fill_cb(skb, iph, th);
				2039
				2040	if (tcp_checksum_complete(skb)) {
				2041	inet_twsk_put(inet_twsk(sk));
				2042	goto csum_error;
				2043	}
				2044	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
				2045	case TCP_TW_SYN: {
				2046	struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
				2047	&tcp_hashinfo, skb,
				2048	__tcp_hdrlen(th),
				2049	iph->saddr, th->source,
				2050	iph->daddr, th->dest,
				2051	inet_iif(skb),
				2052	sdif);
				2053	if (sk2) {
				2054	inet_twsk_deschedule_put(inet_twsk(sk));
				2055	sk = sk2;
				2056	tcp_v4_restore_cb(skb);
				2057	refcounted = false;
				2058	goto process;
				2059	}
				2060	}
				2061	/* to ACK */
				2062	/* fall through */
				2063	case TCP_TW_ACK:
				2064	tcp_v4_timewait_ack(sk, skb);
				2065	break;
				2066	case TCP_TW_RST:
				2067	tcp_v4_send_reset(sk, skb);
				2068	inet_twsk_deschedule_put(inet_twsk(sk));
				2069	goto discard_it;
				2070	case TCP_TW_SUCCESS:;
				2071	}
				2072	goto discard_it;
				2073	}
				2074
				2075	static struct timewait_sock_ops tcp_timewait_sock_ops = {
				2076	.twsk_obj_size = sizeof(struct tcp_timewait_sock),
				2077	.twsk_unique = tcp_twsk_unique,
				2078	.twsk_destructor= tcp_twsk_destructor,
				2079	};
				2080
				2081	void inet_sk_rx_dst_set(struct sock sk, const struct sk_buff skb)
				2082	{
				2083	struct dst_entry *dst = skb_dst(skb);
				2084
				2085	if (dst && dst_hold_safe(dst)) {
				2086	rcu_assign_pointer(sk->sk_rx_dst, dst);
				2087	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
				2088	}
				2089	}
				2090	EXPORT_SYMBOL(inet_sk_rx_dst_set);
				2091
				2092	const struct inet_connection_sock_af_ops ipv4_specific = {
				2093	.queue_xmit = ip_queue_xmit,
				2094	.send_check = tcp_v4_send_check,
				2095	.rebuild_header = inet_sk_rebuild_header,
				2096	.sk_rx_dst_set = inet_sk_rx_dst_set,
				2097	.conn_request = tcp_v4_conn_request,
				2098	.syn_recv_sock = tcp_v4_syn_recv_sock,
				2099	.net_header_len = sizeof(struct iphdr),
				2100	.setsockopt = ip_setsockopt,
				2101	.getsockopt = ip_getsockopt,
				2102	.addr2sockaddr = inet_csk_addr2sockaddr,
				2103	.sockaddr_len = sizeof(struct sockaddr_in),
				2104	#ifdef CONFIG_COMPAT
				2105	.compat_setsockopt = compat_ip_setsockopt,
				2106	.compat_getsockopt = compat_ip_getsockopt,
				2107	#endif
				2108	.mtu_reduced = tcp_v4_mtu_reduced,
				2109	};
				2110	EXPORT_SYMBOL(ipv4_specific);
				2111
				2112	#ifdef CONFIG_TCP_MD5SIG
				2113	static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
				2114	.md5_lookup = tcp_v4_md5_lookup,
				2115	.calc_md5_hash = tcp_v4_md5_hash_skb,
				2116	.md5_parse = tcp_v4_parse_md5_keys,
				2117	};
				2118	#endif
				2119
				2120	/* NOTE: A lot of things set to zero explicitly by call to
				2121	* sk_alloc() so need not be done here.
				2122	*/
				2123	static int tcp_v4_init_sock(struct sock *sk)
				2124	{
				2125	struct inet_connection_sock *icsk = inet_csk(sk);
				2126
				2127	tcp_init_sock(sk);
				2128
				2129	icsk->icsk_af_ops = &ipv4_specific;
				2130
				2131	#ifdef CONFIG_TCP_MD5SIG
				2132	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
				2133	#endif
				2134
				2135	return 0;
				2136	}
				2137
				2138	void tcp_v4_destroy_sock(struct sock *sk)
				2139	{
				2140	struct tcp_sock *tp = tcp_sk(sk);
				2141
				2142	trace_tcp_destroy_sock(sk);
				2143
				2144	tcp_clear_xmit_timers(sk);
				2145
				2146	tcp_cleanup_congestion_control(sk);
				2147
				2148	tcp_cleanup_ulp(sk);
				2149
				2150	/* Cleanup up the write buffer. */
				2151	tcp_write_queue_purge(sk);
				2152
				2153	/* Check if we want to disable active TFO */
				2154	tcp_fastopen_active_disable_ofo_check(sk);
				2155
				2156	/* Cleans up our, hopefully empty, out_of_order_queue. */
				2157	skb_rbtree_purge(&tp->out_of_order_queue);
				2158
				2159	#ifdef CONFIG_TCP_MD5SIG
				2160	/* Clean up the MD5 key list, if any */
				2161	if (tp->md5sig_info) {
				2162	tcp_clear_md5_list(sk);
				2163	kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
				2164	tp->md5sig_info = NULL;
				2165	}
				2166	#endif
				2167
				2168	/* Clean up a referenced TCP bind bucket. */
				2169	if (inet_csk(sk)->icsk_bind_hash)
				2170	inet_put_port(sk);
				2171
				2172	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
				2173
				2174	/* If socket is aborted during connect operation */
				2175	tcp_free_fastopen_req(tp);
				2176	tcp_fastopen_destroy_cipher(sk);
				2177	tcp_saved_syn_free(tp);
				2178
				2179	sk_sockets_allocated_dec(sk);
				2180	}
				2181	EXPORT_SYMBOL(tcp_v4_destroy_sock);
				2182
				2183	#ifdef CONFIG_PROC_FS
				2184	/* Proc filesystem TCP sock list dumping. */
				2185
				2186	/*
				2187	* Get next listener socket follow cur. If cur is NULL, get first socket
				2188	* starting from bucket given in st->bucket; when st->bucket is zero the
				2189	* very first socket in the hash table is returned.
				2190	*/
				2191	static void listening_get_next(struct seq_file seq, void *cur)
				2192	{
				2193	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
				2194	struct tcp_iter_state *st = seq->private;
				2195	struct net *net = seq_file_net(seq);
				2196	struct inet_listen_hashbucket *ilb;
				2197	struct hlist_nulls_node *node;
				2198	struct sock *sk = cur;
				2199
				2200	if (!sk) {
				2201	get_head:
				2202	ilb = &tcp_hashinfo.listening_hash[st->bucket];
				2203	spin_lock(&ilb->lock);
				2204	sk = sk_nulls_head(&ilb->nulls_head);
				2205	st->offset = 0;
				2206	goto get_sk;
				2207	}
				2208	ilb = &tcp_hashinfo.listening_hash[st->bucket];
				2209	++st->num;
				2210	++st->offset;
				2211
				2212	sk = sk_nulls_next(sk);
				2213	get_sk:
				2214	sk_nulls_for_each_from(sk, node) {
				2215	if (!net_eq(sock_net(sk), net))
				2216	continue;
				2217	if (sk->sk_family == afinfo->family)
				2218	return sk;
				2219	}
				2220	spin_unlock(&ilb->lock);
				2221	st->offset = 0;
				2222	if (++st->bucket < INET_LHTABLE_SIZE)
				2223	goto get_head;
				2224	return NULL;
				2225	}
				2226
				2227	static void listening_get_idx(struct seq_file seq, loff_t *pos)
				2228	{
				2229	struct tcp_iter_state *st = seq->private;
				2230	void *rc;
				2231
				2232	st->bucket = 0;
				2233	st->offset = 0;
				2234	rc = listening_get_next(seq, NULL);
				2235
				2236	while (rc && *pos) {
				2237	rc = listening_get_next(seq, rc);
				2238	--*pos;
				2239	}
				2240	return rc;
				2241	}
				2242
				2243	static inline bool empty_bucket(const struct tcp_iter_state *st)
				2244	{
				2245	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
				2246	}
				2247
				2248	/*
				2249	* Get first established socket starting from bucket given in st->bucket.
				2250	* If st->bucket is zero, the very first socket in the hash is returned.
				2251	*/
				2252	static void established_get_first(struct seq_file seq)
				2253	{
				2254	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
				2255	struct tcp_iter_state *st = seq->private;
				2256	struct net *net = seq_file_net(seq);
				2257	void *rc = NULL;
				2258
				2259	st->offset = 0;
				2260	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
				2261	struct sock *sk;
				2262	struct hlist_nulls_node *node;
				2263	spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
				2264
				2265	/* Lockless fast path for the common case of empty buckets */
				2266	if (empty_bucket(st))
				2267	continue;
				2268
				2269	spin_lock_bh(lock);
				2270	sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
				2271	if (sk->sk_family != afinfo->family \|\|
				2272	!net_eq(sock_net(sk), net)) {
				2273	continue;
				2274	}
				2275	rc = sk;
				2276	goto out;
				2277	}
				2278	spin_unlock_bh(lock);
				2279	}
				2280	out:
				2281	return rc;
				2282	}
				2283
				2284	static void established_get_next(struct seq_file seq, void *cur)
				2285	{
				2286	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
				2287	struct sock *sk = cur;
				2288	struct hlist_nulls_node *node;
				2289	struct tcp_iter_state *st = seq->private;
				2290	struct net *net = seq_file_net(seq);
				2291
				2292	++st->num;
				2293	++st->offset;
				2294
				2295	sk = sk_nulls_next(sk);
				2296
				2297	sk_nulls_for_each_from(sk, node) {
				2298	if (sk->sk_family == afinfo->family &&
				2299	net_eq(sock_net(sk), net))
				2300	return sk;
				2301	}
				2302
				2303	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
				2304	++st->bucket;
				2305	return established_get_first(seq);
				2306	}
				2307
				2308	static void established_get_idx(struct seq_file seq, loff_t pos)
				2309	{
				2310	struct tcp_iter_state *st = seq->private;
				2311	void *rc;
				2312
				2313	st->bucket = 0;
				2314	rc = established_get_first(seq);
				2315
				2316	while (rc && pos) {
				2317	rc = established_get_next(seq, rc);
				2318	--pos;
				2319	}
				2320	return rc;
				2321	}
				2322
				2323	static void tcp_get_idx(struct seq_file seq, loff_t pos)
				2324	{
				2325	void *rc;
				2326	struct tcp_iter_state *st = seq->private;
				2327
				2328	st->state = TCP_SEQ_STATE_LISTENING;
				2329	rc = listening_get_idx(seq, &pos);
				2330
				2331	if (!rc) {
				2332	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2333	rc = established_get_idx(seq, pos);
				2334	}
				2335
				2336	return rc;
				2337	}
				2338
				2339	static void tcp_seek_last_pos(struct seq_file seq)
				2340	{
				2341	struct tcp_iter_state *st = seq->private;
				2342	int bucket = st->bucket;
				2343	int offset = st->offset;
				2344	int orig_num = st->num;
				2345	void *rc = NULL;
				2346
				2347	switch (st->state) {
				2348	case TCP_SEQ_STATE_LISTENING:
				2349	if (st->bucket >= INET_LHTABLE_SIZE)
				2350	break;
				2351	st->state = TCP_SEQ_STATE_LISTENING;
				2352	rc = listening_get_next(seq, NULL);
				2353	while (offset-- && rc && bucket == st->bucket)
				2354	rc = listening_get_next(seq, rc);
				2355	if (rc)
				2356	break;
				2357	st->bucket = 0;
				2358	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2359	/* Fallthrough */
				2360	case TCP_SEQ_STATE_ESTABLISHED:
				2361	if (st->bucket > tcp_hashinfo.ehash_mask)
				2362	break;
				2363	rc = established_get_first(seq);
				2364	while (offset-- && rc && bucket == st->bucket)
				2365	rc = established_get_next(seq, rc);
				2366	}
				2367
				2368	st->num = orig_num;
				2369
				2370	return rc;
				2371	}
				2372
				2373	void tcp_seq_start(struct seq_file seq, loff_t *pos)
				2374	{
				2375	struct tcp_iter_state *st = seq->private;
				2376	void *rc;
				2377
				2378	if (pos && pos == st->last_pos) {
				2379	rc = tcp_seek_last_pos(seq);
				2380	if (rc)
				2381	goto out;
				2382	}
				2383
				2384	st->state = TCP_SEQ_STATE_LISTENING;
				2385	st->num = 0;
				2386	st->bucket = 0;
				2387	st->offset = 0;
				2388	rc = pos ? tcp_get_idx(seq, pos - 1) : SEQ_START_TOKEN;
				2389
				2390	out:
				2391	st->last_pos = *pos;
				2392	return rc;
				2393	}
				2394	EXPORT_SYMBOL(tcp_seq_start);
				2395
				2396	void tcp_seq_next(struct seq_file seq, void v, loff_t pos)
				2397	{
				2398	struct tcp_iter_state *st = seq->private;
				2399	void *rc = NULL;
				2400
				2401	if (v == SEQ_START_TOKEN) {
				2402	rc = tcp_get_idx(seq, 0);
				2403	goto out;
				2404	}
				2405
				2406	switch (st->state) {
				2407	case TCP_SEQ_STATE_LISTENING:
				2408	rc = listening_get_next(seq, v);
				2409	if (!rc) {
				2410	st->state = TCP_SEQ_STATE_ESTABLISHED;
				2411	st->bucket = 0;
				2412	st->offset = 0;
				2413	rc = established_get_first(seq);
				2414	}
				2415	break;
				2416	case TCP_SEQ_STATE_ESTABLISHED:
				2417	rc = established_get_next(seq, v);
				2418	break;
				2419	}
				2420	out:
				2421	++*pos;
				2422	st->last_pos = *pos;
				2423	return rc;
				2424	}
				2425	EXPORT_SYMBOL(tcp_seq_next);
				2426
				2427	void tcp_seq_stop(struct seq_file seq, void v)
				2428	{
				2429	struct tcp_iter_state *st = seq->private;
				2430
				2431	switch (st->state) {
				2432	case TCP_SEQ_STATE_LISTENING:
				2433	if (v != SEQ_START_TOKEN)
				2434	spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
				2435	break;
				2436	case TCP_SEQ_STATE_ESTABLISHED:
				2437	if (v)
				2438	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
				2439	break;
				2440	}
				2441	}
				2442	EXPORT_SYMBOL(tcp_seq_stop);
				2443
				2444	static void get_openreq4(const struct request_sock *req,
				2445	struct seq_file *f, int i)
				2446	{
				2447	const struct inet_request_sock *ireq = inet_rsk(req);
				2448	long delta = req->rsk_timer.expires - jiffies;
				2449
				2450	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
				2451	" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
				2452	i,
				2453	ireq->ir_loc_addr,
				2454	ireq->ir_num,
				2455	ireq->ir_rmt_addr,
				2456	ntohs(ireq->ir_rmt_port),
				2457	TCP_SYN_RECV,
				2458	0, 0, /* could print option size, but that is af dependent. */
				2459	1, /* timers active (only the expire timer) */
				2460	jiffies_delta_to_clock_t(delta),
				2461	req->num_timeout,
				2462	from_kuid_munged(seq_user_ns(f),
				2463	sock_i_uid(req->rsk_listener)),
				2464	0, /* non standard timer */
				2465	0, /* open_requests have no inode */
				2466	0,
				2467	req);
				2468	}
				2469
				2470	static void get_tcp4_sock(struct sock sk, struct seq_file f, int i)
				2471	{
				2472	int timer_active;
				2473	unsigned long timer_expires;
				2474	const struct tcp_sock *tp = tcp_sk(sk);
				2475	const struct inet_connection_sock *icsk = inet_csk(sk);
				2476	const struct inet_sock *inet = inet_sk(sk);
				2477	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
				2478	__be32 dest = inet->inet_daddr;
				2479	__be32 src = inet->inet_rcv_saddr;
				2480	__u16 destp = ntohs(inet->inet_dport);
				2481	__u16 srcp = ntohs(inet->inet_sport);
				2482	int rx_queue;
				2483	int state;
				2484
				2485	if (icsk->icsk_pending == ICSK_TIME_RETRANS \|\|
				2486	icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
				2487	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				2488	timer_active = 1;
				2489	timer_expires = icsk->icsk_timeout;
				2490	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
				2491	timer_active = 4;
				2492	timer_expires = icsk->icsk_timeout;
				2493	} else if (timer_pending(&sk->sk_timer)) {
				2494	timer_active = 2;
				2495	timer_expires = sk->sk_timer.expires;
				2496	} else {
				2497	timer_active = 0;
				2498	timer_expires = jiffies;
				2499	}
				2500
				2501	state = inet_sk_state_load(sk);
				2502	if (state == TCP_LISTEN)
				2503	rx_queue = READ_ONCE(sk->sk_ack_backlog);
				2504	else
				2505	/* Because we don't lock the socket,
				2506	* we might find a transient negative value.
				2507	*/
				2508	rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
				2509	READ_ONCE(tp->copied_seq), 0);
				2510
				2511	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
				2512	"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
				2513	i, src, srcp, dest, destp, state,
				2514	READ_ONCE(tp->write_seq) - tp->snd_una,
				2515	rx_queue,
				2516	timer_active,
				2517	jiffies_delta_to_clock_t(timer_expires - jiffies),
				2518	icsk->icsk_retransmits,
				2519	from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
				2520	icsk->icsk_probes_out,
				2521	sock_i_ino(sk),
				2522	refcount_read(&sk->sk_refcnt), sk,
				2523	jiffies_to_clock_t(icsk->icsk_rto),
				2524	jiffies_to_clock_t(icsk->icsk_ack.ato),
				2525	(icsk->icsk_ack.quick << 1) \| inet_csk_in_pingpong_mode(sk),
				2526	tp->snd_cwnd,
				2527	state == TCP_LISTEN ?
				2528	fastopenq->max_qlen :
				2529	(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
				2530	}
				2531
				2532	static void get_timewait4_sock(const struct inet_timewait_sock *tw,
				2533	struct seq_file *f, int i)
				2534	{
				2535	long delta = tw->tw_timer.expires - jiffies;
				2536	__be32 dest, src;
				2537	__u16 destp, srcp;
				2538
				2539	dest = tw->tw_daddr;
				2540	src = tw->tw_rcv_saddr;
				2541	destp = ntohs(tw->tw_dport);
				2542	srcp = ntohs(tw->tw_sport);
				2543
				2544	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
				2545	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
				2546	i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
				2547	3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
				2548	refcount_read(&tw->tw_refcnt), tw);
				2549	}
				2550
				2551	#define TMPSZ 150
				2552
				2553	static int tcp4_seq_show(struct seq_file seq, void v)
				2554	{
				2555	struct tcp_iter_state *st;
				2556	struct sock *sk = v;
				2557
				2558	seq_setwidth(seq, TMPSZ - 1);
				2559	if (v == SEQ_START_TOKEN) {
				2560	seq_puts(seq, " sl local_address rem_address st tx_queue "
				2561	"rx_queue tr tm->when retrnsmt uid timeout "
				2562	"inode");
				2563	goto out;
				2564	}
				2565	st = seq->private;
				2566
				2567	if (sk->sk_state == TCP_TIME_WAIT)
				2568	get_timewait4_sock(v, seq, st->num);
				2569	else if (sk->sk_state == TCP_NEW_SYN_RECV)
				2570	get_openreq4(v, seq, st->num);
				2571	else
				2572	get_tcp4_sock(v, seq, st->num);
				2573	out:
				2574	seq_pad(seq, '\n');
				2575	return 0;
				2576	}
				2577
				2578	static const struct seq_operations tcp4_seq_ops = {
				2579	.show = tcp4_seq_show,
				2580	.start = tcp_seq_start,
				2581	.next = tcp_seq_next,
				2582	.stop = tcp_seq_stop,
				2583	};
				2584
				2585	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
				2586	.family = AF_INET,
				2587	};
				2588
				2589	static int __net_init tcp4_proc_init_net(struct net *net)
				2590	{
				2591	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
				2592	sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
				2593	return -ENOMEM;
				2594	return 0;
				2595	}
				2596
				2597	static void __net_exit tcp4_proc_exit_net(struct net *net)
				2598	{
				2599	remove_proc_entry("tcp", net->proc_net);
				2600	}
				2601
				2602	static struct pernet_operations tcp4_net_ops = {
				2603	.init = tcp4_proc_init_net,
				2604	.exit = tcp4_proc_exit_net,
				2605	};
				2606
				2607	int __init tcp4_proc_init(void)
				2608	{
				2609	return register_pernet_subsys(&tcp4_net_ops);
				2610	}
				2611
				2612	void tcp4_proc_exit(void)
				2613	{
				2614	unregister_pernet_subsys(&tcp4_net_ops);
				2615	}
				2616	#endif /* CONFIG_PROC_FS */
				2617
				2618	struct proto tcp_prot = {
				2619	.name = "TCP",
				2620	.owner = THIS_MODULE,
				2621	.close = tcp_close,
				2622	.pre_connect = tcp_v4_pre_connect,
				2623	.connect = tcp_v4_connect,
				2624	.disconnect = tcp_disconnect,
				2625	.accept = inet_csk_accept,
				2626	.ioctl = tcp_ioctl,
				2627	.init = tcp_v4_init_sock,
				2628	.destroy = tcp_v4_destroy_sock,
				2629	.shutdown = tcp_shutdown,
				2630	.setsockopt = tcp_setsockopt,
				2631	.getsockopt = tcp_getsockopt,
				2632	.keepalive = tcp_set_keepalive,
				2633	.recvmsg = tcp_recvmsg,
				2634	.sendmsg = tcp_sendmsg,
				2635	.sendpage = tcp_sendpage,
				2636	.backlog_rcv = tcp_v4_do_rcv,
				2637	.release_cb = tcp_release_cb,
				2638	.hash = inet_hash,
				2639	.unhash = inet_unhash,
				2640	.get_port = inet_csk_get_port,
				2641	.enter_memory_pressure = tcp_enter_memory_pressure,
				2642	.leave_memory_pressure = tcp_leave_memory_pressure,
				2643	.stream_memory_free = tcp_stream_memory_free,
				2644	.sockets_allocated = &tcp_sockets_allocated,
				2645	.orphan_count = &tcp_orphan_count,
				2646	.memory_allocated = &tcp_memory_allocated,
				2647	.memory_pressure = &tcp_memory_pressure,
				2648	.sysctl_mem = sysctl_tcp_mem,
				2649	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
				2650	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
				2651	.max_header = MAX_TCP_HEADER,
				2652	.obj_size = sizeof(struct tcp_sock),
				2653	.slab_flags = SLAB_TYPESAFE_BY_RCU,
				2654	.twsk_prot = &tcp_timewait_sock_ops,
				2655	.rsk_prot = &tcp_request_sock_ops,
				2656	.h.hashinfo = &tcp_hashinfo,
				2657	.no_autobind = true,
				2658	#ifdef CONFIG_COMPAT
				2659	.compat_setsockopt = compat_tcp_setsockopt,
				2660	.compat_getsockopt = compat_tcp_getsockopt,
				2661	#endif
				2662	.diag_destroy = tcp_abort,
				2663	};
				2664	EXPORT_SYMBOL(tcp_prot);
				2665
				2666	static void __net_exit tcp_sk_exit(struct net *net)
				2667	{
				2668	int cpu;
				2669
				2670	if (net->ipv4.tcp_congestion_control)
				2671	module_put(net->ipv4.tcp_congestion_control->owner);
				2672
				2673	for_each_possible_cpu(cpu)
				2674	inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
				2675	free_percpu(net->ipv4.tcp_sk);
				2676	}
				2677
				2678	static int __net_init tcp_sk_init(struct net *net)
				2679	{
				2680	int res, cpu, cnt;
				2681
				2682	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
				2683	if (!net->ipv4.tcp_sk)
				2684	return -ENOMEM;
				2685
				2686	for_each_possible_cpu(cpu) {
				2687	struct sock *sk;
				2688
				2689	res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
				2690	IPPROTO_TCP, net);
				2691	if (res)
				2692	goto fail;
				2693	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				2694
				2695	/* Please enforce IP_DF and IPID==0 for RST and
				2696	* ACK sent in SYN-RECV and TIME-WAIT state.
				2697	*/
				2698	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
				2699
				2700	*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
				2701	}
				2702
				2703	net->ipv4.sysctl_tcp_ecn = 2;
				2704	net->ipv4.sysctl_tcp_ecn_fallback = 1;
				2705
				2706	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
				2707	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
				2708	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
				2709	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
				2710	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
				2711
				2712	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
				2713	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
				2714	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
				2715
				2716	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
				2717	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
				2718	net->ipv4.sysctl_tcp_syncookies = 1;
				2719	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
				2720	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
				2721	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
				2722	net->ipv4.sysctl_tcp_orphan_retries = 0;
				2723	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
				2724	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
				2725	net->ipv4.sysctl_tcp_tw_reuse = 2;
				2726
				2727	cnt = tcp_hashinfo.ehash_mask + 1;
				2728	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
				2729	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
				2730
				2731	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
				2732	net->ipv4.sysctl_tcp_sack = 1;
				2733	net->ipv4.sysctl_tcp_window_scaling = 1;
				2734	net->ipv4.sysctl_tcp_timestamps = 1;
				2735	net->ipv4.sysctl_tcp_early_retrans = 3;
				2736	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
				2737	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
				2738	net->ipv4.sysctl_tcp_retrans_collapse = 1;
				2739	net->ipv4.sysctl_tcp_max_reordering = 300;
				2740	net->ipv4.sysctl_tcp_dsack = 1;
				2741	net->ipv4.sysctl_tcp_app_win = 31;
				2742	net->ipv4.sysctl_tcp_adv_win_scale = 1;
				2743	net->ipv4.sysctl_tcp_frto = 2;
				2744	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
				2745	/* This limits the percentage of the congestion window which we
				2746	* will allow a single TSO frame to consume. Building TSO frames
				2747	* which are too large can cause TCP streams to be bursty.
				2748	*/
				2749	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
				2750	/* Default TSQ limit of 16 TSO segments */
				2751	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
				2752	/* rfc5961 challenge ack rate limiting */
				2753	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
				2754	net->ipv4.sysctl_tcp_min_tso_segs = 2;
				2755	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
				2756	net->ipv4.sysctl_tcp_autocorking = 1;
				2757	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
				2758	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
				2759	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
				2760	if (net != &init_net) {
				2761	memcpy(net->ipv4.sysctl_tcp_rmem,
				2762	init_net.ipv4.sysctl_tcp_rmem,
				2763	sizeof(init_net.ipv4.sysctl_tcp_rmem));
				2764	memcpy(net->ipv4.sysctl_tcp_wmem,
				2765	init_net.ipv4.sysctl_tcp_wmem,
				2766	sizeof(init_net.ipv4.sysctl_tcp_wmem));
				2767	}
				2768	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
				2769	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
				2770	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
				2771	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
				2772	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
				2773	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
				2774
				2775	/* Reno is always built in */
				2776	if (!net_eq(net, &init_net) &&
				2777	try_module_get(init_net.ipv4.tcp_congestion_control->owner))
				2778	net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
				2779	else
				2780	net->ipv4.tcp_congestion_control = &tcp_reno;
				2781
				2782	return 0;
				2783	fail:
				2784	tcp_sk_exit(net);
				2785
				2786	return res;
				2787	}
				2788
				2789	static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
				2790	{
				2791	struct net *net;
				2792
				2793	inet_twsk_purge(&tcp_hashinfo, AF_INET);
				2794
				2795	list_for_each_entry(net, net_exit_list, exit_list)
				2796	tcp_fastopen_ctx_destroy(net);
				2797	}
				2798
				2799	static struct pernet_operations __net_initdata tcp_sk_ops = {
				2800	.init = tcp_sk_init,
				2801	.exit = tcp_sk_exit,
				2802	.exit_batch = tcp_sk_exit_batch,
				2803	};
				2804
				2805	void __init tcp_v4_init(void)
				2806	{
				2807	if (register_pernet_subsys(&tcp_sk_ops))
				2808	panic("Failed to create the TCP control socket.\n");
				2809	}