Blame - marvell/linux/net/ipv4/tcp_input.c - T108

blob: dccb150e539818f240edef43e214a8fc86219e05 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				13	* Florian La Roche, <flla@stud.uni-sb.de>
				14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				17	* Matthew Dillon, <dillon@apollo.west.oic.com>
				18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				19	* Jorge Cwik, <jorge@laser.satlink.net>
				20	*/
				21
				22	/*
				23	* Changes:
				24	* Pedro Roque : Fast Retransmit/Recovery.
				25	* Two receive queues.
				26	* Retransmit queue handled by TCP.
				27	* Better retransmit timer handling.
				28	* New congestion avoidance.
				29	* Header prediction.
				30	* Variable renaming.
				31	*
				32	* Eric : Fast Retransmit.
				33	* Randy Scott : MSS option defines.
				34	* Eric Schenk : Fixes to slow start algorithm.
				35	* Eric Schenk : Yet another double ACK bug.
				36	* Eric Schenk : Delayed ACK bug fixes.
				37	* Eric Schenk : Floyd style fast retrans war avoidance.
				38	* David S. Miller : Don't allow zero congestion window.
				39	* Eric Schenk : Fix retransmitter so that it sends
				40	* next packet on ack of previous packet.
				41	* Andi Kleen : Moved open_request checking here
				42	* and process RSTs for open_requests.
				43	* Andi Kleen : Better prune_queue, and other fixes.
				44	* Andrey Savochkin: Fix RTT measurements in the presence of
				45	* timestamps.
				46	* Andrey Savochkin: Check sequence numbers correctly when
				47	* removing SACKs due to in sequence incoming
				48	* data segments.
				49	* Andi Kleen: Make sure we never ack data there is not
				50	* enough room for. Also make this condition
				51	* a fatal error if it might still happen.
				52	* Andi Kleen: Add tcp_measure_rcv_mss to make
				53	* connections with MSS<min(MTU,ann. MSS)
				54	* work without delayed acks.
				55	* Andi Kleen: Process packets with PSH set in the
				56	* fast path.
				57	* J Hadi Salim: ECN support
				58	* Andrei Gurtov,
				59	* Pasi Sarolahti,
				60	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
				61	* engine. Lots of bugs are found.
				62	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
				63	*/
				64
				65	#define pr_fmt(fmt) "TCP: " fmt
				66
				67	#include <linux/mm.h>
				68	#include <linux/slab.h>
				69	#include <linux/module.h>
				70	#include <linux/sysctl.h>
				71	#include <linux/kernel.h>
				72	#include <linux/prefetch.h>
				73	#include <net/dst.h>
				74	#include <net/tcp.h>
				75	#include <net/inet_common.h>
				76	#include <linux/ipsec.h>
				77	#include <asm/unaligned.h>
				78	#include <linux/errqueue.h>
				79	#include <trace/events/tcp.h>
				80	#include <linux/jump_label_ratelimit.h>
				81	#include <net/busy_poll.h>
				82	#include <trace/hooks/net.h>
				83
				84	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
				85
				86	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
				87	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
				88	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
				89	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
				90	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
				91	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
				92	#define FLAG_ECE 0x40 /* ECE in this ACK */
				93	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
				94	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
				95	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
				96	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
				97	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
				98	#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
				99	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
				100	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
				101	#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
				102	#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
				103
				104	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
				105	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
				106	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE\|FLAG_DSACKING_ACK)
				107	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
				108
				109	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
				110	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
				111
				112	#define REXMIT_NONE 0 /* no loss recovery to do */
				113	#define REXMIT_LOST 1 /* retransmit packets marked lost */
				114	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
				115
				116	#if IS_ENABLED(CONFIG_TLS_DEVICE)
				117	static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
				118
				119	void clean_acked_data_enable(struct inet_connection_sock *icsk,
				120	void (cad)(struct sock sk, u32 ack_seq))
				121	{
				122	icsk->icsk_clean_acked = cad;
				123	static_branch_deferred_inc(&clean_acked_data_enabled);
				124	}
				125	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
				126
				127	void clean_acked_data_disable(struct inet_connection_sock *icsk)
				128	{
				129	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
				130	icsk->icsk_clean_acked = NULL;
				131	}
				132	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
				133
				134	void clean_acked_data_flush(void)
				135	{
				136	static_key_deferred_flush(&clean_acked_data_enabled);
				137	}
				138	EXPORT_SYMBOL_GPL(clean_acked_data_flush);
				139	#endif
				140
				141	static void tcp_gro_dev_warn(struct sock sk, const struct sk_buff skb,
				142	unsigned int len)
				143	{
				144	static bool __once __read_mostly;
				145
				146	if (!__once) {
				147	struct net_device *dev;
				148
				149	__once = true;
				150
				151	rcu_read_lock();
				152	dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
				153	if (!dev \|\| len >= dev->mtu)
				154	pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				155	dev ? dev->name : "Unknown driver");
				156	rcu_read_unlock();
				157	}
				158	}
				159
				160	/* Adapt the MSS value used to make delayed ack decision to the
				161	* real world.
				162	*/
				163	static void tcp_measure_rcv_mss(struct sock sk, const struct sk_buff skb)
				164	{
				165	struct inet_connection_sock *icsk = inet_csk(sk);
				166	const unsigned int lss = icsk->icsk_ack.last_seg_size;
				167	unsigned int len;
				168
				169	icsk->icsk_ack.last_seg_size = 0;
				170
				171	/* skb->len may jitter because of SACKs, even if peer
				172	* sends good full-sized frames.
				173	*/
				174	len = skb_shinfo(skb)->gso_size ? : skb->len;
				175	if (len >= icsk->icsk_ack.rcv_mss) {
				176	icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
				177	tcp_sk(sk)->advmss);
				178	/* Account for possibly-removed options */
				179	if (unlikely(len > icsk->icsk_ack.rcv_mss +
				180	MAX_TCP_OPTION_SPACE))
				181	tcp_gro_dev_warn(sk, skb, len);
				182	/* If the skb has a len of exactly 1*MSS and has the PSH bit
				183	* set then it is likely the end of an application write. So
				184	* more data may not be arriving soon, and yet the data sender
				185	* may be waiting for an ACK if cwnd-bound or using TX zero
				186	* copy. So we set ICSK_ACK_PUSHED here so that
				187	* tcp_cleanup_rbuf() will send an ACK immediately if the app
				188	* reads all of the data and is not ping-pong. If len > MSS
				189	* then this logic does not matter (and does not hurt) because
				190	* tcp_cleanup_rbuf() will always ACK immediately if the app
				191	* reads data and there is more than an MSS of unACKed data.
				192	*/
				193	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
				194	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				195	} else {
				196	/* Otherwise, we make more careful check taking into account,
				197	* that SACKs block is variable.
				198	*
				199	* "len" is invariant segment length, including TCP header.
				200	*/
				201	len += skb->data - skb_transport_header(skb);
				202	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
				203	/* If PSH is not set, packet should be
				204	* full sized, provided peer TCP is not badly broken.
				205	* This observation (if it is correct 8)) allows
				206	* to handle super-low mtu links fairly.
				207	*/
				208	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
				209	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
				210	/* Subtract also invariant (if peer is RFC compliant),
				211	* tcp header plus fixed timestamp option length.
				212	* Resulting "len" is MSS free of SACK jitter.
				213	*/
				214	len -= tcp_sk(sk)->tcp_header_len;
				215	icsk->icsk_ack.last_seg_size = len;
				216	if (len == lss) {
				217	icsk->icsk_ack.rcv_mss = len;
				218	return;
				219	}
				220	}
				221	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
				222	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
				223	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				224	}
				225	}
				226
				227	static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
				228	{
				229	struct inet_connection_sock *icsk = inet_csk(sk);
				230	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
				231
				232	if (quickacks == 0)
				233	quickacks = 2;
				234	quickacks = min(quickacks, max_quickacks);
				235	if (quickacks > icsk->icsk_ack.quick)
				236	icsk->icsk_ack.quick = quickacks;
				237	}
				238
				239	static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
				240	{
				241	struct inet_connection_sock *icsk = inet_csk(sk);
				242
				243	tcp_incr_quickack(sk, max_quickacks);
				244	inet_csk_exit_pingpong_mode(sk);
				245	icsk->icsk_ack.ato = TCP_ATO_MIN;
				246	}
				247
				248	/* Send ACKs quickly, if "quick" count is not exhausted
				249	* and the session is not interactive.
				250	*/
				251
				252	static bool tcp_in_quickack_mode(struct sock *sk)
				253	{
				254	const struct inet_connection_sock *icsk = inet_csk(sk);
				255	const struct dst_entry *dst = __sk_dst_get(sk);
				256
				257	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
				258	(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
				259	}
				260
				261	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
				262	{
				263	if (tp->ecn_flags & TCP_ECN_OK)
				264	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
				265	}
				266
				267	static void tcp_ecn_accept_cwr(struct sock sk, const struct sk_buff skb)
				268	{
				269	if (tcp_hdr(skb)->cwr) {
				270	tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				271
				272	/* If the sender is telling us it has entered CWR, then its
				273	* cwnd may be very low (even just 1 packet), so we should ACK
				274	* immediately.
				275	*/
				276	if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
				277	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
				278	}
				279	}
				280
				281	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
				282	{
				283	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				284	}
				285
				286	static void __tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				287	{
				288	struct tcp_sock *tp = tcp_sk(sk);
				289
				290	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
				291	case INET_ECN_NOT_ECT:
				292	/* Funny extension: if ECT is not set on a segment,
				293	* and we already seen ECT on a previous segment,
				294	* it is probably a retransmit.
				295	*/
				296	if (tp->ecn_flags & TCP_ECN_SEEN)
				297	tcp_enter_quickack_mode(sk, 2);
				298	break;
				299	case INET_ECN_CE:
				300	if (tcp_ca_needs_ecn(sk))
				301	tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
				302
				303	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
				304	/* Better not delay acks, sender can have a very low cwnd */
				305	tcp_enter_quickack_mode(sk, 2);
				306	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
				307	}
				308	tp->ecn_flags \|= TCP_ECN_SEEN;
				309	break;
				310	default:
				311	if (tcp_ca_needs_ecn(sk))
				312	tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
				313	tp->ecn_flags \|= TCP_ECN_SEEN;
				314	break;
				315	}
				316	}
				317
				318	static void tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				319	{
				320	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
				321	__tcp_ecn_check_ce(sk, skb);
				322	}
				323
				324	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const struct tcphdr th)
				325	{
				326	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
				327	tp->ecn_flags &= ~TCP_ECN_OK;
				328	}
				329
				330	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const struct tcphdr th)
				331	{
				332	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
				333	tp->ecn_flags &= ~TCP_ECN_OK;
				334	}
				335
				336	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const struct tcphdr th)
				337	{
				338	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
				339	return true;
				340	return false;
				341	}
				342
				343	/* Buffer size and advertised window tuning.
				344	*
				345	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
				346	*/
				347
				348	static void tcp_sndbuf_expand(struct sock *sk)
				349	{
				350	const struct tcp_sock *tp = tcp_sk(sk);
				351	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				352	int sndmem, per_mss;
				353	u32 nr_segs;
				354
				355	/* Worst case is non GSO/TSO : each frame consumes one skb
				356	* and skb->head is kmalloced using power of two area of memory
				357	*/
				358	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
				359	MAX_TCP_HEADER +
				360	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				361
				362	per_mss = roundup_pow_of_two(per_mss) +
				363	SKB_DATA_ALIGN(sizeof(struct sk_buff));
				364
				365	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
				366	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
				367
				368	/* Fast Recovery (RFC 5681 3.2) :
				369	* Cubic needs 1.7 factor, rounded to 2 to include
				370	* extra cushion (application might react slowly to EPOLLOUT)
				371	*/
				372	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
				373	sndmem = nr_segs per_mss;
				374
				375	if (sk->sk_sndbuf < sndmem)
				376	WRITE_ONCE(sk->sk_sndbuf,
				377	min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
				378	}
				379
				380	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
				381	*
				382	* All tcp_full_space() is split to two parts: "network" buffer, allocated
				383	* forward and advertised in receiver window (tp->rcv_wnd) and
				384	* "application buffer", required to isolate scheduling/application
				385	* latencies from network.
				386	* window_clamp is maximal advertised window. It can be less than
				387	* tcp_full_space(), in this case tcp_full_space() - window_clamp
				388	* is reserved for "application" buffer. The less window_clamp is
				389	* the smoother our behaviour from viewpoint of network, but the lower
				390	* throughput and the higher sensitivity of the connection to losses. 8)
				391	*
				392	* rcv_ssthresh is more strict window_clamp used at "slow start"
				393	* phase to predict further behaviour of this connection.
				394	* It is used for two goals:
				395	* - to enforce header prediction at sender, even when application
				396	* requires some significant "application buffer". It is check #1.
				397	* - to prevent pruning of receive queue because of misprediction
				398	* of receiver window. Check #2.
				399	*
				400	* The scheme does not work when sender sends good segments opening
				401	* window and then starts to feed us spaghetti. But it should work
				402	* in common situations. Otherwise, we have to rely on queue collapsing.
				403	*/
				404
				405	/* Slow part of check#2. */
				406	static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
				407	{
				408	struct tcp_sock *tp = tcp_sk(sk);
				409	/* Optimize this! */
				410	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
				411	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
				412
				413	while (tp->rcv_ssthresh <= window) {
				414	if (truesize <= skb->len)
				415	return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
				416
				417	truesize >>= 1;
				418	window >>= 1;
				419	}
				420	return 0;
				421	}
				422
				423	static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
				424	{
				425	struct tcp_sock *tp = tcp_sk(sk);
				426	int room;
				427
				428	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
				429
				430	/* Check #1 */
				431	if (room > 0 && !tcp_under_memory_pressure(sk)) {
				432	int incr;
				433
				434	/* Check #2. Increase window, if skb with such overhead
				435	* will fit to rcvbuf in future.
				436	*/
				437	if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
				438	incr = 2 * tp->advmss;
				439	else
				440	incr = __tcp_grow_window(sk, skb);
				441
				442	if (incr) {
				443	incr = max_t(int, incr, 2 * skb->len);
				444	tp->rcv_ssthresh += min(room, incr);
				445	inet_csk(sk)->icsk_ack.quick \|= 1;
				446	}
				447	}
				448	}
				449
				450	/* 3. Try to fixup all. It is made immediately after connection enters
				451	* established state.
				452	*/
				453	void tcp_init_buffer_space(struct sock *sk)
				454	{
				455	int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
				456	struct tcp_sock *tp = tcp_sk(sk);
				457	int maxwin;
				458
				459	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
				460	tcp_sndbuf_expand(sk);
				461
				462	tcp_mstamp_refresh(tp);
				463	tp->rcvq_space.time = tp->tcp_mstamp;
				464	tp->rcvq_space.seq = tp->copied_seq;
				465
				466	maxwin = tcp_full_space(sk);
				467
				468	if (tp->window_clamp >= maxwin) {
				469	tp->window_clamp = maxwin;
				470
				471	if (tcp_app_win && maxwin > 4 * tp->advmss)
				472	tp->window_clamp = max(maxwin -
				473	(maxwin >> tcp_app_win),
				474	4 * tp->advmss);
				475	}
				476
				477	/* Force reservation of one segment. */
				478	if (tcp_app_win &&
				479	tp->window_clamp > 2 * tp->advmss &&
				480	tp->window_clamp + tp->advmss > maxwin)
				481	tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
				482
				483	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
				484	tp->snd_cwnd_stamp = tcp_jiffies32;
				485	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
				486	(u32)TCP_INIT_CWND * tp->advmss);
				487	}
				488
				489	/* 4. Recalculate window clamp after socket hit its memory bounds. */
				490	static void tcp_clamp_window(struct sock *sk)
				491	{
				492	struct tcp_sock *tp = tcp_sk(sk);
				493	struct inet_connection_sock *icsk = inet_csk(sk);
				494	struct net *net = sock_net(sk);
				495
				496	icsk->icsk_ack.quick = 0;
				497
				498	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
				499	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
				500	!tcp_under_memory_pressure(sk) &&
				501	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
				502	WRITE_ONCE(sk->sk_rcvbuf,
				503	min(atomic_read(&sk->sk_rmem_alloc),
				504	net->ipv4.sysctl_tcp_rmem[2]));
				505	}
				506	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
				507	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
				508	}
				509
				510	/* Initialize RCV_MSS value.
				511	* RCV_MSS is an our guess about MSS used by the peer.
				512	* We haven't any direct information about the MSS.
				513	* It's better to underestimate the RCV_MSS rather than overestimate.
				514	* Overestimations make us ACKing less frequently than needed.
				515	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				516	*/
				517	void tcp_initialize_rcv_mss(struct sock *sk)
				518	{
				519	const struct tcp_sock *tp = tcp_sk(sk);
				520	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
				521
				522	hint = min(hint, tp->rcv_wnd / 2);
				523	hint = min(hint, TCP_MSS_DEFAULT);
				524	hint = max(hint, TCP_MIN_MSS);
				525
				526	inet_csk(sk)->icsk_ack.rcv_mss = hint;
				527	}
				528	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
				529
				530	/* Receiver "autotuning" code.
				531	*
				532	* The algorithm for RTT estimation w/o timestamps is based on
				533	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
				534	* <http://public.lanl.gov/radiant/pubs.html#DRS>
				535	*
				536	* More detail on this code can be found at
				537	* <http://staff.psc.edu/jheffner/>,
				538	* though this reference is out of date. A new paper
				539	* is pending.
				540	*/
				541	static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
				542	{
				543	u32 new_sample = tp->rcv_rtt_est.rtt_us;
				544	long m = sample;
				545
				546	if (new_sample != 0) {
				547	/* If we sample in larger samples in the non-timestamp
				548	* case, we could grossly overestimate the RTT especially
				549	* with chatty applications or bulk transfer apps which
				550	* are stalled on filesystem I/O.
				551	*
				552	* Also, since we are only going for a minimum in the
				553	* non-timestamp case, we do not smooth things out
				554	* else with timestamps disabled convergence takes too
				555	* long.
				556	*/
				557	if (!win_dep) {
				558	m -= (new_sample >> 3);
				559	new_sample += m;
				560	} else {
				561	m <<= 3;
				562	if (m < new_sample)
				563	new_sample = m;
				564	}
				565	} else {
				566	/* No previous measure. */
				567	new_sample = m << 3;
				568	}
				569
				570	tp->rcv_rtt_est.rtt_us = new_sample;
				571	}
				572
				573	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
				574	{
				575	u32 delta_us;
				576
				577	if (tp->rcv_rtt_est.time == 0)
				578	goto new_measure;
				579	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
				580	return;
				581	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
				582	if (!delta_us)
				583	delta_us = 1;
				584	tcp_rcv_rtt_update(tp, delta_us, 1);
				585
				586	new_measure:
				587	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
				588	tp->rcv_rtt_est.time = tp->tcp_mstamp;
				589	}
				590
				591	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
				592	const struct sk_buff *skb)
				593	{
				594	struct tcp_sock *tp = tcp_sk(sk);
				595
				596	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
				597	return;
				598	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				599
				600	if (TCP_SKB_CB(skb)->end_seq -
				601	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
				602	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
				603	u32 delta_us;
				604
				605	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
				606	if (!delta)
				607	delta = 1;
				608	delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				609	tcp_rcv_rtt_update(tp, delta_us, 0);
				610	}
				611	}
				612	}
				613
				614	/*
				615	* This function should be called every time data is copied to user space.
				616	* It calculates the appropriate TCP receive buffer space.
				617	*/
				618	void tcp_rcv_space_adjust(struct sock *sk)
				619	{
				620	struct tcp_sock *tp = tcp_sk(sk);
				621	u32 copied;
				622	int time;
				623
				624	trace_tcp_rcv_space_adjust(sk);
				625
				626	tcp_mstamp_refresh(tp);
				627	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
				628	if (time < (tp->rcv_rtt_est.rtt_us >> 3) \|\| tp->rcv_rtt_est.rtt_us == 0)
				629	return;
				630
				631	/* Number of bytes copied to user in last RTT */
				632	copied = tp->copied_seq - tp->rcvq_space.seq;
				633	if (copied <= tp->rcvq_space.space)
				634	goto new_measure;
				635
				636	/* A bit of theory :
				637	* copied = bytes received in previous RTT, our base window
				638	* To cope with packet losses, we need a 2x factor
				639	* To cope with slow start, and sender growing its cwin by 100 %
				640	* every RTT, we need a 4x factor, because the ACK we are sending
				641	* now is for the next RTT, not the current one :
				642	* <prev RTT . ><current RTT .. ><next RTT .... >
				643	*/
				644
				645	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
				646	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
				647	int rcvmem, rcvbuf;
				648	u64 rcvwin, grow;
				649
				650	/* minimal window to cope with packet losses, assuming
				651	* steady state. Add some cushion because of small variations.
				652	*/
				653	rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
				654
				655	/* Accommodate for sender rate increase (eg. slow start) */
				656	grow = rcvwin * (copied - tp->rcvq_space.space);
				657	do_div(grow, tp->rcvq_space.space);
				658	rcvwin += (grow << 1);
				659
				660	rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
				661	while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
				662	rcvmem += 128;
				663
				664	do_div(rcvwin, tp->advmss);
				665	rcvbuf = min_t(u64, rcvwin * rcvmem,
				666	sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
				667	if (rcvbuf > sk->sk_rcvbuf) {
				668	WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
				669
				670	/* Make the window clamp follow along. */
				671	tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
				672	}
				673	}
				674	tp->rcvq_space.space = copied;
				675
				676	new_measure:
				677	tp->rcvq_space.seq = tp->copied_seq;
				678	tp->rcvq_space.time = tp->tcp_mstamp;
				679	}
				680
				681	/* There is something which you must keep in mind when you analyze the
				682	* behavior of the tp->ato delayed ack timeout interval. When a
				683	* connection starts up, we want to ack as quickly as possible. The
				684	* problem is that "good" TCP's do slow start at the beginning of data
				685	* transmission. The means that until we send the first few ACK's the
				686	* sender will sit on his end and only queue most of his data, because
				687	* he can only send snd_cwnd unacked packets at any given time. For
				688	* each ACK we send, he increments snd_cwnd and transmits more of his
				689	* queue. -DaveM
				690	*/
				691	static void tcp_event_data_recv(struct sock sk, struct sk_buff skb)
				692	{
				693	struct tcp_sock *tp = tcp_sk(sk);
				694	struct inet_connection_sock *icsk = inet_csk(sk);
				695	u32 now;
				696
				697	inet_csk_schedule_ack(sk);
				698
				699	tcp_measure_rcv_mss(sk, skb);
				700
				701	tcp_rcv_rtt_measure(tp);
				702
				703	now = tcp_jiffies32;
				704
				705	if (!icsk->icsk_ack.ato) {
				706	/* The _first_ data packet received, initialize
				707	* delayed ACK engine.
				708	*/
				709	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				710	icsk->icsk_ack.ato = TCP_ATO_MIN;
				711	} else {
				712	int m = now - icsk->icsk_ack.lrcvtime;
				713
				714	if (m <= TCP_ATO_MIN / 2) {
				715	/* The fastest case is the first. */
				716	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
				717	} else if (m < icsk->icsk_ack.ato) {
				718	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
				719	if (icsk->icsk_ack.ato > icsk->icsk_rto)
				720	icsk->icsk_ack.ato = icsk->icsk_rto;
				721	} else if (m > icsk->icsk_rto) {
				722	/* Too long gap. Apparently sender failed to
				723	* restart window, so that we send ACKs quickly.
				724	*/
				725	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				726	sk_mem_reclaim(sk);
				727	}
				728	}
				729	icsk->icsk_ack.lrcvtime = now;
				730
				731	tcp_ecn_check_ce(sk, skb);
				732
				733	if (skb->len >= 128)
				734	tcp_grow_window(sk, skb);
				735	}
				736
				737	/* Called to compute a smoothed rtt estimate. The data fed to this
				738	* routine either comes from timestamps, or from segments that were
				739	* known _not_ to have been retransmitted [see Karn/Partridge
				740	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
				741	* piece by Van Jacobson.
				742	* NOTE: the next three routines used to be one big routine.
				743	* To save cycles in the RFC 1323 implementation it was better to break
				744	* it up into three procedures. -- erics
				745	*/
				746	static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
				747	{
				748	struct tcp_sock *tp = tcp_sk(sk);
				749	long m = mrtt_us; /* RTT */
				750	u32 srtt = tp->srtt_us;
				751
				752	/* The following amusing code comes from Jacobson's
				753	* article in SIGCOMM '88. Note that rtt and mdev
				754	* are scaled versions of rtt and mean deviation.
				755	* This is designed to be as fast as possible
				756	* m stands for "measurement".
				757	*
				758	* On a 1990 paper the rto value is changed to:
				759	* RTO = rtt + 4 * mdev
				760	*
				761	* Funny. This algorithm seems to be very broken.
				762	* These formulae increase RTO, when it should be decreased, increase
				763	* too slowly, when it should be increased quickly, decrease too quickly
				764	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
				765	* does not matter how to _calculate_ it. Seems, it was trap
				766	* that VJ failed to avoid. 8)
				767	*/
				768	if (srtt != 0) {
				769	m -= (srtt >> 3); /* m is now error in rtt est */
				770	srtt += m; /* rtt = 7/8 rtt + 1/8 new */
				771	if (m < 0) {
				772	m = -m; /* m is now abs(error) */
				773	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				774	/* This is similar to one of Eifel findings.
				775	* Eifel blocks mdev updates when rtt decreases.
				776	* This solution is a bit different: we use finer gain
				777	* for mdev in this case (alpha*beta).
				778	* Like Eifel it also prevents growth of rto,
				779	* but also it limits too fast rto decreases,
				780	* happening in pure Eifel.
				781	*/
				782	if (m > 0)
				783	m >>= 3;
				784	} else {
				785	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				786	}
				787	tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
				788	if (tp->mdev_us > tp->mdev_max_us) {
				789	tp->mdev_max_us = tp->mdev_us;
				790	if (tp->mdev_max_us > tp->rttvar_us)
				791	tp->rttvar_us = tp->mdev_max_us;
				792	}
				793	if (after(tp->snd_una, tp->rtt_seq)) {
				794	if (tp->mdev_max_us < tp->rttvar_us)
				795	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
				796	tp->rtt_seq = tp->snd_nxt;
				797	tp->mdev_max_us = tcp_rto_min_us(sk);
				798
				799	tcp_bpf_rtt(sk);
				800	}
				801	} else {
				802	/* no previous measure. */
				803	srtt = m << 3; /* take the measured time to be rtt */
				804	tp->mdev_us = m << 1; /* make sure rto = 3rtt /
				805	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
				806	tp->mdev_max_us = tp->rttvar_us;
				807	tp->rtt_seq = tp->snd_nxt;
				808
				809	tcp_bpf_rtt(sk);
				810	}
				811	tp->srtt_us = max(1U, srtt);
				812	}
				813
				814	static void tcp_update_pacing_rate(struct sock *sk)
				815	{
				816	const struct tcp_sock *tp = tcp_sk(sk);
				817	u64 rate;
				818
				819	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
				820	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
				821
				822	/* current rate is (cwnd * mss) / srtt
				823	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
				824	* In Congestion Avoidance phase, set it to 120 % the current rate.
				825	*
				826	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
				827	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
				828	* end of slow start and should slow down.
				829	*/
				830	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
				831	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
				832	else
				833	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
				834
				835	rate *= max(tp->snd_cwnd, tp->packets_out);
				836
				837	if (likely(tp->srtt_us))
				838	do_div(rate, tp->srtt_us);
				839
				840	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
				841	* without any lock. We want to make sure compiler wont store
				842	* intermediate values in this location.
				843	*/
				844	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
				845	sk->sk_max_pacing_rate));
				846	}
				847
				848	/* Calculate rto without backoff. This is the second half of Van Jacobson's
				849	* routine referred to above.
				850	*/
				851	static void tcp_set_rto(struct sock *sk)
				852	{
				853	const struct tcp_sock *tp = tcp_sk(sk);
				854	/* Old crap is replaced with new one. 8)
				855	*
				856	* More seriously:
				857	* 1. If rtt variance happened to be less 50msec, it is hallucination.
				858	* It cannot be less due to utterly erratic ACK generation made
				859	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
				860	* to do with delayed acks, because at cwnd>2 true delack timeout
				861	* is invisible. Actually, Linux-2.4 also generates erratic
				862	* ACKs in some circumstances.
				863	*/
				864	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
				865
				866	/* 2. Fixups made earlier cannot be right.
				867	* If we do not estimate RTO correctly without them,
				868	* all the algo is pure shit and should be replaced
				869	* with correct one. It is exactly, which we pretend to do.
				870	*/
				871
				872	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
				873	* guarantees that rto is higher.
				874	*/
				875	tcp_bound_rto(sk);
				876	}
				877
				878	__u32 tcp_init_cwnd(const struct tcp_sock tp, const struct dst_entry dst)
				879	{
				880	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
				881
				882	if (!cwnd)
				883	cwnd = TCP_INIT_CWND;
				884	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
				885	}
				886
				887	/* Take a notice that peer is sending D-SACKs */
				888	static void tcp_dsack_seen(struct tcp_sock *tp)
				889	{
				890	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
				891	tp->rack.dsack_seen = 1;
				892	tp->dsack_dups++;
				893	}
				894
				895	/* It's reordering when higher sequence was delivered (i.e. sacked) before
				896	* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
				897	* distance is approximated in full-mss packet distance ("reordering").
				898	*/
				899	static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				900	const int ts)
				901	{
				902	struct tcp_sock *tp = tcp_sk(sk);
				903	const u32 mss = tp->mss_cache;
				904	u32 fack, metric;
				905
				906	fack = tcp_highest_sack_seq(tp);
				907	if (!before(low_seq, fack))
				908	return;
				909
				910	metric = fack - low_seq;
				911	if ((metric > tp->reordering * mss) && mss) {
				912	#if FASTRETRANS_DEBUG > 1
				913	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
				914	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
				915	tp->reordering,
				916	0,
				917	tp->sacked_out,
				918	tp->undo_marker ? tp->undo_retrans : 0);
				919	#endif
				920	tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				921	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
				922	}
				923
				924	/* This exciting event is worth to be remembered. 8) */
				925	tp->reord_seen++;
				926	NET_INC_STATS(sock_net(sk),
				927	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
				928	}
				929
				930	/* This must be called before lost_out is incremented */
				931	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
				932	{
				933	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
				934	(tp->retransmit_skb_hint &&
				935	before(TCP_SKB_CB(skb)->seq,
				936	TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
				937	tp->retransmit_skb_hint = skb;
				938	}
				939
				940	/* Sum the number of packets on the wire we have marked as lost.
				941	* There are two cases we care about here:
				942	* a) Packet hasn't been marked lost (nor retransmitted),
				943	* and this is the first loss.
				944	* b) Packet has been marked both lost and retransmitted,
				945	* and this means we think it was lost again.
				946	*/
				947	static void tcp_sum_lost(struct tcp_sock tp, struct sk_buff skb)
				948	{
				949	__u8 sacked = TCP_SKB_CB(skb)->sacked;
				950
				951	if (!(sacked & TCPCB_LOST) \|\|
				952	((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
				953	tp->lost += tcp_skb_pcount(skb);
				954	}
				955
				956	static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
				957	{
				958	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				959	tcp_verify_retransmit_hint(tp, skb);
				960
				961	tp->lost_out += tcp_skb_pcount(skb);
				962	tcp_sum_lost(tp, skb);
				963	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				964	}
				965	}
				966
				967	void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
				968	{
				969	tcp_verify_retransmit_hint(tp, skb);
				970
				971	tcp_sum_lost(tp, skb);
				972	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				973	tp->lost_out += tcp_skb_pcount(skb);
				974	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				975	}
				976	}
				977
				978	/* This procedure tags the retransmission queue when SACKs arrive.
				979	*
				980	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
				981	* Packets in queue with these bits set are counted in variables
				982	* sacked_out, retrans_out and lost_out, correspondingly.
				983	*
				984	* Valid combinations are:
				985	* Tag InFlight Description
				986	* 0 1 - orig segment is in flight.
				987	* S 0 - nothing flies, orig reached receiver.
				988	* L 0 - nothing flies, orig lost by net.
				989	* R 2 - both orig and retransmit are in flight.
				990	* L\|R 1 - orig is lost, retransmit is in flight.
				991	* S\|R 1 - orig reached receiver, retrans is still in flight.
				992	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
				993	* but it is equivalent to plain S and code short-curcuits it to S.
				994	* L\|S is logically invalid, it would mean -1 packet in flight 8))
				995	*
				996	* These 6 states form finite state machine, controlled by the following events:
				997	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
				998	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
				999	* 3. Loss detection event of two flavors:
				1000	* A. Scoreboard estimator decided the packet is lost.
				1001	* A'. Reno "three dupacks" marks head of queue lost.
				1002	* B. SACK arrives sacking SND.NXT at the moment, when the
				1003	* segment was retransmitted.
				1004	* 4. D-SACK added new rule: D-SACK changes any tag to S.
				1005	*
				1006	* It is pleasant to note, that state diagram turns out to be commutative,
				1007	* so that we are allowed not to be bothered by order of our actions,
				1008	* when multiple events arrive simultaneously. (see the function below).
				1009	*
				1010	* Reordering detection.
				1011	* --------------------
				1012	* Reordering metric is maximal distance, which a packet can be displaced
				1013	* in packet stream. With SACKs we can estimate it:
				1014	*
				1015	* 1. SACK fills old hole and the corresponding segment was not
				1016	* ever retransmitted -> reordering. Alas, we cannot use it
				1017	* when segment was retransmitted.
				1018	* 2. The last flaw is solved with D-SACK. D-SACK arrives
				1019	* for retransmitted and already SACKed segment -> reordering..
				1020	* Both of these heuristics are not used in Loss state, when we cannot
				1021	* account for retransmits accurately.
				1022	*
				1023	* SACK block validation.
				1024	* ----------------------
				1025	*
				1026	* SACK block range validation checks that the received SACK block fits to
				1027	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
				1028	* Note that SND.UNA is not included to the range though being valid because
				1029	* it means that the receiver is rather inconsistent with itself reporting
				1030	* SACK reneging when it should advance SND.UNA. Such SACK block this is
				1031	* perfectly valid, however, in light of RFC2018 which explicitly states
				1032	* that "SACK block MUST reflect the newest segment. Even if the newest
				1033	* segment is going to be discarded ...", not that it looks very clever
				1034	* in case of head skb. Due to potentional receiver driven attacks, we
				1035	* choose to avoid immediate execution of a walk in write queue due to
				1036	* reneging and defer head skb's loss recovery to standard loss recovery
				1037	* procedure that will eventually trigger (nothing forbids us doing this).
				1038	*
				1039	* Implements also blockage to start_seq wrap-around. Problem lies in the
				1040	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
				1041	* there's no guarantee that it will be before snd_nxt (n). The problem
				1042	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
				1043	* wrap (s_w):
				1044	*
				1045	* <- outs wnd -> <- wrapzone ->
				1046	* u e n u_w e_w s n_w
				1047	* \| \| \| \| \| \| \|
				1048	* \|<------------+------+----- TCP seqno space --------------+---------->\|
				1049	* ...-- <2^31 ->\| \|<--------...
				1050	* ...---- >2^31 ------>\| \|<--------...
				1051	*
				1052	* Current code wouldn't be vulnerable but it's better still to discard such
				1053	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
				1054	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
				1055	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
				1056	* equal to the ideal case (infinite seqno space without wrap caused issues).
				1057	*
				1058	* With D-SACK the lower bound is extended to cover sequence space below
				1059	* SND.UNA down to undo_marker, which is the last point of interest. Yet
				1060	* again, D-SACK block must not to go across snd_una (for the same reason as
				1061	* for the normal SACK blocks, explained above). But there all simplicity
				1062	* ends, TCP might receive valid D-SACKs below that. As long as they reside
				1063	* fully below undo_marker they do not affect behavior in anyway and can
				1064	* therefore be safely ignored. In rare cases (which are more or less
				1065	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
				1066	* fragmentation and packet reordering past skb's retransmission. To consider
				1067	* them correctly, the acceptable range must be extended even more though
				1068	* the exact amount is rather hard to quantify. However, tp->max_window can
				1069	* be used as an exaggerated estimate.
				1070	*/
				1071	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				1072	u32 start_seq, u32 end_seq)
				1073	{
				1074	/* Too far in future, or reversed (interpretation is ambiguous) */
				1075	if (after(end_seq, tp->snd_nxt) \|\| !before(start_seq, end_seq))
				1076	return false;
				1077
				1078	/* Nasty start_seq wrap-around check (see comments above) */
				1079	if (!before(start_seq, tp->snd_nxt))
				1080	return false;
				1081
				1082	/* In outstanding window? ...This is valid exit for D-SACKs too.
				1083	* start_seq == snd_una is non-sensical (see comments above)
				1084	*/
				1085	if (after(start_seq, tp->snd_una))
				1086	return true;
				1087
				1088	if (!is_dsack \|\| !tp->undo_marker)
				1089	return false;
				1090
				1091	/* ...Then it's D-SACK, and must reside below snd_una completely */
				1092	if (after(end_seq, tp->snd_una))
				1093	return false;
				1094
				1095	if (!before(start_seq, tp->undo_marker))
				1096	return true;
				1097
				1098	/* Too old */
				1099	if (!after(end_seq, tp->undo_marker))
				1100	return false;
				1101
				1102	/* Undo_marker boundary crossing (overestimates a lot). Known already:
				1103	* start_seq < undo_marker and end_seq >= undo_marker.
				1104	*/
				1105	return !before(start_seq, end_seq - tp->max_window);
				1106	}
				1107
				1108	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
				1109	struct tcp_sack_block_wire *sp, int num_sacks,
				1110	u32 prior_snd_una)
				1111	{
				1112	struct tcp_sock *tp = tcp_sk(sk);
				1113	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
				1114	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
				1115	bool dup_sack = false;
				1116
				1117	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
				1118	dup_sack = true;
				1119	tcp_dsack_seen(tp);
				1120	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
				1121	} else if (num_sacks > 1) {
				1122	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
				1123	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
				1124
				1125	if (!after(end_seq_0, end_seq_1) &&
				1126	!before(start_seq_0, start_seq_1)) {
				1127	dup_sack = true;
				1128	tcp_dsack_seen(tp);
				1129	NET_INC_STATS(sock_net(sk),
				1130	LINUX_MIB_TCPDSACKOFORECV);
				1131	}
				1132	}
				1133
				1134	/* D-SACK for already forgotten data... Do dumb counting. */
				1135	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
				1136	!after(end_seq_0, prior_snd_una) &&
				1137	after(end_seq_0, tp->undo_marker))
				1138	tp->undo_retrans--;
				1139
				1140	return dup_sack;
				1141	}
				1142
				1143	struct tcp_sacktag_state {
				1144	u32 reord;
				1145	/* Timestamps for earliest and latest never-retransmitted segment
				1146	* that was SACKed. RTO needs the earliest RTT to stay conservative,
				1147	* but congestion control should still get an accurate delay signal.
				1148	*/
				1149	u64 first_sackt;
				1150	u64 last_sackt;
				1151	struct rate_sample *rate;
				1152	int flag;
				1153	unsigned int mss_now;
				1154	};
				1155
				1156	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
				1157	* the incoming SACK may not exactly match but we can find smaller MSS
				1158	* aligned portion of it that matches. Therefore we might need to fragment
				1159	* which may fail and creates some hassle (caller must handle error case
				1160	* returns).
				1161	*
				1162	* FIXME: this could be merged to shift decision code
				1163	*/
				1164	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
				1165	u32 start_seq, u32 end_seq)
				1166	{
				1167	int err;
				1168	bool in_sack;
				1169	unsigned int pkt_len;
				1170	unsigned int mss;
				1171
				1172	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1173	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1174
				1175	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
				1176	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
				1177	mss = tcp_skb_mss(skb);
				1178	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1179
				1180	if (!in_sack) {
				1181	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
				1182	if (pkt_len < mss)
				1183	pkt_len = mss;
				1184	} else {
				1185	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
				1186	if (pkt_len < mss)
				1187	return -EINVAL;
				1188	}
				1189
				1190	/* Round if necessary so that SACKs cover only full MSSes
				1191	* and/or the remaining small portion (if present)
				1192	*/
				1193	if (pkt_len > mss) {
				1194	unsigned int new_len = (pkt_len / mss) * mss;
				1195	if (!in_sack && new_len < pkt_len)
				1196	new_len += mss;
				1197	pkt_len = new_len;
				1198	}
				1199
				1200	if (pkt_len >= skb->len && !in_sack)
				1201	return 0;
				1202
				1203	err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				1204	pkt_len, mss, GFP_ATOMIC);
				1205	if (err < 0)
				1206	return err;
				1207	}
				1208
				1209	return in_sack;
				1210	}
				1211
				1212	/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
				1213	static u8 tcp_sacktag_one(struct sock *sk,
				1214	struct tcp_sacktag_state *state, u8 sacked,
				1215	u32 start_seq, u32 end_seq,
				1216	int dup_sack, int pcount,
				1217	u64 xmit_time)
				1218	{
				1219	struct tcp_sock *tp = tcp_sk(sk);
				1220
				1221	/* Account D-SACK for retransmitted packet. */
				1222	if (dup_sack && (sacked & TCPCB_RETRANS)) {
				1223	if (tp->undo_marker && tp->undo_retrans > 0 &&
				1224	after(end_seq, tp->undo_marker))
				1225	tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
				1226	if ((sacked & TCPCB_SACKED_ACKED) &&
				1227	before(start_seq, state->reord))
				1228	state->reord = start_seq;
				1229	}
				1230
				1231	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
				1232	if (!after(end_seq, tp->snd_una))
				1233	return sacked;
				1234
				1235	if (!(sacked & TCPCB_SACKED_ACKED)) {
				1236	tcp_rack_advance(tp, sacked, end_seq, xmit_time);
				1237
				1238	if (sacked & TCPCB_SACKED_RETRANS) {
				1239	/* If the segment is not tagged as lost,
				1240	* we do not clear RETRANS, believing
				1241	* that retransmission is still in flight.
				1242	*/
				1243	if (sacked & TCPCB_LOST) {
				1244	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
				1245	tp->lost_out -= pcount;
				1246	tp->retrans_out -= pcount;
				1247	}
				1248	} else {
				1249	if (!(sacked & TCPCB_RETRANS)) {
				1250	/* New sack for not retransmitted frame,
				1251	* which was in hole. It is reordering.
				1252	*/
				1253	if (before(start_seq,
				1254	tcp_highest_sack_seq(tp)) &&
				1255	before(start_seq, state->reord))
				1256	state->reord = start_seq;
				1257
				1258	if (!after(end_seq, tp->high_seq))
				1259	state->flag \|= FLAG_ORIG_SACK_ACKED;
				1260	if (state->first_sackt == 0)
				1261	state->first_sackt = xmit_time;
				1262	state->last_sackt = xmit_time;
				1263	}
				1264
				1265	if (sacked & TCPCB_LOST) {
				1266	sacked &= ~TCPCB_LOST;
				1267	tp->lost_out -= pcount;
				1268	}
				1269	}
				1270
				1271	sacked \|= TCPCB_SACKED_ACKED;
				1272	state->flag \|= FLAG_DATA_SACKED;
				1273	tp->sacked_out += pcount;
				1274	tp->delivered += pcount; /* Out-of-order packets delivered */
				1275
				1276	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
				1277	if (tp->lost_skb_hint &&
				1278	before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
				1279	tp->lost_cnt_hint += pcount;
				1280	}
				1281
				1282	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
				1283	* frames and clear it. undo_retrans is decreased above, L\|R frames
				1284	* are accounted above as well.
				1285	*/
				1286	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
				1287	sacked &= ~TCPCB_SACKED_RETRANS;
				1288	tp->retrans_out -= pcount;
				1289	}
				1290
				1291	return sacked;
				1292	}
				1293
				1294	/* Shift newly-SACKed bytes from this skb to the immediately previous
				1295	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
				1296	*/
				1297	static bool tcp_shifted_skb(struct sock sk, struct sk_buff prev,
				1298	struct sk_buff *skb,
				1299	struct tcp_sacktag_state *state,
				1300	unsigned int pcount, int shifted, int mss,
				1301	bool dup_sack)
				1302	{
				1303	struct tcp_sock *tp = tcp_sk(sk);
				1304	u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
				1305	u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
				1306
				1307	BUG_ON(!pcount);
				1308
				1309	/* Adjust counters and hints for the newly sacked sequence
				1310	* range but discard the return value since prev is already
				1311	* marked. We must tag the range first because the seq
				1312	* advancement below implicitly advances
				1313	* tcp_highest_sack_seq() when skb is highest_sack.
				1314	*/
				1315	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
				1316	start_seq, end_seq, dup_sack, pcount,
				1317	tcp_skb_timestamp_us(skb));
				1318	tcp_rate_skb_delivered(sk, skb, state->rate);
				1319
				1320	if (skb == tp->lost_skb_hint)
				1321	tp->lost_cnt_hint += pcount;
				1322
				1323	TCP_SKB_CB(prev)->end_seq += shifted;
				1324	TCP_SKB_CB(skb)->seq += shifted;
				1325
				1326	tcp_skb_pcount_add(prev, pcount);
				1327	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
				1328	tcp_skb_pcount_add(skb, -pcount);
				1329
				1330	/* When we're adding to gso_segs == 1, gso_size will be zero,
				1331	* in theory this shouldn't be necessary but as long as DSACK
				1332	* code can come after this skb later on it's better to keep
				1333	* setting gso_size to something.
				1334	*/
				1335	if (!TCP_SKB_CB(prev)->tcp_gso_size)
				1336	TCP_SKB_CB(prev)->tcp_gso_size = mss;
				1337
				1338	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
				1339	if (tcp_skb_pcount(skb) <= 1)
				1340	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1341
				1342	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
				1343	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
				1344
				1345	if (skb->len > 0) {
				1346	BUG_ON(!tcp_skb_pcount(skb));
				1347	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
				1348	return false;
				1349	}
				1350
				1351	/* Whole SKB was eaten :-) */
				1352
				1353	if (skb == tp->retransmit_skb_hint)
				1354	tp->retransmit_skb_hint = prev;
				1355	if (skb == tp->lost_skb_hint) {
				1356	tp->lost_skb_hint = prev;
				1357	tp->lost_cnt_hint -= tcp_skb_pcount(prev);
				1358	}
				1359
				1360	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1361	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
				1362	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1363	TCP_SKB_CB(prev)->end_seq++;
				1364
				1365	if (skb == tcp_highest_sack(sk))
				1366	tcp_advance_highest_sack(sk, skb);
				1367
				1368	tcp_skb_collapse_tstamp(prev, skb);
				1369	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
				1370	TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
				1371
				1372	tcp_rtx_queue_unlink_and_free(skb, sk);
				1373
				1374	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
				1375
				1376	return true;
				1377	}
				1378
				1379	/* I wish gso_size would have a bit more sane initialization than
				1380	* something-or-zero which complicates things
				1381	*/
				1382	static int tcp_skb_seglen(const struct sk_buff *skb)
				1383	{
				1384	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
				1385	}
				1386
				1387	/* Shifting pages past head area doesn't work */
				1388	static int skb_can_shift(const struct sk_buff *skb)
				1389	{
				1390	return !skb_headlen(skb) && skb_is_nonlinear(skb);
				1391	}
				1392
				1393	int tcp_skb_shift(struct sk_buff to, struct sk_buff from,
				1394	int pcount, int shiftlen)
				1395	{
				1396	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
				1397	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
				1398	* to make sure not storing more than 65535 * 8 bytes per skb,
				1399	* even if current MSS is bigger.
				1400	*/
				1401	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
				1402	return 0;
				1403	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
				1404	return 0;
				1405	return skb_shift(to, from, shiftlen);
				1406	}
				1407
				1408	/* Try collapsing SACK blocks spanning across multiple skbs to a single
				1409	* skb.
				1410	*/
				1411	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
				1412	struct tcp_sacktag_state *state,
				1413	u32 start_seq, u32 end_seq,
				1414	bool dup_sack)
				1415	{
				1416	struct tcp_sock *tp = tcp_sk(sk);
				1417	struct sk_buff *prev;
				1418	int mss;
				1419	int pcount = 0;
				1420	int len;
				1421	int in_sack;
				1422
				1423	/* Normally R but no L won't result in plain S */
				1424	if (!dup_sack &&
				1425	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
				1426	goto fallback;
				1427	if (!skb_can_shift(skb))
				1428	goto fallback;
				1429	/* This frame is about to be dropped (was ACKed). */
				1430	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1431	goto fallback;
				1432
				1433	/* Can only happen with delayed DSACK + discard craziness */
				1434	prev = skb_rb_prev(skb);
				1435	if (!prev)
				1436	goto fallback;
				1437
				1438	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
				1439	goto fallback;
				1440
				1441	if (!tcp_skb_can_collapse_to(prev))
				1442	goto fallback;
				1443
				1444	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1445	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1446
				1447	if (in_sack) {
				1448	len = skb->len;
				1449	pcount = tcp_skb_pcount(skb);
				1450	mss = tcp_skb_seglen(skb);
				1451
				1452	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1453	* drop this restriction as unnecessary
				1454	*/
				1455	if (mss != tcp_skb_seglen(prev))
				1456	goto fallback;
				1457	} else {
				1458	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
				1459	goto noop;
				1460	/* CHECKME: This is non-MSS split case only?, this will
				1461	* cause skipped skbs due to advancing loop btw, original
				1462	* has that feature too
				1463	*/
				1464	if (tcp_skb_pcount(skb) <= 1)
				1465	goto noop;
				1466
				1467	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1468	if (!in_sack) {
				1469	/* TODO: head merge to next could be attempted here
				1470	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
				1471	* though it might not be worth of the additional hassle
				1472	*
				1473	* ...we can probably just fallback to what was done
				1474	* previously. We could try merging non-SACKed ones
				1475	* as well but it probably isn't going to buy off
				1476	* because later SACKs might again split them, and
				1477	* it would make skb timestamp tracking considerably
				1478	* harder problem.
				1479	*/
				1480	goto fallback;
				1481	}
				1482
				1483	len = end_seq - TCP_SKB_CB(skb)->seq;
				1484	BUG_ON(len < 0);
				1485	BUG_ON(len > skb->len);
				1486
				1487	/* MSS boundaries should be honoured or else pcount will
				1488	* severely break even though it makes things bit trickier.
				1489	* Optimize common case to avoid most of the divides
				1490	*/
				1491	mss = tcp_skb_mss(skb);
				1492
				1493	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1494	* drop this restriction as unnecessary
				1495	*/
				1496	if (mss != tcp_skb_seglen(prev))
				1497	goto fallback;
				1498
				1499	if (len == mss) {
				1500	pcount = 1;
				1501	} else if (len < mss) {
				1502	goto noop;
				1503	} else {
				1504	pcount = len / mss;
				1505	len = pcount * mss;
				1506	}
				1507	}
				1508
				1509	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
				1510	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
				1511	goto fallback;
				1512
				1513	if (!tcp_skb_shift(prev, skb, pcount, len))
				1514	goto fallback;
				1515	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
				1516	goto out;
				1517
				1518	/* Hole filled allows collapsing with the next as well, this is very
				1519	* useful when hole on every nth skb pattern happens
				1520	*/
				1521	skb = skb_rb_next(prev);
				1522	if (!skb)
				1523	goto out;
				1524
				1525	if (!skb_can_shift(skb) \|\|
				1526	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
				1527	(mss != tcp_skb_seglen(skb)))
				1528	goto out;
				1529
				1530	len = skb->len;
				1531	pcount = tcp_skb_pcount(skb);
				1532	if (tcp_skb_shift(prev, skb, pcount, len))
				1533	tcp_shifted_skb(sk, prev, skb, state, pcount,
				1534	len, mss, 0);
				1535
				1536	out:
				1537	return prev;
				1538
				1539	noop:
				1540	return skb;
				1541
				1542	fallback:
				1543	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
				1544	return NULL;
				1545	}
				1546
				1547	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
				1548	struct tcp_sack_block *next_dup,
				1549	struct tcp_sacktag_state *state,
				1550	u32 start_seq, u32 end_seq,
				1551	bool dup_sack_in)
				1552	{
				1553	struct tcp_sock *tp = tcp_sk(sk);
				1554	struct sk_buff *tmp;
				1555
				1556	skb_rbtree_walk_from(skb) {
				1557	int in_sack = 0;
				1558	bool dup_sack = dup_sack_in;
				1559
				1560	/* queue is in-order => we can short-circuit the walk early */
				1561	if (!before(TCP_SKB_CB(skb)->seq, end_seq))
				1562	break;
				1563
				1564	if (next_dup &&
				1565	before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
				1566	in_sack = tcp_match_skb_to_sack(sk, skb,
				1567	next_dup->start_seq,
				1568	next_dup->end_seq);
				1569	if (in_sack > 0)
				1570	dup_sack = true;
				1571	}
				1572
				1573	/* skb reference here is a bit tricky to get right, since
				1574	* shifting can eat and free both this skb and the next,
				1575	* so not even _safe variant of the loop is enough.
				1576	*/
				1577	if (in_sack <= 0) {
				1578	tmp = tcp_shift_skb_data(sk, skb, state,
				1579	start_seq, end_seq, dup_sack);
				1580	if (tmp) {
				1581	if (tmp != skb) {
				1582	skb = tmp;
				1583	continue;
				1584	}
				1585
				1586	in_sack = 0;
				1587	} else {
				1588	in_sack = tcp_match_skb_to_sack(sk, skb,
				1589	start_seq,
				1590	end_seq);
				1591	}
				1592	}
				1593
				1594	if (unlikely(in_sack < 0))
				1595	break;
				1596
				1597	if (in_sack) {
				1598	TCP_SKB_CB(skb)->sacked =
				1599	tcp_sacktag_one(sk,
				1600	state,
				1601	TCP_SKB_CB(skb)->sacked,
				1602	TCP_SKB_CB(skb)->seq,
				1603	TCP_SKB_CB(skb)->end_seq,
				1604	dup_sack,
				1605	tcp_skb_pcount(skb),
				1606	tcp_skb_timestamp_us(skb));
				1607	tcp_rate_skb_delivered(sk, skb, state->rate);
				1608	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1609	list_del_init(&skb->tcp_tsorted_anchor);
				1610
				1611	if (!before(TCP_SKB_CB(skb)->seq,
				1612	tcp_highest_sack_seq(tp)))
				1613	tcp_advance_highest_sack(sk, skb);
				1614	}
				1615	}
				1616	return skb;
				1617	}
				1618
				1619	static struct sk_buff tcp_sacktag_bsearch(struct sock sk, u32 seq)
				1620	{
				1621	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
				1622	struct sk_buff *skb;
				1623
				1624	while (*p) {
				1625	parent = *p;
				1626	skb = rb_to_skb(parent);
				1627	if (before(seq, TCP_SKB_CB(skb)->seq)) {
				1628	p = &parent->rb_left;
				1629	continue;
				1630	}
				1631	if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
				1632	p = &parent->rb_right;
				1633	continue;
				1634	}
				1635	return skb;
				1636	}
				1637	return NULL;
				1638	}
				1639
				1640	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
				1641	u32 skip_to_seq)
				1642	{
				1643	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
				1644	return skb;
				1645
				1646	return tcp_sacktag_bsearch(sk, skip_to_seq);
				1647	}
				1648
				1649	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
				1650	struct sock *sk,
				1651	struct tcp_sack_block *next_dup,
				1652	struct tcp_sacktag_state *state,
				1653	u32 skip_to_seq)
				1654	{
				1655	if (!next_dup)
				1656	return skb;
				1657
				1658	if (before(next_dup->start_seq, skip_to_seq)) {
				1659	skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
				1660	skb = tcp_sacktag_walk(skb, sk, NULL, state,
				1661	next_dup->start_seq, next_dup->end_seq,
				1662	1);
				1663	}
				1664
				1665	return skb;
				1666	}
				1667
				1668	static int tcp_sack_cache_ok(const struct tcp_sock tp, const struct tcp_sack_block cache)
				1669	{
				1670	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1671	}
				1672
				1673	static int
				1674	tcp_sacktag_write_queue(struct sock sk, const struct sk_buff ack_skb,
				1675	u32 prior_snd_una, struct tcp_sacktag_state *state)
				1676	{
				1677	struct tcp_sock *tp = tcp_sk(sk);
				1678	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				1679	TCP_SKB_CB(ack_skb)->sacked);
				1680	struct tcp_sack_block_wire sp_wire = (struct tcp_sack_block_wire )(ptr+2);
				1681	struct tcp_sack_block sp[TCP_NUM_SACKS];
				1682	struct tcp_sack_block *cache;
				1683	struct sk_buff *skb;
				1684	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
				1685	int used_sacks;
				1686	bool found_dup_sack = false;
				1687	int i, j;
				1688	int first_sack_index;
				1689
				1690	state->flag = 0;
				1691	state->reord = tp->snd_nxt;
				1692
				1693	if (!tp->sacked_out)
				1694	tcp_highest_sack_reset(sk);
				1695
				1696	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
				1697	num_sacks, prior_snd_una);
				1698	if (found_dup_sack) {
				1699	state->flag \|= FLAG_DSACKING_ACK;
				1700	tp->delivered++; /* A spurious retransmission is delivered */
				1701	}
				1702
				1703	/* Eliminate too old ACKs, but take into
				1704	* account more or less fresh ones, they can
				1705	* contain valid SACK info.
				1706	*/
				1707	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
				1708	return 0;
				1709
				1710	if (!tp->packets_out)
				1711	goto out;
				1712
				1713	used_sacks = 0;
				1714	first_sack_index = 0;
				1715	for (i = 0; i < num_sacks; i++) {
				1716	bool dup_sack = !i && found_dup_sack;
				1717
				1718	sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
				1719	sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
				1720
				1721	if (!tcp_is_sackblock_valid(tp, dup_sack,
				1722	sp[used_sacks].start_seq,
				1723	sp[used_sacks].end_seq)) {
				1724	int mib_idx;
				1725
				1726	if (dup_sack) {
				1727	if (!tp->undo_marker)
				1728	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
				1729	else
				1730	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
				1731	} else {
				1732	/* Don't count olds caused by ACK reordering */
				1733	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				1734	!after(sp[used_sacks].end_seq, tp->snd_una))
				1735	continue;
				1736	mib_idx = LINUX_MIB_TCPSACKDISCARD;
				1737	}
				1738
				1739	NET_INC_STATS(sock_net(sk), mib_idx);
				1740	if (i == 0)
				1741	first_sack_index = -1;
				1742	continue;
				1743	}
				1744
				1745	/* Ignore very old stuff early */
				1746	if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
				1747	if (i == 0)
				1748	first_sack_index = -1;
				1749	continue;
				1750	}
				1751
				1752	used_sacks++;
				1753	}
				1754
				1755	/* order SACK blocks to allow in order walk of the retrans queue */
				1756	for (i = used_sacks - 1; i > 0; i--) {
				1757	for (j = 0; j < i; j++) {
				1758	if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
				1759	swap(sp[j], sp[j + 1]);
				1760
				1761	/* Track where the first SACK block goes to */
				1762	if (j == first_sack_index)
				1763	first_sack_index = j + 1;
				1764	}
				1765	}
				1766	}
				1767
				1768	state->mss_now = tcp_current_mss(sk);
				1769	skb = NULL;
				1770	i = 0;
				1771
				1772	if (!tp->sacked_out) {
				1773	/* It's already past, so skip checking against it */
				1774	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1775	} else {
				1776	cache = tp->recv_sack_cache;
				1777	/* Skip empty blocks in at head of the cache */
				1778	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
				1779	!cache->end_seq)
				1780	cache++;
				1781	}
				1782
				1783	while (i < used_sacks) {
				1784	u32 start_seq = sp[i].start_seq;
				1785	u32 end_seq = sp[i].end_seq;
				1786	bool dup_sack = (found_dup_sack && (i == first_sack_index));
				1787	struct tcp_sack_block *next_dup = NULL;
				1788
				1789	if (found_dup_sack && ((i + 1) == first_sack_index))
				1790	next_dup = &sp[i + 1];
				1791
				1792	/* Skip too early cached blocks */
				1793	while (tcp_sack_cache_ok(tp, cache) &&
				1794	!before(start_seq, cache->end_seq))
				1795	cache++;
				1796
				1797	/* Can skip some work by looking recv_sack_cache? */
				1798	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
				1799	after(end_seq, cache->start_seq)) {
				1800
				1801	/* Head todo? */
				1802	if (before(start_seq, cache->start_seq)) {
				1803	skb = tcp_sacktag_skip(skb, sk, start_seq);
				1804	skb = tcp_sacktag_walk(skb, sk, next_dup,
				1805	state,
				1806	start_seq,
				1807	cache->start_seq,
				1808	dup_sack);
				1809	}
				1810
				1811	/* Rest of the block already fully processed? */
				1812	if (!after(end_seq, cache->end_seq))
				1813	goto advance_sp;
				1814
				1815	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
				1816	state,
				1817	cache->end_seq);
				1818
				1819	/* ...tail remains todo... */
				1820	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
				1821	/* ...but better entrypoint exists! */
				1822	skb = tcp_highest_sack(sk);
				1823	if (!skb)
				1824	break;
				1825	cache++;
				1826	goto walk;
				1827	}
				1828
				1829	skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
				1830	/* Check overlap against next cached too (past this one already) */
				1831	cache++;
				1832	continue;
				1833	}
				1834
				1835	if (!before(start_seq, tcp_highest_sack_seq(tp))) {
				1836	skb = tcp_highest_sack(sk);
				1837	if (!skb)
				1838	break;
				1839	}
				1840	skb = tcp_sacktag_skip(skb, sk, start_seq);
				1841
				1842	walk:
				1843	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
				1844	start_seq, end_seq, dup_sack);
				1845
				1846	advance_sp:
				1847	i++;
				1848	}
				1849
				1850	/* Clear the head of the cache sack blocks so we can skip it next time */
				1851	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
				1852	tp->recv_sack_cache[i].start_seq = 0;
				1853	tp->recv_sack_cache[i].end_seq = 0;
				1854	}
				1855	for (j = 0; j < used_sacks; j++)
				1856	tp->recv_sack_cache[i++] = sp[j];
				1857
				1858	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss \|\| tp->undo_marker)
				1859	tcp_check_sack_reordering(sk, state->reord, 0);
				1860
				1861	tcp_verify_left_out(tp);
				1862	out:
				1863
				1864	#if FASTRETRANS_DEBUG > 0
				1865	WARN_ON((int)tp->sacked_out < 0);
				1866	WARN_ON((int)tp->lost_out < 0);
				1867	WARN_ON((int)tp->retrans_out < 0);
				1868	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
				1869	#endif
				1870	return state->flag;
				1871	}
				1872
				1873	/* Limits sacked_out so that sum with lost_out isn't ever larger than
				1874	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
				1875	*/
				1876	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
				1877	{
				1878	u32 holes;
				1879
				1880	holes = max(tp->lost_out, 1U);
				1881	holes = min(holes, tp->packets_out);
				1882
				1883	if ((tp->sacked_out + holes) > tp->packets_out) {
				1884	tp->sacked_out = tp->packets_out - holes;
				1885	return true;
				1886	}
				1887	return false;
				1888	}
				1889
				1890	/* If we receive more dupacks than we expected counting segments
				1891	* in assumption of absent reordering, interpret this as reordering.
				1892	* The only another reason could be bug in receiver TCP.
				1893	*/
				1894	static void tcp_check_reno_reordering(struct sock *sk, const int addend)
				1895	{
				1896	struct tcp_sock *tp = tcp_sk(sk);
				1897
				1898	if (!tcp_limit_reno_sacked(tp))
				1899	return;
				1900
				1901	tp->reordering = min_t(u32, tp->packets_out + addend,
				1902	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
				1903	tp->reord_seen++;
				1904	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
				1905	}
				1906
				1907	/* Emulate SACKs for SACKless connection: account for a new dupack. */
				1908
				1909	static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
				1910	{
				1911	if (num_dupack) {
				1912	struct tcp_sock *tp = tcp_sk(sk);
				1913	u32 prior_sacked = tp->sacked_out;
				1914	s32 delivered;
				1915
				1916	tp->sacked_out += num_dupack;
				1917	tcp_check_reno_reordering(sk, 0);
				1918	delivered = tp->sacked_out - prior_sacked;
				1919	if (delivered > 0)
				1920	tp->delivered += delivered;
				1921	tcp_verify_left_out(tp);
				1922	}
				1923	}
				1924
				1925	/* Account for ACK, ACKing some data in Reno Recovery phase. */
				1926
				1927	static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
				1928	{
				1929	struct tcp_sock *tp = tcp_sk(sk);
				1930
				1931	if (acked > 0) {
				1932	/* One ACK acked hole. The rest eat duplicate ACKs. */
				1933	tp->delivered += max_t(int, acked - tp->sacked_out, 1);
				1934	if (acked - 1 >= tp->sacked_out)
				1935	tp->sacked_out = 0;
				1936	else
				1937	tp->sacked_out -= acked - 1;
				1938	}
				1939	tcp_check_reno_reordering(sk, acked);
				1940	tcp_verify_left_out(tp);
				1941	}
				1942
				1943	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
				1944	{
				1945	tp->sacked_out = 0;
				1946	}
				1947
				1948	void tcp_clear_retrans(struct tcp_sock *tp)
				1949	{
				1950	tp->retrans_out = 0;
				1951	tp->lost_out = 0;
				1952	tp->undo_marker = 0;
				1953	tp->undo_retrans = -1;
				1954	tp->sacked_out = 0;
				1955	}
				1956
				1957	static inline void tcp_init_undo(struct tcp_sock *tp)
				1958	{
				1959	tp->undo_marker = tp->snd_una;
				1960
				1961	/* Retransmission still in flight may cause DSACKs later. */
				1962	/* First, account for regular retransmits in flight: */
				1963	tp->undo_retrans = tp->retrans_out;
				1964	/* Next, account for TLP retransmits in flight: */
				1965	if (tp->tlp_high_seq && tp->tlp_retrans)
				1966	tp->undo_retrans++;
				1967	/* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
				1968	if (!tp->undo_retrans)
				1969	tp->undo_retrans = -1;
				1970	}
				1971
				1972	static bool tcp_is_rack(const struct sock *sk)
				1973	{
				1974	return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
				1975	TCP_RACK_LOSS_DETECTION;
				1976	}
				1977
				1978	/* If we detect SACK reneging, forget all SACK information
				1979	* and reset tags completely, otherwise preserve SACKs. If receiver
				1980	* dropped its ofo queue, we will know this due to reneging detection.
				1981	*/
				1982	static void tcp_timeout_mark_lost(struct sock *sk)
				1983	{
				1984	struct tcp_sock *tp = tcp_sk(sk);
				1985	struct sk_buff skb, head;
				1986	bool is_reneg; /* is receiver reneging on SACKs? */
				1987
				1988	head = tcp_rtx_queue_head(sk);
				1989	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
				1990	if (is_reneg) {
				1991	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
				1992	tp->sacked_out = 0;
				1993	/* Mark SACK reneging until we recover from this loss event. */
				1994	tp->is_sack_reneg = 1;
				1995	} else if (tcp_is_reno(tp)) {
				1996	tcp_reset_reno_sack(tp);
				1997	}
				1998
				1999	skb = head;
				2000	skb_rbtree_walk_from(skb) {
				2001	if (is_reneg)
				2002	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
				2003	else if (tcp_is_rack(sk) && skb != head &&
				2004	tcp_rack_skb_timeout(tp, skb, 0) > 0)
				2005	continue; /* Don't mark recently sent ones lost yet */
				2006	tcp_mark_skb_lost(sk, skb);
				2007	}
				2008	tcp_verify_left_out(tp);
				2009	tcp_clear_all_retrans_hints(tp);
				2010	}
				2011
				2012	/* Enter Loss state. */
				2013	void tcp_enter_loss(struct sock *sk)
				2014	{
				2015	const struct inet_connection_sock *icsk = inet_csk(sk);
				2016	struct tcp_sock *tp = tcp_sk(sk);
				2017	struct net *net = sock_net(sk);
				2018	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
				2019	u8 reordering;
				2020
				2021	tcp_timeout_mark_lost(sk);
				2022
				2023	/* Reduce ssthresh if it has not yet been made inside this window. */
				2024	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
				2025	!after(tp->high_seq, tp->snd_una) \|\|
				2026	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
				2027	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2028	tp->prior_cwnd = tp->snd_cwnd;
				2029	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				2030	tcp_ca_event(sk, CA_EVENT_LOSS);
				2031	tcp_init_undo(tp);
				2032	}
				2033	tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
				2034	tp->snd_cwnd_cnt = 0;
				2035	tp->snd_cwnd_stamp = tcp_jiffies32;
				2036
				2037	/* Timeout in disordered state after receiving substantial DUPACKs
				2038	* suggests that the degree of reordering is over-estimated.
				2039	*/
				2040	reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
				2041	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
				2042	tp->sacked_out >= reordering)
				2043	tp->reordering = min_t(unsigned int, tp->reordering,
				2044	reordering);
				2045
				2046	tcp_set_ca_state(sk, TCP_CA_Loss);
				2047	tp->high_seq = tp->snd_nxt;
				2048	tp->tlp_high_seq = 0;
				2049	tcp_ecn_queue_cwr(tp);
				2050
				2051	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
				2052	* loss recovery is underway except recurring timeout(s) on
				2053	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
				2054	*/
				2055	tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
				2056	(new_recovery \|\| icsk->icsk_retransmits) &&
				2057	!inet_csk(sk)->icsk_mtup.probe_size;
				2058	}
				2059
				2060	/* If ACK arrived pointing to a remembered SACK, it means that our
				2061	* remembered SACKs do not reflect real state of receiver i.e.
				2062	* receiver _host_ is heavily congested (or buggy).
				2063	*
				2064	* To avoid big spurious retransmission bursts due to transient SACK
				2065	* scoreboard oddities that look like reneging, we give the receiver a
				2066	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
				2067	* restore sanity to the SACK scoreboard. If the apparent reneging
				2068	* persists until this RTO then we'll clear the SACK scoreboard.
				2069	*/
				2070	static bool tcp_check_sack_reneging(struct sock sk, int ack_flag)
				2071	{
				2072	if (*ack_flag & FLAG_SACK_RENEGING &&
				2073	*ack_flag & FLAG_SND_UNA_ADVANCED) {
				2074	struct tcp_sock *tp = tcp_sk(sk);
				2075	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
				2076	msecs_to_jiffies(10));
				2077
				2078	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2079	delay, TCP_RTO_MAX);
				2080	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
				2081	return true;
				2082	}
				2083	return false;
				2084	}
				2085
				2086	/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
				2087	* counter when SACK is enabled (without SACK, sacked_out is used for
				2088	* that purpose).
				2089	*
				2090	* With reordering, holes may still be in flight, so RFC3517 recovery
				2091	* uses pure sacked_out (total number of SACKed segments) even though
				2092	* it violates the RFC that uses duplicate ACKs, often these are equal
				2093	* but when e.g. out-of-window ACKs or packet duplication occurs,
				2094	* they differ. Since neither occurs due to loss, TCP should really
				2095	* ignore them.
				2096	*/
				2097	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
				2098	{
				2099	return tp->sacked_out + 1;
				2100	}
				2101
				2102	/* Linux NewReno/SACK/ECN state machine.
				2103	* --------------------------------------
				2104	*
				2105	* "Open" Normal state, no dubious events, fast path.
				2106	* "Disorder" In all the respects it is "Open",
				2107	* but requires a bit more attention. It is entered when
				2108	* we see some SACKs or dupacks. It is split of "Open"
				2109	* mainly to move some processing from fast path to slow one.
				2110	* "CWR" CWND was reduced due to some Congestion Notification event.
				2111	* It can be ECN, ICMP source quench, local device congestion.
				2112	* "Recovery" CWND was reduced, we are fast-retransmitting.
				2113	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
				2114	*
				2115	* tcp_fastretrans_alert() is entered:
				2116	* - each incoming ACK, if state is not "Open"
				2117	* - when arrived ACK is unusual, namely:
				2118	* * SACK
				2119	* * Duplicate ACK.
				2120	* * ECN ECE.
				2121	*
				2122	* Counting packets in flight is pretty simple.
				2123	*
				2124	* in_flight = packets_out - left_out + retrans_out
				2125	*
				2126	* packets_out is SND.NXT-SND.UNA counted in packets.
				2127	*
				2128	* retrans_out is number of retransmitted segments.
				2129	*
				2130	* left_out is number of segments left network, but not ACKed yet.
				2131	*
				2132	* left_out = sacked_out + lost_out
				2133	*
				2134	* sacked_out: Packets, which arrived to receiver out of order
				2135	* and hence not ACKed. With SACKs this number is simply
				2136	* amount of SACKed data. Even without SACKs
				2137	* it is easy to give pretty reliable estimate of this number,
				2138	* counting duplicate ACKs.
				2139	*
				2140	* lost_out: Packets lost by network. TCP has no explicit
				2141	* "loss notification" feedback from network (for now).
				2142	* It means that this number can be only _guessed_.
				2143	* Actually, it is the heuristics to predict lossage that
				2144	* distinguishes different algorithms.
				2145	*
				2146	* F.e. after RTO, when all the queue is considered as lost,
				2147	* lost_out = packets_out and in_flight = retrans_out.
				2148	*
				2149	* Essentially, we have now a few algorithms detecting
				2150	* lost packets.
				2151	*
				2152	* If the receiver supports SACK:
				2153	*
				2154	* RFC6675/3517: It is the conventional algorithm. A packet is
				2155	* considered lost if the number of higher sequence packets
				2156	* SACKed is greater than or equal the DUPACK thoreshold
				2157	* (reordering). This is implemented in tcp_mark_head_lost and
				2158	* tcp_update_scoreboard.
				2159	*
				2160	* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
				2161	* (2017-) that checks timing instead of counting DUPACKs.
				2162	* Essentially a packet is considered lost if it's not S/ACKed
				2163	* after RTT + reordering_window, where both metrics are
				2164	* dynamically measured and adjusted. This is implemented in
				2165	* tcp_rack_mark_lost.
				2166	*
				2167	* If the receiver does not support SACK:
				2168	*
				2169	* NewReno (RFC6582): in Recovery we assume that one segment
				2170	* is lost (classic Reno). While we are in Recovery and
				2171	* a partial ACK arrives, we assume that one more packet
				2172	* is lost (NewReno). This heuristics are the same in NewReno
				2173	* and SACK.
				2174	*
				2175	* Really tricky (and requiring careful tuning) part of algorithm
				2176	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
				2177	* The first determines the moment _when_ we should reduce CWND and,
				2178	* hence, slow down forward transmission. In fact, it determines the moment
				2179	* when we decide that hole is caused by loss, rather than by a reorder.
				2180	*
				2181	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
				2182	* holes, caused by lost packets.
				2183	*
				2184	* And the most logically complicated part of algorithm is undo
				2185	* heuristics. We detect false retransmits due to both too early
				2186	* fast retransmit (reordering) and underestimated RTO, analyzing
				2187	* timestamps and D-SACKs. When we detect that some segments were
				2188	* retransmitted by mistake and CWND reduction was wrong, we undo
				2189	* window reduction and abort recovery phase. This logic is hidden
				2190	* inside several functions named tcp_try_undo_<something>.
				2191	*/
				2192
				2193	/* This function decides, when we should leave Disordered state
				2194	* and enter Recovery phase, reducing congestion window.
				2195	*
				2196	* Main question: may we further continue forward transmission
				2197	* with the same cwnd?
				2198	*/
				2199	static bool tcp_time_to_recover(struct sock *sk, int flag)
				2200	{
				2201	struct tcp_sock *tp = tcp_sk(sk);
				2202
				2203	/* Trick#1: The loss is proven. */
				2204	if (tp->lost_out)
				2205	return true;
				2206
				2207	/* Not-A-Trick#2 : Classic rule... */
				2208	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
				2209	return true;
				2210
				2211	return false;
				2212	}
				2213
				2214	/* Detect loss in event "A" above by marking head of queue up as lost.
				2215	* For RFC3517 SACK, a segment is considered lost if it
				2216	* has at least tp->reordering SACKed seqments above it; "packets" refers to
				2217	* the maximum SACKed segments to pass before reaching this limit.
				2218	*/
				2219	static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
				2220	{
				2221	struct tcp_sock *tp = tcp_sk(sk);
				2222	struct sk_buff *skb;
				2223	int cnt;
				2224	/* Use SACK to deduce losses of new sequences sent during recovery */
				2225	const u32 loss_high = tp->snd_nxt;
				2226
				2227	WARN_ON(packets > tp->packets_out);
				2228	skb = tp->lost_skb_hint;
				2229	if (skb) {
				2230	/* Head already handled? */
				2231	if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
				2232	return;
				2233	cnt = tp->lost_cnt_hint;
				2234	} else {
				2235	skb = tcp_rtx_queue_head(sk);
				2236	cnt = 0;
				2237	}
				2238
				2239	skb_rbtree_walk_from(skb) {
				2240	/* TODO: do this better */
				2241	/* this is not the most efficient way to do this... */
				2242	tp->lost_skb_hint = skb;
				2243	tp->lost_cnt_hint = cnt;
				2244
				2245	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
				2246	break;
				2247
				2248	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				2249	cnt += tcp_skb_pcount(skb);
				2250
				2251	if (cnt > packets)
				2252	break;
				2253
				2254	tcp_skb_mark_lost(tp, skb);
				2255
				2256	if (mark_head)
				2257	break;
				2258	}
				2259	tcp_verify_left_out(tp);
				2260	}
				2261
				2262	/* Account newly detected lost packet(s) */
				2263
				2264	static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
				2265	{
				2266	struct tcp_sock *tp = tcp_sk(sk);
				2267
				2268	if (tcp_is_sack(tp)) {
				2269	int sacked_upto = tp->sacked_out - tp->reordering;
				2270	if (sacked_upto >= 0)
				2271	tcp_mark_head_lost(sk, sacked_upto, 0);
				2272	else if (fast_rexmit)
				2273	tcp_mark_head_lost(sk, 1, 1);
				2274	}
				2275	}
				2276
				2277	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
				2278	{
				2279	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2280	before(tp->rx_opt.rcv_tsecr, when);
				2281	}
				2282
				2283	/* skb is spurious retransmitted if the returned timestamp echo
				2284	* reply is prior to the skb transmission time
				2285	*/
				2286	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
				2287	const struct sk_buff *skb)
				2288	{
				2289	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
				2290	tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
				2291	}
				2292
				2293	/* Nothing was retransmitted or returned timestamp is less
				2294	* than timestamp of the first retransmission.
				2295	*/
				2296	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
				2297	{
				2298	const struct sock sk = (const struct sock )tp;
				2299
				2300	if (tp->retrans_stamp &&
				2301	tcp_tsopt_ecr_before(tp, tp->retrans_stamp))
				2302	return true; /* got echoed TS before first retransmission */
				2303
				2304	/* Check if nothing was retransmitted (retrans_stamp==0), which may
				2305	* happen in fast recovery due to TSQ. But we ignore zero retrans_stamp
				2306	* in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear
				2307	* retrans_stamp even if we had retransmitted the SYN.
				2308	*/
				2309	if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */
				2310	sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */
				2311	return true; /* nothing was retransmitted */
				2312
				2313	return false;
				2314	}
				2315
				2316	/* Undo procedures. */
				2317
				2318	/* We can clear retrans_stamp when there are no retransmissions in the
				2319	* window. It would seem that it is trivially available for us in
				2320	* tp->retrans_out, however, that kind of assumptions doesn't consider
				2321	* what will happen if errors occur when sending retransmission for the
				2322	* second time. ...It could the that such segment has only
				2323	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
				2324	* the head skb is enough except for some reneging corner cases that
				2325	* are not worth the effort.
				2326	*
				2327	* Main reason for all this complexity is the fact that connection dying
				2328	* time now depends on the validity of the retrans_stamp, in particular,
				2329	* that successive retransmissions of a segment must not advance
				2330	* retrans_stamp under any conditions.
				2331	*/
				2332	static bool tcp_any_retrans_done(const struct sock *sk)
				2333	{
				2334	const struct tcp_sock *tp = tcp_sk(sk);
				2335	struct sk_buff *skb;
				2336
				2337	if (tp->retrans_out)
				2338	return true;
				2339
				2340	skb = tcp_rtx_queue_head(sk);
				2341	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
				2342	return true;
				2343
				2344	return false;
				2345	}
				2346
				2347	/* If loss recovery is finished and there are no retransmits out in the
				2348	* network, then we clear retrans_stamp so that upon the next loss recovery
				2349	* retransmits_timed_out() and timestamp-undo are using the correct value.
				2350	*/
				2351	static void tcp_retrans_stamp_cleanup(struct sock *sk)
				2352	{
				2353	if (!tcp_any_retrans_done(sk))
				2354	tcp_sk(sk)->retrans_stamp = 0;
				2355	}
				2356
				2357	static void DBGUNDO(struct sock sk, const char msg)
				2358	{
				2359	#if FASTRETRANS_DEBUG > 1
				2360	struct tcp_sock *tp = tcp_sk(sk);
				2361	struct inet_sock *inet = inet_sk(sk);
				2362
				2363	if (sk->sk_family == AF_INET) {
				2364	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
				2365	msg,
				2366	&inet->inet_daddr, ntohs(inet->inet_dport),
				2367	tp->snd_cwnd, tcp_left_out(tp),
				2368	tp->snd_ssthresh, tp->prior_ssthresh,
				2369	tp->packets_out);
				2370	}
				2371	#if IS_ENABLED(CONFIG_IPV6)
				2372	else if (sk->sk_family == AF_INET6) {
				2373	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
				2374	msg,
				2375	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
				2376	tp->snd_cwnd, tcp_left_out(tp),
				2377	tp->snd_ssthresh, tp->prior_ssthresh,
				2378	tp->packets_out);
				2379	}
				2380	#endif
				2381	#endif
				2382	}
				2383
				2384	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
				2385	{
				2386	struct tcp_sock *tp = tcp_sk(sk);
				2387
				2388	if (unmark_loss) {
				2389	struct sk_buff *skb;
				2390
				2391	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2392	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2393	}
				2394	tp->lost_out = 0;
				2395	tcp_clear_all_retrans_hints(tp);
				2396	}
				2397
				2398	if (tp->prior_ssthresh) {
				2399	const struct inet_connection_sock *icsk = inet_csk(sk);
				2400
				2401	tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
				2402
				2403	if (tp->prior_ssthresh > tp->snd_ssthresh) {
				2404	tp->snd_ssthresh = tp->prior_ssthresh;
				2405	tcp_ecn_withdraw_cwr(tp);
				2406	}
				2407	}
				2408	tp->snd_cwnd_stamp = tcp_jiffies32;
				2409	tp->undo_marker = 0;
				2410	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
				2411	}
				2412
				2413	static inline bool tcp_may_undo(const struct tcp_sock *tp)
				2414	{
				2415	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
				2416	}
				2417
				2418	static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
				2419	{
				2420	struct tcp_sock *tp = tcp_sk(sk);
				2421
				2422	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
				2423	/* Hold old state until something above high_seq
				2424	* is ACKed. For Reno it is MUST to prevent false
				2425	* fast retransmits (RFC2582). SACK TCP is safe. */
				2426	if (!tcp_any_retrans_done(sk))
				2427	tp->retrans_stamp = 0;
				2428	return true;
				2429	}
				2430	return false;
				2431	}
				2432
				2433	/* People celebrate: "We love our President!" */
				2434	static bool tcp_try_undo_recovery(struct sock *sk)
				2435	{
				2436	struct tcp_sock *tp = tcp_sk(sk);
				2437
				2438	if (tcp_may_undo(tp)) {
				2439	int mib_idx;
				2440
				2441	/* Happy end! We did not retransmit anything
				2442	* or our original transmission succeeded.
				2443	*/
				2444	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
				2445	tcp_undo_cwnd_reduction(sk, false);
				2446	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				2447	mib_idx = LINUX_MIB_TCPLOSSUNDO;
				2448	else
				2449	mib_idx = LINUX_MIB_TCPFULLUNDO;
				2450
				2451	NET_INC_STATS(sock_net(sk), mib_idx);
				2452	} else if (tp->rack.reo_wnd_persist) {
				2453	tp->rack.reo_wnd_persist--;
				2454	}
				2455	if (tcp_is_non_sack_preventing_reopen(sk))
				2456	return true;
				2457	tcp_set_ca_state(sk, TCP_CA_Open);
				2458	tp->is_sack_reneg = 0;
				2459	return false;
				2460	}
				2461
				2462	/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
				2463	static bool tcp_try_undo_dsack(struct sock *sk)
				2464	{
				2465	struct tcp_sock *tp = tcp_sk(sk);
				2466
				2467	if (tp->undo_marker && !tp->undo_retrans) {
				2468	tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
				2469	tp->rack.reo_wnd_persist + 1);
				2470	DBGUNDO(sk, "D-SACK");
				2471	tcp_undo_cwnd_reduction(sk, false);
				2472	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
				2473	return true;
				2474	}
				2475	return false;
				2476	}
				2477
				2478	/* Undo during loss recovery after partial ACK or using F-RTO. */
				2479	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
				2480	{
				2481	struct tcp_sock *tp = tcp_sk(sk);
				2482
				2483	if (frto_undo \|\| tcp_may_undo(tp)) {
				2484	tcp_undo_cwnd_reduction(sk, true);
				2485
				2486	DBGUNDO(sk, "partial loss");
				2487	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
				2488	if (frto_undo)
				2489	NET_INC_STATS(sock_net(sk),
				2490	LINUX_MIB_TCPSPURIOUSRTOS);
				2491	inet_csk(sk)->icsk_retransmits = 0;
				2492	if (tcp_is_non_sack_preventing_reopen(sk))
				2493	return true;
				2494	if (frto_undo \|\| tcp_is_sack(tp)) {
				2495	tcp_set_ca_state(sk, TCP_CA_Open);
				2496	tp->is_sack_reneg = 0;
				2497	}
				2498	return true;
				2499	}
				2500	return false;
				2501	}
				2502
				2503	/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
				2504	* It computes the number of packets to send (sndcnt) based on packets newly
				2505	* delivered:
				2506	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
				2507	* cwnd reductions across a full RTT.
				2508	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
				2509	* But when the retransmits are acked without further losses, PRR
				2510	* slow starts cwnd up to ssthresh to speed up the recovery.
				2511	*/
				2512	static void tcp_init_cwnd_reduction(struct sock *sk)
				2513	{
				2514	struct tcp_sock *tp = tcp_sk(sk);
				2515
				2516	tp->high_seq = tp->snd_nxt;
				2517	tp->tlp_high_seq = 0;
				2518	tp->snd_cwnd_cnt = 0;
				2519	tp->prior_cwnd = tp->snd_cwnd;
				2520	tp->prr_delivered = 0;
				2521	tp->prr_out = 0;
				2522	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
				2523	tcp_ecn_queue_cwr(tp);
				2524	}
				2525
				2526	void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
				2527	{
				2528	struct tcp_sock *tp = tcp_sk(sk);
				2529	int sndcnt = 0;
				2530	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
				2531
				2532	if (newly_acked_sacked <= 0 \|\| WARN_ON_ONCE(!tp->prior_cwnd))
				2533	return;
				2534
				2535	tp->prr_delivered += newly_acked_sacked;
				2536	if (delta < 0) {
				2537	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
				2538	tp->prior_cwnd - 1;
				2539	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
				2540	} else if ((flag & (FLAG_RETRANS_DATA_ACKED \| FLAG_LOST_RETRANS)) ==
				2541	FLAG_RETRANS_DATA_ACKED) {
				2542	sndcnt = min_t(int, delta,
				2543	max_t(int, tp->prr_delivered - tp->prr_out,
				2544	newly_acked_sacked) + 1);
				2545	} else {
				2546	sndcnt = min(delta, newly_acked_sacked);
				2547	}
				2548	/* Force a fast retransmit upon entering fast recovery */
				2549	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
				2550	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
				2551	}
				2552
				2553	static inline void tcp_end_cwnd_reduction(struct sock *sk)
				2554	{
				2555	struct tcp_sock *tp = tcp_sk(sk);
				2556
				2557	if (inet_csk(sk)->icsk_ca_ops->cong_control)
				2558	return;
				2559
				2560	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
				2561	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
				2562	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
				2563	tp->snd_cwnd = tp->snd_ssthresh;
				2564	tp->snd_cwnd_stamp = tcp_jiffies32;
				2565	}
				2566	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
				2567	}
				2568
				2569	/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
				2570	void tcp_enter_cwr(struct sock *sk)
				2571	{
				2572	struct tcp_sock *tp = tcp_sk(sk);
				2573
				2574	tp->prior_ssthresh = 0;
				2575	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
				2576	tp->undo_marker = 0;
				2577	tcp_init_cwnd_reduction(sk);
				2578	tcp_set_ca_state(sk, TCP_CA_CWR);
				2579	}
				2580	}
				2581	EXPORT_SYMBOL(tcp_enter_cwr);
				2582
				2583	static void tcp_try_keep_open(struct sock *sk)
				2584	{
				2585	struct tcp_sock *tp = tcp_sk(sk);
				2586	int state = TCP_CA_Open;
				2587
				2588	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
				2589	state = TCP_CA_Disorder;
				2590
				2591	if (inet_csk(sk)->icsk_ca_state != state) {
				2592	tcp_set_ca_state(sk, state);
				2593	tp->high_seq = tp->snd_nxt;
				2594	}
				2595	}
				2596
				2597	static void tcp_try_to_open(struct sock *sk, int flag)
				2598	{
				2599	struct tcp_sock *tp = tcp_sk(sk);
				2600
				2601	tcp_verify_left_out(tp);
				2602
				2603	if (!tcp_any_retrans_done(sk))
				2604	tp->retrans_stamp = 0;
				2605
				2606	if (flag & FLAG_ECE)
				2607	tcp_enter_cwr(sk);
				2608
				2609	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
				2610	tcp_try_keep_open(sk);
				2611	}
				2612	}
				2613
				2614	static void tcp_mtup_probe_failed(struct sock *sk)
				2615	{
				2616	struct inet_connection_sock *icsk = inet_csk(sk);
				2617
				2618	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
				2619	icsk->icsk_mtup.probe_size = 0;
				2620	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
				2621	}
				2622
				2623	static void tcp_mtup_probe_success(struct sock *sk)
				2624	{
				2625	struct tcp_sock *tp = tcp_sk(sk);
				2626	struct inet_connection_sock *icsk = inet_csk(sk);
				2627	u64 val;
				2628
				2629	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2630
				2631	val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache);
				2632	do_div(val, icsk->icsk_mtup.probe_size);
				2633	WARN_ON_ONCE((u32)val != val);
				2634	tp->snd_cwnd = max_t(u32, 1U, val);
				2635
				2636	tp->snd_cwnd_cnt = 0;
				2637	tp->snd_cwnd_stamp = tcp_jiffies32;
				2638	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2639
				2640	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
				2641	icsk->icsk_mtup.probe_size = 0;
				2642	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				2643	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
				2644	}
				2645
				2646	/* Do a simple retransmit without using the backoff mechanisms in
				2647	* tcp_timer. This is used for path mtu discovery.
				2648	* The socket is already locked here.
				2649	*/
				2650	void tcp_simple_retransmit(struct sock *sk)
				2651	{
				2652	const struct inet_connection_sock *icsk = inet_csk(sk);
				2653	struct tcp_sock *tp = tcp_sk(sk);
				2654	struct sk_buff *skb;
				2655	unsigned int mss = tcp_current_mss(sk);
				2656
				2657	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2658	if (tcp_skb_seglen(skb) > mss &&
				2659	!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
				2660	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2661	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2662	tp->retrans_out -= tcp_skb_pcount(skb);
				2663	}
				2664	tcp_skb_mark_lost_uncond_verify(tp, skb);
				2665	}
				2666	}
				2667
				2668	tcp_clear_retrans_hints_partial(tp);
				2669
				2670	if (!tp->lost_out)
				2671	return;
				2672
				2673	if (tcp_is_reno(tp))
				2674	tcp_limit_reno_sacked(tp);
				2675
				2676	tcp_verify_left_out(tp);
				2677
				2678	/* Don't muck with the congestion window here.
				2679	* Reason is that we do not increase amount of _data_
				2680	* in network, but units changed and effective
				2681	* cwnd/ssthresh really reduced now.
				2682	*/
				2683	if (icsk->icsk_ca_state != TCP_CA_Loss) {
				2684	tp->high_seq = tp->snd_nxt;
				2685	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2686	tp->prior_ssthresh = 0;
				2687	tp->undo_marker = 0;
				2688	tcp_set_ca_state(sk, TCP_CA_Loss);
				2689	}
				2690	tcp_xmit_retransmit_queue(sk);
				2691	}
				2692	EXPORT_SYMBOL(tcp_simple_retransmit);
				2693
				2694	void tcp_enter_recovery(struct sock *sk, bool ece_ack)
				2695	{
				2696	struct tcp_sock *tp = tcp_sk(sk);
				2697	int mib_idx;
				2698
				2699	/* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */
				2700	tcp_retrans_stamp_cleanup(sk);
				2701
				2702	if (tcp_is_reno(tp))
				2703	mib_idx = LINUX_MIB_TCPRENORECOVERY;
				2704	else
				2705	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
				2706
				2707	NET_INC_STATS(sock_net(sk), mib_idx);
				2708
				2709	tp->prior_ssthresh = 0;
				2710	tcp_init_undo(tp);
				2711
				2712	if (!tcp_in_cwnd_reduction(sk)) {
				2713	if (!ece_ack)
				2714	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2715	tcp_init_cwnd_reduction(sk);
				2716	}
				2717	tcp_set_ca_state(sk, TCP_CA_Recovery);
				2718	}
				2719
				2720	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
				2721	* recovered or spurious. Otherwise retransmits more on partial ACKs.
				2722	*/
				2723	static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
				2724	int *rexmit)
				2725	{
				2726	struct tcp_sock *tp = tcp_sk(sk);
				2727	bool recovered = !before(tp->snd_una, tp->high_seq);
				2728
				2729	if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
				2730	tcp_try_undo_loss(sk, false))
				2731	return;
				2732
				2733	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
				2734	/* Step 3.b. A timeout is spurious if not all data are
				2735	* lost, i.e., never-retransmitted data are (s)acked.
				2736	*/
				2737	if ((flag & FLAG_ORIG_SACK_ACKED) &&
				2738	tcp_try_undo_loss(sk, true))
				2739	return;
				2740
				2741	if (after(tp->snd_nxt, tp->high_seq)) {
				2742	if (flag & FLAG_DATA_SACKED \|\| num_dupack)
				2743	tp->frto = 0; /* Step 3.a. loss was real */
				2744	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
				2745	tp->high_seq = tp->snd_nxt;
				2746	/* Step 2.b. Try send new data (but deferred until cwnd
				2747	* is updated in tcp_ack()). Otherwise fall back to
				2748	* the conventional recovery.
				2749	*/
				2750	if (!tcp_write_queue_empty(sk) &&
				2751	after(tcp_wnd_end(tp), tp->snd_nxt)) {
				2752	*rexmit = REXMIT_NEW;
				2753	return;
				2754	}
				2755	tp->frto = 0;
				2756	}
				2757	}
				2758
				2759	if (recovered) {
				2760	/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
				2761	tcp_try_undo_recovery(sk);
				2762	return;
				2763	}
				2764	if (tcp_is_reno(tp)) {
				2765	/* A Reno DUPACK means new data in F-RTO step 2.b above are
				2766	* delivered. Lower inflight to clock out (re)tranmissions.
				2767	*/
				2768	if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
				2769	tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
				2770	else if (flag & FLAG_SND_UNA_ADVANCED)
				2771	tcp_reset_reno_sack(tp);
				2772	}
				2773	*rexmit = REXMIT_LOST;
				2774	}
				2775
				2776	static bool tcp_force_fast_retransmit(struct sock *sk)
				2777	{
				2778	struct tcp_sock *tp = tcp_sk(sk);
				2779
				2780	return after(tcp_highest_sack_seq(tp),
				2781	tp->snd_una + tp->reordering * tp->mss_cache);
				2782	}
				2783
				2784	/* Undo during fast recovery after partial ACK. */
				2785	static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
				2786	bool *do_lost)
				2787	{
				2788	struct tcp_sock *tp = tcp_sk(sk);
				2789
				2790	if (tp->undo_marker && tcp_packet_delayed(tp)) {
				2791	/* Plain luck! Hole if filled with delayed
				2792	* packet, rather than with a retransmit. Check reordering.
				2793	*/
				2794	tcp_check_sack_reordering(sk, prior_snd_una, 1);
				2795
				2796	/* We are getting evidence that the reordering degree is higher
				2797	* than we realized. If there are no retransmits out then we
				2798	* can undo. Otherwise we clock out new packets but do not
				2799	* mark more packets lost or retransmit more.
				2800	*/
				2801	if (tp->retrans_out)
				2802	return true;
				2803
				2804	if (!tcp_any_retrans_done(sk))
				2805	tp->retrans_stamp = 0;
				2806
				2807	DBGUNDO(sk, "partial recovery");
				2808	tcp_undo_cwnd_reduction(sk, true);
				2809	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
				2810	tcp_try_keep_open(sk);
				2811	} else {
				2812	/* Partial ACK arrived. Force fast retransmit. */
				2813	*do_lost = tcp_force_fast_retransmit(sk);
				2814	}
				2815	return false;
				2816	}
				2817
				2818	static void tcp_identify_packet_loss(struct sock sk, int ack_flag)
				2819	{
				2820	struct tcp_sock *tp = tcp_sk(sk);
				2821
				2822	if (tcp_rtx_queue_empty(sk))
				2823	return;
				2824
				2825	if (unlikely(tcp_is_reno(tp))) {
				2826	tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
				2827	} else if (tcp_is_rack(sk)) {
				2828	u32 prior_retrans = tp->retrans_out;
				2829
				2830	if (tcp_rack_mark_lost(sk))
				2831	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
				2832	if (prior_retrans > tp->retrans_out)
				2833	*ack_flag \|= FLAG_LOST_RETRANS;
				2834	}
				2835	}
				2836
				2837	/* Process an event, which can update packets-in-flight not trivially.
				2838	* Main goal of this function is to calculate new estimate for left_out,
				2839	* taking into account both packets sitting in receiver's buffer and
				2840	* packets lost by network.
				2841	*
				2842	* Besides that it updates the congestion state when packet loss or ECN
				2843	* is detected. But it does not reduce the cwnd, it is done by the
				2844	* congestion control later.
				2845	*
				2846	* It does _not_ decide what to send, it is made in function
				2847	* tcp_xmit_retransmit_queue().
				2848	*/
				2849	static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
				2850	int num_dupack, int ack_flag, int rexmit)
				2851	{
				2852	struct inet_connection_sock *icsk = inet_csk(sk);
				2853	struct tcp_sock *tp = tcp_sk(sk);
				2854	int fast_rexmit = 0, flag = *ack_flag;
				2855	bool ece_ack = flag & FLAG_ECE;
				2856	bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
				2857	tcp_force_fast_retransmit(sk));
				2858
				2859	if (!tp->packets_out && tp->sacked_out)
				2860	tp->sacked_out = 0;
				2861
				2862	/* Now state machine starts.
				2863	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
				2864	if (ece_ack)
				2865	tp->prior_ssthresh = 0;
				2866
				2867	/* B. In all the states check for reneging SACKs. */
				2868	if (tcp_check_sack_reneging(sk, ack_flag))
				2869	return;
				2870
				2871	/* C. Check consistency of the current state. */
				2872	tcp_verify_left_out(tp);
				2873
				2874	/* D. Check state exit conditions. State can be terminated
				2875	* when high_seq is ACKed. */
				2876	if (icsk->icsk_ca_state == TCP_CA_Open) {
				2877	WARN_ON(tp->retrans_out != 0);
				2878	tp->retrans_stamp = 0;
				2879	} else if (!before(tp->snd_una, tp->high_seq)) {
				2880	switch (icsk->icsk_ca_state) {
				2881	case TCP_CA_CWR:
				2882	/* CWR is to be held something above high_seq
				2883	* is ACKed for CWR bit to reach receiver. */
				2884	if (tp->snd_una != tp->high_seq) {
				2885	tcp_end_cwnd_reduction(sk);
				2886	tcp_set_ca_state(sk, TCP_CA_Open);
				2887	}
				2888	break;
				2889
				2890	case TCP_CA_Recovery:
				2891	if (tcp_is_reno(tp))
				2892	tcp_reset_reno_sack(tp);
				2893	if (tcp_try_undo_recovery(sk))
				2894	return;
				2895	tcp_end_cwnd_reduction(sk);
				2896	break;
				2897	}
				2898	}
				2899
				2900	/* E. Process state. */
				2901	switch (icsk->icsk_ca_state) {
				2902	case TCP_CA_Recovery:
				2903	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
				2904	if (tcp_is_reno(tp))
				2905	tcp_add_reno_sack(sk, num_dupack, ece_ack);
				2906	} else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
				2907	return;
				2908
				2909	if (tcp_try_undo_dsack(sk))
				2910	tcp_try_to_open(sk, flag);
				2911
				2912	tcp_identify_packet_loss(sk, ack_flag);
				2913	if (icsk->icsk_ca_state != TCP_CA_Recovery) {
				2914	if (!tcp_time_to_recover(sk, flag))
				2915	return;
				2916	/* Undo reverts the recovery state. If loss is evident,
				2917	* starts a new recovery (e.g. reordering then loss);
				2918	*/
				2919	tcp_enter_recovery(sk, ece_ack);
				2920	}
				2921	break;
				2922	case TCP_CA_Loss:
				2923	tcp_process_loss(sk, flag, num_dupack, rexmit);
				2924	tcp_identify_packet_loss(sk, ack_flag);
				2925	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
				2926	(*ack_flag & FLAG_LOST_RETRANS)))
				2927	return;
				2928	/* Change state if cwnd is undone or retransmits are lost */
				2929	/* fall through */
				2930	default:
				2931	if (tcp_is_reno(tp)) {
				2932	if (flag & FLAG_SND_UNA_ADVANCED)
				2933	tcp_reset_reno_sack(tp);
				2934	tcp_add_reno_sack(sk, num_dupack, ece_ack);
				2935	}
				2936
				2937	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
				2938	tcp_try_undo_dsack(sk);
				2939
				2940	tcp_identify_packet_loss(sk, ack_flag);
				2941	if (!tcp_time_to_recover(sk, flag)) {
				2942	tcp_try_to_open(sk, flag);
				2943	return;
				2944	}
				2945
				2946	/* MTU probe failure: don't reduce cwnd */
				2947	if (icsk->icsk_ca_state < TCP_CA_CWR &&
				2948	icsk->icsk_mtup.probe_size &&
				2949	tp->snd_una == tp->mtu_probe.probe_seq_start) {
				2950	tcp_mtup_probe_failed(sk);
				2951	/* Restores the reduction we did in tcp_mtup_probe() */
				2952	tp->snd_cwnd++;
				2953	tcp_simple_retransmit(sk);
				2954	return;
				2955	}
				2956
				2957	/* Otherwise enter Recovery state */
				2958	tcp_enter_recovery(sk, ece_ack);
				2959	fast_rexmit = 1;
				2960	}
				2961
				2962	if (!tcp_is_rack(sk) && do_lost)
				2963	tcp_update_scoreboard(sk, fast_rexmit);
				2964	*rexmit = REXMIT_LOST;
				2965	}
				2966
				2967	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
				2968	{
				2969	u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
				2970	struct tcp_sock *tp = tcp_sk(sk);
				2971
				2972	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
				2973	/* If the remote keeps returning delayed ACKs, eventually
				2974	* the min filter would pick it up and overestimate the
				2975	* prop. delay when it expires. Skip suspected delayed ACKs.
				2976	*/
				2977	return;
				2978	}
				2979	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
				2980	rtt_us ? : jiffies_to_usecs(1));
				2981	}
				2982
				2983	static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
				2984	long seq_rtt_us, long sack_rtt_us,
				2985	long ca_rtt_us, struct rate_sample *rs)
				2986	{
				2987	const struct tcp_sock *tp = tcp_sk(sk);
				2988
				2989	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
				2990	* broken middle-boxes or peers may corrupt TS-ECR fields. But
				2991	* Karn's algorithm forbids taking RTT if some retransmitted data
				2992	* is acked (RFC6298).
				2993	*/
				2994	if (seq_rtt_us < 0)
				2995	seq_rtt_us = sack_rtt_us;
				2996
				2997	/* RTTM Rule: A TSecr value received in a segment is used to
				2998	* update the averaged RTT measurement only if the segment
				2999	* acknowledges some new data, i.e., only if it advances the
				3000	* left edge of the send window.
				3001	* See draft-ietf-tcplw-high-performance-00, section 3.3.
				3002	*/
				3003	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				3004	flag & FLAG_ACKED) {
				3005	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
				3006
				3007	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
				3008	if (!delta)
				3009	delta = 1;
				3010	seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				3011	ca_rtt_us = seq_rtt_us;
				3012	}
				3013	}
				3014	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
				3015	if (seq_rtt_us < 0)
				3016	return false;
				3017
				3018	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
				3019	* always taken together with ACK, SACK, or TS-opts. Any negative
				3020	* values will be skipped with the seq_rtt_us < 0 check above.
				3021	*/
				3022	tcp_update_rtt_min(sk, ca_rtt_us, flag);
				3023	tcp_rtt_estimator(sk, seq_rtt_us);
				3024	tcp_set_rto(sk);
				3025
				3026	/* RFC6298: only reset backoff on valid RTT measurement. */
				3027	inet_csk(sk)->icsk_backoff = 0;
				3028	return true;
				3029	}
				3030
				3031	/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
				3032	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req)
				3033	{
				3034	struct rate_sample rs;
				3035	long rtt_us = -1L;
				3036
				3037	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
				3038	rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
				3039
				3040	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
				3041	}
				3042
				3043
				3044	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
				3045	{
				3046	const struct inet_connection_sock *icsk = inet_csk(sk);
				3047
				3048	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
				3049	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
				3050	}
				3051
				3052	/* Restart timer after forward progress on connection.
				3053	* RFC2988 recommends to restart timer to now+rto.
				3054	*/
				3055	void tcp_rearm_rto(struct sock *sk)
				3056	{
				3057	const struct inet_connection_sock *icsk = inet_csk(sk);
				3058	struct tcp_sock *tp = tcp_sk(sk);
				3059
				3060	/* If the retrans timer is currently being used by Fast Open
				3061	* for SYN-ACK retrans purpose, stay put.
				3062	*/
				3063	if (rcu_access_pointer(tp->fastopen_rsk))
				3064	return;
				3065
				3066	if (!tp->packets_out) {
				3067	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
				3068	} else {
				3069	u32 rto = inet_csk(sk)->icsk_rto;
				3070	/* Offset the time elapsed after installing regular RTO */
				3071	if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
				3072	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				3073	s64 delta_us = tcp_rto_delta_us(sk);
				3074	/* delta_us may not be positive if the socket is locked
				3075	* when the retrans timer fires and is rescheduled.
				3076	*/
				3077	rto = usecs_to_jiffies(max_t(int, delta_us, 1));
				3078	}
				3079	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
				3080	TCP_RTO_MAX, tcp_rtx_queue_head(sk));
				3081	}
				3082	}
				3083
				3084	/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
				3085	static void tcp_set_xmit_timer(struct sock *sk)
				3086	{
				3087	if (!tcp_schedule_loss_probe(sk, true))
				3088	tcp_rearm_rto(sk);
				3089	}
				3090
				3091	/* If we get here, the whole TSO packet has not been acked. */
				3092	static u32 tcp_tso_acked(struct sock sk, struct sk_buff skb)
				3093	{
				3094	struct tcp_sock *tp = tcp_sk(sk);
				3095	u32 packets_acked;
				3096
				3097	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
				3098
				3099	packets_acked = tcp_skb_pcount(skb);
				3100	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				3101	return 0;
				3102	packets_acked -= tcp_skb_pcount(skb);
				3103
				3104	if (packets_acked) {
				3105	BUG_ON(tcp_skb_pcount(skb) == 0);
				3106	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
				3107	}
				3108
				3109	return packets_acked;
				3110	}
				3111
				3112	static void tcp_ack_tstamp(struct sock sk, struct sk_buff skb,
				3113	u32 prior_snd_una)
				3114	{
				3115	const struct skb_shared_info *shinfo;
				3116
				3117	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
				3118	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
				3119	return;
				3120
				3121	shinfo = skb_shinfo(skb);
				3122	if (!before(shinfo->tskey, prior_snd_una) &&
				3123	before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
				3124	tcp_skb_tsorted_save(skb) {
				3125	__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
				3126	} tcp_skb_tsorted_restore(skb);
				3127	}
				3128	}
				3129
				3130	/* Remove acknowledged frames from the retransmission queue. If our packet
				3131	* is before the ack sequence we can discard it as it's confirmed to have
				3132	* arrived at the other end.
				3133	*/
				3134	static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
				3135	u32 prior_snd_una,
				3136	struct tcp_sacktag_state *sack, bool ece_ack)
				3137	{
				3138	const struct inet_connection_sock *icsk = inet_csk(sk);
				3139	u64 first_ackt, last_ackt;
				3140	struct tcp_sock *tp = tcp_sk(sk);
				3141	u32 prior_sacked = tp->sacked_out;
				3142	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
				3143	struct sk_buff skb, next;
				3144	bool fully_acked = true;
				3145	long sack_rtt_us = -1L;
				3146	long seq_rtt_us = -1L;
				3147	long ca_rtt_us = -1L;
				3148	u32 pkts_acked = 0;
				3149	u32 last_in_flight = 0;
				3150	bool rtt_update;
				3151	int flag = 0;
				3152
				3153	first_ackt = 0;
				3154
				3155	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
				3156	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
				3157	const u32 start_seq = scb->seq;
				3158	u8 sacked = scb->sacked;
				3159	u32 acked_pcount;
				3160
				3161	tcp_ack_tstamp(sk, skb, prior_snd_una);
				3162
				3163	/* Determine how many packets and what bytes were acked, tso and else */
				3164	if (after(scb->end_seq, tp->snd_una)) {
				3165	if (tcp_skb_pcount(skb) == 1 \|\|
				3166	!after(tp->snd_una, scb->seq))
				3167	break;
				3168
				3169	acked_pcount = tcp_tso_acked(sk, skb);
				3170	if (!acked_pcount)
				3171	break;
				3172	fully_acked = false;
				3173	} else {
				3174	acked_pcount = tcp_skb_pcount(skb);
				3175	}
				3176
				3177	if (unlikely(sacked & TCPCB_RETRANS)) {
				3178	if (sacked & TCPCB_SACKED_RETRANS)
				3179	tp->retrans_out -= acked_pcount;
				3180	flag \|= FLAG_RETRANS_DATA_ACKED;
				3181	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
				3182	last_ackt = tcp_skb_timestamp_us(skb);
				3183	WARN_ON_ONCE(last_ackt == 0);
				3184	if (!first_ackt)
				3185	first_ackt = last_ackt;
				3186
				3187	last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
				3188	if (before(start_seq, reord))
				3189	reord = start_seq;
				3190	if (!after(scb->end_seq, tp->high_seq))
				3191	flag \|= FLAG_ORIG_SACK_ACKED;
				3192	}
				3193
				3194	if (sacked & TCPCB_SACKED_ACKED) {
				3195	tp->sacked_out -= acked_pcount;
				3196	} else if (tcp_is_sack(tp)) {
				3197	tp->delivered += acked_pcount;
				3198	if (!tcp_skb_spurious_retrans(tp, skb))
				3199	tcp_rack_advance(tp, sacked, scb->end_seq,
				3200	tcp_skb_timestamp_us(skb));
				3201	}
				3202	if (sacked & TCPCB_LOST)
				3203	tp->lost_out -= acked_pcount;
				3204
				3205	tp->packets_out -= acked_pcount;
				3206	pkts_acked += acked_pcount;
				3207	tcp_rate_skb_delivered(sk, skb, sack->rate);
				3208
				3209	/* Initial outgoing SYN's get put onto the write_queue
				3210	* just like anything else we transmit. It is not
				3211	* true data, and if we misinform our callers that
				3212	* this ACK acks real data, we will erroneously exit
				3213	* connection startup slow start one packet too
				3214	* quickly. This is severely frowned upon behavior.
				3215	*/
				3216	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
				3217	flag \|= FLAG_DATA_ACKED;
				3218	} else {
				3219	flag \|= FLAG_SYN_ACKED;
				3220	tp->retrans_stamp = 0;
				3221	}
				3222
				3223	if (!fully_acked)
				3224	break;
				3225
				3226	next = skb_rb_next(skb);
				3227	if (unlikely(skb == tp->retransmit_skb_hint))
				3228	tp->retransmit_skb_hint = NULL;
				3229	if (unlikely(skb == tp->lost_skb_hint))
				3230	tp->lost_skb_hint = NULL;
				3231	tcp_highest_sack_replace(sk, skb, next);
				3232	tcp_rtx_queue_unlink_and_free(skb, sk);
				3233	}
				3234
				3235	if (!skb)
				3236	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
				3237
				3238	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
				3239	tp->snd_up = tp->snd_una;
				3240
				3241	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				3242	flag \|= FLAG_SACK_RENEGING;
				3243
				3244	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
				3245	seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
				3246	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
				3247
				3248	if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
				3249	last_in_flight && !prior_sacked && fully_acked &&
				3250	sack->rate->prior_delivered + 1 == tp->delivered &&
				3251	!(flag & (FLAG_CA_ALERT \| FLAG_SYN_ACKED))) {
				3252	/* Conservatively mark a delayed ACK. It's typically
				3253	* from a lone runt packet over the round trip to
				3254	* a receiver w/o out-of-order or CE events.
				3255	*/
				3256	flag \|= FLAG_ACK_MAYBE_DELAYED;
				3257	}
				3258	}
				3259	if (sack->first_sackt) {
				3260	sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
				3261	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
				3262	}
				3263	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
				3264	ca_rtt_us, sack->rate);
				3265
				3266	if (flag & FLAG_ACKED) {
				3267	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3268	if (unlikely(icsk->icsk_mtup.probe_size &&
				3269	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
				3270	tcp_mtup_probe_success(sk);
				3271	}
				3272
				3273	if (tcp_is_reno(tp)) {
				3274	tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
				3275
				3276	/* If any of the cumulatively ACKed segments was
				3277	* retransmitted, non-SACK case cannot confirm that
				3278	* progress was due to original transmission due to
				3279	* lack of TCPCB_SACKED_ACKED bits even if some of
				3280	* the packets may have been never retransmitted.
				3281	*/
				3282	if (flag & FLAG_RETRANS_DATA_ACKED)
				3283	flag &= ~FLAG_ORIG_SACK_ACKED;
				3284	} else {
				3285	int delta;
				3286
				3287	/* Non-retransmitted hole got filled? That's reordering */
				3288	if (before(reord, prior_fack))
				3289	tcp_check_sack_reordering(sk, reord, 0);
				3290
				3291	delta = prior_sacked - tp->sacked_out;
				3292	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
				3293	}
				3294	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
				3295	sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
				3296	tcp_skb_timestamp_us(skb))) {
				3297	/* Do not re-arm RTO if the sack RTT is measured from data sent
				3298	* after when the head was last (re)transmitted. Otherwise the
				3299	* timeout may continue to extend in loss recovery.
				3300	*/
				3301	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3302	}
				3303
				3304	if (icsk->icsk_ca_ops->pkts_acked) {
				3305	struct ack_sample sample = { .pkts_acked = pkts_acked,
				3306	.rtt_us = sack->rate->rtt_us,
				3307	.in_flight = last_in_flight };
				3308
				3309	icsk->icsk_ca_ops->pkts_acked(sk, &sample);
				3310	}
				3311
				3312	#if FASTRETRANS_DEBUG > 0
				3313	WARN_ON((int)tp->sacked_out < 0);
				3314	WARN_ON((int)tp->lost_out < 0);
				3315	WARN_ON((int)tp->retrans_out < 0);
				3316	if (!tp->packets_out && tcp_is_sack(tp)) {
				3317	icsk = inet_csk(sk);
				3318	if (tp->lost_out) {
				3319	pr_debug("Leak l=%u %d\n",
				3320	tp->lost_out, icsk->icsk_ca_state);
				3321	tp->lost_out = 0;
				3322	}
				3323	if (tp->sacked_out) {
				3324	pr_debug("Leak s=%u %d\n",
				3325	tp->sacked_out, icsk->icsk_ca_state);
				3326	tp->sacked_out = 0;
				3327	}
				3328	if (tp->retrans_out) {
				3329	pr_debug("Leak r=%u %d\n",
				3330	tp->retrans_out, icsk->icsk_ca_state);
				3331	tp->retrans_out = 0;
				3332	}
				3333	}
				3334	#endif
				3335	return flag;
				3336	}
				3337
				3338	static void tcp_ack_probe(struct sock *sk)
				3339	{
				3340	struct inet_connection_sock *icsk = inet_csk(sk);
				3341	struct sk_buff *head = tcp_send_head(sk);
				3342	const struct tcp_sock *tp = tcp_sk(sk);
				3343
				3344	/* Was it a usable window open? */
				3345	if (!head)
				3346	return;
				3347	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
				3348	icsk->icsk_backoff = 0;
				3349	icsk->icsk_probes_tstamp = 0;
				3350	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
				3351	/* Socket must be waked up by subsequent tcp_data_snd_check().
				3352	* This function is not for random using!
				3353	*/
				3354	} else {
				3355	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
				3356
				3357	when = tcp_clamp_probe0_to_user_timeout(sk, when);
				3358	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3359	when, TCP_RTO_MAX, NULL);
				3360	}
				3361	}
				3362
				3363	static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
				3364	{
				3365	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
				3366	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
				3367	}
				3368
				3369	/* Decide wheather to run the increase function of congestion control. */
				3370	static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
				3371	{
				3372	/* If reordering is high then always grow cwnd whenever data is
				3373	* delivered regardless of its ordering. Otherwise stay conservative
				3374	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
				3375	* new SACK or ECE mark may first advance cwnd here and later reduce
				3376	* cwnd in tcp_fastretrans_alert() based on more states.
				3377	*/
				3378	if (tcp_sk(sk)->reordering >
				3379	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
				3380	return flag & FLAG_FORWARD_PROGRESS;
				3381
				3382	return flag & FLAG_DATA_ACKED;
				3383	}
				3384
				3385	/* The "ultimate" congestion control function that aims to replace the rigid
				3386	* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
				3387	* It's called toward the end of processing an ACK with precise rate
				3388	* information. All transmission or retransmission are delayed afterwards.
				3389	*/
				3390	static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
				3391	int flag, const struct rate_sample *rs)
				3392	{
				3393	const struct inet_connection_sock *icsk = inet_csk(sk);
				3394
				3395	if (icsk->icsk_ca_ops->cong_control) {
				3396	icsk->icsk_ca_ops->cong_control(sk, rs);
				3397	return;
				3398	}
				3399
				3400	if (tcp_in_cwnd_reduction(sk)) {
				3401	/* Reduce cwnd if state mandates */
				3402	tcp_cwnd_reduction(sk, acked_sacked, flag);
				3403	} else if (tcp_may_raise_cwnd(sk, flag)) {
				3404	/* Advance cwnd if state allows */
				3405	tcp_cong_avoid(sk, ack, acked_sacked);
				3406	}
				3407	tcp_update_pacing_rate(sk);
				3408	}
				3409
				3410	/* Check that window update is acceptable.
				3411	* The function assumes that snd_una<=ack<=snd_next.
				3412	*/
				3413	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
				3414	const u32 ack, const u32 ack_seq,
				3415	const u32 nwin)
				3416	{
				3417	return after(ack, tp->snd_una) \|\|
				3418	after(ack_seq, tp->snd_wl1) \|\|
				3419	(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
				3420	}
				3421
				3422	/* If we update tp->snd_una, also update tp->bytes_acked */
				3423	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
				3424	{
				3425	u32 delta = ack - tp->snd_una;
				3426
				3427	sock_owned_by_me((struct sock *)tp);
				3428	tp->bytes_acked += delta;
				3429	tp->snd_una = ack;
				3430	}
				3431
				3432	/* If we update tp->rcv_nxt, also update tp->bytes_received */
				3433	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
				3434	{
				3435	u32 delta = seq - tp->rcv_nxt;
				3436
				3437	sock_owned_by_me((struct sock *)tp);
				3438	tp->bytes_received += delta;
				3439	WRITE_ONCE(tp->rcv_nxt, seq);
				3440	}
				3441
				3442	/* Update our send window.
				3443	*
				3444	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
				3445	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
				3446	*/
				3447	static int tcp_ack_update_window(struct sock sk, const struct sk_buff skb, u32 ack,
				3448	u32 ack_seq)
				3449	{
				3450	struct tcp_sock *tp = tcp_sk(sk);
				3451	int flag = 0;
				3452	u32 nwin = ntohs(tcp_hdr(skb)->window);
				3453
				3454	if (likely(!tcp_hdr(skb)->syn))
				3455	nwin <<= tp->rx_opt.snd_wscale;
				3456
				3457	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
				3458	flag \|= FLAG_WIN_UPDATE;
				3459	tcp_update_wl(tp, ack_seq);
				3460
				3461	if (tp->snd_wnd != nwin) {
				3462	tp->snd_wnd = nwin;
				3463
				3464	/* Note, it is the only place, where
				3465	* fast path is recovered for sending TCP.
				3466	*/
				3467	tp->pred_flags = 0;
				3468	tcp_fast_path_check(sk);
				3469
				3470	if (!tcp_write_queue_empty(sk))
				3471	tcp_slow_start_after_idle_check(sk);
				3472
				3473	if (nwin > tp->max_window) {
				3474	tp->max_window = nwin;
				3475	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
				3476	}
				3477	}
				3478	}
				3479
				3480	tcp_snd_una_update(tp, ack);
				3481
				3482	return flag;
				3483	}
				3484
				3485	static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
				3486	u32 *last_oow_ack_time)
				3487	{
				3488	/* Paired with the WRITE_ONCE() in this function. */
				3489	u32 val = READ_ONCE(*last_oow_ack_time);
				3490
				3491	if (val) {
				3492	s32 elapsed = (s32)(tcp_jiffies32 - val);
				3493
				3494	if (0 <= elapsed &&
				3495	elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
				3496	NET_INC_STATS(net, mib_idx);
				3497	return true; /* rate-limited: don't send yet! */
				3498	}
				3499	}
				3500
				3501	/* Paired with the prior READ_ONCE() and with itself,
				3502	* as we might be lockless.
				3503	*/
				3504	WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
				3505
				3506	return false; /* not rate-limited: go ahead, send dupack now! */
				3507	}
				3508
				3509	/* Return true if we're currently rate-limiting out-of-window ACKs and
				3510	* thus shouldn't send a dupack right now. We rate-limit dupacks in
				3511	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
				3512	* attacks that send repeated SYNs or ACKs for the same connection. To
				3513	* do this, we do not send a duplicate SYNACK or ACK if the remote
				3514	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
				3515	*/
				3516	bool tcp_oow_rate_limited(struct net net, const struct sk_buff skb,
				3517	int mib_idx, u32 *last_oow_ack_time)
				3518	{
				3519	/* Data packets without SYNs are not likely part of an ACK loop. */
				3520	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
				3521	!tcp_hdr(skb)->syn)
				3522	return false;
				3523
				3524	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
				3525	}
				3526
				3527	/* RFC 5961 7 [ACK Throttling] */
				3528	static void tcp_send_challenge_ack(struct sock sk, const struct sk_buff skb)
				3529	{
				3530	/* unprotected vars, we dont care of overwrites */
				3531	static u32 challenge_timestamp;
				3532	static unsigned int challenge_count;
				3533	struct tcp_sock *tp = tcp_sk(sk);
				3534	struct net *net = sock_net(sk);
				3535	u32 count, now;
				3536
				3537	/* First check our per-socket dupack rate limit. */
				3538	if (__tcp_oow_rate_limited(net,
				3539	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
				3540	&tp->last_oow_ack_time))
				3541	return;
				3542
				3543	/* Then check host-wide RFC 5961 rate limit. */
				3544	now = jiffies / HZ;
				3545	if (now != READ_ONCE(challenge_timestamp)) {
				3546	u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
				3547	u32 half = (ack_limit + 1) >> 1;
				3548
				3549	WRITE_ONCE(challenge_timestamp, now);
				3550	WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
				3551	}
				3552	count = READ_ONCE(challenge_count);
				3553	if (count > 0) {
				3554	WRITE_ONCE(challenge_count, count - 1);
				3555	NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
				3556	tcp_send_ack(sk);
				3557	}
				3558	}
				3559
				3560	static void tcp_store_ts_recent(struct tcp_sock *tp)
				3561	{
				3562	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
				3563	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
				3564	}
				3565
				3566	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
				3567	{
				3568	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
				3569	/* PAWS bug workaround wrt. ACK frames, the PAWS discard
				3570	* extra check below makes sure this can only happen
				3571	* for pure ACK frames. -DaveM
				3572	*
				3573	* Not only, also it occurs for expired timestamps.
				3574	*/
				3575
				3576	if (tcp_paws_check(&tp->rx_opt, 0))
				3577	tcp_store_ts_recent(tp);
				3578	}
				3579	}
				3580
				3581	/* This routine deals with acks during a TLP episode and ends an episode by
				3582	* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
				3583	*/
				3584	static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
				3585	{
				3586	struct tcp_sock *tp = tcp_sk(sk);
				3587
				3588	if (before(ack, tp->tlp_high_seq))
				3589	return;
				3590
				3591	if (!tp->tlp_retrans) {
				3592	/* TLP of new data has been acknowledged */
				3593	tp->tlp_high_seq = 0;
				3594	} else if (flag & FLAG_DSACKING_ACK) {
				3595	/* This DSACK means original and TLP probe arrived; no loss */
				3596	tp->tlp_high_seq = 0;
				3597	} else if (after(ack, tp->tlp_high_seq)) {
				3598	/* ACK advances: there was a loss, so reduce cwnd. Reset
				3599	* tlp_high_seq in tcp_init_cwnd_reduction()
				3600	*/
				3601	tcp_init_cwnd_reduction(sk);
				3602	tcp_set_ca_state(sk, TCP_CA_CWR);
				3603	tcp_end_cwnd_reduction(sk);
				3604	tcp_try_keep_open(sk);
				3605	NET_INC_STATS(sock_net(sk),
				3606	LINUX_MIB_TCPLOSSPROBERECOVERY);
				3607	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
				3608	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
				3609	/* Pure dupack: original and TLP probe arrived; no loss */
				3610	tp->tlp_high_seq = 0;
				3611	}
				3612	}
				3613
				3614	static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
				3615	{
				3616	const struct inet_connection_sock *icsk = inet_csk(sk);
				3617
				3618	if (icsk->icsk_ca_ops->in_ack_event)
				3619	icsk->icsk_ca_ops->in_ack_event(sk, flags);
				3620	}
				3621
				3622	/* Congestion control has updated the cwnd already. So if we're in
				3623	* loss recovery then now we do any new sends (for FRTO) or
				3624	* retransmits (for CA_Loss or CA_recovery) that make sense.
				3625	*/
				3626	static void tcp_xmit_recovery(struct sock *sk, int rexmit)
				3627	{
				3628	struct tcp_sock *tp = tcp_sk(sk);
				3629
				3630	if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
				3631	return;
				3632
				3633	if (unlikely(rexmit == 2)) {
				3634	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
				3635	TCP_NAGLE_OFF);
				3636	if (after(tp->snd_nxt, tp->high_seq))
				3637	return;
				3638	tp->frto = 0;
				3639	}
				3640	tcp_xmit_retransmit_queue(sk);
				3641	}
				3642
				3643	/* Returns the number of packets newly acked or sacked by the current ACK */
				3644	static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
				3645	{
				3646	const struct net *net = sock_net(sk);
				3647	struct tcp_sock *tp = tcp_sk(sk);
				3648	u32 delivered;
				3649
				3650	delivered = tp->delivered - prior_delivered;
				3651	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
				3652	if (flag & FLAG_ECE) {
				3653	tp->delivered_ce += delivered;
				3654	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
				3655	}
				3656	return delivered;
				3657	}
				3658
				3659	/* This routine deals with incoming acks, but not outgoing ones. */
				3660	static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
				3661	{
				3662	struct inet_connection_sock *icsk = inet_csk(sk);
				3663	struct tcp_sock *tp = tcp_sk(sk);
				3664	struct tcp_sacktag_state sack_state;
				3665	struct rate_sample rs = { .prior_delivered = 0 };
				3666	u32 prior_snd_una = tp->snd_una;
				3667	bool is_sack_reneg = tp->is_sack_reneg;
				3668	u32 ack_seq = TCP_SKB_CB(skb)->seq;
				3669	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				3670	int num_dupack = 0;
				3671	int prior_packets = tp->packets_out;
				3672	u32 delivered = tp->delivered;
				3673	u32 lost = tp->lost;
				3674	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
				3675	u32 prior_fack;
				3676
				3677	sack_state.first_sackt = 0;
				3678	sack_state.rate = &rs;
				3679
				3680	/* We very likely will need to access rtx queue. */
				3681	prefetch(sk->tcp_rtx_queue.rb_node);
				3682
				3683	/* If the ack is older than previous acks
				3684	* then we can probably ignore it.
				3685	*/
				3686	if (before(ack, prior_snd_una)) {
				3687	u32 max_window;
				3688
				3689	/* do not accept ACK for bytes we never sent. */
				3690	max_window = min_t(u64, tp->max_window, tp->bytes_acked);
				3691	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
				3692	if (before(ack, prior_snd_una - max_window)) {
				3693	if (!(flag & FLAG_NO_CHALLENGE_ACK))
				3694	tcp_send_challenge_ack(sk, skb);
				3695	return -1;
				3696	}
				3697	goto old_ack;
				3698	}
				3699
				3700	/* If the ack includes data we haven't sent yet, discard
				3701	* this segment (RFC793 Section 3.9).
				3702	*/
				3703	if (after(ack, tp->snd_nxt))
				3704	return -1;
				3705
				3706	if (after(ack, prior_snd_una)) {
				3707	flag \|= FLAG_SND_UNA_ADVANCED;
				3708	icsk->icsk_retransmits = 0;
				3709
				3710	#if IS_ENABLED(CONFIG_TLS_DEVICE)
				3711	if (static_branch_unlikely(&clean_acked_data_enabled.key))
				3712	if (icsk->icsk_clean_acked)
				3713	icsk->icsk_clean_acked(sk, ack);
				3714	#endif
				3715	}
				3716
				3717	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
				3718	rs.prior_in_flight = tcp_packets_in_flight(tp);
				3719
				3720	/* ts_recent update must be made after we are sure that the packet
				3721	* is in window.
				3722	*/
				3723	if (flag & FLAG_UPDATE_TS_RECENT)
				3724	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
				3725
				3726	if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
				3727	FLAG_SND_UNA_ADVANCED) {
				3728	/* Window is constant, pure forward advance.
				3729	* No more checks are required.
				3730	* Note, we use the fact that SND.UNA>=SND.WL2.
				3731	*/
				3732	tcp_update_wl(tp, ack_seq);
				3733	tcp_snd_una_update(tp, ack);
				3734	flag \|= FLAG_WIN_UPDATE;
				3735
				3736	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
				3737
				3738	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
				3739	} else {
				3740	u32 ack_ev_flags = CA_ACK_SLOWPATH;
				3741
				3742	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
				3743	flag \|= FLAG_DATA;
				3744	else
				3745	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
				3746
				3747	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
				3748
				3749	if (TCP_SKB_CB(skb)->sacked)
				3750	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3751	&sack_state);
				3752
				3753	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
				3754	flag \|= FLAG_ECE;
				3755	ack_ev_flags \|= CA_ACK_ECE;
				3756	}
				3757
				3758	if (flag & FLAG_WIN_UPDATE)
				3759	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
				3760
				3761	tcp_in_ack_event(sk, ack_ev_flags);
				3762	}
				3763
				3764	/* This is a deviation from RFC3168 since it states that:
				3765	* "When the TCP data sender is ready to set the CWR bit after reducing
				3766	* the congestion window, it SHOULD set the CWR bit only on the first
				3767	* new data packet that it transmits."
				3768	* We accept CWR on pure ACKs to be more robust
				3769	* with widely-deployed TCP implementations that do this.
				3770	*/
				3771	tcp_ecn_accept_cwr(sk, skb);
				3772
				3773	/* We passed data and got it acked, remove any soft error
				3774	* log. Something worked...
				3775	*/
				3776	sk->sk_err_soft = 0;
				3777	icsk->icsk_probes_out = 0;
				3778	tp->rcv_tstamp = tcp_jiffies32;
				3779	if (!prior_packets)
				3780	goto no_queue;
				3781
				3782	/* See if we can take anything off of the retransmit queue. */
				3783	flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
				3784	flag & FLAG_ECE);
				3785
				3786	tcp_rack_update_reo_wnd(sk, &rs);
				3787
				3788	if (tp->tlp_high_seq)
				3789	tcp_process_tlp_ack(sk, ack, flag);
				3790
				3791	if (tcp_ack_is_dubious(sk, flag)) {
				3792	if (!(flag & (FLAG_SND_UNA_ADVANCED \|
				3793	FLAG_NOT_DUP \| FLAG_DSACKING_ACK))) {
				3794	num_dupack = 1;
				3795	/* Consider if pure acks were aggregated in tcp_add_backlog() */
				3796	if (!(flag & FLAG_DATA))
				3797	num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
				3798	}
				3799	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
				3800	&rexmit);
				3801	}
				3802
				3803	/* If needed, reset TLP/RTO timer when RACK doesn't set. */
				3804	if (flag & FLAG_SET_XMIT_TIMER)
				3805	tcp_set_xmit_timer(sk);
				3806
				3807	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
				3808	sk_dst_confirm(sk);
				3809
				3810	delivered = tcp_newly_delivered(sk, delivered, flag);
				3811	lost = tp->lost - lost; /* freshly marked lost */
				3812	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
				3813	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
				3814	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
				3815	tcp_xmit_recovery(sk, rexmit);
				3816	return 1;
				3817
				3818	no_queue:
				3819	/* If data was DSACKed, see if we can undo a cwnd reduction. */
				3820	if (flag & FLAG_DSACKING_ACK) {
				3821	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
				3822	&rexmit);
				3823	tcp_newly_delivered(sk, delivered, flag);
				3824	}
				3825	/* If this ack opens up a zero window, clear backoff. It was
				3826	* being used to time the probes, and is probably far higher than
				3827	* it needs to be for normal retransmission.
				3828	*/
				3829	tcp_ack_probe(sk);
				3830
				3831	if (tp->tlp_high_seq)
				3832	tcp_process_tlp_ack(sk, ack, flag);
				3833	return 1;
				3834
				3835	old_ack:
				3836	/* If data was SACKed, tag it and see if we should send more data.
				3837	* If data was DSACKed, see if we can undo a cwnd reduction.
				3838	*/
				3839	if (TCP_SKB_CB(skb)->sacked) {
				3840	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3841	&sack_state);
				3842	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
				3843	&rexmit);
				3844	tcp_newly_delivered(sk, delivered, flag);
				3845	tcp_xmit_recovery(sk, rexmit);
				3846	}
				3847
				3848	return 0;
				3849	}
				3850
				3851	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
				3852	bool syn, struct tcp_fastopen_cookie *foc,
				3853	bool exp_opt)
				3854	{
				3855	/* Valid only in SYN or SYN-ACK with an even length. */
				3856	if (!foc \|\| !syn \|\| len < 0 \|\| (len & 1))
				3857	return;
				3858
				3859	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
				3860	len <= TCP_FASTOPEN_COOKIE_MAX)
				3861	memcpy(foc->val, cookie, len);
				3862	else if (len != 0)
				3863	len = -1;
				3864	foc->len = len;
				3865	foc->exp = exp_opt;
				3866	}
				3867
				3868	static void smc_parse_options(const struct tcphdr *th,
				3869	struct tcp_options_received *opt_rx,
				3870	const unsigned char *ptr,
				3871	int opsize)
				3872	{
				3873	#if IS_ENABLED(CONFIG_SMC)
				3874	if (static_branch_unlikely(&tcp_have_smc)) {
				3875	if (th->syn && !(opsize & 1) &&
				3876	opsize >= TCPOLEN_EXP_SMC_BASE &&
				3877	get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
				3878	opt_rx->smc_ok = 1;
				3879	}
				3880	#endif
				3881	}
				3882
				3883	/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
				3884	* value on success.
				3885	*/
				3886	static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
				3887	{
				3888	const unsigned char ptr = (const unsigned char )(th + 1);
				3889	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3890	u16 mss = 0;
				3891
				3892	while (length > 0) {
				3893	int opcode = *ptr++;
				3894	int opsize;
				3895
				3896	switch (opcode) {
				3897	case TCPOPT_EOL:
				3898	return mss;
				3899	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3900	length--;
				3901	continue;
				3902	default:
				3903	if (length < 2)
				3904	return mss;
				3905	opsize = *ptr++;
				3906	if (opsize < 2) /* "silly options" */
				3907	return mss;
				3908	if (opsize > length)
				3909	return mss; /* fail on partial options */
				3910	if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
				3911	u16 in_mss = get_unaligned_be16(ptr);
				3912
				3913	if (in_mss) {
				3914	if (user_mss && user_mss < in_mss)
				3915	in_mss = user_mss;
				3916	mss = in_mss;
				3917	}
				3918	}
				3919	ptr += opsize - 2;
				3920	length -= opsize;
				3921	}
				3922	}
				3923	return mss;
				3924	}
				3925
				3926	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
				3927	* But, this can also be called on packets in the established flow when
				3928	* the fast version below fails.
				3929	*/
				3930	void tcp_parse_options(const struct net *net,
				3931	const struct sk_buff *skb,
				3932	struct tcp_options_received *opt_rx, int estab,
				3933	struct tcp_fastopen_cookie *foc)
				3934	{
				3935	const unsigned char *ptr;
				3936	const struct tcphdr *th = tcp_hdr(skb);
				3937	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3938
				3939	ptr = (const unsigned char *)(th + 1);
				3940	opt_rx->saw_tstamp = 0;
				3941
				3942	while (length > 0) {
				3943	int opcode = *ptr++;
				3944	int opsize;
				3945
				3946	switch (opcode) {
				3947	case TCPOPT_EOL:
				3948	return;
				3949	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3950	length--;
				3951	continue;
				3952	default:
				3953	if (length < 2)
				3954	return;
				3955	opsize = *ptr++;
				3956	if (opsize < 2) /* "silly options" */
				3957	return;
				3958	if (opsize > length)
				3959	return; /* don't parse partial options */
				3960	switch (opcode) {
				3961	case TCPOPT_MSS:
				3962	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
				3963	u16 in_mss = get_unaligned_be16(ptr);
				3964	if (in_mss) {
				3965	if (opt_rx->user_mss &&
				3966	opt_rx->user_mss < in_mss)
				3967	in_mss = opt_rx->user_mss;
				3968	opt_rx->mss_clamp = in_mss;
				3969	}
				3970	}
				3971	break;
				3972	case TCPOPT_WINDOW:
				3973	if (opsize == TCPOLEN_WINDOW && th->syn &&
				3974	!estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
				3975	__u8 snd_wscale = (__u8 )ptr;
				3976	opt_rx->wscale_ok = 1;
				3977	if (snd_wscale > TCP_MAX_WSCALE) {
				3978	net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
				3979	__func__,
				3980	snd_wscale,
				3981	TCP_MAX_WSCALE);
				3982	snd_wscale = TCP_MAX_WSCALE;
				3983	}
				3984	opt_rx->snd_wscale = snd_wscale;
				3985	}
				3986	break;
				3987	case TCPOPT_TIMESTAMP:
				3988	if ((opsize == TCPOLEN_TIMESTAMP) &&
				3989	((estab && opt_rx->tstamp_ok) \|\|
				3990	(!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
				3991	opt_rx->saw_tstamp = 1;
				3992	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
				3993	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
				3994	}
				3995	break;
				3996	case TCPOPT_SACK_PERM:
				3997	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
				3998	!estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
				3999	opt_rx->sack_ok = TCP_SACK_SEEN;
				4000	tcp_sack_reset(opt_rx);
				4001	}
				4002	break;
				4003
				4004	case TCPOPT_SACK:
				4005	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				4006	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				4007	opt_rx->sack_ok) {
				4008	TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				4009	}
				4010	break;
				4011	#ifdef CONFIG_TCP_MD5SIG
				4012	case TCPOPT_MD5SIG:
				4013	/*
				4014	* The MD5 Hash has already been
				4015	* checked (see tcp_v{4,6}_do_rcv()).
				4016	*/
				4017	break;
				4018	#endif
				4019	case TCPOPT_FASTOPEN:
				4020	tcp_parse_fastopen_option(
				4021	opsize - TCPOLEN_FASTOPEN_BASE,
				4022	ptr, th->syn, foc, false);
				4023	break;
				4024
				4025	case TCPOPT_EXP:
				4026	/* Fast Open option shares code 254 using a
				4027	* 16 bits magic number.
				4028	*/
				4029	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
				4030	get_unaligned_be16(ptr) ==
				4031	TCPOPT_FASTOPEN_MAGIC)
				4032	tcp_parse_fastopen_option(opsize -
				4033	TCPOLEN_EXP_FASTOPEN_BASE,
				4034	ptr + 2, th->syn, foc, true);
				4035	else
				4036	smc_parse_options(th, opt_rx, ptr,
				4037	opsize);
				4038	break;
				4039
				4040	}
				4041	ptr += opsize-2;
				4042	length -= opsize;
				4043	}
				4044	}
				4045	}
				4046	EXPORT_SYMBOL(tcp_parse_options);
				4047
				4048	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const struct tcphdr th)
				4049	{
				4050	const __be32 ptr = (const __be32 )(th + 1);
				4051
				4052	if (*ptr == htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16)
				4053	\| (TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP)) {
				4054	tp->rx_opt.saw_tstamp = 1;
				4055	++ptr;
				4056	tp->rx_opt.rcv_tsval = ntohl(*ptr);
				4057	++ptr;
				4058	if (*ptr)
				4059	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
				4060	else
				4061	tp->rx_opt.rcv_tsecr = 0;
				4062	return true;
				4063	}
				4064	return false;
				4065	}
				4066
				4067	/* Fast parse options. This hopes to only see timestamps.
				4068	* If it is wrong it falls back on tcp_parse_options().
				4069	*/
				4070	static bool tcp_fast_parse_options(const struct net *net,
				4071	const struct sk_buff *skb,
				4072	const struct tcphdr th, struct tcp_sock tp)
				4073	{
				4074	/* In the spirit of fast parsing, compare doff directly to constant
				4075	* values. Because equality is used, short doff can be ignored here.
				4076	*/
				4077	if (th->doff == (sizeof(*th) / 4)) {
				4078	tp->rx_opt.saw_tstamp = 0;
				4079	return false;
				4080	} else if (tp->rx_opt.tstamp_ok &&
				4081	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
				4082	if (tcp_parse_aligned_timestamp(tp, th))
				4083	return true;
				4084	}
				4085
				4086	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
				4087	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				4088	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				4089
				4090	return true;
				4091	}
				4092
				4093	#ifdef CONFIG_TCP_MD5SIG
				4094	/*
				4095	* Parse MD5 Signature option
				4096	*/
				4097	const u8 tcp_parse_md5sig_option(const struct tcphdr th)
				4098	{
				4099	int length = (th->doff << 2) - sizeof(*th);
				4100	const u8 ptr = (const u8 )(th + 1);
				4101
				4102	/* If not enough data remaining, we can short cut */
				4103	while (length >= TCPOLEN_MD5SIG) {
				4104	int opcode = *ptr++;
				4105	int opsize;
				4106
				4107	switch (opcode) {
				4108	case TCPOPT_EOL:
				4109	return NULL;
				4110	case TCPOPT_NOP:
				4111	length--;
				4112	continue;
				4113	default:
				4114	opsize = *ptr++;
				4115	if (opsize < 2 \|\| opsize > length)
				4116	return NULL;
				4117	if (opcode == TCPOPT_MD5SIG)
				4118	return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
				4119	}
				4120	ptr += opsize - 2;
				4121	length -= opsize;
				4122	}
				4123	return NULL;
				4124	}
				4125	EXPORT_SYMBOL(tcp_parse_md5sig_option);
				4126	#endif
				4127
				4128	/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
				4129	*
				4130	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
				4131	* it can pass through stack. So, the following predicate verifies that
				4132	* this segment is not used for anything but congestion avoidance or
				4133	* fast retransmit. Moreover, we even are able to eliminate most of such
				4134	* second order effects, if we apply some small "replay" window (~RTO)
				4135	* to timestamp space.
				4136	*
				4137	* All these measures still do not guarantee that we reject wrapped ACKs
				4138	* on networks with high bandwidth, when sequence space is recycled fastly,
				4139	* but it guarantees that such events will be very rare and do not affect
				4140	* connection seriously. This doesn't look nice, but alas, PAWS is really
				4141	* buggy extension.
				4142	*
				4143	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
				4144	* states that events when retransmit arrives after original data are rare.
				4145	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
				4146	* the biggest problem on large power networks even with minor reordering.
				4147	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
				4148	* up to bandwidth of 18Gigabit/sec. 8) ]
				4149	*/
				4150
				4151	static int tcp_disordered_ack(const struct sock sk, const struct sk_buff skb)
				4152	{
				4153	const struct tcp_sock *tp = tcp_sk(sk);
				4154	const struct tcphdr *th = tcp_hdr(skb);
				4155	u32 seq = TCP_SKB_CB(skb)->seq;
				4156	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				4157
				4158	return (/* 1. Pure ACK with correct sequence number. */
				4159	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
				4160
				4161	/* 2. ... and duplicate ACK. */
				4162	ack == tp->snd_una &&
				4163
				4164	/* 3. ... and does not update window. */
				4165	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
				4166
				4167	/* 4. ... and sits in replay window. */
				4168	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
				4169	}
				4170
				4171	static inline bool tcp_paws_discard(const struct sock *sk,
				4172	const struct sk_buff *skb)
				4173	{
				4174	const struct tcp_sock *tp = tcp_sk(sk);
				4175
				4176	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
				4177	!tcp_disordered_ack(sk, skb);
				4178	}
				4179
				4180	/* Check segment sequence number for validity.
				4181	*
				4182	* Segment controls are considered valid, if the segment
				4183	* fits to the window after truncation to the window. Acceptability
				4184	* of data (and SYN, FIN, of course) is checked separately.
				4185	* See tcp_data_queue(), for example.
				4186	*
				4187	* Also, controls (RST is main one) are accepted using RCV.WUP instead
				4188	* of RCV.NXT. Peer still did not advance his SND.UNA when we
				4189	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
				4190	* (borrowed from freebsd)
				4191	*/
				4192
				4193	static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
				4194	{
				4195	return !before(end_seq, tp->rcv_wup) &&
				4196	!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
				4197	}
				4198
				4199	/* When we get a reset we do this. */
				4200	void tcp_reset(struct sock *sk)
				4201	{
				4202	trace_tcp_receive_reset(sk);
				4203
				4204	/* We want the right error as BSD sees it (and indeed as we do). */
				4205	switch (sk->sk_state) {
				4206	case TCP_SYN_SENT:
				4207	sk->sk_err = ECONNREFUSED;
				4208	break;
				4209	case TCP_CLOSE_WAIT:
				4210	sk->sk_err = EPIPE;
				4211	break;
				4212	case TCP_CLOSE:
				4213	return;
				4214	default:
				4215	sk->sk_err = ECONNRESET;
				4216	}
				4217	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				4218	smp_wmb();
				4219
				4220	tcp_write_queue_purge(sk);
				4221	tcp_done(sk);
				4222
				4223	if (!sock_flag(sk, SOCK_DEAD))
				4224	sk->sk_error_report(sk);
				4225	}
				4226
				4227	/*
				4228	* Process the FIN bit. This now behaves as it is supposed to work
				4229	* and the FIN takes effect when it is validly part of sequence
				4230	* space. Not before when we get holes.
				4231	*
				4232	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
				4233	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
				4234	* TIME-WAIT)
				4235	*
				4236	* If we are in FINWAIT-1, a received FIN indicates simultaneous
				4237	* close and we go into CLOSING (and later onto TIME-WAIT)
				4238	*
				4239	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
				4240	*/
				4241	void tcp_fin(struct sock *sk)
				4242	{
				4243	struct tcp_sock *tp = tcp_sk(sk);
				4244
				4245	inet_csk_schedule_ack(sk);
				4246
				4247	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| RCV_SHUTDOWN);
				4248	sock_set_flag(sk, SOCK_DONE);
				4249
				4250	switch (sk->sk_state) {
				4251	case TCP_SYN_RECV:
				4252	case TCP_ESTABLISHED:
				4253	/* Move to CLOSE_WAIT */
				4254	tcp_set_state(sk, TCP_CLOSE_WAIT);
				4255	inet_csk_enter_pingpong_mode(sk);
				4256	break;
				4257
				4258	case TCP_CLOSE_WAIT:
				4259	case TCP_CLOSING:
				4260	/* Received a retransmission of the FIN, do
				4261	* nothing.
				4262	*/
				4263	break;
				4264	case TCP_LAST_ACK:
				4265	/* RFC793: Remain in the LAST-ACK state. */
				4266	break;
				4267
				4268	case TCP_FIN_WAIT1:
				4269	/* This case occurs when a simultaneous close
				4270	* happens, we must ack the received FIN and
				4271	* enter the CLOSING state.
				4272	*/
				4273	tcp_send_ack(sk);
				4274	tcp_set_state(sk, TCP_CLOSING);
				4275	break;
				4276	case TCP_FIN_WAIT2:
				4277	/* Received a FIN -- send ACK and enter TIME_WAIT. */
				4278	tcp_send_ack(sk);
				4279	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				4280	break;
				4281	default:
				4282	/* Only TCP_LISTEN and TCP_CLOSE are left, in these
				4283	* cases we should never reach this piece of code.
				4284	*/
				4285	pr_err("%s: Impossible, sk->sk_state=%d\n",
				4286	__func__, sk->sk_state);
				4287	break;
				4288	}
				4289
				4290	/* It _is_ possible, that we have something out-of-order _after_ FIN.
				4291	* Probably, we should reset in this case. For now drop them.
				4292	*/
				4293	skb_rbtree_purge(&tp->out_of_order_queue);
				4294	if (tcp_is_sack(tp))
				4295	tcp_sack_reset(&tp->rx_opt);
				4296	sk_mem_reclaim(sk);
				4297
				4298	if (!sock_flag(sk, SOCK_DEAD)) {
				4299	sk->sk_state_change(sk);
				4300
				4301	/* Do not send POLL_HUP for half duplex close. */
				4302	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
				4303	sk->sk_state == TCP_CLOSE)
				4304	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
				4305	else
				4306	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				4307	}
				4308	}
				4309
				4310	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
				4311	u32 end_seq)
				4312	{
				4313	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
				4314	if (before(seq, sp->start_seq))
				4315	sp->start_seq = seq;
				4316	if (after(end_seq, sp->end_seq))
				4317	sp->end_seq = end_seq;
				4318	return true;
				4319	}
				4320	return false;
				4321	}
				4322
				4323	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
				4324	{
				4325	struct tcp_sock *tp = tcp_sk(sk);
				4326
				4327	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
				4328	int mib_idx;
				4329
				4330	if (before(seq, tp->rcv_nxt))
				4331	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
				4332	else
				4333	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
				4334
				4335	NET_INC_STATS(sock_net(sk), mib_idx);
				4336
				4337	tp->rx_opt.dsack = 1;
				4338	tp->duplicate_sack[0].start_seq = seq;
				4339	tp->duplicate_sack[0].end_seq = end_seq;
				4340	}
				4341	}
				4342
				4343	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
				4344	{
				4345	struct tcp_sock *tp = tcp_sk(sk);
				4346
				4347	if (!tp->rx_opt.dsack)
				4348	tcp_dsack_set(sk, seq, end_seq);
				4349	else
				4350	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
				4351	}
				4352
				4353	static void tcp_rcv_spurious_retrans(struct sock sk, const struct sk_buff skb)
				4354	{
				4355	/* When the ACK path fails or drops most ACKs, the sender would
				4356	* timeout and spuriously retransmit the same segment repeatedly.
				4357	* The receiver remembers and reflects via DSACKs. Leverage the
				4358	* DSACK state and change the txhash to re-route speculatively.
				4359	*/
				4360	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
				4361	sk_rethink_txhash(sk);
				4362	}
				4363
				4364	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
				4365	{
				4366	struct tcp_sock *tp = tcp_sk(sk);
				4367
				4368	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				4369	before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4370	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4371	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4372
				4373	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
				4374	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				4375
				4376	tcp_rcv_spurious_retrans(sk, skb);
				4377	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				4378	end_seq = tp->rcv_nxt;
				4379	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
				4380	}
				4381	}
				4382
				4383	tcp_send_ack(sk);
				4384	}
				4385
				4386	/* These routines update the SACK block as out-of-order packets arrive or
				4387	* in-order packets close up the sequence space.
				4388	*/
				4389	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
				4390	{
				4391	int this_sack;
				4392	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4393	struct tcp_sack_block *swalk = sp + 1;
				4394
				4395	/* See if the recent change to the first SACK eats into
				4396	* or hits the sequence space of other SACK blocks, if so coalesce.
				4397	*/
				4398	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
				4399	if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
				4400	int i;
				4401
				4402	/* Zap SWALK, by moving every further SACK up by one slot.
				4403	* Decrease num_sacks.
				4404	*/
				4405	tp->rx_opt.num_sacks--;
				4406	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				4407	sp[i] = sp[i + 1];
				4408	continue;
				4409	}
				4410	this_sack++, swalk++;
				4411	}
				4412	}
				4413
				4414	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
				4415	{
				4416	struct tcp_sock *tp = tcp_sk(sk);
				4417	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4418	int cur_sacks = tp->rx_opt.num_sacks;
				4419	int this_sack;
				4420
				4421	if (!cur_sacks)
				4422	goto new_sack;
				4423
				4424	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
				4425	if (tcp_sack_extend(sp, seq, end_seq)) {
				4426	/* Rotate this_sack to the first one. */
				4427	for (; this_sack > 0; this_sack--, sp--)
				4428	swap(sp, (sp - 1));
				4429	if (cur_sacks > 1)
				4430	tcp_sack_maybe_coalesce(tp);
				4431	return;
				4432	}
				4433	}
				4434
				4435	/* Could not find an adjacent existing SACK, build a new one,
				4436	* put it at the front, and shift everyone else down. We
				4437	* always know there is at least one SACK present already here.
				4438	*
				4439	* If the sack array is full, forget about the last one.
				4440	*/
				4441	if (this_sack >= TCP_NUM_SACKS) {
				4442	if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
				4443	tcp_send_ack(sk);
				4444	this_sack--;
				4445	tp->rx_opt.num_sacks--;
				4446	sp--;
				4447	}
				4448	for (; this_sack > 0; this_sack--, sp--)
				4449	sp = (sp - 1);
				4450
				4451	new_sack:
				4452	/* Build the new head SACK, and we're done. */
				4453	sp->start_seq = seq;
				4454	sp->end_seq = end_seq;
				4455	tp->rx_opt.num_sacks++;
				4456	}
				4457
				4458	/* RCV.NXT advances, some SACKs should be eaten. */
				4459
				4460	static void tcp_sack_remove(struct tcp_sock *tp)
				4461	{
				4462	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4463	int num_sacks = tp->rx_opt.num_sacks;
				4464	int this_sack;
				4465
				4466	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
				4467	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4468	tp->rx_opt.num_sacks = 0;
				4469	return;
				4470	}
				4471
				4472	for (this_sack = 0; this_sack < num_sacks;) {
				4473	/* Check if the start of the sack is covered by RCV.NXT. */
				4474	if (!before(tp->rcv_nxt, sp->start_seq)) {
				4475	int i;
				4476
				4477	/* RCV.NXT must cover all the block! */
				4478	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
				4479
				4480	/* Zap this SACK, by moving forward any other SACKS. */
				4481	for (i = this_sack+1; i < num_sacks; i++)
				4482	tp->selective_acks[i-1] = tp->selective_acks[i];
				4483	num_sacks--;
				4484	continue;
				4485	}
				4486	this_sack++;
				4487	sp++;
				4488	}
				4489	tp->rx_opt.num_sacks = num_sacks;
				4490	}
				4491
				4492	/**
				4493	* tcp_try_coalesce - try to merge skb to prior one
				4494	* @sk: socket
				4495	* @dest: destination queue
				4496	* @to: prior buffer
				4497	* @from: buffer to add in queue
				4498	* @fragstolen: pointer to boolean
				4499	*
				4500	* Before queueing skb @from after @to, try to merge them
				4501	* to reduce overall memory use and queue lengths, if cost is small.
				4502	* Packets in ofo or receive queues can stay a long time.
				4503	* Better try to coalesce them right now to avoid future collapses.
				4504	* Returns true if caller should free @from instead of queueing it
				4505	*/
				4506	static bool tcp_try_coalesce(struct sock *sk,
				4507	struct sk_buff *to,
				4508	struct sk_buff *from,
				4509	bool *fragstolen)
				4510	{
				4511	int delta;
				4512
				4513	*fragstolen = false;
				4514
				4515	/* Its possible this segment overlaps with prior segment in queue */
				4516	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
				4517	return false;
				4518
				4519	#ifdef CONFIG_TLS_DEVICE
				4520	if (from->decrypted != to->decrypted)
				4521	return false;
				4522	#endif
				4523
				4524	if (!skb_try_coalesce(to, from, fragstolen, &delta))
				4525	return false;
				4526
				4527	atomic_add(delta, &sk->sk_rmem_alloc);
				4528	sk_mem_charge(sk, delta);
				4529	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
				4530	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
				4531	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
				4532	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
				4533
				4534	if (TCP_SKB_CB(from)->has_rxtstamp) {
				4535	TCP_SKB_CB(to)->has_rxtstamp = true;
				4536	to->tstamp = from->tstamp;
				4537	skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
				4538	}
				4539
				4540	return true;
				4541	}
				4542
				4543	static bool tcp_ooo_try_coalesce(struct sock *sk,
				4544	struct sk_buff *to,
				4545	struct sk_buff *from,
				4546	bool *fragstolen)
				4547	{
				4548	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
				4549
				4550	/* In case tcp_drop() is called later, update to->gso_segs */
				4551	if (res) {
				4552	u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
				4553	max_t(u16, 1, skb_shinfo(from)->gso_segs);
				4554
				4555	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
				4556	}
				4557	return res;
				4558	}
				4559
				4560	static void tcp_drop(struct sock sk, struct sk_buff skb)
				4561	{
				4562	trace_android_vh_kfree_skb(skb);
				4563	sk_drops_add(sk, skb);
				4564	__kfree_skb(skb);
				4565	}
				4566
				4567	/* This one checks to see if we can put data from the
				4568	* out_of_order queue into the receive_queue.
				4569	*/
				4570	static void tcp_ofo_queue(struct sock *sk)
				4571	{
				4572	struct tcp_sock *tp = tcp_sk(sk);
				4573	__u32 dsack_high = tp->rcv_nxt;
				4574	bool fin, fragstolen, eaten;
				4575	struct sk_buff skb, tail;
				4576	struct rb_node *p;
				4577
				4578	p = rb_first(&tp->out_of_order_queue);
				4579	while (p) {
				4580	skb = rb_to_skb(p);
				4581	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				4582	break;
				4583
				4584	if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
				4585	__u32 dsack = dsack_high;
				4586	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				4587	dsack_high = TCP_SKB_CB(skb)->end_seq;
				4588	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
				4589	}
				4590	p = rb_next(p);
				4591	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
				4592
				4593	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
				4594	tcp_drop(sk, skb);
				4595	continue;
				4596	}
				4597
				4598	tail = skb_peek_tail(&sk->sk_receive_queue);
				4599	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
				4600	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				4601	fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
				4602	if (!eaten)
				4603	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4604	else
				4605	kfree_skb_partial(skb, fragstolen);
				4606
				4607	if (unlikely(fin)) {
				4608	tcp_fin(sk);
				4609	/* tcp_fin() purges tp->out_of_order_queue,
				4610	* so we must end this loop right now.
				4611	*/
				4612	break;
				4613	}
				4614	}
				4615	}
				4616
				4617	static bool tcp_prune_ofo_queue(struct sock *sk);
				4618	static int tcp_prune_queue(struct sock *sk);
				4619
				4620	static int tcp_try_rmem_schedule(struct sock sk, struct sk_buff skb,
				4621	unsigned int size)
				4622	{
				4623	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
				4624	!sk_rmem_schedule(sk, skb, size)) {
				4625
				4626	if (tcp_prune_queue(sk) < 0)
				4627	return -1;
				4628
				4629	while (!sk_rmem_schedule(sk, skb, size)) {
				4630	if (!tcp_prune_ofo_queue(sk))
				4631	return -1;
				4632	}
				4633	}
				4634	return 0;
				4635	}
				4636
				4637	static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
				4638	{
				4639	struct tcp_sock *tp = tcp_sk(sk);
				4640	struct rb_node *p, parent;
				4641	struct sk_buff *skb1;
				4642	u32 seq, end_seq;
				4643	bool fragstolen;
				4644
				4645	tcp_ecn_check_ce(sk, skb);
				4646
				4647	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
				4648	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
				4649	sk->sk_data_ready(sk);
				4650	tcp_drop(sk, skb);
				4651	return;
				4652	}
				4653
				4654	/* Disable header prediction. */
				4655	tp->pred_flags = 0;
				4656	inet_csk_schedule_ack(sk);
				4657
				4658	tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
				4659	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
				4660	seq = TCP_SKB_CB(skb)->seq;
				4661	end_seq = TCP_SKB_CB(skb)->end_seq;
				4662
				4663	p = &tp->out_of_order_queue.rb_node;
				4664	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4665	/* Initial out of order segment, build 1 SACK. */
				4666	if (tcp_is_sack(tp)) {
				4667	tp->rx_opt.num_sacks = 1;
				4668	tp->selective_acks[0].start_seq = seq;
				4669	tp->selective_acks[0].end_seq = end_seq;
				4670	}
				4671	rb_link_node(&skb->rbnode, NULL, p);
				4672	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4673	tp->ooo_last_skb = skb;
				4674	goto end;
				4675	}
				4676
				4677	/* In the typical case, we are adding an skb to the end of the list.
				4678	* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
				4679	*/
				4680	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
				4681	skb, &fragstolen)) {
				4682	coalesce_done:
				4683	/* For non sack flows, do not grow window to force DUPACK
				4684	* and trigger fast retransmit.
				4685	*/
				4686	if (tcp_is_sack(tp))
				4687	tcp_grow_window(sk, skb);
				4688	kfree_skb_partial(skb, fragstolen);
				4689	skb = NULL;
				4690	goto add_sack;
				4691	}
				4692	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
				4693	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
				4694	parent = &tp->ooo_last_skb->rbnode;
				4695	p = &parent->rb_right;
				4696	goto insert;
				4697	}
				4698
				4699	/* Find place to insert this segment. Handle overlaps on the way. */
				4700	parent = NULL;
				4701	while (*p) {
				4702	parent = *p;
				4703	skb1 = rb_to_skb(parent);
				4704	if (before(seq, TCP_SKB_CB(skb1)->seq)) {
				4705	p = &parent->rb_left;
				4706	continue;
				4707	}
				4708	if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
				4709	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4710	/* All the bits are present. Drop. */
				4711	NET_INC_STATS(sock_net(sk),
				4712	LINUX_MIB_TCPOFOMERGE);
				4713	tcp_drop(sk, skb);
				4714	skb = NULL;
				4715	tcp_dsack_set(sk, seq, end_seq);
				4716	goto add_sack;
				4717	}
				4718	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				4719	/* Partial overlap. */
				4720	tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
				4721	} else {
				4722	/* skb's seq == skb1's seq and skb covers skb1.
				4723	* Replace skb1 with skb.
				4724	*/
				4725	rb_replace_node(&skb1->rbnode, &skb->rbnode,
				4726	&tp->out_of_order_queue);
				4727	tcp_dsack_extend(sk,
				4728	TCP_SKB_CB(skb1)->seq,
				4729	TCP_SKB_CB(skb1)->end_seq);
				4730	NET_INC_STATS(sock_net(sk),
				4731	LINUX_MIB_TCPOFOMERGE);
				4732	tcp_drop(sk, skb1);
				4733	goto merge_right;
				4734	}
				4735	} else if (tcp_ooo_try_coalesce(sk, skb1,
				4736	skb, &fragstolen)) {
				4737	goto coalesce_done;
				4738	}
				4739	p = &parent->rb_right;
				4740	}
				4741	insert:
				4742	/* Insert segment into RB tree. */
				4743	rb_link_node(&skb->rbnode, parent, p);
				4744	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4745
				4746	merge_right:
				4747	/* Remove other segments covered by skb. */
				4748	while ((skb1 = skb_rb_next(skb)) != NULL) {
				4749	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
				4750	break;
				4751	if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4752	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4753	end_seq);
				4754	break;
				4755	}
				4756	rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
				4757	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4758	TCP_SKB_CB(skb1)->end_seq);
				4759	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
				4760	tcp_drop(sk, skb1);
				4761	}
				4762	/* If there is no skb after us, we are the last_skb ! */
				4763	if (!skb1)
				4764	tp->ooo_last_skb = skb;
				4765
				4766	add_sack:
				4767	if (tcp_is_sack(tp))
				4768	tcp_sack_new_ofo_skb(sk, seq, end_seq);
				4769	end:
				4770	if (skb) {
				4771	/* For non sack flows, do not grow window to force DUPACK
				4772	* and trigger fast retransmit.
				4773	*/
				4774	if (tcp_is_sack(tp))
				4775	tcp_grow_window(sk, skb);
				4776	skb_condense(skb);
				4777	skb_set_owner_r(skb, sk);
				4778	}
				4779	}
				4780
				4781	static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb,
				4782	bool *fragstolen)
				4783	{
				4784	int eaten;
				4785	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
				4786
				4787	eaten = (tail &&
				4788	tcp_try_coalesce(sk, tail,
				4789	skb, fragstolen)) ? 1 : 0;
				4790	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
				4791	if (!eaten) {
				4792	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4793	skb_set_owner_r(skb, sk);
				4794	}
				4795	return eaten;
				4796	}
				4797
				4798	int tcp_send_rcvq(struct sock sk, struct msghdr msg, size_t size)
				4799	{
				4800	struct sk_buff *skb;
				4801	int err = -ENOMEM;
				4802	int data_len = 0;
				4803	bool fragstolen;
				4804
				4805	if (size == 0)
				4806	return 0;
				4807
				4808	if (size > PAGE_SIZE) {
				4809	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
				4810
				4811	data_len = npages << PAGE_SHIFT;
				4812	size = data_len + (size & ~PAGE_MASK);
				4813	}
				4814	skb = alloc_skb_with_frags(size - data_len, data_len,
				4815	PAGE_ALLOC_COSTLY_ORDER,
				4816	&err, sk->sk_allocation);
				4817	if (!skb)
				4818	goto err;
				4819
				4820	skb_put(skb, size - data_len);
				4821	skb->data_len = data_len;
				4822	skb->len = size;
				4823
				4824	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4825	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
				4826	goto err_free;
				4827	}
				4828
				4829	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
				4830	if (err)
				4831	goto err_free;
				4832
				4833	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
				4834	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
				4835	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
				4836
				4837	if (tcp_queue_rcv(sk, skb, &fragstolen)) {
				4838	WARN_ON_ONCE(fragstolen); /* should not happen */
				4839	__kfree_skb(skb);
				4840	}
				4841	return size;
				4842
				4843	err_free:
				4844	kfree_skb(skb);
				4845	err:
				4846	return err;
				4847
				4848	}
				4849
				4850	void tcp_data_ready(struct sock *sk)
				4851	{
				4852	const struct tcp_sock *tp = tcp_sk(sk);
				4853	int avail = tp->rcv_nxt - tp->copied_seq;
				4854
				4855	if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) &&
				4856	!sock_flag(sk, SOCK_DONE) &&
				4857	tcp_receive_window(tp) > inet_csk(sk)->icsk_ack.rcv_mss)
				4858	return;
				4859
				4860	sk->sk_data_ready(sk);
				4861	}
				4862
				4863	static void tcp_data_queue(struct sock sk, struct sk_buff skb)
				4864	{
				4865	struct tcp_sock *tp = tcp_sk(sk);
				4866	bool fragstolen;
				4867	int eaten;
				4868
				4869	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
				4870	__kfree_skb(skb);
				4871	return;
				4872	}
				4873	skb_dst_drop(skb);
				4874	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
				4875
				4876	tp->rx_opt.dsack = 0;
				4877
				4878	/* Queue data for delivery to the user.
				4879	* Packets in sequence go to the receive queue.
				4880	* Out of sequence packets to the out_of_order_queue.
				4881	*/
				4882	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
				4883	if (tcp_receive_window(tp) == 0) {
				4884	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4885	goto out_of_window;
				4886	}
				4887
				4888	/* Ok. In sequence. In window. */
				4889	queue_and_out:
				4890	if (skb_queue_len(&sk->sk_receive_queue) == 0)
				4891	sk_forced_mem_schedule(sk, skb->truesize);
				4892	else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4893	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
				4894	sk->sk_data_ready(sk);
				4895	goto drop;
				4896	}
				4897
				4898	eaten = tcp_queue_rcv(sk, skb, &fragstolen);
				4899	if (skb->len)
				4900	tcp_event_data_recv(sk, skb);
				4901	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				4902	tcp_fin(sk);
				4903
				4904	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4905	tcp_ofo_queue(sk);
				4906
				4907	/* RFC5681. 4.2. SHOULD send immediate ACK, when
				4908	* gap in queue is filled.
				4909	*/
				4910	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				4911	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
				4912	}
				4913
				4914	if (tp->rx_opt.num_sacks)
				4915	tcp_sack_remove(tp);
				4916
				4917	tcp_fast_path_check(sk);
				4918
				4919	if (eaten > 0)
				4920	kfree_skb_partial(skb, fragstolen);
				4921	if (!sock_flag(sk, SOCK_DEAD))
				4922	tcp_data_ready(sk);
				4923	return;
				4924	}
				4925
				4926	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
				4927	tcp_rcv_spurious_retrans(sk, skb);
				4928	/* A retransmit, 2nd most common case. Force an immediate ack. */
				4929	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4930	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4931
				4932	out_of_window:
				4933	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4934	inet_csk_schedule_ack(sk);
				4935	drop:
				4936	tcp_drop(sk, skb);
				4937	return;
				4938	}
				4939
				4940	/* Out of window. F.e. zero window probe. */
				4941	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
				4942	goto out_of_window;
				4943
				4944	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4945	/* Partial packet, seq < rcv_next < end_seq */
				4946	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
				4947
				4948	/* If window is closed, drop tail of packet. But after
				4949	* remembering D-SACK for its head made in previous line.
				4950	*/
				4951	if (!tcp_receive_window(tp)) {
				4952	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4953	goto out_of_window;
				4954	}
				4955	goto queue_and_out;
				4956	}
				4957
				4958	tcp_data_queue_ofo(sk, skb);
				4959	}
				4960
				4961	static struct sk_buff tcp_skb_next(struct sk_buff skb, struct sk_buff_head *list)
				4962	{
				4963	if (list)
				4964	return !skb_queue_is_last(list, skb) ? skb->next : NULL;
				4965
				4966	return skb_rb_next(skb);
				4967	}
				4968
				4969	static struct sk_buff tcp_collapse_one(struct sock sk, struct sk_buff *skb,
				4970	struct sk_buff_head *list,
				4971	struct rb_root *root)
				4972	{
				4973	struct sk_buff *next = tcp_skb_next(skb, list);
				4974
				4975	if (list)
				4976	__skb_unlink(skb, list);
				4977	else
				4978	rb_erase(&skb->rbnode, root);
				4979
				4980	__kfree_skb(skb);
				4981	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
				4982
				4983	return next;
				4984	}
				4985
				4986	/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
				4987	void tcp_rbtree_insert(struct rb_root root, struct sk_buff skb)
				4988	{
				4989	struct rb_node **p = &root->rb_node;
				4990	struct rb_node *parent = NULL;
				4991	struct sk_buff *skb1;
				4992
				4993	while (*p) {
				4994	parent = *p;
				4995	skb1 = rb_to_skb(parent);
				4996	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
				4997	p = &parent->rb_left;
				4998	else
				4999	p = &parent->rb_right;
				5000	}
				5001	rb_link_node(&skb->rbnode, parent, p);
				5002	rb_insert_color(&skb->rbnode, root);
				5003	}
				5004
				5005	/* Collapse contiguous sequence of skbs head..tail with
				5006	* sequence numbers start..end.
				5007	*
				5008	* If tail is NULL, this means until the end of the queue.
				5009	*
				5010	* Segments with FIN/SYN are not collapsed (only because this
				5011	* simplifies code)
				5012	*/
				5013	static void
				5014	tcp_collapse(struct sock sk, struct sk_buff_head list, struct rb_root *root,
				5015	struct sk_buff head, struct sk_buff tail, u32 start, u32 end)
				5016	{
				5017	struct sk_buff skb = head, n;
				5018	struct sk_buff_head tmp;
				5019	bool end_of_skbs;
				5020
				5021	/* First, check that queue is collapsible and find
				5022	* the point where collapsing can be useful.
				5023	*/
				5024	restart:
				5025	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
				5026	n = tcp_skb_next(skb, list);
				5027
				5028	/* No new bits? It is possible on ofo queue. */
				5029	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				5030	skb = tcp_collapse_one(sk, skb, list, root);
				5031	if (!skb)
				5032	break;
				5033	goto restart;
				5034	}
				5035
				5036	/* The first skb to collapse is:
				5037	* - not SYN/FIN and
				5038	* - bloated or contains data before "start" or
				5039	* overlaps to the next one.
				5040	*/
				5041	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
				5042	(tcp_win_from_space(sk, skb->truesize) > skb->len \|\|
				5043	before(TCP_SKB_CB(skb)->seq, start))) {
				5044	end_of_skbs = false;
				5045	break;
				5046	}
				5047
				5048	if (n && n != tail &&
				5049	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
				5050	end_of_skbs = false;
				5051	break;
				5052	}
				5053
				5054	/* Decided to skip this, advance start seq. */
				5055	start = TCP_SKB_CB(skb)->end_seq;
				5056	}
				5057	if (end_of_skbs \|\|
				5058	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				5059	return;
				5060
				5061	__skb_queue_head_init(&tmp);
				5062
				5063	while (before(start, end)) {
				5064	int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
				5065	struct sk_buff *nskb;
				5066
				5067	nskb = alloc_skb(copy, GFP_ATOMIC);
				5068	if (!nskb)
				5069	break;
				5070
				5071	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
				5072	#ifdef CONFIG_TLS_DEVICE
				5073	nskb->decrypted = skb->decrypted;
				5074	#endif
				5075	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
				5076	if (list)
				5077	__skb_queue_before(list, skb, nskb);
				5078	else
				5079	__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
				5080	skb_set_owner_r(nskb, sk);
				5081
				5082	/* Copy data, releasing collapsed skbs. */
				5083	while (copy > 0) {
				5084	int offset = start - TCP_SKB_CB(skb)->seq;
				5085	int size = TCP_SKB_CB(skb)->end_seq - start;
				5086
				5087	BUG_ON(offset < 0);
				5088	if (size > 0) {
				5089	size = min(copy, size);
				5090	if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				5091	BUG();
				5092	TCP_SKB_CB(nskb)->end_seq += size;
				5093	copy -= size;
				5094	start += size;
				5095	}
				5096	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				5097	skb = tcp_collapse_one(sk, skb, list, root);
				5098	if (!skb \|\|
				5099	skb == tail \|\|
				5100	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				5101	goto end;
				5102	#ifdef CONFIG_TLS_DEVICE
				5103	if (skb->decrypted != nskb->decrypted)
				5104	goto end;
				5105	#endif
				5106	}
				5107	}
				5108	}
				5109	end:
				5110	skb_queue_walk_safe(&tmp, skb, n)
				5111	tcp_rbtree_insert(root, skb);
				5112	}
				5113
				5114	/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
				5115	* and tcp_collapse() them until all the queue is collapsed.
				5116	*/
				5117	static void tcp_collapse_ofo_queue(struct sock *sk)
				5118	{
				5119	struct tcp_sock *tp = tcp_sk(sk);
				5120	u32 range_truesize, sum_tiny = 0;
				5121	struct sk_buff skb, head;
				5122	u32 start, end;
				5123
				5124	skb = skb_rb_first(&tp->out_of_order_queue);
				5125	new_range:
				5126	if (!skb) {
				5127	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
				5128	return;
				5129	}
				5130	start = TCP_SKB_CB(skb)->seq;
				5131	end = TCP_SKB_CB(skb)->end_seq;
				5132	range_truesize = skb->truesize;
				5133
				5134	for (head = skb;;) {
				5135	skb = skb_rb_next(skb);
				5136
				5137	/* Range is terminated when we see a gap or when
				5138	* we are at the queue end.
				5139	*/
				5140	if (!skb \|\|
				5141	after(TCP_SKB_CB(skb)->seq, end) \|\|
				5142	before(TCP_SKB_CB(skb)->end_seq, start)) {
				5143	/* Do not attempt collapsing tiny skbs */
				5144	if (range_truesize != head->truesize \|\|
				5145	end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
				5146	tcp_collapse(sk, NULL, &tp->out_of_order_queue,
				5147	head, skb, start, end);
				5148	} else {
				5149	sum_tiny += range_truesize;
				5150	if (sum_tiny > sk->sk_rcvbuf >> 3)
				5151	return;
				5152	}
				5153	goto new_range;
				5154	}
				5155
				5156	range_truesize += skb->truesize;
				5157	if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
				5158	start = TCP_SKB_CB(skb)->seq;
				5159	if (after(TCP_SKB_CB(skb)->end_seq, end))
				5160	end = TCP_SKB_CB(skb)->end_seq;
				5161	}
				5162	}
				5163
				5164	/*
				5165	* Clean the out-of-order queue to make room.
				5166	* We drop high sequences packets to :
				5167	* 1) Let a chance for holes to be filled.
				5168	* 2) not add too big latencies if thousands of packets sit there.
				5169	* (But if application shrinks SO_RCVBUF, we could still end up
				5170	* freeing whole queue here)
				5171	* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
				5172	*
				5173	* Return true if queue has shrunk.
				5174	*/
				5175	static bool tcp_prune_ofo_queue(struct sock *sk)
				5176	{
				5177	struct tcp_sock *tp = tcp_sk(sk);
				5178	struct rb_node node, prev;
				5179	int goal;
				5180
				5181	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				5182	return false;
				5183
				5184	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
				5185	goal = sk->sk_rcvbuf >> 3;
				5186	node = &tp->ooo_last_skb->rbnode;
				5187	do {
				5188	prev = rb_prev(node);
				5189	rb_erase(node, &tp->out_of_order_queue);
				5190	goal -= rb_to_skb(node)->truesize;
				5191	tcp_drop(sk, rb_to_skb(node));
				5192	if (!prev \|\| goal <= 0) {
				5193	sk_mem_reclaim(sk);
				5194	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
				5195	!tcp_under_memory_pressure(sk))
				5196	break;
				5197	goal = sk->sk_rcvbuf >> 3;
				5198	}
				5199	node = prev;
				5200	} while (node);
				5201	tp->ooo_last_skb = rb_to_skb(prev);
				5202
				5203	/* Reset SACK state. A conforming SACK implementation will
				5204	* do the same at a timeout based retransmit. When a connection
				5205	* is in a sad state like this, we care only about integrity
				5206	* of the connection not performance.
				5207	*/
				5208	if (tp->rx_opt.sack_ok)
				5209	tcp_sack_reset(&tp->rx_opt);
				5210	return true;
				5211	}
				5212
				5213	/* Reduce allocated memory if we can, trying to get
				5214	* the socket within its memory limits again.
				5215	*
				5216	* Return less than zero if we should start dropping frames
				5217	* until the socket owning process reads some of the data
				5218	* to stabilize the situation.
				5219	*/
				5220	static int tcp_prune_queue(struct sock *sk)
				5221	{
				5222	struct tcp_sock *tp = tcp_sk(sk);
				5223
				5224	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
				5225
				5226	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				5227	tcp_clamp_window(sk);
				5228	else if (tcp_under_memory_pressure(sk))
				5229	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
				5230
				5231	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5232	return 0;
				5233
				5234	tcp_collapse_ofo_queue(sk);
				5235	if (!skb_queue_empty(&sk->sk_receive_queue))
				5236	tcp_collapse(sk, &sk->sk_receive_queue, NULL,
				5237	skb_peek(&sk->sk_receive_queue),
				5238	NULL,
				5239	tp->copied_seq, tp->rcv_nxt);
				5240	sk_mem_reclaim(sk);
				5241
				5242	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5243	return 0;
				5244
				5245	/* Collapsing did not help, destructive actions follow.
				5246	* This must not ever occur. */
				5247
				5248	tcp_prune_ofo_queue(sk);
				5249
				5250	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5251	return 0;
				5252
				5253	/* If we are really being abused, tell the caller to silently
				5254	* drop receive data on the floor. It will get retransmitted
				5255	* and hopefully then we'll have sufficient space.
				5256	*/
				5257	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
				5258
				5259	/* Massive buffer overcommit. */
				5260	tp->pred_flags = 0;
				5261	return -1;
				5262	}
				5263
				5264	static bool tcp_should_expand_sndbuf(const struct sock *sk)
				5265	{
				5266	const struct tcp_sock *tp = tcp_sk(sk);
				5267
				5268	/* If the user specified a specific send buffer setting, do
				5269	* not modify it.
				5270	*/
				5271	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
				5272	return false;
				5273
				5274	/* If we are under global TCP memory pressure, do not expand. */
				5275	if (tcp_under_memory_pressure(sk))
				5276	return false;
				5277
				5278	/* If we are under soft global TCP memory pressure, do not expand. */
				5279	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
				5280	return false;
				5281
				5282	/* If we filled the congestion window, do not expand. */
				5283	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
				5284	return false;
				5285
				5286	return true;
				5287	}
				5288
				5289	/* When incoming ACK allowed to free some skb from write_queue,
				5290	* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
				5291	* on the exit from tcp input handler.
				5292	*
				5293	* PROBLEM: sndbuf expansion does not work well with largesend.
				5294	*/
				5295	static void tcp_new_space(struct sock *sk)
				5296	{
				5297	struct tcp_sock *tp = tcp_sk(sk);
				5298
				5299	if (tcp_should_expand_sndbuf(sk)) {
				5300	tcp_sndbuf_expand(sk);
				5301	tp->snd_cwnd_stamp = tcp_jiffies32;
				5302	}
				5303
				5304	sk->sk_write_space(sk);
				5305	}
				5306
				5307	/* Caller made space either from:
				5308	* 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
				5309	* 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
				5310	*
				5311	* We might be able to generate EPOLLOUT to the application if:
				5312	* 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
				5313	* 2) notsent amount (tp->write_seq - tp->snd_nxt) became
				5314	* small enough that tcp_stream_memory_free() decides it
				5315	* is time to generate EPOLLOUT.
				5316	*/
				5317	void tcp_check_space(struct sock *sk)
				5318	{
				5319	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
				5320	sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
				5321	/* pairs with tcp_poll() */
				5322	smp_mb();
				5323	if (sk->sk_socket &&
				5324	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				5325	tcp_new_space(sk);
				5326	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
				5327	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				5328	}
				5329	}
				5330	}
				5331
				5332	static inline void tcp_data_snd_check(struct sock *sk)
				5333	{
				5334	tcp_push_pending_frames(sk);
				5335	tcp_check_space(sk);
				5336	}
				5337
				5338	/*
				5339	* Check if sending an ack is needed.
				5340	*/
				5341	static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
				5342	{
				5343	struct tcp_sock *tp = tcp_sk(sk);
				5344	unsigned long rtt, delay;
				5345
				5346	/* More than one full frame received... */
				5347	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
				5348	/* ... and right edge of window advances far enough.
				5349	* (tcp_recvmsg() will send ACK otherwise).
				5350	* If application uses SO_RCVLOWAT, we want send ack now if
				5351	* we have not received enough bytes to satisfy the condition.
				5352	*/
				5353	(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat \|\|
				5354	__tcp_select_window(sk) >= tp->rcv_wnd)) \|\|
				5355	/* We ACK each frame or... */
				5356	tcp_in_quickack_mode(sk) \|\|
				5357	/* Protocol state mandates a one-time immediate ACK */
				5358	inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
				5359	send_now:
				5360	tcp_send_ack(sk);
				5361	return;
				5362	}
				5363
				5364	if (!ofo_possible \|\| RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				5365	tcp_send_delayed_ack(sk);
				5366	return;
				5367	}
				5368
				5369	if (!tcp_is_sack(tp) \|\|
				5370	tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
				5371	goto send_now;
				5372
				5373	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
				5374	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
				5375	if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
				5376	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
				5377	tp->compressed_ack - TCP_FASTRETRANS_THRESH);
				5378	tp->compressed_ack = 0;
				5379	}
				5380
				5381	if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
				5382	goto send_now;
				5383
				5384	if (hrtimer_is_queued(&tp->compressed_ack_timer))
				5385	return;
				5386
				5387	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
				5388
				5389	rtt = tp->rcv_rtt_est.rtt_us;
				5390	if (tp->srtt_us && tp->srtt_us < rtt)
				5391	rtt = tp->srtt_us;
				5392
				5393	delay = min_t(unsigned long,
				5394	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
				5395	rtt * (NSEC_PER_USEC >> 3)/20);
				5396	sock_hold(sk);
				5397	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
				5398	HRTIMER_MODE_REL_PINNED_SOFT);
				5399	}
				5400
				5401	static inline void tcp_ack_snd_check(struct sock *sk)
				5402	{
				5403	if (!inet_csk_ack_scheduled(sk)) {
				5404	/* We sent a data segment already. */
				5405	return;
				5406	}
				5407	__tcp_ack_snd_check(sk, 1);
				5408	}
				5409
				5410	/*
				5411	* This routine is only called when we have urgent data
				5412	* signaled. Its the 'slow' part of tcp_urg. It could be
				5413	* moved inline now as tcp_urg is only called from one
				5414	* place. We handle URGent data wrong. We have to - as
				5415	* BSD still doesn't use the correction from RFC961.
				5416	* For 1003.1g we should support a new option TCP_STDURG to permit
				5417	* either form (or just set the sysctl tcp_stdurg).
				5418	*/
				5419
				5420	static void tcp_check_urg(struct sock sk, const struct tcphdr th)
				5421	{
				5422	struct tcp_sock *tp = tcp_sk(sk);
				5423	u32 ptr = ntohs(th->urg_ptr);
				5424
				5425	if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
				5426	ptr--;
				5427	ptr += ntohl(th->seq);
				5428
				5429	/* Ignore urgent data that we've already seen and read. */
				5430	if (after(tp->copied_seq, ptr))
				5431	return;
				5432
				5433	/* Do not replay urg ptr.
				5434	*
				5435	* NOTE: interesting situation not covered by specs.
				5436	* Misbehaving sender may send urg ptr, pointing to segment,
				5437	* which we already have in ofo queue. We are not able to fetch
				5438	* such data and will stay in TCP_URG_NOTYET until will be eaten
				5439	* by recvmsg(). Seems, we are not obliged to handle such wicked
				5440	* situations. But it is worth to think about possibility of some
				5441	* DoSes using some hypothetical application level deadlock.
				5442	*/
				5443	if (before(ptr, tp->rcv_nxt))
				5444	return;
				5445
				5446	/* Do we already have a newer (or duplicate) urgent pointer? */
				5447	if (tp->urg_data && !after(ptr, tp->urg_seq))
				5448	return;
				5449
				5450	/* Tell the world about our new urgent pointer. */
				5451	sk_send_sigurg(sk);
				5452
				5453	/* We may be adding urgent data when the last byte read was
				5454	* urgent. To do this requires some care. We cannot just ignore
				5455	* tp->copied_seq since we would read the last urgent byte again
				5456	* as data, nor can we alter copied_seq until this data arrives
				5457	* or we break the semantics of SIOCATMARK (and thus sockatmark())
				5458	*
				5459	* NOTE. Double Dutch. Rendering to plain English: author of comment
				5460	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
				5461	* and expect that both A and B disappear from stream. This is _wrong_.
				5462	* Though this happens in BSD with high probability, this is occasional.
				5463	* Any application relying on this is buggy. Note also, that fix "works"
				5464	* only in this artificial test. Insert some normal data between A and B and we will
				5465	* decline of BSD again. Verdict: it is better to remove to trap
				5466	* buggy users.
				5467	*/
				5468	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
				5469	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
				5470	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				5471	tp->copied_seq++;
				5472	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
				5473	__skb_unlink(skb, &sk->sk_receive_queue);
				5474	__kfree_skb(skb);
				5475	}
				5476	}
				5477
				5478	tp->urg_data = TCP_URG_NOTYET;
				5479	WRITE_ONCE(tp->urg_seq, ptr);
				5480
				5481	/* Disable header prediction. */
				5482	tp->pred_flags = 0;
				5483	}
				5484
				5485	/* This is the 'fast' part of urgent handling. */
				5486	static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *th)
				5487	{
				5488	struct tcp_sock *tp = tcp_sk(sk);
				5489
				5490	/* Check if we get a new urgent pointer - normally not. */
				5491	if (th->urg)
				5492	tcp_check_urg(sk, th);
				5493
				5494	/* Do we wait for any urgent data? - normally not... */
				5495	if (tp->urg_data == TCP_URG_NOTYET) {
				5496	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
				5497	th->syn;
				5498
				5499	/* Is the urgent pointer pointing into this packet? */
				5500	if (ptr < skb->len) {
				5501	u8 tmp;
				5502	if (skb_copy_bits(skb, ptr, &tmp, 1))
				5503	BUG();
				5504	tp->urg_data = TCP_URG_VALID \| tmp;
				5505	if (!sock_flag(sk, SOCK_DEAD))
				5506	sk->sk_data_ready(sk);
				5507	}
				5508	}
				5509	}
				5510
				5511	/* Accept RST for rcv_nxt - 1 after a FIN.
				5512	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
				5513	* FIN is sent followed by a RST packet. The RST is sent with the same
				5514	* sequence number as the FIN, and thus according to RFC 5961 a challenge
				5515	* ACK should be sent. However, Mac OSX rate limits replies to challenge
				5516	* ACKs on the closed socket. In addition middleboxes can drop either the
				5517	* challenge ACK or a subsequent RST.
				5518	*/
				5519	static bool tcp_reset_check(const struct sock sk, const struct sk_buff skb)
				5520	{
				5521	struct tcp_sock *tp = tcp_sk(sk);
				5522
				5523	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
				5524	(1 << sk->sk_state) & (TCPF_CLOSE_WAIT \| TCPF_LAST_ACK \|
				5525	TCPF_CLOSING));
				5526	}
				5527
				5528	/* Does PAWS and seqno based validation of an incoming segment, flags will
				5529	* play significant role here.
				5530	*/
				5531	static bool tcp_validate_incoming(struct sock sk, struct sk_buff skb,
				5532	const struct tcphdr *th, int syn_inerr)
				5533	{
				5534	struct tcp_sock *tp = tcp_sk(sk);
				5535	bool rst_seq_match = false;
				5536
				5537	/* RFC1323: H1. Apply PAWS check first. */
				5538	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
				5539	tp->rx_opt.saw_tstamp &&
				5540	tcp_paws_discard(sk, skb)) {
				5541	if (!th->rst) {
				5542	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
				5543	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5544	LINUX_MIB_TCPACKSKIPPEDPAWS,
				5545	&tp->last_oow_ack_time))
				5546	tcp_send_dupack(sk, skb);
				5547	goto discard;
				5548	}
				5549	/* Reset is accepted even if it did not pass PAWS. */
				5550	}
				5551
				5552	/* Step 1: check sequence number */
				5553	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
				5554	/* RFC793, page 37: "In all states except SYN-SENT, all reset
				5555	* (RST) segments are validated by checking their SEQ-fields."
				5556	* And page 69: "If an incoming segment is not acceptable,
				5557	* an acknowledgment should be sent in reply (unless the RST
				5558	* bit is set, if so drop the segment and return)".
				5559	*/
				5560	if (!th->rst) {
				5561	if (th->syn)
				5562	goto syn_challenge;
				5563	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5564	LINUX_MIB_TCPACKSKIPPEDSEQ,
				5565	&tp->last_oow_ack_time))
				5566	tcp_send_dupack(sk, skb);
				5567	} else if (tcp_reset_check(sk, skb)) {
				5568	tcp_reset(sk);
				5569	}
				5570	goto discard;
				5571	}
				5572
				5573	/* Step 2: check RST bit */
				5574	if (th->rst) {
				5575	/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
				5576	* FIN and SACK too if available):
				5577	* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
				5578	* the right-most SACK block,
				5579	* then
				5580	* RESET the connection
				5581	* else
				5582	* Send a challenge ACK
				5583	*/
				5584	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt \|\|
				5585	tcp_reset_check(sk, skb)) {
				5586	rst_seq_match = true;
				5587	} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
				5588	struct tcp_sack_block *sp = &tp->selective_acks[0];
				5589	int max_sack = sp[0].end_seq;
				5590	int this_sack;
				5591
				5592	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
				5593	++this_sack) {
				5594	max_sack = after(sp[this_sack].end_seq,
				5595	max_sack) ?
				5596	sp[this_sack].end_seq : max_sack;
				5597	}
				5598
				5599	if (TCP_SKB_CB(skb)->seq == max_sack)
				5600	rst_seq_match = true;
				5601	}
				5602
				5603	if (rst_seq_match)
				5604	tcp_reset(sk);
				5605	else {
				5606	/* Disable TFO if RST is out-of-order
				5607	* and no data has been received
				5608	* for current active TFO socket
				5609	*/
				5610	if (tp->syn_fastopen && !tp->data_segs_in &&
				5611	sk->sk_state == TCP_ESTABLISHED)
				5612	tcp_fastopen_active_disable(sk);
				5613	tcp_send_challenge_ack(sk, skb);
				5614	}
				5615	goto discard;
				5616	}
				5617
				5618	/* step 3: check security and precedence [ignored] */
				5619
				5620	/* step 4: Check for a SYN
				5621	* RFC 5961 4.2 : Send a challenge ack
				5622	*/
				5623	if (th->syn) {
				5624	syn_challenge:
				5625	if (syn_inerr)
				5626	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5627	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
				5628	tcp_send_challenge_ack(sk, skb);
				5629	goto discard;
				5630	}
				5631
				5632	return true;
				5633
				5634	discard:
				5635	tcp_drop(sk, skb);
				5636	return false;
				5637	}
				5638
				5639	/*
				5640	* TCP receive function for the ESTABLISHED state.
				5641	*
				5642	* It is split into a fast path and a slow path. The fast path is
				5643	* disabled when:
				5644	* - A zero window was announced from us - zero window probing
				5645	* is only handled properly in the slow path.
				5646	* - Out of order segments arrived.
				5647	* - Urgent data is expected.
				5648	* - There is no buffer space left
				5649	* - Unexpected TCP flags/window values/header lengths are received
				5650	* (detected by checking the TCP header against pred_flags)
				5651	* - Data is sent in both directions. Fast path only supports pure senders
				5652	* or pure receivers (this means either the sequence number or the ack
				5653	* value must stay constant)
				5654	* - Unexpected TCP option.
				5655	*
				5656	* When these conditions are not satisfied it drops into a standard
				5657	* receive procedure patterned after RFC793 to handle all cases.
				5658	* The first three cases are guaranteed by proper pred_flags setting,
				5659	* the rest is checked inline. Fast processing is turned on in
				5660	* tcp_data_queue when everything is OK.
				5661	*/
				5662	void tcp_rcv_established(struct sock sk, struct sk_buff skb)
				5663	{
				5664	const struct tcphdr th = (const struct tcphdr )skb->data;
				5665	struct tcp_sock *tp = tcp_sk(sk);
				5666	unsigned int len = skb->len;
				5667
				5668	/* TCP congestion window tracking */
				5669	trace_tcp_probe(sk, skb);
				5670
				5671	tcp_mstamp_refresh(tp);
				5672	if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
				5673	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5674	/*
				5675	* Header prediction.
				5676	* The code loosely follows the one in the famous
				5677	* "30 instruction TCP receive" Van Jacobson mail.
				5678	*
				5679	* Van's trick is to deposit buffers into socket queue
				5680	* on a device interrupt, to call tcp_recv function
				5681	* on the receive process context and checksum and copy
				5682	* the buffer to user space. smart...
				5683	*
				5684	* Our current scheme is not silly either but we take the
				5685	* extra cost of the net_bh soft interrupt processing...
				5686	* We do checksum and copy also but from device to kernel.
				5687	*/
				5688
				5689	tp->rx_opt.saw_tstamp = 0;
				5690
				5691	/* pred_flags is 0xS?10 << 16 + snd_wnd
				5692	* if header_prediction is to be made
				5693	* 'S' will always be tp->tcp_header_len >> 2
				5694	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
				5695	* turn it off (when there are holes in the receive
				5696	* space for instance)
				5697	* PSH flag is ignored.
				5698	*/
				5699
				5700	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
				5701	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
				5702	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				5703	int tcp_header_len = tp->tcp_header_len;
				5704
				5705	/* Timestamp header prediction: tcp_header_len
				5706	* is automatically equal to th->doff*4 due to pred_flags
				5707	* match.
				5708	*/
				5709
				5710	/* Check timestamp */
				5711	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
				5712	/* No? Slow path! */
				5713	if (!tcp_parse_aligned_timestamp(tp, th))
				5714	goto slow_path;
				5715
				5716	/* If PAWS failed, check it more carefully in slow path */
				5717	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				5718	goto slow_path;
				5719
				5720	/* DO NOT update ts_recent here, if checksum fails
				5721	* and timestamp was corrupted part, it will result
				5722	* in a hung connection since we will drop all
				5723	* future packets due to the PAWS test.
				5724	*/
				5725	}
				5726
				5727	if (len <= tcp_header_len) {
				5728	/* Bulk data transfer: sender */
				5729	if (len == tcp_header_len) {
				5730	/* Predicted packet is in window by definition.
				5731	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5732	* Hence, check seq<=rcv_wup reduces to:
				5733	*/
				5734	if (tcp_header_len ==
				5735	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5736	tp->rcv_nxt == tp->rcv_wup)
				5737	tcp_store_ts_recent(tp);
				5738
				5739	/* We know that such packets are checksummed
				5740	* on entry.
				5741	*/
				5742	tcp_ack(sk, skb, 0);
				5743	__kfree_skb(skb);
				5744	tcp_data_snd_check(sk);
				5745	/* When receiving pure ack in fast path, update
				5746	* last ts ecr directly instead of calling
				5747	* tcp_rcv_rtt_measure_ts()
				5748	*/
				5749	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				5750	return;
				5751	} else { /* Header too small */
				5752	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5753	goto discard;
				5754	}
				5755	} else {
				5756	int eaten = 0;
				5757	bool fragstolen = false;
				5758
				5759	if (tcp_checksum_complete(skb))
				5760	goto csum_error;
				5761
				5762	if ((int)skb->truesize > sk->sk_forward_alloc)
				5763	goto step5;
				5764
				5765	/* Predicted packet is in window by definition.
				5766	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5767	* Hence, check seq<=rcv_wup reduces to:
				5768	*/
				5769	if (tcp_header_len ==
				5770	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5771	tp->rcv_nxt == tp->rcv_wup)
				5772	tcp_store_ts_recent(tp);
				5773
				5774	tcp_rcv_rtt_measure_ts(sk, skb);
				5775
				5776	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
				5777
				5778	/* Bulk data transfer: receiver */
				5779	__skb_pull(skb, tcp_header_len);
				5780	eaten = tcp_queue_rcv(sk, skb, &fragstolen);
				5781
				5782	tcp_event_data_recv(sk, skb);
				5783
				5784	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				5785	/* Well, only one small jumplet in fast path... */
				5786	tcp_ack(sk, skb, FLAG_DATA);
				5787	tcp_data_snd_check(sk);
				5788	if (!inet_csk_ack_scheduled(sk))
				5789	goto no_ack;
				5790	} else {
				5791	tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
				5792	}
				5793
				5794	__tcp_ack_snd_check(sk, 0);
				5795	no_ack:
				5796	if (eaten)
				5797	kfree_skb_partial(skb, fragstolen);
				5798	tcp_data_ready(sk);
				5799	return;
				5800	}
				5801	}
				5802
				5803	slow_path:
				5804	if (len < (th->doff << 2) \|\| tcp_checksum_complete(skb))
				5805	goto csum_error;
				5806
				5807	if (!th->ack && !th->rst && !th->syn)
				5808	goto discard;
				5809
				5810	/*
				5811	* Standard slow path.
				5812	*/
				5813
				5814	if (!tcp_validate_incoming(sk, skb, th, 1))
				5815	return;
				5816
				5817	step5:
				5818	if (tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT) < 0)
				5819	goto discard;
				5820
				5821	tcp_rcv_rtt_measure_ts(sk, skb);
				5822
				5823	/* Process urgent data. */
				5824	tcp_urg(sk, skb, th);
				5825
				5826	/* step 7: process the segment text */
				5827	tcp_data_queue(sk, skb);
				5828
				5829	tcp_data_snd_check(sk);
				5830	tcp_ack_snd_check(sk);
				5831	return;
				5832
				5833	csum_error:
				5834	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				5835	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5836
				5837	discard:
				5838	tcp_drop(sk, skb);
				5839	}
				5840	EXPORT_SYMBOL(tcp_rcv_established);
				5841
				5842	void tcp_init_transfer(struct sock *sk, int bpf_op)
				5843	{
				5844	struct inet_connection_sock *icsk = inet_csk(sk);
				5845	struct tcp_sock *tp = tcp_sk(sk);
				5846
				5847	tcp_mtup_init(sk);
				5848	icsk->icsk_af_ops->rebuild_header(sk);
				5849	tcp_init_metrics(sk);
				5850
				5851	/* Initialize the congestion window to start the transfer.
				5852	* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
				5853	* retransmitted. In light of RFC6298 more aggressive 1sec
				5854	* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
				5855	* retransmission has occurred.
				5856	*/
				5857	if (tp->total_retrans > 1 && tp->undo_marker)
				5858	tp->snd_cwnd = 1;
				5859	else
				5860	tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
				5861	tp->snd_cwnd_stamp = tcp_jiffies32;
				5862
				5863	tcp_call_bpf(sk, bpf_op, 0, NULL);
				5864	tcp_init_congestion_control(sk);
				5865	tcp_init_buffer_space(sk);
				5866	}
				5867
				5868	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
				5869	{
				5870	struct tcp_sock *tp = tcp_sk(sk);
				5871	struct inet_connection_sock *icsk = inet_csk(sk);
				5872
				5873	tcp_set_state(sk, TCP_ESTABLISHED);
				5874	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
				5875
				5876	if (skb) {
				5877	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5878	security_inet_conn_established(sk, skb);
				5879	sk_mark_napi_id(sk, skb);
				5880	}
				5881
				5882	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
				5883
				5884	/* Prevent spurious tcp_cwnd_restart() on first data
				5885	* packet.
				5886	*/
				5887	tp->lsndtime = tcp_jiffies32;
				5888
				5889	if (sock_flag(sk, SOCK_KEEPOPEN))
				5890	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
				5891
				5892	if (!tp->rx_opt.snd_wscale)
				5893	__tcp_fast_path_on(tp, tp->snd_wnd);
				5894	else
				5895	tp->pred_flags = 0;
				5896	}
				5897
				5898	static bool tcp_rcv_fastopen_synack(struct sock sk, struct sk_buff synack,
				5899	struct tcp_fastopen_cookie *cookie)
				5900	{
				5901	struct tcp_sock *tp = tcp_sk(sk);
				5902	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
				5903	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
				5904	bool syn_drop = false;
				5905
				5906	if (mss == tp->rx_opt.user_mss) {
				5907	struct tcp_options_received opt;
				5908
				5909	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
				5910	tcp_clear_options(&opt);
				5911	opt.user_mss = opt.mss_clamp = 0;
				5912	tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
				5913	mss = opt.mss_clamp;
				5914	}
				5915
				5916	if (!tp->syn_fastopen) {
				5917	/* Ignore an unsolicited cookie */
				5918	cookie->len = -1;
				5919	} else if (tp->total_retrans) {
				5920	/* SYN timed out and the SYN-ACK neither has a cookie nor
				5921	* acknowledges data. Presumably the remote received only
				5922	* the retransmitted (regular) SYNs: either the original
				5923	* SYN-data or the corresponding SYN-ACK was dropped.
				5924	*/
				5925	syn_drop = (cookie->len < 0 && data);
				5926	} else if (cookie->len < 0 && !tp->syn_data) {
				5927	/* We requested a cookie but didn't get it. If we did not use
				5928	* the (old) exp opt format then try so next time (try_exp=1).
				5929	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
				5930	*/
				5931	try_exp = tp->syn_fastopen_exp ? 2 : 1;
				5932	}
				5933
				5934	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
				5935
				5936	if (data) { /* Retransmit unacked data in SYN */
				5937	if (tp->total_retrans)
				5938	tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
				5939	else
				5940	tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
				5941	skb_rbtree_walk_from(data) {
				5942	if (__tcp_retransmit_skb(sk, data, 1))
				5943	break;
				5944	}
				5945	tcp_rearm_rto(sk);
				5946	NET_INC_STATS(sock_net(sk),
				5947	LINUX_MIB_TCPFASTOPENACTIVEFAIL);
				5948	return true;
				5949	}
				5950	tp->syn_data_acked = tp->syn_data;
				5951	if (tp->syn_data_acked) {
				5952	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
				5953	/* SYN-data is counted as two separate packets in tcp_ack() */
				5954	if (tp->delivered > 1)
				5955	--tp->delivered;
				5956	}
				5957
				5958	tcp_fastopen_add_skb(sk, synack);
				5959
				5960	return false;
				5961	}
				5962
				5963	static void smc_check_reset_syn(struct tcp_sock *tp)
				5964	{
				5965	#if IS_ENABLED(CONFIG_SMC)
				5966	if (static_branch_unlikely(&tcp_have_smc)) {
				5967	if (tp->syn_smc && !tp->rx_opt.smc_ok)
				5968	tp->syn_smc = 0;
				5969	}
				5970	#endif
				5971	}
				5972
				5973	static void tcp_try_undo_spurious_syn(struct sock *sk)
				5974	{
				5975	struct tcp_sock *tp = tcp_sk(sk);
				5976	u32 syn_stamp;
				5977
				5978	/* undo_marker is set when SYN or SYNACK times out. The timeout is
				5979	* spurious if the ACK's timestamp option echo value matches the
				5980	* original SYN timestamp.
				5981	*/
				5982	syn_stamp = tp->retrans_stamp;
				5983	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
				5984	syn_stamp == tp->rx_opt.rcv_tsecr)
				5985	tp->undo_marker = 0;
				5986	}
				5987
				5988	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
				5989	const struct tcphdr *th)
				5990	{
				5991	struct inet_connection_sock *icsk = inet_csk(sk);
				5992	struct tcp_sock *tp = tcp_sk(sk);
				5993	struct tcp_fastopen_cookie foc = { .len = -1 };
				5994	int saved_clamp = tp->rx_opt.mss_clamp;
				5995	bool fastopen_fail;
				5996
				5997	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
				5998	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				5999	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				6000
				6001	if (th->ack) {
				6002	/* rfc793:
				6003	* "If the state is SYN-SENT then
				6004	* first check the ACK bit
				6005	* If the ACK bit is set
				6006	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
				6007	* a reset (unless the RST bit is set, if so drop
				6008	* the segment and return)"
				6009	*/
				6010	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
				6011	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
				6012	goto reset_and_undo;
				6013
				6014	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				6015	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
				6016	tcp_time_stamp(tp))) {
				6017	NET_INC_STATS(sock_net(sk),
				6018	LINUX_MIB_PAWSACTIVEREJECTED);
				6019	goto reset_and_undo;
				6020	}
				6021
				6022	/* Now ACK is acceptable.
				6023	*
				6024	* "If the RST bit is set
				6025	* If the ACK was acceptable then signal the user "error:
				6026	* connection reset", drop the segment, enter CLOSED state,
				6027	* delete TCB, and return."
				6028	*/
				6029
				6030	if (th->rst) {
				6031	tcp_reset(sk);
				6032	goto discard;
				6033	}
				6034
				6035	/* rfc793:
				6036	* "fifth, if neither of the SYN or RST bits is set then
				6037	* drop the segment and return."
				6038	*
				6039	* See note below!
				6040	* --ANK(990513)
				6041	*/
				6042	if (!th->syn)
				6043	goto discard_and_undo;
				6044
				6045	/* rfc793:
				6046	* "If the SYN bit is on ...
				6047	* are acceptable then ...
				6048	* (our SYN has been ACKed), change the connection
				6049	* state to ESTABLISHED..."
				6050	*/
				6051
				6052	tcp_ecn_rcv_synack(tp, th);
				6053
				6054	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				6055	tcp_try_undo_spurious_syn(sk);
				6056	tcp_ack(sk, skb, FLAG_SLOWPATH);
				6057
				6058	/* Ok.. it's good. Set up sequence numbers and
				6059	* move to established.
				6060	*/
				6061	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
				6062	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				6063
				6064	/* RFC1323: The window in SYN & SYN/ACK segments is
				6065	* never scaled.
				6066	*/
				6067	tp->snd_wnd = ntohs(th->window);
				6068
				6069	if (!tp->rx_opt.wscale_ok) {
				6070	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
				6071	tp->window_clamp = min(tp->window_clamp, 65535U);
				6072	}
				6073
				6074	if (tp->rx_opt.saw_tstamp) {
				6075	tp->rx_opt.tstamp_ok = 1;
				6076	tp->tcp_header_len =
				6077	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				6078	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6079	tcp_store_ts_recent(tp);
				6080	} else {
				6081	tp->tcp_header_len = sizeof(struct tcphdr);
				6082	}
				6083
				6084	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				6085	tcp_initialize_rcv_mss(sk);
				6086
				6087	/* Remember, tcp_poll() does not lock socket!
				6088	* Change state from SYN-SENT only after copied_seq
				6089	* is initialized. */
				6090	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
				6091
				6092	smc_check_reset_syn(tp);
				6093
				6094	smp_mb();
				6095
				6096	tcp_finish_connect(sk, skb);
				6097
				6098	fastopen_fail = (tp->syn_fastopen \|\| tp->syn_data) &&
				6099	tcp_rcv_fastopen_synack(sk, skb, &foc);
				6100
				6101	if (!sock_flag(sk, SOCK_DEAD)) {
				6102	sk->sk_state_change(sk);
				6103	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6104	}
				6105	if (fastopen_fail)
				6106	return -1;
				6107	if (sk->sk_write_pending \|\|
				6108	icsk->icsk_accept_queue.rskq_defer_accept \|\|
				6109	inet_csk_in_pingpong_mode(sk)) {
				6110	/* Save one ACK. Data will be ready after
				6111	* several ticks, if write_pending is set.
				6112	*
				6113	* It may be deleted, but with this feature tcpdumps
				6114	* look so _wonderfully_ clever, that I was not able
				6115	* to stand against the temptation 8) --ANK
				6116	*/
				6117	inet_csk_schedule_ack(sk);
				6118	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				6119	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				6120	TCP_DELACK_MAX, TCP_RTO_MAX);
				6121
				6122	discard:
				6123	tcp_drop(sk, skb);
				6124	return 0;
				6125	} else {
				6126	tcp_send_ack(sk);
				6127	}
				6128	return -1;
				6129	}
				6130
				6131	/* No ACK in the segment */
				6132
				6133	if (th->rst) {
				6134	/* rfc793:
				6135	* "If the RST bit is set
				6136	*
				6137	* Otherwise (no ACK) drop the segment and return."
				6138	*/
				6139
				6140	goto discard_and_undo;
				6141	}
				6142
				6143	/* PAWS check. */
				6144	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
				6145	tcp_paws_reject(&tp->rx_opt, 0))
				6146	goto discard_and_undo;
				6147
				6148	if (th->syn) {
				6149	/* We see SYN without ACK. It is attempt of
				6150	* simultaneous connect with crossed SYNs.
				6151	* Particularly, it can be connect to self.
				6152	*/
				6153	tcp_set_state(sk, TCP_SYN_RECV);
				6154
				6155	if (tp->rx_opt.saw_tstamp) {
				6156	tp->rx_opt.tstamp_ok = 1;
				6157	tcp_store_ts_recent(tp);
				6158	tp->tcp_header_len =
				6159	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				6160	} else {
				6161	tp->tcp_header_len = sizeof(struct tcphdr);
				6162	}
				6163
				6164	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
				6165	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
				6166	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				6167
				6168	/* RFC1323: The window in SYN & SYN/ACK segments is
				6169	* never scaled.
				6170	*/
				6171	tp->snd_wnd = ntohs(th->window);
				6172	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				6173	tp->max_window = tp->snd_wnd;
				6174
				6175	tcp_ecn_rcv_syn(tp, th);
				6176
				6177	tcp_mtup_init(sk);
				6178	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				6179	tcp_initialize_rcv_mss(sk);
				6180
				6181	tcp_send_synack(sk);
				6182	#if 0
				6183	/* Note, we could accept data and URG from this segment.
				6184	* There are no obstacles to make this (except that we must
				6185	* either change tcp_recvmsg() to prevent it from returning data
				6186	* before 3WHS completes per RFC793, or employ TCP Fast Open).
				6187	*
				6188	* However, if we ignore data in ACKless segments sometimes,
				6189	* we have no reasons to accept it sometimes.
				6190	* Also, seems the code doing it in step6 of tcp_rcv_state_process
				6191	* is not flawless. So, discard packet for sanity.
				6192	* Uncomment this return to process the data.
				6193	*/
				6194	return -1;
				6195	#else
				6196	goto discard;
				6197	#endif
				6198	}
				6199	/* "fifth, if neither of the SYN or RST bits is set then
				6200	* drop the segment and return."
				6201	*/
				6202
				6203	discard_and_undo:
				6204	tcp_clear_options(&tp->rx_opt);
				6205	tp->rx_opt.mss_clamp = saved_clamp;
				6206	goto discard;
				6207
				6208	reset_and_undo:
				6209	tcp_clear_options(&tp->rx_opt);
				6210	tp->rx_opt.mss_clamp = saved_clamp;
				6211	return 1;
				6212	}
				6213
				6214	static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
				6215	{
				6216	struct tcp_sock *tp = tcp_sk(sk);
				6217	struct request_sock *req;
				6218
				6219	/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
				6220	* undo. If peer SACKs triggered fast recovery, we can't undo here.
				6221	*/
				6222	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
				6223	tcp_try_undo_recovery(sk);
				6224
				6225	/* Reset rtx states to prevent spurious retransmits_timed_out() */
				6226	tp->retrans_stamp = 0;
				6227	inet_csk(sk)->icsk_retransmits = 0;
				6228
				6229	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
				6230	* we no longer need req so release it.
				6231	*/
				6232	req = rcu_dereference_protected(tp->fastopen_rsk,
				6233	lockdep_sock_is_held(sk));
				6234	reqsk_fastopen_remove(sk, req, false);
				6235
				6236	/* Re-arm the timer because data may have been sent out.
				6237	* This is similar to the regular data transmission case
				6238	* when new data has just been ack'ed.
				6239	*
				6240	* (TFO) - we could try to be more aggressive and
				6241	* retransmitting any data sooner based on when they
				6242	* are sent out.
				6243	*/
				6244	tcp_rearm_rto(sk);
				6245	}
				6246
				6247	/*
				6248	* This function implements the receiving procedure of RFC 793 for
				6249	* all states except ESTABLISHED and TIME_WAIT.
				6250	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
				6251	* address independent.
				6252	*/
				6253
				6254	int tcp_rcv_state_process(struct sock sk, struct sk_buff skb)
				6255	{
				6256	struct tcp_sock *tp = tcp_sk(sk);
				6257	struct inet_connection_sock *icsk = inet_csk(sk);
				6258	const struct tcphdr *th = tcp_hdr(skb);
				6259	struct request_sock *req;
				6260	int queued = 0;
				6261	bool acceptable;
				6262
				6263	switch (sk->sk_state) {
				6264	case TCP_CLOSE:
				6265	goto discard;
				6266
				6267	case TCP_LISTEN:
				6268	if (th->ack)
				6269	return 1;
				6270
				6271	if (th->rst)
				6272	goto discard;
				6273
				6274	if (th->syn) {
				6275	if (th->fin)
				6276	goto discard;
				6277	/* It is possible that we process SYN packets from backlog,
				6278	* so we need to make sure to disable BH and RCU right there.
				6279	*/
				6280	rcu_read_lock();
				6281	local_bh_disable();
				6282	acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
				6283	local_bh_enable();
				6284	rcu_read_unlock();
				6285
				6286	if (!acceptable)
				6287	return 1;
				6288	consume_skb(skb);
				6289	return 0;
				6290	}
				6291	goto discard;
				6292
				6293	case TCP_SYN_SENT:
				6294	tp->rx_opt.saw_tstamp = 0;
				6295	tcp_mstamp_refresh(tp);
				6296	queued = tcp_rcv_synsent_state_process(sk, skb, th);
				6297	if (queued >= 0)
				6298	return queued;
				6299
				6300	/* Do step6 onward by hand. */
				6301	tcp_urg(sk, skb, th);
				6302	__kfree_skb(skb);
				6303	tcp_data_snd_check(sk);
				6304	return 0;
				6305	}
				6306
				6307	tcp_mstamp_refresh(tp);
				6308	tp->rx_opt.saw_tstamp = 0;
				6309	req = rcu_dereference_protected(tp->fastopen_rsk,
				6310	lockdep_sock_is_held(sk));
				6311	if (req) {
				6312	bool req_stolen;
				6313
				6314	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
				6315	sk->sk_state != TCP_FIN_WAIT1);
				6316
				6317	if (!tcp_check_req(sk, skb, req, true, &req_stolen))
				6318	goto discard;
				6319	}
				6320
				6321	if (!th->ack && !th->rst && !th->syn)
				6322	goto discard;
				6323
				6324	if (!tcp_validate_incoming(sk, skb, th, 0))
				6325	return 0;
				6326
				6327	/* step 5: check the ACK field */
				6328	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
				6329	FLAG_UPDATE_TS_RECENT \|
				6330	FLAG_NO_CHALLENGE_ACK) > 0;
				6331
				6332	if (!acceptable) {
				6333	if (sk->sk_state == TCP_SYN_RECV)
				6334	return 1; /* send one RST */
				6335	tcp_send_challenge_ack(sk, skb);
				6336	goto discard;
				6337	}
				6338	switch (sk->sk_state) {
				6339	case TCP_SYN_RECV:
				6340	tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
				6341	if (!tp->srtt_us)
				6342	tcp_synack_rtt_meas(sk, req);
				6343
				6344	if (req) {
				6345	tcp_rcv_synrecv_state_fastopen(sk);
				6346	} else {
				6347	tcp_try_undo_spurious_syn(sk);
				6348	tp->retrans_stamp = 0;
				6349	tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
				6350	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
				6351	}
				6352	smp_mb();
				6353	tcp_set_state(sk, TCP_ESTABLISHED);
				6354	sk->sk_state_change(sk);
				6355
				6356	/* Note, that this wakeup is only for marginal crossed SYN case.
				6357	* Passively open sockets are not waked up, because
				6358	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
				6359	*/
				6360	if (sk->sk_socket)
				6361	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6362
				6363	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				6364	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
				6365	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				6366
				6367	if (tp->rx_opt.tstamp_ok)
				6368	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6369
				6370	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
				6371	tcp_update_pacing_rate(sk);
				6372
				6373	/* Prevent spurious tcp_cwnd_restart() on first data packet */
				6374	tp->lsndtime = tcp_jiffies32;
				6375
				6376	tcp_initialize_rcv_mss(sk);
				6377	tcp_fast_path_on(tp);
				6378	if (sk->sk_shutdown & SEND_SHUTDOWN)
				6379	tcp_shutdown(sk, SEND_SHUTDOWN);
				6380	break;
				6381
				6382	case TCP_FIN_WAIT1: {
				6383	int tmo;
				6384
				6385	if (req)
				6386	tcp_rcv_synrecv_state_fastopen(sk);
				6387
				6388	if (tp->snd_una != tp->write_seq)
				6389	break;
				6390
				6391	tcp_set_state(sk, TCP_FIN_WAIT2);
				6392	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| SEND_SHUTDOWN);
				6393
				6394	sk_dst_confirm(sk);
				6395
				6396	if (!sock_flag(sk, SOCK_DEAD)) {
				6397	/* Wake up lingering close() */
				6398	sk->sk_state_change(sk);
				6399	break;
				6400	}
				6401
				6402	if (tp->linger2 < 0) {
				6403	tcp_done(sk);
				6404	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6405	return 1;
				6406	}
				6407	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6408	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6409	/* Receive out of order FIN after close() */
				6410	if (tp->syn_fastopen && th->fin)
				6411	tcp_fastopen_active_disable(sk);
				6412	tcp_done(sk);
				6413	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6414	return 1;
				6415	}
				6416
				6417	tmo = tcp_fin_time(sk);
				6418	if (tmo > TCP_TIMEWAIT_LEN) {
				6419	inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
				6420	} else if (th->fin \|\| sock_owned_by_user(sk)) {
				6421	/* Bad case. We could lose such FIN otherwise.
				6422	* It is not a big problem, but it looks confusing
				6423	* and not so rare event. We still can lose it now,
				6424	* if it spins in bh_lock_sock(), but it is really
				6425	* marginal case.
				6426	*/
				6427	inet_csk_reset_keepalive_timer(sk, tmo);
				6428	} else {
				6429	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				6430	goto discard;
				6431	}
				6432	break;
				6433	}
				6434
				6435	case TCP_CLOSING:
				6436	if (tp->snd_una == tp->write_seq) {
				6437	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				6438	goto discard;
				6439	}
				6440	break;
				6441
				6442	case TCP_LAST_ACK:
				6443	if (tp->snd_una == tp->write_seq) {
				6444	tcp_update_metrics(sk);
				6445	tcp_done(sk);
				6446	goto discard;
				6447	}
				6448	break;
				6449	}
				6450
				6451	/* step 6: check the URG bit */
				6452	tcp_urg(sk, skb, th);
				6453
				6454	/* step 7: process the segment text */
				6455	switch (sk->sk_state) {
				6456	case TCP_CLOSE_WAIT:
				6457	case TCP_CLOSING:
				6458	case TCP_LAST_ACK:
				6459	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				6460	break;
				6461	/* fall through */
				6462	case TCP_FIN_WAIT1:
				6463	case TCP_FIN_WAIT2:
				6464	/* RFC 793 says to queue data in these states,
				6465	* RFC 1122 says we MUST send a reset.
				6466	* BSD 4.4 also does reset.
				6467	*/
				6468	if (sk->sk_shutdown & RCV_SHUTDOWN) {
				6469	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6470	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6471	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6472	tcp_reset(sk);
				6473	return 1;
				6474	}
				6475	}
				6476	/* Fall through */
				6477	case TCP_ESTABLISHED:
				6478	tcp_data_queue(sk, skb);
				6479	queued = 1;
				6480	break;
				6481	}
				6482
				6483	/* tcp_data could move socket to TIME-WAIT */
				6484	if (sk->sk_state != TCP_CLOSE) {
				6485	tcp_data_snd_check(sk);
				6486	tcp_ack_snd_check(sk);
				6487	}
				6488
				6489	if (!queued) {
				6490	discard:
				6491	tcp_drop(sk, skb);
				6492	}
				6493	return 0;
				6494	}
				6495	EXPORT_SYMBOL(tcp_rcv_state_process);
				6496
				6497	static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
				6498	{
				6499	struct inet_request_sock *ireq = inet_rsk(req);
				6500
				6501	if (family == AF_INET)
				6502	net_dbg_ratelimited("drop open request from %pI4/%u\n",
				6503	&ireq->ir_rmt_addr, port);
				6504	#if IS_ENABLED(CONFIG_IPV6)
				6505	else if (family == AF_INET6)
				6506	net_dbg_ratelimited("drop open request from %pI6/%u\n",
				6507	&ireq->ir_v6_rmt_addr, port);
				6508	#endif
				6509	}
				6510
				6511	/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
				6512	*
				6513	* If we receive a SYN packet with these bits set, it means a
				6514	* network is playing bad games with TOS bits. In order to
				6515	* avoid possible false congestion notifications, we disable
				6516	* TCP ECN negotiation.
				6517	*
				6518	* Exception: tcp_ca wants ECN. This is required for DCTCP
				6519	* congestion control: Linux DCTCP asserts ECT on all packets,
				6520	* including SYN, which is most optimal solution; however,
				6521	* others, such as FreeBSD do not.
				6522	*
				6523	* Exception: At least one of the reserved bits of the TCP header (th->res1) is
				6524	* set, indicating the use of a future TCP extension (such as AccECN). See
				6525	* RFC8311 §4.3 which updates RFC3168 to allow the development of such
				6526	* extensions.
				6527	*/
				6528	static void tcp_ecn_create_request(struct request_sock *req,
				6529	const struct sk_buff *skb,
				6530	const struct sock *listen_sk,
				6531	const struct dst_entry *dst)
				6532	{
				6533	const struct tcphdr *th = tcp_hdr(skb);
				6534	const struct net *net = sock_net(listen_sk);
				6535	bool th_ecn = th->ece && th->cwr;
				6536	bool ect, ecn_ok;
				6537	u32 ecn_ok_dst;
				6538
				6539	if (!th_ecn)
				6540	return;
				6541
				6542	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
				6543	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
				6544	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
				6545
				6546	if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
				6547	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
				6548	tcp_bpf_ca_needs_ecn((struct sock *)req))
				6549	inet_rsk(req)->ecn_ok = 1;
				6550	}
				6551
				6552	static void tcp_openreq_init(struct request_sock *req,
				6553	const struct tcp_options_received *rx_opt,
				6554	struct sk_buff skb, const struct sock sk)
				6555	{
				6556	struct inet_request_sock *ireq = inet_rsk(req);
				6557
				6558	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
				6559	req->cookie_ts = 0;
				6560	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
				6561	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				6562	tcp_rsk(req)->snt_synack = 0;
				6563	tcp_rsk(req)->last_oow_ack_time = 0;
				6564	req->mss = rx_opt->mss_clamp;
				6565	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
				6566	ireq->tstamp_ok = rx_opt->tstamp_ok;
				6567	ireq->sack_ok = rx_opt->sack_ok;
				6568	ireq->snd_wscale = rx_opt->snd_wscale;
				6569	ireq->wscale_ok = rx_opt->wscale_ok;
				6570	ireq->acked = 0;
				6571	ireq->ecn_ok = 0;
				6572	ireq->ir_rmt_port = tcp_hdr(skb)->source;
				6573	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
				6574	ireq->ir_mark = inet_request_mark(sk, skb);
				6575	#if IS_ENABLED(CONFIG_SMC)
				6576	ireq->smc_ok = rx_opt->smc_ok;
				6577	#endif
				6578	}
				6579
				6580	struct request_sock inet_reqsk_alloc(const struct request_sock_ops ops,
				6581	struct sock *sk_listener,
				6582	bool attach_listener)
				6583	{
				6584	struct request_sock *req = reqsk_alloc(ops, sk_listener,
				6585	attach_listener);
				6586
				6587	if (req) {
				6588	struct inet_request_sock *ireq = inet_rsk(req);
				6589
				6590	ireq->ireq_opt = NULL;
				6591	#if IS_ENABLED(CONFIG_IPV6)
				6592	ireq->pktopts = NULL;
				6593	#endif
				6594	atomic64_set(&ireq->ir_cookie, 0);
				6595	ireq->ireq_state = TCP_NEW_SYN_RECV;
				6596	write_pnet(&ireq->ireq_net, sock_net(sk_listener));
				6597	ireq->ireq_family = sk_listener->sk_family;
				6598	}
				6599
				6600	return req;
				6601	}
				6602	EXPORT_SYMBOL(inet_reqsk_alloc);
				6603
				6604	/*
				6605	* Return true if a syncookie should be sent
				6606	*/
				6607	static bool tcp_syn_flood_action(const struct sock sk, const char proto)
				6608	{
				6609	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
				6610	const char *msg = "Dropping request";
				6611	struct net *net = sock_net(sk);
				6612	bool want_cookie = false;
				6613	u8 syncookies;
				6614
				6615	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
				6616
				6617	#ifdef CONFIG_SYN_COOKIES
				6618	if (syncookies) {
				6619	msg = "Sending cookies";
				6620	want_cookie = true;
				6621	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
				6622	} else
				6623	#endif
				6624	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
				6625
				6626	if (!queue->synflood_warned && syncookies != 2 &&
				6627	xchg(&queue->synflood_warned, 1) == 0)
				6628	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
				6629	proto, sk->sk_num, msg);
				6630
				6631	return want_cookie;
				6632	}
				6633
				6634	static void tcp_reqsk_record_syn(const struct sock *sk,
				6635	struct request_sock *req,
				6636	const struct sk_buff *skb)
				6637	{
				6638	if (tcp_sk(sk)->save_syn) {
				6639	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
				6640	u32 *copy;
				6641
				6642	copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
				6643	if (copy) {
				6644	copy[0] = len;
				6645	memcpy(&copy[1], skb_network_header(skb), len);
				6646	req->saved_syn = copy;
				6647	}
				6648	}
				6649	}
				6650
				6651	/* If a SYN cookie is required and supported, returns a clamped MSS value to be
				6652	* used for SYN cookie generation.
				6653	*/
				6654	u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
				6655	const struct tcp_request_sock_ops *af_ops,
				6656	struct sock sk, struct tcphdr th)
				6657	{
				6658	struct tcp_sock *tp = tcp_sk(sk);
				6659	u16 mss;
				6660
				6661	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
				6662	!inet_csk_reqsk_queue_is_full(sk))
				6663	return 0;
				6664
				6665	if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
				6666	return 0;
				6667
				6668	if (sk_acceptq_is_full(sk)) {
				6669	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6670	return 0;
				6671	}
				6672
				6673	mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
				6674	if (!mss)
				6675	mss = af_ops->mss_clamp;
				6676
				6677	return mss;
				6678	}
				6679	EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
				6680
				6681	int tcp_conn_request(struct request_sock_ops *rsk_ops,
				6682	const struct tcp_request_sock_ops *af_ops,
				6683	struct sock sk, struct sk_buff skb)
				6684	{
				6685	struct tcp_fastopen_cookie foc = { .len = -1 };
				6686	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
				6687	struct tcp_options_received tmp_opt;
				6688	struct tcp_sock *tp = tcp_sk(sk);
				6689	struct net *net = sock_net(sk);
				6690	struct sock *fastopen_sk = NULL;
				6691	struct request_sock *req;
				6692	bool want_cookie = false;
				6693	struct dst_entry *dst;
				6694	struct flowi fl;
				6695	u8 syncookies;
				6696
				6697	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
				6698
				6699	/* TW buckets are converted to open requests without
				6700	* limitations, they conserve resources and peer is
				6701	* evidently real one.
				6702	*/
				6703	if ((syncookies == 2 \|\| inet_csk_reqsk_queue_is_full(sk)) && !isn) {
				6704	want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
				6705	if (!want_cookie)
				6706	goto drop;
				6707	}
				6708
				6709	if (sk_acceptq_is_full(sk)) {
				6710	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6711	goto drop;
				6712	}
				6713
				6714	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
				6715	if (!req)
				6716	goto drop;
				6717
				6718	tcp_rsk(req)->af_specific = af_ops;
				6719	tcp_rsk(req)->ts_off = 0;
				6720
				6721	tcp_clear_options(&tmp_opt);
				6722	tmp_opt.mss_clamp = af_ops->mss_clamp;
				6723	tmp_opt.user_mss = tp->rx_opt.user_mss;
				6724	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
				6725	want_cookie ? NULL : &foc);
				6726
				6727	if (want_cookie && !tmp_opt.saw_tstamp)
				6728	tcp_clear_options(&tmp_opt);
				6729
				6730	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
				6731	tmp_opt.smc_ok = 0;
				6732
				6733	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				6734	tcp_openreq_init(req, &tmp_opt, skb, sk);
				6735	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
				6736
				6737	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
				6738	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
				6739
				6740	af_ops->init_req(req, sk, skb);
				6741
				6742	if (security_inet_conn_request(sk, skb, req))
				6743	goto drop_and_free;
				6744
				6745	if (tmp_opt.tstamp_ok)
				6746	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
				6747
				6748	dst = af_ops->route_req(sk, &fl, req);
				6749	if (!dst)
				6750	goto drop_and_free;
				6751
				6752	if (!want_cookie && !isn) {
				6753	int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
				6754
				6755	/* Kill the following clause, if you dislike this way. */
				6756	if (!syncookies &&
				6757	(max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
				6758	(max_syn_backlog >> 2)) &&
				6759	!tcp_peer_is_proven(req, dst)) {
				6760	/* Without syncookies last quarter of
				6761	* backlog is filled with destinations,
				6762	* proven to be alive.
				6763	* It means that we continue to communicate
				6764	* to destinations, already remembered
				6765	* to the moment of synflood.
				6766	*/
				6767	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				6768	rsk_ops->family);
				6769	goto drop_and_release;
				6770	}
				6771
				6772	isn = af_ops->init_seq(skb);
				6773	}
				6774
				6775	tcp_ecn_create_request(req, skb, sk, dst);
				6776
				6777	if (want_cookie) {
				6778	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
				6779	req->cookie_ts = tmp_opt.tstamp_ok;
				6780	if (!tmp_opt.tstamp_ok)
				6781	inet_rsk(req)->ecn_ok = 0;
				6782	}
				6783
				6784	tcp_rsk(req)->snt_isn = isn;
				6785	tcp_rsk(req)->txhash = net_tx_rndhash();
				6786	tcp_openreq_init_rwin(req, sk, dst);
				6787	sk_rx_queue_set(req_to_sk(req), skb);
				6788	if (!want_cookie) {
				6789	tcp_reqsk_record_syn(sk, req, skb);
				6790	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
				6791	}
				6792	if (fastopen_sk) {
				6793	af_ops->send_synack(fastopen_sk, dst, &fl, req,
				6794	&foc, TCP_SYNACK_FASTOPEN);
				6795	/* Add the child socket directly into the accept queue */
				6796	if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
				6797	reqsk_fastopen_remove(fastopen_sk, req, false);
				6798	bh_unlock_sock(fastopen_sk);
				6799	sock_put(fastopen_sk);
				6800	goto drop_and_free;
				6801	}
				6802	sk->sk_data_ready(sk);
				6803	bh_unlock_sock(fastopen_sk);
				6804	sock_put(fastopen_sk);
				6805	} else {
				6806	tcp_rsk(req)->tfo_listener = false;
				6807	if (!want_cookie)
				6808	inet_csk_reqsk_queue_hash_add(sk, req,
				6809	tcp_timeout_init((struct sock *)req));
				6810	af_ops->send_synack(sk, dst, &fl, req, &foc,
				6811	!want_cookie ? TCP_SYNACK_NORMAL :
				6812	TCP_SYNACK_COOKIE);
				6813	if (want_cookie) {
				6814	reqsk_free(req);
				6815	return 0;
				6816	}
				6817	}
				6818	reqsk_put(req);
				6819	return 0;
				6820
				6821	drop_and_release:
				6822	dst_release(dst);
				6823	drop_and_free:
				6824	__reqsk_free(req);
				6825	drop:
				6826	tcp_listendrop(sk);
				6827	return 0;
				6828	}
				6829	EXPORT_SYMBOL(tcp_conn_request);