Blame - src/kernel/linux/v4.19/net/ipv4/tcp_input.c - T800

blob: 5a8c6db42548b3f4a36c8d3eefb75eae5ded2f70 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Implementation of the Transmission Control Protocol(TCP).
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				13	* Florian La Roche, <flla@stud.uni-sb.de>
				14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				17	* Matthew Dillon, <dillon@apollo.west.oic.com>
				18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				19	* Jorge Cwik, <jorge@laser.satlink.net>
				20	*/
				21
				22	/*
				23	* Changes:
				24	* Pedro Roque : Fast Retransmit/Recovery.
				25	* Two receive queues.
				26	* Retransmit queue handled by TCP.
				27	* Better retransmit timer handling.
				28	* New congestion avoidance.
				29	* Header prediction.
				30	* Variable renaming.
				31	*
				32	* Eric : Fast Retransmit.
				33	* Randy Scott : MSS option defines.
				34	* Eric Schenk : Fixes to slow start algorithm.
				35	* Eric Schenk : Yet another double ACK bug.
				36	* Eric Schenk : Delayed ACK bug fixes.
				37	* Eric Schenk : Floyd style fast retrans war avoidance.
				38	* David S. Miller : Don't allow zero congestion window.
				39	* Eric Schenk : Fix retransmitter so that it sends
				40	* next packet on ack of previous packet.
				41	* Andi Kleen : Moved open_request checking here
				42	* and process RSTs for open_requests.
				43	* Andi Kleen : Better prune_queue, and other fixes.
				44	* Andrey Savochkin: Fix RTT measurements in the presence of
				45	* timestamps.
				46	* Andrey Savochkin: Check sequence numbers correctly when
				47	* removing SACKs due to in sequence incoming
				48	* data segments.
				49	* Andi Kleen: Make sure we never ack data there is not
				50	* enough room for. Also make this condition
				51	* a fatal error if it might still happen.
				52	* Andi Kleen: Add tcp_measure_rcv_mss to make
				53	* connections with MSS<min(MTU,ann. MSS)
				54	* work without delayed acks.
				55	* Andi Kleen: Process packets with PSH set in the
				56	* fast path.
				57	* J Hadi Salim: ECN support
				58	* Andrei Gurtov,
				59	* Pasi Sarolahti,
				60	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
				61	* engine. Lots of bugs are found.
				62	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
				63	*/
				64
				65	#define pr_fmt(fmt) "TCP: " fmt
				66
				67	#include <linux/mm.h>
				68	#include <linux/slab.h>
				69	#include <linux/module.h>
				70	#include <linux/sysctl.h>
				71	#include <linux/kernel.h>
				72	#include <linux/prefetch.h>
				73	#include <net/dst.h>
				74	#include <net/tcp.h>
				75	#include <net/inet_common.h>
				76	#include <linux/ipsec.h>
				77	#include <asm/unaligned.h>
				78	#include <linux/errqueue.h>
				79	#include <trace/events/tcp.h>
				80	#include <linux/static_key.h>
				81	#include <net/busy_poll.h>
				82	#include <net/ra_nat.h>
				83
				84	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
				85
				86	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
				87	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
				88	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
				89	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
				90	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
				91	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
				92	#define FLAG_ECE 0x40 /* ECE in this ACK */
				93	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
				94	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
				95	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
				96	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
				97	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
				98	#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
				99	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
				100	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
				101	#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
				102	#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
				103
				104	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
				105	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
				106	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE\|FLAG_DSACKING_ACK)
				107	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
				108
				109	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
				110	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
				111
				112	#define REXMIT_NONE 0 /* no loss recovery to do */
				113	#define REXMIT_LOST 1 /* retransmit packets marked lost */
				114	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
				115
				116	#if IS_ENABLED(CONFIG_TLS_DEVICE)
				117	static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
				118
				119	void clean_acked_data_enable(struct inet_connection_sock *icsk,
				120	void (cad)(struct sock sk, u32 ack_seq))
				121	{
				122	icsk->icsk_clean_acked = cad;
				123	static_branch_inc(&clean_acked_data_enabled);
				124	}
				125	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
				126
				127	void clean_acked_data_disable(struct inet_connection_sock *icsk)
				128	{
				129	static_branch_dec(&clean_acked_data_enabled);
				130	icsk->icsk_clean_acked = NULL;
				131	}
				132	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
				133	#endif
				134
				135	static void tcp_gro_dev_warn(struct sock sk, const struct sk_buff skb,
				136	unsigned int len)
				137	{
				138	static bool __once __read_mostly;
				139
				140	if (!__once) {
				141	struct net_device *dev;
				142
				143	__once = true;
				144
				145	rcu_read_lock();
				146	dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
				147	if (!dev \|\| len >= dev->mtu)
				148	pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				149	dev ? dev->name : "Unknown driver");
				150	rcu_read_unlock();
				151	}
				152	}
				153
				154	/* Adapt the MSS value used to make delayed ack decision to the
				155	* real world.
				156	*/
				157	static void tcp_measure_rcv_mss(struct sock sk, const struct sk_buff skb)
				158	{
				159	struct inet_connection_sock *icsk = inet_csk(sk);
				160	const unsigned int lss = icsk->icsk_ack.last_seg_size;
				161	unsigned int len;
				162
				163	icsk->icsk_ack.last_seg_size = 0;
				164
				165	/* skb->len may jitter because of SACKs, even if peer
				166	* sends good full-sized frames.
				167	*/
				168	len = skb_shinfo(skb)->gso_size ? : skb->len;
				169	if (len >= icsk->icsk_ack.rcv_mss) {
				170	icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
				171	tcp_sk(sk)->advmss);
				172	/* Account for possibly-removed options */
				173	if (unlikely(len > icsk->icsk_ack.rcv_mss +
				174	MAX_TCP_OPTION_SPACE))
				175	tcp_gro_dev_warn(sk, skb, len);
				176	} else {
				177	/* Otherwise, we make more careful check taking into account,
				178	* that SACKs block is variable.
				179	*
				180	* "len" is invariant segment length, including TCP header.
				181	*/
				182	len += skb->data - skb_transport_header(skb);
				183	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
				184	/* If PSH is not set, packet should be
				185	* full sized, provided peer TCP is not badly broken.
				186	* This observation (if it is correct 8)) allows
				187	* to handle super-low mtu links fairly.
				188	*/
				189	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
				190	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
				191	/* Subtract also invariant (if peer is RFC compliant),
				192	* tcp header plus fixed timestamp option length.
				193	* Resulting "len" is MSS free of SACK jitter.
				194	*/
				195	len -= tcp_sk(sk)->tcp_header_len;
				196	icsk->icsk_ack.last_seg_size = len;
				197	if (len == lss) {
				198	icsk->icsk_ack.rcv_mss = len;
				199	return;
				200	}
				201	}
				202	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
				203	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
				204	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				205	}
				206	}
				207
				208	static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
				209	{
				210	struct inet_connection_sock *icsk = inet_csk(sk);
				211	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
				212
				213	if (quickacks == 0)
				214	quickacks = 2;
				215	quickacks = min(quickacks, max_quickacks);
				216	if (quickacks > icsk->icsk_ack.quick)
				217	icsk->icsk_ack.quick = quickacks;
				218	}
				219
				220	void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
				221	{
				222	struct inet_connection_sock *icsk = inet_csk(sk);
				223
				224	tcp_incr_quickack(sk, max_quickacks);
				225	icsk->icsk_ack.pingpong = 0;
				226	icsk->icsk_ack.ato = TCP_ATO_MIN;
				227	}
				228	EXPORT_SYMBOL(tcp_enter_quickack_mode);
				229
				230	/* Send ACKs quickly, if "quick" count is not exhausted
				231	* and the session is not interactive.
				232	*/
				233
				234	static bool tcp_in_quickack_mode(struct sock *sk)
				235	{
				236	const struct inet_connection_sock *icsk = inet_csk(sk);
				237	const struct dst_entry *dst = __sk_dst_get(sk);
				238
				239	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
				240	(icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
				241	}
				242
				243	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
				244	{
				245	if (tp->ecn_flags & TCP_ECN_OK)
				246	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
				247	}
				248
				249	static void tcp_ecn_accept_cwr(struct sock sk, const struct sk_buff skb)
				250	{
				251	if (tcp_hdr(skb)->cwr) {
				252	tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				253
				254	/* If the sender is telling us it has entered CWR, then its
				255	* cwnd may be very low (even just 1 packet), so we should ACK
				256	* immediately.
				257	*/
				258	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
				259	}
				260	}
				261
				262	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
				263	{
				264	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				265	}
				266
				267	static void __tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				268	{
				269	struct tcp_sock *tp = tcp_sk(sk);
				270
				271	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
				272	case INET_ECN_NOT_ECT:
				273	/* Funny extension: if ECT is not set on a segment,
				274	* and we already seen ECT on a previous segment,
				275	* it is probably a retransmit.
				276	*/
				277	if (tp->ecn_flags & TCP_ECN_SEEN)
				278	tcp_enter_quickack_mode(sk, 2);
				279	break;
				280	case INET_ECN_CE:
				281	if (tcp_ca_needs_ecn(sk))
				282	tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
				283
				284	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
				285	/* Better not delay acks, sender can have a very low cwnd */
				286	tcp_enter_quickack_mode(sk, 2);
				287	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
				288	}
				289	tp->ecn_flags \|= TCP_ECN_SEEN;
				290	break;
				291	default:
				292	if (tcp_ca_needs_ecn(sk))
				293	tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
				294	tp->ecn_flags \|= TCP_ECN_SEEN;
				295	break;
				296	}
				297	}
				298
				299	static void tcp_ecn_check_ce(struct sock sk, const struct sk_buff skb)
				300	{
				301	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
				302	__tcp_ecn_check_ce(sk, skb);
				303	}
				304
				305	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const struct tcphdr th)
				306	{
				307	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
				308	tp->ecn_flags &= ~TCP_ECN_OK;
				309	}
				310
				311	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const struct tcphdr th)
				312	{
				313	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
				314	tp->ecn_flags &= ~TCP_ECN_OK;
				315	}
				316
				317	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const struct tcphdr th)
				318	{
				319	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
				320	return true;
				321	return false;
				322	}
				323
				324	/* Buffer size and advertised window tuning.
				325	*
				326	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
				327	*/
				328
				329	static void tcp_sndbuf_expand(struct sock *sk)
				330	{
				331	const struct tcp_sock *tp = tcp_sk(sk);
				332	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				333	int sndmem, per_mss;
				334	u32 nr_segs;
				335
				336	/* Worst case is non GSO/TSO : each frame consumes one skb
				337	* and skb->head is kmalloced using power of two area of memory
				338	*/
				339	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
				340	MAX_TCP_HEADER +
				341	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				342
				343	per_mss = roundup_pow_of_two(per_mss) +
				344	SKB_DATA_ALIGN(sizeof(struct sk_buff));
				345
				346	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
				347	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
				348
				349	/* Fast Recovery (RFC 5681 3.2) :
				350	* Cubic needs 1.7 factor, rounded to 2 to include
				351	* extra cushion (application might react slowly to EPOLLOUT)
				352	*/
				353	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
				354	sndmem = nr_segs per_mss;
				355
				356	if (sk->sk_sndbuf < sndmem)
				357	sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
				358	}
				359
				360	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
				361	*
				362	* All tcp_full_space() is split to two parts: "network" buffer, allocated
				363	* forward and advertised in receiver window (tp->rcv_wnd) and
				364	* "application buffer", required to isolate scheduling/application
				365	* latencies from network.
				366	* window_clamp is maximal advertised window. It can be less than
				367	* tcp_full_space(), in this case tcp_full_space() - window_clamp
				368	* is reserved for "application" buffer. The less window_clamp is
				369	* the smoother our behaviour from viewpoint of network, but the lower
				370	* throughput and the higher sensitivity of the connection to losses. 8)
				371	*
				372	* rcv_ssthresh is more strict window_clamp used at "slow start"
				373	* phase to predict further behaviour of this connection.
				374	* It is used for two goals:
				375	* - to enforce header prediction at sender, even when application
				376	* requires some significant "application buffer". It is check #1.
				377	* - to prevent pruning of receive queue because of misprediction
				378	* of receiver window. Check #2.
				379	*
				380	* The scheme does not work when sender sends good segments opening
				381	* window and then starts to feed us spaghetti. But it should work
				382	* in common situations. Otherwise, we have to rely on queue collapsing.
				383	*/
				384
				385	/* Slow part of check#2. */
				386	static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
				387	{
				388	struct tcp_sock *tp = tcp_sk(sk);
				389	/* Optimize this! */
				390	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
				391	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
				392
				393	while (tp->rcv_ssthresh <= window) {
				394	if (truesize <= skb->len)
				395	return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
				396
				397	truesize >>= 1;
				398	window >>= 1;
				399	}
				400	return 0;
				401	}
				402
				403	static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
				404	{
				405	struct tcp_sock *tp = tcp_sk(sk);
				406	int room;
				407
				408	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
				409
				410	/* Check #1 */
				411	if (room > 0 && !tcp_under_memory_pressure(sk)) {
				412	int incr;
				413
				414	/* Check #2. Increase window, if skb with such overhead
				415	* will fit to rcvbuf in future.
				416	*/
				417	if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
				418	incr = 2 * tp->advmss;
				419	else
				420	incr = __tcp_grow_window(sk, skb);
				421
				422	if (incr) {
				423	incr = max_t(int, incr, 2 * skb->len);
				424	tp->rcv_ssthresh += min(room, incr);
				425	inet_csk(sk)->icsk_ack.quick \|= 1;
				426	}
				427	}
				428	}
				429
				430	/* 3. Try to fixup all. It is made immediately after connection enters
				431	* established state.
				432	*/
				433	void tcp_init_buffer_space(struct sock *sk)
				434	{
				435	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
				436	struct tcp_sock *tp = tcp_sk(sk);
				437	int maxwin;
				438
				439	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
				440	tcp_sndbuf_expand(sk);
				441
				442	tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
				443	tcp_mstamp_refresh(tp);
				444	tp->rcvq_space.time = tp->tcp_mstamp;
				445	tp->rcvq_space.seq = tp->copied_seq;
				446
				447	maxwin = tcp_full_space(sk);
				448
				449	if (tp->window_clamp >= maxwin) {
				450	tp->window_clamp = maxwin;
				451
				452	if (tcp_app_win && maxwin > 4 * tp->advmss)
				453	tp->window_clamp = max(maxwin -
				454	(maxwin >> tcp_app_win),
				455	4 * tp->advmss);
				456	}
				457
				458	/* Force reservation of one segment. */
				459	if (tcp_app_win &&
				460	tp->window_clamp > 2 * tp->advmss &&
				461	tp->window_clamp + tp->advmss > maxwin)
				462	tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
				463
				464	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
				465	tp->snd_cwnd_stamp = tcp_jiffies32;
				466	}
				467
				468	/* 4. Recalculate window clamp after socket hit its memory bounds. */
				469	static void tcp_clamp_window(struct sock *sk)
				470	{
				471	struct tcp_sock *tp = tcp_sk(sk);
				472	struct inet_connection_sock *icsk = inet_csk(sk);
				473	struct net *net = sock_net(sk);
				474
				475	icsk->icsk_ack.quick = 0;
				476
				477	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
				478	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
				479	!tcp_under_memory_pressure(sk) &&
				480	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
				481	sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				482	net->ipv4.sysctl_tcp_rmem[2]);
				483	}
				484	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
				485	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
				486	}
				487
				488	/* Initialize RCV_MSS value.
				489	* RCV_MSS is an our guess about MSS used by the peer.
				490	* We haven't any direct information about the MSS.
				491	* It's better to underestimate the RCV_MSS rather than overestimate.
				492	* Overestimations make us ACKing less frequently than needed.
				493	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				494	*/
				495	void tcp_initialize_rcv_mss(struct sock *sk)
				496	{
				497	const struct tcp_sock *tp = tcp_sk(sk);
				498	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
				499
				500	hint = min(hint, tp->rcv_wnd / 2);
				501	hint = min(hint, TCP_MSS_DEFAULT);
				502	hint = max(hint, TCP_MIN_MSS);
				503
				504	inet_csk(sk)->icsk_ack.rcv_mss = hint;
				505	}
				506	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
				507
				508	/* Receiver "autotuning" code.
				509	*
				510	* The algorithm for RTT estimation w/o timestamps is based on
				511	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
				512	* <http://public.lanl.gov/radiant/pubs.html#DRS>
				513	*
				514	* More detail on this code can be found at
				515	* <http://staff.psc.edu/jheffner/>,
				516	* though this reference is out of date. A new paper
				517	* is pending.
				518	*/
				519	static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
				520	{
				521	u32 new_sample = tp->rcv_rtt_est.rtt_us;
				522	long m = sample;
				523
				524	if (new_sample != 0) {
				525	/* If we sample in larger samples in the non-timestamp
				526	* case, we could grossly overestimate the RTT especially
				527	* with chatty applications or bulk transfer apps which
				528	* are stalled on filesystem I/O.
				529	*
				530	* Also, since we are only going for a minimum in the
				531	* non-timestamp case, we do not smooth things out
				532	* else with timestamps disabled convergence takes too
				533	* long.
				534	*/
				535	if (!win_dep) {
				536	m -= (new_sample >> 3);
				537	new_sample += m;
				538	} else {
				539	m <<= 3;
				540	if (m < new_sample)
				541	new_sample = m;
				542	}
				543	} else {
				544	/* No previous measure. */
				545	new_sample = m << 3;
				546	}
				547
				548	tp->rcv_rtt_est.rtt_us = new_sample;
				549	}
				550
				551	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
				552	{
				553	u32 delta_us;
				554
				555	if (tp->rcv_rtt_est.time == 0)
				556	goto new_measure;
				557	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
				558	return;
				559	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
				560	if (!delta_us)
				561	delta_us = 1;
				562	tcp_rcv_rtt_update(tp, delta_us, 1);
				563
				564	new_measure:
				565	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
				566	tp->rcv_rtt_est.time = tp->tcp_mstamp;
				567	}
				568
				569	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
				570	const struct sk_buff *skb)
				571	{
				572	struct tcp_sock *tp = tcp_sk(sk);
				573
				574	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
				575	return;
				576	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				577
				578	if (TCP_SKB_CB(skb)->end_seq -
				579	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
				580	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
				581	u32 delta_us;
				582
				583	if (!delta)
				584	delta = 1;
				585	delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				586	tcp_rcv_rtt_update(tp, delta_us, 0);
				587	}
				588	}
				589
				590	/*
				591	* This function should be called every time data is copied to user space.
				592	* It calculates the appropriate TCP receive buffer space.
				593	*/
				594	void tcp_rcv_space_adjust(struct sock *sk)
				595	{
				596	struct tcp_sock *tp = tcp_sk(sk);
				597	u32 copied;
				598	int time;
				599
				600	trace_tcp_rcv_space_adjust(sk);
				601
				602	tcp_mstamp_refresh(tp);
				603	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
				604	if (time < (tp->rcv_rtt_est.rtt_us >> 3) \|\| tp->rcv_rtt_est.rtt_us == 0)
				605	return;
				606
				607	/* Number of bytes copied to user in last RTT */
				608	copied = tp->copied_seq - tp->rcvq_space.seq;
				609	if (copied <= tp->rcvq_space.space)
				610	goto new_measure;
				611
				612	/* A bit of theory :
				613	* copied = bytes received in previous RTT, our base window
				614	* To cope with packet losses, we need a 2x factor
				615	* To cope with slow start, and sender growing its cwin by 100 %
				616	* every RTT, we need a 4x factor, because the ACK we are sending
				617	* now is for the next RTT, not the current one :
				618	* <prev RTT . ><current RTT .. ><next RTT .... >
				619	*/
				620
				621	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
				622	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
				623	int rcvmem, rcvbuf;
				624	u64 rcvwin, grow;
				625
				626	/* minimal window to cope with packet losses, assuming
				627	* steady state. Add some cushion because of small variations.
				628	*/
				629	rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
				630
				631	/* Accommodate for sender rate increase (eg. slow start) */
				632	grow = rcvwin * (copied - tp->rcvq_space.space);
				633	do_div(grow, tp->rcvq_space.space);
				634	rcvwin += (grow << 1);
				635
				636	rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
				637	while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
				638	rcvmem += 128;
				639
				640	do_div(rcvwin, tp->advmss);
				641	rcvbuf = min_t(u64, rcvwin * rcvmem,
				642	sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
				643	if (rcvbuf > sk->sk_rcvbuf) {
				644	sk->sk_rcvbuf = rcvbuf;
				645
				646	/* Make the window clamp follow along. */
				647	tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
				648	}
				649	}
				650	tp->rcvq_space.space = copied;
				651
				652	new_measure:
				653	tp->rcvq_space.seq = tp->copied_seq;
				654	tp->rcvq_space.time = tp->tcp_mstamp;
				655	}
				656
				657	/* There is something which you must keep in mind when you analyze the
				658	* behavior of the tp->ato delayed ack timeout interval. When a
				659	* connection starts up, we want to ack as quickly as possible. The
				660	* problem is that "good" TCP's do slow start at the beginning of data
				661	* transmission. The means that until we send the first few ACK's the
				662	* sender will sit on his end and only queue most of his data, because
				663	* he can only send snd_cwnd unacked packets at any given time. For
				664	* each ACK we send, he increments snd_cwnd and transmits more of his
				665	* queue. -DaveM
				666	*/
				667	static void tcp_event_data_recv(struct sock sk, struct sk_buff skb)
				668	{
				669	struct tcp_sock *tp = tcp_sk(sk);
				670	struct inet_connection_sock *icsk = inet_csk(sk);
				671	u32 now;
				672
				673	inet_csk_schedule_ack(sk);
				674
				675	tcp_measure_rcv_mss(sk, skb);
				676
				677	tcp_rcv_rtt_measure(tp);
				678
				679	now = tcp_jiffies32;
				680
				681	if (!icsk->icsk_ack.ato) {
				682	/* The _first_ data packet received, initialize
				683	* delayed ACK engine.
				684	*/
				685	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				686	icsk->icsk_ack.ato = TCP_ATO_MIN;
				687	} else {
				688	int m = now - icsk->icsk_ack.lrcvtime;
				689
				690	if (m <= TCP_ATO_MIN / 2) {
				691	/* The fastest case is the first. */
				692	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
				693	} else if (m < icsk->icsk_ack.ato) {
				694	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
				695	if (icsk->icsk_ack.ato > icsk->icsk_rto)
				696	icsk->icsk_ack.ato = icsk->icsk_rto;
				697	} else if (m > icsk->icsk_rto) {
				698	/* Too long gap. Apparently sender failed to
				699	* restart window, so that we send ACKs quickly.
				700	*/
				701	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
				702	sk_mem_reclaim(sk);
				703	}
				704	}
				705	icsk->icsk_ack.lrcvtime = now;
				706
				707	tcp_ecn_check_ce(sk, skb);
				708
				709	if (skb->len >= 128)
				710	tcp_grow_window(sk, skb);
				711	}
				712
				713	/* Called to compute a smoothed rtt estimate. The data fed to this
				714	* routine either comes from timestamps, or from segments that were
				715	* known _not_ to have been retransmitted [see Karn/Partridge
				716	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
				717	* piece by Van Jacobson.
				718	* NOTE: the next three routines used to be one big routine.
				719	* To save cycles in the RFC 1323 implementation it was better to break
				720	* it up into three procedures. -- erics
				721	*/
				722	static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
				723	{
				724	struct tcp_sock *tp = tcp_sk(sk);
				725	long m = mrtt_us; /* RTT */
				726	u32 srtt = tp->srtt_us;
				727
				728	/* The following amusing code comes from Jacobson's
				729	* article in SIGCOMM '88. Note that rtt and mdev
				730	* are scaled versions of rtt and mean deviation.
				731	* This is designed to be as fast as possible
				732	* m stands for "measurement".
				733	*
				734	* On a 1990 paper the rto value is changed to:
				735	* RTO = rtt + 4 * mdev
				736	*
				737	* Funny. This algorithm seems to be very broken.
				738	* These formulae increase RTO, when it should be decreased, increase
				739	* too slowly, when it should be increased quickly, decrease too quickly
				740	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
				741	* does not matter how to _calculate_ it. Seems, it was trap
				742	* that VJ failed to avoid. 8)
				743	*/
				744	if (srtt != 0) {
				745	m -= (srtt >> 3); /* m is now error in rtt est */
				746	srtt += m; /* rtt = 7/8 rtt + 1/8 new */
				747	if (m < 0) {
				748	m = -m; /* m is now abs(error) */
				749	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				750	/* This is similar to one of Eifel findings.
				751	* Eifel blocks mdev updates when rtt decreases.
				752	* This solution is a bit different: we use finer gain
				753	* for mdev in this case (alpha*beta).
				754	* Like Eifel it also prevents growth of rto,
				755	* but also it limits too fast rto decreases,
				756	* happening in pure Eifel.
				757	*/
				758	if (m > 0)
				759	m >>= 3;
				760	} else {
				761	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				762	}
				763	tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
				764	if (tp->mdev_us > tp->mdev_max_us) {
				765	tp->mdev_max_us = tp->mdev_us;
				766	if (tp->mdev_max_us > tp->rttvar_us)
				767	tp->rttvar_us = tp->mdev_max_us;
				768	}
				769	if (after(tp->snd_una, tp->rtt_seq)) {
				770	if (tp->mdev_max_us < tp->rttvar_us)
				771	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
				772	tp->rtt_seq = tp->snd_nxt;
				773	tp->mdev_max_us = tcp_rto_min_us(sk);
				774	}
				775	} else {
				776	/* no previous measure. */
				777	srtt = m << 3; /* take the measured time to be rtt */
				778	tp->mdev_us = m << 1; /* make sure rto = 3rtt /
				779	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
				780	tp->mdev_max_us = tp->rttvar_us;
				781	tp->rtt_seq = tp->snd_nxt;
				782	}
				783	tp->srtt_us = max(1U, srtt);
				784	}
				785
				786	static void tcp_update_pacing_rate(struct sock *sk)
				787	{
				788	const struct tcp_sock *tp = tcp_sk(sk);
				789	u64 rate;
				790
				791	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
				792	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
				793
				794	/* current rate is (cwnd * mss) / srtt
				795	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
				796	* In Congestion Avoidance phase, set it to 120 % the current rate.
				797	*
				798	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
				799	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
				800	* end of slow start and should slow down.
				801	*/
				802	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
				803	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
				804	else
				805	rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
				806
				807	rate *= max(tp->snd_cwnd, tp->packets_out);
				808
				809	if (likely(tp->srtt_us))
				810	do_div(rate, tp->srtt_us);
				811
				812	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
				813	* without any lock. We want to make sure compiler wont store
				814	* intermediate values in this location.
				815	*/
				816	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
				817	sk->sk_max_pacing_rate));
				818	}
				819
				820	/* Calculate rto without backoff. This is the second half of Van Jacobson's
				821	* routine referred to above.
				822	*/
				823	static void tcp_set_rto(struct sock *sk)
				824	{
				825	const struct tcp_sock *tp = tcp_sk(sk);
				826	/* Old crap is replaced with new one. 8)
				827	*
				828	* More seriously:
				829	* 1. If rtt variance happened to be less 50msec, it is hallucination.
				830	* It cannot be less due to utterly erratic ACK generation made
				831	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
				832	* to do with delayed acks, because at cwnd>2 true delack timeout
				833	* is invisible. Actually, Linux-2.4 also generates erratic
				834	* ACKs in some circumstances.
				835	*/
				836	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
				837
				838	/* 2. Fixups made earlier cannot be right.
				839	* If we do not estimate RTO correctly without them,
				840	* all the algo is pure shit and should be replaced
				841	* with correct one. It is exactly, which we pretend to do.
				842	*/
				843
				844	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
				845	* guarantees that rto is higher.
				846	*/
				847	tcp_bound_rto(sk);
				848	}
				849
				850	__u32 tcp_init_cwnd(const struct tcp_sock tp, const struct dst_entry dst)
				851	{
				852	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
				853
				854	if (!cwnd)
				855	cwnd = TCP_INIT_CWND;
				856	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
				857	}
				858
				859	/* Take a notice that peer is sending D-SACKs */
				860	static void tcp_dsack_seen(struct tcp_sock *tp)
				861	{
				862	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
				863	tp->rack.dsack_seen = 1;
				864	tp->dsack_dups++;
				865	}
				866
				867	/* It's reordering when higher sequence was delivered (i.e. sacked) before
				868	* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
				869	* distance is approximated in full-mss packet distance ("reordering").
				870	*/
				871	static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				872	const int ts)
				873	{
				874	struct tcp_sock *tp = tcp_sk(sk);
				875	const u32 mss = tp->mss_cache;
				876	u32 fack, metric;
				877
				878	fack = tcp_highest_sack_seq(tp);
				879	if (!before(low_seq, fack))
				880	return;
				881
				882	metric = fack - low_seq;
				883	if ((metric > tp->reordering * mss) && mss) {
				884	#if FASTRETRANS_DEBUG > 1
				885	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
				886	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
				887	tp->reordering,
				888	0,
				889	tp->sacked_out,
				890	tp->undo_marker ? tp->undo_retrans : 0);
				891	#endif
				892	tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				893	sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
				894	}
				895
				896	/* This exciting event is worth to be remembered. 8) */
				897	tp->reord_seen++;
				898	NET_INC_STATS(sock_net(sk),
				899	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
				900	}
				901
				902	/* This must be called before lost_out is incremented */
				903	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
				904	{
				905	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
				906	(tp->retransmit_skb_hint &&
				907	before(TCP_SKB_CB(skb)->seq,
				908	TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
				909	tp->retransmit_skb_hint = skb;
				910	}
				911
				912	/* Sum the number of packets on the wire we have marked as lost.
				913	* There are two cases we care about here:
				914	* a) Packet hasn't been marked lost (nor retransmitted),
				915	* and this is the first loss.
				916	* b) Packet has been marked both lost and retransmitted,
				917	* and this means we think it was lost again.
				918	*/
				919	static void tcp_sum_lost(struct tcp_sock tp, struct sk_buff skb)
				920	{
				921	__u8 sacked = TCP_SKB_CB(skb)->sacked;
				922
				923	if (!(sacked & TCPCB_LOST) \|\|
				924	((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
				925	tp->lost += tcp_skb_pcount(skb);
				926	}
				927
				928	static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
				929	{
				930	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				931	tcp_verify_retransmit_hint(tp, skb);
				932
				933	tp->lost_out += tcp_skb_pcount(skb);
				934	tcp_sum_lost(tp, skb);
				935	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				936	}
				937	}
				938
				939	void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
				940	{
				941	tcp_verify_retransmit_hint(tp, skb);
				942
				943	tcp_sum_lost(tp, skb);
				944	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				945	tp->lost_out += tcp_skb_pcount(skb);
				946	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				947	}
				948	}
				949
				950	/* This procedure tags the retransmission queue when SACKs arrive.
				951	*
				952	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
				953	* Packets in queue with these bits set are counted in variables
				954	* sacked_out, retrans_out and lost_out, correspondingly.
				955	*
				956	* Valid combinations are:
				957	* Tag InFlight Description
				958	* 0 1 - orig segment is in flight.
				959	* S 0 - nothing flies, orig reached receiver.
				960	* L 0 - nothing flies, orig lost by net.
				961	* R 2 - both orig and retransmit are in flight.
				962	* L\|R 1 - orig is lost, retransmit is in flight.
				963	* S\|R 1 - orig reached receiver, retrans is still in flight.
				964	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
				965	* but it is equivalent to plain S and code short-curcuits it to S.
				966	* L\|S is logically invalid, it would mean -1 packet in flight 8))
				967	*
				968	* These 6 states form finite state machine, controlled by the following events:
				969	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
				970	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
				971	* 3. Loss detection event of two flavors:
				972	* A. Scoreboard estimator decided the packet is lost.
				973	* A'. Reno "three dupacks" marks head of queue lost.
				974	* B. SACK arrives sacking SND.NXT at the moment, when the
				975	* segment was retransmitted.
				976	* 4. D-SACK added new rule: D-SACK changes any tag to S.
				977	*
				978	* It is pleasant to note, that state diagram turns out to be commutative,
				979	* so that we are allowed not to be bothered by order of our actions,
				980	* when multiple events arrive simultaneously. (see the function below).
				981	*
				982	* Reordering detection.
				983	* --------------------
				984	* Reordering metric is maximal distance, which a packet can be displaced
				985	* in packet stream. With SACKs we can estimate it:
				986	*
				987	* 1. SACK fills old hole and the corresponding segment was not
				988	* ever retransmitted -> reordering. Alas, we cannot use it
				989	* when segment was retransmitted.
				990	* 2. The last flaw is solved with D-SACK. D-SACK arrives
				991	* for retransmitted and already SACKed segment -> reordering..
				992	* Both of these heuristics are not used in Loss state, when we cannot
				993	* account for retransmits accurately.
				994	*
				995	* SACK block validation.
				996	* ----------------------
				997	*
				998	* SACK block range validation checks that the received SACK block fits to
				999	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
				1000	* Note that SND.UNA is not included to the range though being valid because
				1001	* it means that the receiver is rather inconsistent with itself reporting
				1002	* SACK reneging when it should advance SND.UNA. Such SACK block this is
				1003	* perfectly valid, however, in light of RFC2018 which explicitly states
				1004	* that "SACK block MUST reflect the newest segment. Even if the newest
				1005	* segment is going to be discarded ...", not that it looks very clever
				1006	* in case of head skb. Due to potentional receiver driven attacks, we
				1007	* choose to avoid immediate execution of a walk in write queue due to
				1008	* reneging and defer head skb's loss recovery to standard loss recovery
				1009	* procedure that will eventually trigger (nothing forbids us doing this).
				1010	*
				1011	* Implements also blockage to start_seq wrap-around. Problem lies in the
				1012	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
				1013	* there's no guarantee that it will be before snd_nxt (n). The problem
				1014	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
				1015	* wrap (s_w):
				1016	*
				1017	* <- outs wnd -> <- wrapzone ->
				1018	* u e n u_w e_w s n_w
				1019	* \| \| \| \| \| \| \|
				1020	* \|<------------+------+----- TCP seqno space --------------+---------->\|
				1021	* ...-- <2^31 ->\| \|<--------...
				1022	* ...---- >2^31 ------>\| \|<--------...
				1023	*
				1024	* Current code wouldn't be vulnerable but it's better still to discard such
				1025	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
				1026	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
				1027	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
				1028	* equal to the ideal case (infinite seqno space without wrap caused issues).
				1029	*
				1030	* With D-SACK the lower bound is extended to cover sequence space below
				1031	* SND.UNA down to undo_marker, which is the last point of interest. Yet
				1032	* again, D-SACK block must not to go across snd_una (for the same reason as
				1033	* for the normal SACK blocks, explained above). But there all simplicity
				1034	* ends, TCP might receive valid D-SACKs below that. As long as they reside
				1035	* fully below undo_marker they do not affect behavior in anyway and can
				1036	* therefore be safely ignored. In rare cases (which are more or less
				1037	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
				1038	* fragmentation and packet reordering past skb's retransmission. To consider
				1039	* them correctly, the acceptable range must be extended even more though
				1040	* the exact amount is rather hard to quantify. However, tp->max_window can
				1041	* be used as an exaggerated estimate.
				1042	*/
				1043	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				1044	u32 start_seq, u32 end_seq)
				1045	{
				1046	/* Too far in future, or reversed (interpretation is ambiguous) */
				1047	if (after(end_seq, tp->snd_nxt) \|\| !before(start_seq, end_seq))
				1048	return false;
				1049
				1050	/* Nasty start_seq wrap-around check (see comments above) */
				1051	if (!before(start_seq, tp->snd_nxt))
				1052	return false;
				1053
				1054	/* In outstanding window? ...This is valid exit for D-SACKs too.
				1055	* start_seq == snd_una is non-sensical (see comments above)
				1056	*/
				1057	if (after(start_seq, tp->snd_una))
				1058	return true;
				1059
				1060	if (!is_dsack \|\| !tp->undo_marker)
				1061	return false;
				1062
				1063	/* ...Then it's D-SACK, and must reside below snd_una completely */
				1064	if (after(end_seq, tp->snd_una))
				1065	return false;
				1066
				1067	if (!before(start_seq, tp->undo_marker))
				1068	return true;
				1069
				1070	/* Too old */
				1071	if (!after(end_seq, tp->undo_marker))
				1072	return false;
				1073
				1074	/* Undo_marker boundary crossing (overestimates a lot). Known already:
				1075	* start_seq < undo_marker and end_seq >= undo_marker.
				1076	*/
				1077	return !before(start_seq, end_seq - tp->max_window);
				1078	}
				1079
				1080	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
				1081	struct tcp_sack_block_wire *sp, int num_sacks,
				1082	u32 prior_snd_una)
				1083	{
				1084	struct tcp_sock *tp = tcp_sk(sk);
				1085	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
				1086	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
				1087	bool dup_sack = false;
				1088
				1089	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
				1090	dup_sack = true;
				1091	tcp_dsack_seen(tp);
				1092	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
				1093	} else if (num_sacks > 1) {
				1094	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
				1095	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
				1096
				1097	if (!after(end_seq_0, end_seq_1) &&
				1098	!before(start_seq_0, start_seq_1)) {
				1099	dup_sack = true;
				1100	tcp_dsack_seen(tp);
				1101	NET_INC_STATS(sock_net(sk),
				1102	LINUX_MIB_TCPDSACKOFORECV);
				1103	}
				1104	}
				1105
				1106	/* D-SACK for already forgotten data... Do dumb counting. */
				1107	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
				1108	!after(end_seq_0, prior_snd_una) &&
				1109	after(end_seq_0, tp->undo_marker))
				1110	tp->undo_retrans--;
				1111
				1112	return dup_sack;
				1113	}
				1114
				1115	struct tcp_sacktag_state {
				1116	u32 reord;
				1117	/* Timestamps for earliest and latest never-retransmitted segment
				1118	* that was SACKed. RTO needs the earliest RTT to stay conservative,
				1119	* but congestion control should still get an accurate delay signal.
				1120	*/
				1121	u64 first_sackt;
				1122	u64 last_sackt;
				1123	struct rate_sample *rate;
				1124	int flag;
				1125	unsigned int mss_now;
				1126	};
				1127
				1128	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
				1129	* the incoming SACK may not exactly match but we can find smaller MSS
				1130	* aligned portion of it that matches. Therefore we might need to fragment
				1131	* which may fail and creates some hassle (caller must handle error case
				1132	* returns).
				1133	*
				1134	* FIXME: this could be merged to shift decision code
				1135	*/
				1136	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
				1137	u32 start_seq, u32 end_seq)
				1138	{
				1139	int err;
				1140	bool in_sack;
				1141	unsigned int pkt_len;
				1142	unsigned int mss;
				1143
				1144	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1145	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1146
				1147	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
				1148	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
				1149	mss = tcp_skb_mss(skb);
				1150	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1151
				1152	if (!in_sack) {
				1153	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
				1154	if (pkt_len < mss)
				1155	pkt_len = mss;
				1156	} else {
				1157	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
				1158	if (pkt_len < mss)
				1159	return -EINVAL;
				1160	}
				1161
				1162	/* Round if necessary so that SACKs cover only full MSSes
				1163	* and/or the remaining small portion (if present)
				1164	*/
				1165	if (pkt_len > mss) {
				1166	unsigned int new_len = (pkt_len / mss) * mss;
				1167	if (!in_sack && new_len < pkt_len)
				1168	new_len += mss;
				1169	pkt_len = new_len;
				1170	}
				1171
				1172	if (pkt_len >= skb->len && !in_sack)
				1173	return 0;
				1174
				1175	err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				1176	pkt_len, mss, GFP_ATOMIC);
				1177	if (err < 0)
				1178	return err;
				1179	}
				1180
				1181	return in_sack;
				1182	}
				1183
				1184	/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
				1185	static u8 tcp_sacktag_one(struct sock *sk,
				1186	struct tcp_sacktag_state *state, u8 sacked,
				1187	u32 start_seq, u32 end_seq,
				1188	int dup_sack, int pcount,
				1189	u64 xmit_time)
				1190	{
				1191	struct tcp_sock *tp = tcp_sk(sk);
				1192
				1193	/* Account D-SACK for retransmitted packet. */
				1194	if (dup_sack && (sacked & TCPCB_RETRANS)) {
				1195	if (tp->undo_marker && tp->undo_retrans > 0 &&
				1196	after(end_seq, tp->undo_marker))
				1197	tp->undo_retrans--;
				1198	if ((sacked & TCPCB_SACKED_ACKED) &&
				1199	before(start_seq, state->reord))
				1200	state->reord = start_seq;
				1201	}
				1202
				1203	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
				1204	if (!after(end_seq, tp->snd_una))
				1205	return sacked;
				1206
				1207	if (!(sacked & TCPCB_SACKED_ACKED)) {
				1208	tcp_rack_advance(tp, sacked, end_seq, xmit_time);
				1209
				1210	if (sacked & TCPCB_SACKED_RETRANS) {
				1211	/* If the segment is not tagged as lost,
				1212	* we do not clear RETRANS, believing
				1213	* that retransmission is still in flight.
				1214	*/
				1215	if (sacked & TCPCB_LOST) {
				1216	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
				1217	tp->lost_out -= pcount;
				1218	tp->retrans_out -= pcount;
				1219	}
				1220	} else {
				1221	if (!(sacked & TCPCB_RETRANS)) {
				1222	/* New sack for not retransmitted frame,
				1223	* which was in hole. It is reordering.
				1224	*/
				1225	if (before(start_seq,
				1226	tcp_highest_sack_seq(tp)) &&
				1227	before(start_seq, state->reord))
				1228	state->reord = start_seq;
				1229
				1230	if (!after(end_seq, tp->high_seq))
				1231	state->flag \|= FLAG_ORIG_SACK_ACKED;
				1232	if (state->first_sackt == 0)
				1233	state->first_sackt = xmit_time;
				1234	state->last_sackt = xmit_time;
				1235	}
				1236
				1237	if (sacked & TCPCB_LOST) {
				1238	sacked &= ~TCPCB_LOST;
				1239	tp->lost_out -= pcount;
				1240	}
				1241	}
				1242
				1243	sacked \|= TCPCB_SACKED_ACKED;
				1244	state->flag \|= FLAG_DATA_SACKED;
				1245	tp->sacked_out += pcount;
				1246	tp->delivered += pcount; /* Out-of-order packets delivered */
				1247
				1248	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
				1249	if (tp->lost_skb_hint &&
				1250	before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
				1251	tp->lost_cnt_hint += pcount;
				1252	}
				1253
				1254	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
				1255	* frames and clear it. undo_retrans is decreased above, L\|R frames
				1256	* are accounted above as well.
				1257	*/
				1258	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
				1259	sacked &= ~TCPCB_SACKED_RETRANS;
				1260	tp->retrans_out -= pcount;
				1261	}
				1262
				1263	return sacked;
				1264	}
				1265
				1266	/* Shift newly-SACKed bytes from this skb to the immediately previous
				1267	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
				1268	*/
				1269	static bool tcp_shifted_skb(struct sock sk, struct sk_buff prev,
				1270	struct sk_buff *skb,
				1271	struct tcp_sacktag_state *state,
				1272	unsigned int pcount, int shifted, int mss,
				1273	bool dup_sack)
				1274	{
				1275	struct tcp_sock *tp = tcp_sk(sk);
				1276	u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
				1277	u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
				1278
				1279	BUG_ON(!pcount);
				1280
				1281	/* Adjust counters and hints for the newly sacked sequence
				1282	* range but discard the return value since prev is already
				1283	* marked. We must tag the range first because the seq
				1284	* advancement below implicitly advances
				1285	* tcp_highest_sack_seq() when skb is highest_sack.
				1286	*/
				1287	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
				1288	start_seq, end_seq, dup_sack, pcount,
				1289	skb->skb_mstamp);
				1290	tcp_rate_skb_delivered(sk, skb, state->rate);
				1291
				1292	if (skb == tp->lost_skb_hint)
				1293	tp->lost_cnt_hint += pcount;
				1294
				1295	TCP_SKB_CB(prev)->end_seq += shifted;
				1296	TCP_SKB_CB(skb)->seq += shifted;
				1297
				1298	tcp_skb_pcount_add(prev, pcount);
				1299	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
				1300	tcp_skb_pcount_add(skb, -pcount);
				1301
				1302	/* When we're adding to gso_segs == 1, gso_size will be zero,
				1303	* in theory this shouldn't be necessary but as long as DSACK
				1304	* code can come after this skb later on it's better to keep
				1305	* setting gso_size to something.
				1306	*/
				1307	if (!TCP_SKB_CB(prev)->tcp_gso_size)
				1308	TCP_SKB_CB(prev)->tcp_gso_size = mss;
				1309
				1310	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
				1311	if (tcp_skb_pcount(skb) <= 1)
				1312	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1313
				1314	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
				1315	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
				1316
				1317	if (skb->len > 0) {
				1318	BUG_ON(!tcp_skb_pcount(skb));
				1319	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
				1320	return false;
				1321	}
				1322
				1323	/* Whole SKB was eaten :-) */
				1324
				1325	if (skb == tp->retransmit_skb_hint)
				1326	tp->retransmit_skb_hint = prev;
				1327	if (skb == tp->lost_skb_hint) {
				1328	tp->lost_skb_hint = prev;
				1329	tp->lost_cnt_hint -= tcp_skb_pcount(prev);
				1330	}
				1331
				1332	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1333	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
				1334	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1335	TCP_SKB_CB(prev)->end_seq++;
				1336
				1337	if (skb == tcp_highest_sack(sk))
				1338	tcp_advance_highest_sack(sk, skb);
				1339
				1340	tcp_skb_collapse_tstamp(prev, skb);
				1341	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
				1342	TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
				1343
				1344	tcp_rtx_queue_unlink_and_free(skb, sk);
				1345
				1346	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
				1347
				1348	return true;
				1349	}
				1350
				1351	/* I wish gso_size would have a bit more sane initialization than
				1352	* something-or-zero which complicates things
				1353	*/
				1354	static int tcp_skb_seglen(const struct sk_buff *skb)
				1355	{
				1356	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
				1357	}
				1358
				1359	/* Shifting pages past head area doesn't work */
				1360	static int skb_can_shift(const struct sk_buff *skb)
				1361	{
				1362	return !skb_headlen(skb) && skb_is_nonlinear(skb);
				1363	}
				1364
				1365	int tcp_skb_shift(struct sk_buff to, struct sk_buff from,
				1366	int pcount, int shiftlen)
				1367	{
				1368	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
				1369	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
				1370	* to make sure not storing more than 65535 * 8 bytes per skb,
				1371	* even if current MSS is bigger.
				1372	*/
				1373	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
				1374	return 0;
				1375	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
				1376	return 0;
				1377	return skb_shift(to, from, shiftlen);
				1378	}
				1379
				1380	/* Try collapsing SACK blocks spanning across multiple skbs to a single
				1381	* skb.
				1382	*/
				1383	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
				1384	struct tcp_sacktag_state *state,
				1385	u32 start_seq, u32 end_seq,
				1386	bool dup_sack)
				1387	{
				1388	struct tcp_sock *tp = tcp_sk(sk);
				1389	struct sk_buff *prev;
				1390	int mss;
				1391	int pcount = 0;
				1392	int len;
				1393	int in_sack;
				1394
				1395	/* Normally R but no L won't result in plain S */
				1396	if (!dup_sack &&
				1397	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
				1398	goto fallback;
				1399	if (!skb_can_shift(skb))
				1400	goto fallback;
				1401	/* This frame is about to be dropped (was ACKed). */
				1402	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1403	goto fallback;
				1404
				1405	/* Can only happen with delayed DSACK + discard craziness */
				1406	prev = skb_rb_prev(skb);
				1407	if (!prev)
				1408	goto fallback;
				1409
				1410	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
				1411	goto fallback;
				1412
				1413	if (!tcp_skb_can_collapse_to(prev))
				1414	goto fallback;
				1415
				1416	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1417	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1418
				1419	if (in_sack) {
				1420	len = skb->len;
				1421	pcount = tcp_skb_pcount(skb);
				1422	mss = tcp_skb_seglen(skb);
				1423
				1424	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1425	* drop this restriction as unnecessary
				1426	*/
				1427	if (mss != tcp_skb_seglen(prev))
				1428	goto fallback;
				1429	} else {
				1430	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
				1431	goto noop;
				1432	/* CHECKME: This is non-MSS split case only?, this will
				1433	* cause skipped skbs due to advancing loop btw, original
				1434	* has that feature too
				1435	*/
				1436	if (tcp_skb_pcount(skb) <= 1)
				1437	goto noop;
				1438
				1439	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1440	if (!in_sack) {
				1441	/* TODO: head merge to next could be attempted here
				1442	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
				1443	* though it might not be worth of the additional hassle
				1444	*
				1445	* ...we can probably just fallback to what was done
				1446	* previously. We could try merging non-SACKed ones
				1447	* as well but it probably isn't going to buy off
				1448	* because later SACKs might again split them, and
				1449	* it would make skb timestamp tracking considerably
				1450	* harder problem.
				1451	*/
				1452	goto fallback;
				1453	}
				1454
				1455	len = end_seq - TCP_SKB_CB(skb)->seq;
				1456	BUG_ON(len < 0);
				1457	BUG_ON(len > skb->len);
				1458
				1459	/* MSS boundaries should be honoured or else pcount will
				1460	* severely break even though it makes things bit trickier.
				1461	* Optimize common case to avoid most of the divides
				1462	*/
				1463	mss = tcp_skb_mss(skb);
				1464
				1465	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1466	* drop this restriction as unnecessary
				1467	*/
				1468	if (mss != tcp_skb_seglen(prev))
				1469	goto fallback;
				1470
				1471	if (len == mss) {
				1472	pcount = 1;
				1473	} else if (len < mss) {
				1474	goto noop;
				1475	} else {
				1476	pcount = len / mss;
				1477	len = pcount * mss;
				1478	}
				1479	}
				1480
				1481	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
				1482	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
				1483	goto fallback;
				1484
				1485	if (!tcp_skb_shift(prev, skb, pcount, len))
				1486	goto fallback;
				1487	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
				1488	goto out;
				1489
				1490	/* Hole filled allows collapsing with the next as well, this is very
				1491	* useful when hole on every nth skb pattern happens
				1492	*/
				1493	skb = skb_rb_next(prev);
				1494	if (!skb)
				1495	goto out;
				1496
				1497	if (!skb_can_shift(skb) \|\|
				1498	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
				1499	(mss != tcp_skb_seglen(skb)))
				1500	goto out;
				1501
				1502	len = skb->len;
				1503	pcount = tcp_skb_pcount(skb);
				1504	if (tcp_skb_shift(prev, skb, pcount, len))
				1505	tcp_shifted_skb(sk, prev, skb, state, pcount,
				1506	len, mss, 0);
				1507
				1508	out:
				1509	return prev;
				1510
				1511	noop:
				1512	return skb;
				1513
				1514	fallback:
				1515	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
				1516	return NULL;
				1517	}
				1518
				1519	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
				1520	struct tcp_sack_block *next_dup,
				1521	struct tcp_sacktag_state *state,
				1522	u32 start_seq, u32 end_seq,
				1523	bool dup_sack_in)
				1524	{
				1525	struct tcp_sock *tp = tcp_sk(sk);
				1526	struct sk_buff *tmp;
				1527
				1528	skb_rbtree_walk_from(skb) {
				1529	int in_sack = 0;
				1530	bool dup_sack = dup_sack_in;
				1531
				1532	/* queue is in-order => we can short-circuit the walk early */
				1533	if (!before(TCP_SKB_CB(skb)->seq, end_seq))
				1534	break;
				1535
				1536	if (next_dup &&
				1537	before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
				1538	in_sack = tcp_match_skb_to_sack(sk, skb,
				1539	next_dup->start_seq,
				1540	next_dup->end_seq);
				1541	if (in_sack > 0)
				1542	dup_sack = true;
				1543	}
				1544
				1545	/* skb reference here is a bit tricky to get right, since
				1546	* shifting can eat and free both this skb and the next,
				1547	* so not even _safe variant of the loop is enough.
				1548	*/
				1549	if (in_sack <= 0) {
				1550	tmp = tcp_shift_skb_data(sk, skb, state,
				1551	start_seq, end_seq, dup_sack);
				1552	if (tmp) {
				1553	if (tmp != skb) {
				1554	skb = tmp;
				1555	continue;
				1556	}
				1557
				1558	in_sack = 0;
				1559	} else {
				1560	in_sack = tcp_match_skb_to_sack(sk, skb,
				1561	start_seq,
				1562	end_seq);
				1563	}
				1564	}
				1565
				1566	if (unlikely(in_sack < 0))
				1567	break;
				1568
				1569	if (in_sack) {
				1570	TCP_SKB_CB(skb)->sacked =
				1571	tcp_sacktag_one(sk,
				1572	state,
				1573	TCP_SKB_CB(skb)->sacked,
				1574	TCP_SKB_CB(skb)->seq,
				1575	TCP_SKB_CB(skb)->end_seq,
				1576	dup_sack,
				1577	tcp_skb_pcount(skb),
				1578	skb->skb_mstamp);
				1579	tcp_rate_skb_delivered(sk, skb, state->rate);
				1580	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1581	list_del_init(&skb->tcp_tsorted_anchor);
				1582
				1583	if (!before(TCP_SKB_CB(skb)->seq,
				1584	tcp_highest_sack_seq(tp)))
				1585	tcp_advance_highest_sack(sk, skb);
				1586	}
				1587	}
				1588	return skb;
				1589	}
				1590
				1591	static struct sk_buff tcp_sacktag_bsearch(struct sock sk,
				1592	struct tcp_sacktag_state *state,
				1593	u32 seq)
				1594	{
				1595	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
				1596	struct sk_buff *skb;
				1597
				1598	while (*p) {
				1599	parent = *p;
				1600	skb = rb_to_skb(parent);
				1601	if (before(seq, TCP_SKB_CB(skb)->seq)) {
				1602	p = &parent->rb_left;
				1603	continue;
				1604	}
				1605	if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
				1606	p = &parent->rb_right;
				1607	continue;
				1608	}
				1609	return skb;
				1610	}
				1611	return NULL;
				1612	}
				1613
				1614	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
				1615	struct tcp_sacktag_state *state,
				1616	u32 skip_to_seq)
				1617	{
				1618	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
				1619	return skb;
				1620
				1621	return tcp_sacktag_bsearch(sk, state, skip_to_seq);
				1622	}
				1623
				1624	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
				1625	struct sock *sk,
				1626	struct tcp_sack_block *next_dup,
				1627	struct tcp_sacktag_state *state,
				1628	u32 skip_to_seq)
				1629	{
				1630	if (!next_dup)
				1631	return skb;
				1632
				1633	if (before(next_dup->start_seq, skip_to_seq)) {
				1634	skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
				1635	skb = tcp_sacktag_walk(skb, sk, NULL, state,
				1636	next_dup->start_seq, next_dup->end_seq,
				1637	1);
				1638	}
				1639
				1640	return skb;
				1641	}
				1642
				1643	static int tcp_sack_cache_ok(const struct tcp_sock tp, const struct tcp_sack_block cache)
				1644	{
				1645	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1646	}
				1647
				1648	static int
				1649	tcp_sacktag_write_queue(struct sock sk, const struct sk_buff ack_skb,
				1650	u32 prior_snd_una, struct tcp_sacktag_state *state)
				1651	{
				1652	struct tcp_sock *tp = tcp_sk(sk);
				1653	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				1654	TCP_SKB_CB(ack_skb)->sacked);
				1655	struct tcp_sack_block_wire sp_wire = (struct tcp_sack_block_wire )(ptr+2);
				1656	struct tcp_sack_block sp[TCP_NUM_SACKS];
				1657	struct tcp_sack_block *cache;
				1658	struct sk_buff *skb;
				1659	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
				1660	int used_sacks;
				1661	bool found_dup_sack = false;
				1662	int i, j;
				1663	int first_sack_index;
				1664
				1665	state->flag = 0;
				1666	state->reord = tp->snd_nxt;
				1667
				1668	if (!tp->sacked_out)
				1669	tcp_highest_sack_reset(sk);
				1670
				1671	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
				1672	num_sacks, prior_snd_una);
				1673	if (found_dup_sack) {
				1674	state->flag \|= FLAG_DSACKING_ACK;
				1675	tp->delivered++; /* A spurious retransmission is delivered */
				1676	}
				1677
				1678	/* Eliminate too old ACKs, but take into
				1679	* account more or less fresh ones, they can
				1680	* contain valid SACK info.
				1681	*/
				1682	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
				1683	return 0;
				1684
				1685	if (!tp->packets_out)
				1686	goto out;
				1687
				1688	used_sacks = 0;
				1689	first_sack_index = 0;
				1690	for (i = 0; i < num_sacks; i++) {
				1691	bool dup_sack = !i && found_dup_sack;
				1692
				1693	sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
				1694	sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
				1695
				1696	if (!tcp_is_sackblock_valid(tp, dup_sack,
				1697	sp[used_sacks].start_seq,
				1698	sp[used_sacks].end_seq)) {
				1699	int mib_idx;
				1700
				1701	if (dup_sack) {
				1702	if (!tp->undo_marker)
				1703	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
				1704	else
				1705	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
				1706	} else {
				1707	/* Don't count olds caused by ACK reordering */
				1708	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				1709	!after(sp[used_sacks].end_seq, tp->snd_una))
				1710	continue;
				1711	mib_idx = LINUX_MIB_TCPSACKDISCARD;
				1712	}
				1713
				1714	NET_INC_STATS(sock_net(sk), mib_idx);
				1715	if (i == 0)
				1716	first_sack_index = -1;
				1717	continue;
				1718	}
				1719
				1720	/* Ignore very old stuff early */
				1721	if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
				1722	if (i == 0)
				1723	first_sack_index = -1;
				1724	continue;
				1725	}
				1726
				1727	used_sacks++;
				1728	}
				1729
				1730	/* order SACK blocks to allow in order walk of the retrans queue */
				1731	for (i = used_sacks - 1; i > 0; i--) {
				1732	for (j = 0; j < i; j++) {
				1733	if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
				1734	swap(sp[j], sp[j + 1]);
				1735
				1736	/* Track where the first SACK block goes to */
				1737	if (j == first_sack_index)
				1738	first_sack_index = j + 1;
				1739	}
				1740	}
				1741	}
				1742
				1743	state->mss_now = tcp_current_mss(sk);
				1744	skb = NULL;
				1745	i = 0;
				1746
				1747	if (!tp->sacked_out) {
				1748	/* It's already past, so skip checking against it */
				1749	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1750	} else {
				1751	cache = tp->recv_sack_cache;
				1752	/* Skip empty blocks in at head of the cache */
				1753	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
				1754	!cache->end_seq)
				1755	cache++;
				1756	}
				1757
				1758	while (i < used_sacks) {
				1759	u32 start_seq = sp[i].start_seq;
				1760	u32 end_seq = sp[i].end_seq;
				1761	bool dup_sack = (found_dup_sack && (i == first_sack_index));
				1762	struct tcp_sack_block *next_dup = NULL;
				1763
				1764	if (found_dup_sack && ((i + 1) == first_sack_index))
				1765	next_dup = &sp[i + 1];
				1766
				1767	/* Skip too early cached blocks */
				1768	while (tcp_sack_cache_ok(tp, cache) &&
				1769	!before(start_seq, cache->end_seq))
				1770	cache++;
				1771
				1772	/* Can skip some work by looking recv_sack_cache? */
				1773	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
				1774	after(end_seq, cache->start_seq)) {
				1775
				1776	/* Head todo? */
				1777	if (before(start_seq, cache->start_seq)) {
				1778	skb = tcp_sacktag_skip(skb, sk, state,
				1779	start_seq);
				1780	skb = tcp_sacktag_walk(skb, sk, next_dup,
				1781	state,
				1782	start_seq,
				1783	cache->start_seq,
				1784	dup_sack);
				1785	}
				1786
				1787	/* Rest of the block already fully processed? */
				1788	if (!after(end_seq, cache->end_seq))
				1789	goto advance_sp;
				1790
				1791	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
				1792	state,
				1793	cache->end_seq);
				1794
				1795	/* ...tail remains todo... */
				1796	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
				1797	/* ...but better entrypoint exists! */
				1798	skb = tcp_highest_sack(sk);
				1799	if (!skb)
				1800	break;
				1801	cache++;
				1802	goto walk;
				1803	}
				1804
				1805	skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
				1806	/* Check overlap against next cached too (past this one already) */
				1807	cache++;
				1808	continue;
				1809	}
				1810
				1811	if (!before(start_seq, tcp_highest_sack_seq(tp))) {
				1812	skb = tcp_highest_sack(sk);
				1813	if (!skb)
				1814	break;
				1815	}
				1816	skb = tcp_sacktag_skip(skb, sk, state, start_seq);
				1817
				1818	walk:
				1819	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
				1820	start_seq, end_seq, dup_sack);
				1821
				1822	advance_sp:
				1823	i++;
				1824	}
				1825
				1826	/* Clear the head of the cache sack blocks so we can skip it next time */
				1827	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
				1828	tp->recv_sack_cache[i].start_seq = 0;
				1829	tp->recv_sack_cache[i].end_seq = 0;
				1830	}
				1831	for (j = 0; j < used_sacks; j++)
				1832	tp->recv_sack_cache[i++] = sp[j];
				1833
				1834	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss \|\| tp->undo_marker)
				1835	tcp_check_sack_reordering(sk, state->reord, 0);
				1836
				1837	tcp_verify_left_out(tp);
				1838	out:
				1839
				1840	#if FASTRETRANS_DEBUG > 0
				1841	WARN_ON((int)tp->sacked_out < 0);
				1842	WARN_ON((int)tp->lost_out < 0);
				1843	WARN_ON((int)tp->retrans_out < 0);
				1844	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
				1845	#endif
				1846	return state->flag;
				1847	}
				1848
				1849	/* Limits sacked_out so that sum with lost_out isn't ever larger than
				1850	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
				1851	*/
				1852	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
				1853	{
				1854	u32 holes;
				1855
				1856	holes = max(tp->lost_out, 1U);
				1857	holes = min(holes, tp->packets_out);
				1858
				1859	if ((tp->sacked_out + holes) > tp->packets_out) {
				1860	tp->sacked_out = tp->packets_out - holes;
				1861	return true;
				1862	}
				1863	return false;
				1864	}
				1865
				1866	/* If we receive more dupacks than we expected counting segments
				1867	* in assumption of absent reordering, interpret this as reordering.
				1868	* The only another reason could be bug in receiver TCP.
				1869	*/
				1870	static void tcp_check_reno_reordering(struct sock *sk, const int addend)
				1871	{
				1872	struct tcp_sock *tp = tcp_sk(sk);
				1873
				1874	if (!tcp_limit_reno_sacked(tp))
				1875	return;
				1876
				1877	tp->reordering = min_t(u32, tp->packets_out + addend,
				1878	sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
				1879	tp->reord_seen++;
				1880	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
				1881	}
				1882
				1883	/* Emulate SACKs for SACKless connection: account for a new dupack. */
				1884
				1885	static void tcp_add_reno_sack(struct sock *sk)
				1886	{
				1887	struct tcp_sock *tp = tcp_sk(sk);
				1888	u32 prior_sacked = tp->sacked_out;
				1889
				1890	tp->sacked_out++;
				1891	tcp_check_reno_reordering(sk, 0);
				1892	if (tp->sacked_out > prior_sacked)
				1893	tp->delivered++; /* Some out-of-order packet is delivered */
				1894	tcp_verify_left_out(tp);
				1895	}
				1896
				1897	/* Account for ACK, ACKing some data in Reno Recovery phase. */
				1898
				1899	static void tcp_remove_reno_sacks(struct sock *sk, int acked)
				1900	{
				1901	struct tcp_sock *tp = tcp_sk(sk);
				1902
				1903	if (acked > 0) {
				1904	/* One ACK acked hole. The rest eat duplicate ACKs. */
				1905	tp->delivered += max_t(int, acked - tp->sacked_out, 1);
				1906	if (acked - 1 >= tp->sacked_out)
				1907	tp->sacked_out = 0;
				1908	else
				1909	tp->sacked_out -= acked - 1;
				1910	}
				1911	tcp_check_reno_reordering(sk, acked);
				1912	tcp_verify_left_out(tp);
				1913	}
				1914
				1915	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
				1916	{
				1917	tp->sacked_out = 0;
				1918	}
				1919
				1920	void tcp_clear_retrans(struct tcp_sock *tp)
				1921	{
				1922	tp->retrans_out = 0;
				1923	tp->lost_out = 0;
				1924	tp->undo_marker = 0;
				1925	tp->undo_retrans = -1;
				1926	tp->sacked_out = 0;
				1927	}
				1928
				1929	static inline void tcp_init_undo(struct tcp_sock *tp)
				1930	{
				1931	tp->undo_marker = tp->snd_una;
				1932	/* Retransmission still in flight may cause DSACKs later. */
				1933	tp->undo_retrans = tp->retrans_out ? : -1;
				1934	}
				1935
				1936	static bool tcp_is_rack(const struct sock *sk)
				1937	{
				1938	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
				1939	}
				1940
				1941	/* If we detect SACK reneging, forget all SACK information
				1942	* and reset tags completely, otherwise preserve SACKs. If receiver
				1943	* dropped its ofo queue, we will know this due to reneging detection.
				1944	*/
				1945	static void tcp_timeout_mark_lost(struct sock *sk)
				1946	{
				1947	struct tcp_sock *tp = tcp_sk(sk);
				1948	struct sk_buff skb, head;
				1949	bool is_reneg; /* is receiver reneging on SACKs? */
				1950
				1951	head = tcp_rtx_queue_head(sk);
				1952	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
				1953	if (is_reneg) {
				1954	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
				1955	tp->sacked_out = 0;
				1956	/* Mark SACK reneging until we recover from this loss event. */
				1957	tp->is_sack_reneg = 1;
				1958	} else if (tcp_is_reno(tp)) {
				1959	tcp_reset_reno_sack(tp);
				1960	}
				1961
				1962	skb = head;
				1963	skb_rbtree_walk_from(skb) {
				1964	if (is_reneg)
				1965	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
				1966	else if (tcp_is_rack(sk) && skb != head &&
				1967	tcp_rack_skb_timeout(tp, skb, 0) > 0)
				1968	continue; /* Don't mark recently sent ones lost yet */
				1969	tcp_mark_skb_lost(sk, skb);
				1970	}
				1971	tcp_verify_left_out(tp);
				1972	tcp_clear_all_retrans_hints(tp);
				1973	}
				1974
				1975	/* Enter Loss state. */
				1976	void tcp_enter_loss(struct sock *sk)
				1977	{
				1978	const struct inet_connection_sock *icsk = inet_csk(sk);
				1979	struct tcp_sock *tp = tcp_sk(sk);
				1980	struct net *net = sock_net(sk);
				1981	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
				1982
				1983	tcp_timeout_mark_lost(sk);
				1984
				1985	/* Reduce ssthresh if it has not yet been made inside this window. */
				1986	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
				1987	!after(tp->high_seq, tp->snd_una) \|\|
				1988	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
				1989	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				1990	tp->prior_cwnd = tp->snd_cwnd;
				1991	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				1992	tcp_ca_event(sk, CA_EVENT_LOSS);
				1993	tcp_init_undo(tp);
				1994	}
				1995	tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
				1996	tp->snd_cwnd_cnt = 0;
				1997	tp->snd_cwnd_stamp = tcp_jiffies32;
				1998
				1999	/* Timeout in disordered state after receiving substantial DUPACKs
				2000	* suggests that the degree of reordering is over-estimated.
				2001	*/
				2002	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
				2003	tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
				2004	tp->reordering = min_t(unsigned int, tp->reordering,
				2005	net->ipv4.sysctl_tcp_reordering);
				2006	tcp_set_ca_state(sk, TCP_CA_Loss);
				2007	tp->high_seq = tp->snd_nxt;
				2008	tcp_ecn_queue_cwr(tp);
				2009
				2010	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
				2011	* loss recovery is underway except recurring timeout(s) on
				2012	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
				2013	*/
				2014	tp->frto = net->ipv4.sysctl_tcp_frto &&
				2015	(new_recovery \|\| icsk->icsk_retransmits) &&
				2016	!inet_csk(sk)->icsk_mtup.probe_size;
				2017	}
				2018
				2019	/* If ACK arrived pointing to a remembered SACK, it means that our
				2020	* remembered SACKs do not reflect real state of receiver i.e.
				2021	* receiver _host_ is heavily congested (or buggy).
				2022	*
				2023	* To avoid big spurious retransmission bursts due to transient SACK
				2024	* scoreboard oddities that look like reneging, we give the receiver a
				2025	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
				2026	* restore sanity to the SACK scoreboard. If the apparent reneging
				2027	* persists until this RTO then we'll clear the SACK scoreboard.
				2028	*/
				2029	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
				2030	{
				2031	if (flag & FLAG_SACK_RENEGING) {
				2032	struct tcp_sock *tp = tcp_sk(sk);
				2033	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
				2034	msecs_to_jiffies(10));
				2035
				2036	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2037	delay, TCP_RTO_MAX);
				2038	return true;
				2039	}
				2040	return false;
				2041	}
				2042
				2043	/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
				2044	* counter when SACK is enabled (without SACK, sacked_out is used for
				2045	* that purpose).
				2046	*
				2047	* With reordering, holes may still be in flight, so RFC3517 recovery
				2048	* uses pure sacked_out (total number of SACKed segments) even though
				2049	* it violates the RFC that uses duplicate ACKs, often these are equal
				2050	* but when e.g. out-of-window ACKs or packet duplication occurs,
				2051	* they differ. Since neither occurs due to loss, TCP should really
				2052	* ignore them.
				2053	*/
				2054	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
				2055	{
				2056	return tp->sacked_out + 1;
				2057	}
				2058
				2059	/* Linux NewReno/SACK/ECN state machine.
				2060	* --------------------------------------
				2061	*
				2062	* "Open" Normal state, no dubious events, fast path.
				2063	* "Disorder" In all the respects it is "Open",
				2064	* but requires a bit more attention. It is entered when
				2065	* we see some SACKs or dupacks. It is split of "Open"
				2066	* mainly to move some processing from fast path to slow one.
				2067	* "CWR" CWND was reduced due to some Congestion Notification event.
				2068	* It can be ECN, ICMP source quench, local device congestion.
				2069	* "Recovery" CWND was reduced, we are fast-retransmitting.
				2070	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
				2071	*
				2072	* tcp_fastretrans_alert() is entered:
				2073	* - each incoming ACK, if state is not "Open"
				2074	* - when arrived ACK is unusual, namely:
				2075	* * SACK
				2076	* * Duplicate ACK.
				2077	* * ECN ECE.
				2078	*
				2079	* Counting packets in flight is pretty simple.
				2080	*
				2081	* in_flight = packets_out - left_out + retrans_out
				2082	*
				2083	* packets_out is SND.NXT-SND.UNA counted in packets.
				2084	*
				2085	* retrans_out is number of retransmitted segments.
				2086	*
				2087	* left_out is number of segments left network, but not ACKed yet.
				2088	*
				2089	* left_out = sacked_out + lost_out
				2090	*
				2091	* sacked_out: Packets, which arrived to receiver out of order
				2092	* and hence not ACKed. With SACKs this number is simply
				2093	* amount of SACKed data. Even without SACKs
				2094	* it is easy to give pretty reliable estimate of this number,
				2095	* counting duplicate ACKs.
				2096	*
				2097	* lost_out: Packets lost by network. TCP has no explicit
				2098	* "loss notification" feedback from network (for now).
				2099	* It means that this number can be only _guessed_.
				2100	* Actually, it is the heuristics to predict lossage that
				2101	* distinguishes different algorithms.
				2102	*
				2103	* F.e. after RTO, when all the queue is considered as lost,
				2104	* lost_out = packets_out and in_flight = retrans_out.
				2105	*
				2106	* Essentially, we have now a few algorithms detecting
				2107	* lost packets.
				2108	*
				2109	* If the receiver supports SACK:
				2110	*
				2111	* RFC6675/3517: It is the conventional algorithm. A packet is
				2112	* considered lost if the number of higher sequence packets
				2113	* SACKed is greater than or equal the DUPACK thoreshold
				2114	* (reordering). This is implemented in tcp_mark_head_lost and
				2115	* tcp_update_scoreboard.
				2116	*
				2117	* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
				2118	* (2017-) that checks timing instead of counting DUPACKs.
				2119	* Essentially a packet is considered lost if it's not S/ACKed
				2120	* after RTT + reordering_window, where both metrics are
				2121	* dynamically measured and adjusted. This is implemented in
				2122	* tcp_rack_mark_lost.
				2123	*
				2124	* If the receiver does not support SACK:
				2125	*
				2126	* NewReno (RFC6582): in Recovery we assume that one segment
				2127	* is lost (classic Reno). While we are in Recovery and
				2128	* a partial ACK arrives, we assume that one more packet
				2129	* is lost (NewReno). This heuristics are the same in NewReno
				2130	* and SACK.
				2131	*
				2132	* Really tricky (and requiring careful tuning) part of algorithm
				2133	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
				2134	* The first determines the moment _when_ we should reduce CWND and,
				2135	* hence, slow down forward transmission. In fact, it determines the moment
				2136	* when we decide that hole is caused by loss, rather than by a reorder.
				2137	*
				2138	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
				2139	* holes, caused by lost packets.
				2140	*
				2141	* And the most logically complicated part of algorithm is undo
				2142	* heuristics. We detect false retransmits due to both too early
				2143	* fast retransmit (reordering) and underestimated RTO, analyzing
				2144	* timestamps and D-SACKs. When we detect that some segments were
				2145	* retransmitted by mistake and CWND reduction was wrong, we undo
				2146	* window reduction and abort recovery phase. This logic is hidden
				2147	* inside several functions named tcp_try_undo_<something>.
				2148	*/
				2149
				2150	/* This function decides, when we should leave Disordered state
				2151	* and enter Recovery phase, reducing congestion window.
				2152	*
				2153	* Main question: may we further continue forward transmission
				2154	* with the same cwnd?
				2155	*/
				2156	static bool tcp_time_to_recover(struct sock *sk, int flag)
				2157	{
				2158	struct tcp_sock *tp = tcp_sk(sk);
				2159
				2160	/* Trick#1: The loss is proven. */
				2161	if (tp->lost_out)
				2162	return true;
				2163
				2164	/* Not-A-Trick#2 : Classic rule... */
				2165	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
				2166	return true;
				2167
				2168	return false;
				2169	}
				2170
				2171	/* Detect loss in event "A" above by marking head of queue up as lost.
				2172	* For non-SACK(Reno) senders, the first "packets" number of segments
				2173	* are considered lost. For RFC3517 SACK, a segment is considered lost if it
				2174	* has at least tp->reordering SACKed seqments above it; "packets" refers to
				2175	* the maximum SACKed segments to pass before reaching this limit.
				2176	*/
				2177	static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
				2178	{
				2179	struct tcp_sock *tp = tcp_sk(sk);
				2180	struct sk_buff *skb;
				2181	int cnt, oldcnt, lost;
				2182	unsigned int mss;
				2183	/* Use SACK to deduce losses of new sequences sent during recovery */
				2184	const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
				2185
				2186	WARN_ON(packets > tp->packets_out);
				2187	skb = tp->lost_skb_hint;
				2188	if (skb) {
				2189	/* Head already handled? */
				2190	if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
				2191	return;
				2192	cnt = tp->lost_cnt_hint;
				2193	} else {
				2194	skb = tcp_rtx_queue_head(sk);
				2195	cnt = 0;
				2196	}
				2197
				2198	skb_rbtree_walk_from(skb) {
				2199	/* TODO: do this better */
				2200	/* this is not the most efficient way to do this... */
				2201	tp->lost_skb_hint = skb;
				2202	tp->lost_cnt_hint = cnt;
				2203
				2204	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
				2205	break;
				2206
				2207	oldcnt = cnt;
				2208	if (tcp_is_reno(tp) \|\|
				2209	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				2210	cnt += tcp_skb_pcount(skb);
				2211
				2212	if (cnt > packets) {
				2213	if (tcp_is_sack(tp) \|\|
				2214	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
				2215	(oldcnt >= packets))
				2216	break;
				2217
				2218	mss = tcp_skb_mss(skb);
				2219	/* If needed, chop off the prefix to mark as lost. */
				2220	lost = (packets - oldcnt) * mss;
				2221	if (lost < skb->len &&
				2222	tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				2223	lost, mss, GFP_ATOMIC) < 0)
				2224	break;
				2225	cnt = packets;
				2226	}
				2227
				2228	tcp_skb_mark_lost(tp, skb);
				2229
				2230	if (mark_head)
				2231	break;
				2232	}
				2233	tcp_verify_left_out(tp);
				2234	}
				2235
				2236	/* Account newly detected lost packet(s) */
				2237
				2238	static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
				2239	{
				2240	struct tcp_sock *tp = tcp_sk(sk);
				2241
				2242	if (tcp_is_sack(tp)) {
				2243	int sacked_upto = tp->sacked_out - tp->reordering;
				2244	if (sacked_upto >= 0)
				2245	tcp_mark_head_lost(sk, sacked_upto, 0);
				2246	else if (fast_rexmit)
				2247	tcp_mark_head_lost(sk, 1, 1);
				2248	}
				2249	}
				2250
				2251	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
				2252	{
				2253	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2254	before(tp->rx_opt.rcv_tsecr, when);
				2255	}
				2256
				2257	/* skb is spurious retransmitted if the returned timestamp echo
				2258	* reply is prior to the skb transmission time
				2259	*/
				2260	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
				2261	const struct sk_buff *skb)
				2262	{
				2263	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
				2264	tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
				2265	}
				2266
				2267	/* Nothing was retransmitted or returned timestamp is less
				2268	* than timestamp of the first retransmission.
				2269	*/
				2270	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
				2271	{
				2272	return !tp->retrans_stamp \|\|
				2273	tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
				2274	}
				2275
				2276	/* Undo procedures. */
				2277
				2278	/* We can clear retrans_stamp when there are no retransmissions in the
				2279	* window. It would seem that it is trivially available for us in
				2280	* tp->retrans_out, however, that kind of assumptions doesn't consider
				2281	* what will happen if errors occur when sending retransmission for the
				2282	* second time. ...It could the that such segment has only
				2283	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
				2284	* the head skb is enough except for some reneging corner cases that
				2285	* are not worth the effort.
				2286	*
				2287	* Main reason for all this complexity is the fact that connection dying
				2288	* time now depends on the validity of the retrans_stamp, in particular,
				2289	* that successive retransmissions of a segment must not advance
				2290	* retrans_stamp under any conditions.
				2291	*/
				2292	static bool tcp_any_retrans_done(const struct sock *sk)
				2293	{
				2294	const struct tcp_sock *tp = tcp_sk(sk);
				2295	struct sk_buff *skb;
				2296
				2297	if (tp->retrans_out)
				2298	return true;
				2299
				2300	skb = tcp_rtx_queue_head(sk);
				2301	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
				2302	return true;
				2303
				2304	return false;
				2305	}
				2306
				2307	static void DBGUNDO(struct sock sk, const char msg)
				2308	{
				2309	#if FASTRETRANS_DEBUG > 1
				2310	struct tcp_sock *tp = tcp_sk(sk);
				2311	struct inet_sock *inet = inet_sk(sk);
				2312
				2313	if (sk->sk_family == AF_INET) {
				2314	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
				2315	msg,
				2316	&inet->inet_daddr, ntohs(inet->inet_dport),
				2317	tp->snd_cwnd, tcp_left_out(tp),
				2318	tp->snd_ssthresh, tp->prior_ssthresh,
				2319	tp->packets_out);
				2320	}
				2321	#if IS_ENABLED(CONFIG_IPV6)
				2322	else if (sk->sk_family == AF_INET6) {
				2323	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
				2324	msg,
				2325	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
				2326	tp->snd_cwnd, tcp_left_out(tp),
				2327	tp->snd_ssthresh, tp->prior_ssthresh,
				2328	tp->packets_out);
				2329	}
				2330	#endif
				2331	#endif
				2332	}
				2333
				2334	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
				2335	{
				2336	struct tcp_sock *tp = tcp_sk(sk);
				2337
				2338	if (unmark_loss) {
				2339	struct sk_buff *skb;
				2340
				2341	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2342	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2343	}
				2344	tp->lost_out = 0;
				2345	tcp_clear_all_retrans_hints(tp);
				2346	}
				2347
				2348	if (tp->prior_ssthresh) {
				2349	const struct inet_connection_sock *icsk = inet_csk(sk);
				2350
				2351	tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
				2352
				2353	if (tp->prior_ssthresh > tp->snd_ssthresh) {
				2354	tp->snd_ssthresh = tp->prior_ssthresh;
				2355	tcp_ecn_withdraw_cwr(tp);
				2356	}
				2357	}
				2358	tp->snd_cwnd_stamp = tcp_jiffies32;
				2359	tp->undo_marker = 0;
				2360	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
				2361	}
				2362
				2363	static inline bool tcp_may_undo(const struct tcp_sock *tp)
				2364	{
				2365	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
				2366	}
				2367
				2368	/* People celebrate: "We love our President!" */
				2369	static bool tcp_try_undo_recovery(struct sock *sk)
				2370	{
				2371	struct tcp_sock *tp = tcp_sk(sk);
				2372
				2373	if (tcp_may_undo(tp)) {
				2374	int mib_idx;
				2375
				2376	/* Happy end! We did not retransmit anything
				2377	* or our original transmission succeeded.
				2378	*/
				2379	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
				2380	tcp_undo_cwnd_reduction(sk, false);
				2381	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				2382	mib_idx = LINUX_MIB_TCPLOSSUNDO;
				2383	else
				2384	mib_idx = LINUX_MIB_TCPFULLUNDO;
				2385
				2386	NET_INC_STATS(sock_net(sk), mib_idx);
				2387	} else if (tp->rack.reo_wnd_persist) {
				2388	tp->rack.reo_wnd_persist--;
				2389	}
				2390	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
				2391	/* Hold old state until something above high_seq
				2392	* is ACKed. For Reno it is MUST to prevent false
				2393	* fast retransmits (RFC2582). SACK TCP is safe. */
				2394	if (!tcp_any_retrans_done(sk))
				2395	tp->retrans_stamp = 0;
				2396	return true;
				2397	}
				2398	tcp_set_ca_state(sk, TCP_CA_Open);
				2399	tp->is_sack_reneg = 0;
				2400	return false;
				2401	}
				2402
				2403	/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
				2404	static bool tcp_try_undo_dsack(struct sock *sk)
				2405	{
				2406	struct tcp_sock *tp = tcp_sk(sk);
				2407
				2408	if (tp->undo_marker && !tp->undo_retrans) {
				2409	tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
				2410	tp->rack.reo_wnd_persist + 1);
				2411	DBGUNDO(sk, "D-SACK");
				2412	tcp_undo_cwnd_reduction(sk, false);
				2413	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
				2414	return true;
				2415	}
				2416	return false;
				2417	}
				2418
				2419	/* Undo during loss recovery after partial ACK or using F-RTO. */
				2420	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
				2421	{
				2422	struct tcp_sock *tp = tcp_sk(sk);
				2423
				2424	if (frto_undo \|\| tcp_may_undo(tp)) {
				2425	tcp_undo_cwnd_reduction(sk, true);
				2426
				2427	DBGUNDO(sk, "partial loss");
				2428	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
				2429	if (frto_undo)
				2430	NET_INC_STATS(sock_net(sk),
				2431	LINUX_MIB_TCPSPURIOUSRTOS);
				2432	inet_csk(sk)->icsk_retransmits = 0;
				2433	if (frto_undo \|\| tcp_is_sack(tp)) {
				2434	tcp_set_ca_state(sk, TCP_CA_Open);
				2435	tp->is_sack_reneg = 0;
				2436	}
				2437	return true;
				2438	}
				2439	return false;
				2440	}
				2441
				2442	/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
				2443	* It computes the number of packets to send (sndcnt) based on packets newly
				2444	* delivered:
				2445	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
				2446	* cwnd reductions across a full RTT.
				2447	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
				2448	* But when the retransmits are acked without further losses, PRR
				2449	* slow starts cwnd up to ssthresh to speed up the recovery.
				2450	*/
				2451	static void tcp_init_cwnd_reduction(struct sock *sk)
				2452	{
				2453	struct tcp_sock *tp = tcp_sk(sk);
				2454
				2455	tp->high_seq = tp->snd_nxt;
				2456	tp->tlp_high_seq = 0;
				2457	tp->snd_cwnd_cnt = 0;
				2458	tp->prior_cwnd = tp->snd_cwnd;
				2459	tp->prr_delivered = 0;
				2460	tp->prr_out = 0;
				2461	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
				2462	tcp_ecn_queue_cwr(tp);
				2463	}
				2464
				2465	void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
				2466	{
				2467	struct tcp_sock *tp = tcp_sk(sk);
				2468	int sndcnt = 0;
				2469	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
				2470
				2471	if (newly_acked_sacked <= 0 \|\| WARN_ON_ONCE(!tp->prior_cwnd))
				2472	return;
				2473
				2474	tp->prr_delivered += newly_acked_sacked;
				2475	if (delta < 0) {
				2476	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
				2477	tp->prior_cwnd - 1;
				2478	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
				2479	} else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
				2480	!(flag & FLAG_LOST_RETRANS)) {
				2481	sndcnt = min_t(int, delta,
				2482	max_t(int, tp->prr_delivered - tp->prr_out,
				2483	newly_acked_sacked) + 1);
				2484	} else {
				2485	sndcnt = min(delta, newly_acked_sacked);
				2486	}
				2487	/* Force a fast retransmit upon entering fast recovery */
				2488	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
				2489	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
				2490	}
				2491
				2492	static inline void tcp_end_cwnd_reduction(struct sock *sk)
				2493	{
				2494	struct tcp_sock *tp = tcp_sk(sk);
				2495
				2496	if (inet_csk(sk)->icsk_ca_ops->cong_control)
				2497	return;
				2498
				2499	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
				2500	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
				2501	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
				2502	tp->snd_cwnd = tp->snd_ssthresh;
				2503	tp->snd_cwnd_stamp = tcp_jiffies32;
				2504	}
				2505	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
				2506	}
				2507
				2508	/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
				2509	void tcp_enter_cwr(struct sock *sk)
				2510	{
				2511	struct tcp_sock *tp = tcp_sk(sk);
				2512
				2513	tp->prior_ssthresh = 0;
				2514	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
				2515	tp->undo_marker = 0;
				2516	tcp_init_cwnd_reduction(sk);
				2517	tcp_set_ca_state(sk, TCP_CA_CWR);
				2518	}
				2519	}
				2520	EXPORT_SYMBOL(tcp_enter_cwr);
				2521
				2522	static void tcp_try_keep_open(struct sock *sk)
				2523	{
				2524	struct tcp_sock *tp = tcp_sk(sk);
				2525	int state = TCP_CA_Open;
				2526
				2527	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
				2528	state = TCP_CA_Disorder;
				2529
				2530	if (inet_csk(sk)->icsk_ca_state != state) {
				2531	tcp_set_ca_state(sk, state);
				2532	tp->high_seq = tp->snd_nxt;
				2533	}
				2534	}
				2535
				2536	static void tcp_try_to_open(struct sock *sk, int flag)
				2537	{
				2538	struct tcp_sock *tp = tcp_sk(sk);
				2539
				2540	tcp_verify_left_out(tp);
				2541
				2542	if (!tcp_any_retrans_done(sk))
				2543	tp->retrans_stamp = 0;
				2544
				2545	if (flag & FLAG_ECE)
				2546	tcp_enter_cwr(sk);
				2547
				2548	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
				2549	tcp_try_keep_open(sk);
				2550	}
				2551	}
				2552
				2553	static void tcp_mtup_probe_failed(struct sock *sk)
				2554	{
				2555	struct inet_connection_sock *icsk = inet_csk(sk);
				2556
				2557	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
				2558	icsk->icsk_mtup.probe_size = 0;
				2559	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
				2560	}
				2561
				2562	static void tcp_mtup_probe_success(struct sock *sk)
				2563	{
				2564	struct tcp_sock *tp = tcp_sk(sk);
				2565	struct inet_connection_sock *icsk = inet_csk(sk);
				2566
				2567	/* FIXME: breaks with very large cwnd */
				2568	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2569	tp->snd_cwnd = tp->snd_cwnd *
				2570	tcp_mss_to_mtu(sk, tp->mss_cache) /
				2571	icsk->icsk_mtup.probe_size;
				2572	tp->snd_cwnd_cnt = 0;
				2573	tp->snd_cwnd_stamp = tcp_jiffies32;
				2574	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2575
				2576	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
				2577	icsk->icsk_mtup.probe_size = 0;
				2578	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				2579	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
				2580	}
				2581
				2582	/* Do a simple retransmit without using the backoff mechanisms in
				2583	* tcp_timer. This is used for path mtu discovery.
				2584	* The socket is already locked here.
				2585	*/
				2586	void tcp_simple_retransmit(struct sock *sk)
				2587	{
				2588	const struct inet_connection_sock *icsk = inet_csk(sk);
				2589	struct tcp_sock *tp = tcp_sk(sk);
				2590	struct sk_buff *skb;
				2591	unsigned int mss = tcp_current_mss(sk);
				2592
				2593	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
				2594	if (tcp_skb_seglen(skb) > mss &&
				2595	!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
				2596	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2597	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2598	tp->retrans_out -= tcp_skb_pcount(skb);
				2599	}
				2600	tcp_skb_mark_lost_uncond_verify(tp, skb);
				2601	}
				2602	}
				2603
				2604	tcp_clear_retrans_hints_partial(tp);
				2605
				2606	if (!tp->lost_out)
				2607	return;
				2608
				2609	if (tcp_is_reno(tp))
				2610	tcp_limit_reno_sacked(tp);
				2611
				2612	tcp_verify_left_out(tp);
				2613
				2614	/* Don't muck with the congestion window here.
				2615	* Reason is that we do not increase amount of _data_
				2616	* in network, but units changed and effective
				2617	* cwnd/ssthresh really reduced now.
				2618	*/
				2619	if (icsk->icsk_ca_state != TCP_CA_Loss) {
				2620	tp->high_seq = tp->snd_nxt;
				2621	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2622	tp->prior_ssthresh = 0;
				2623	tp->undo_marker = 0;
				2624	tcp_set_ca_state(sk, TCP_CA_Loss);
				2625	}
				2626	tcp_xmit_retransmit_queue(sk);
				2627	}
				2628	EXPORT_SYMBOL(tcp_simple_retransmit);
				2629
				2630	void tcp_enter_recovery(struct sock *sk, bool ece_ack)
				2631	{
				2632	struct tcp_sock *tp = tcp_sk(sk);
				2633	int mib_idx;
				2634
				2635	if (tcp_is_reno(tp))
				2636	mib_idx = LINUX_MIB_TCPRENORECOVERY;
				2637	else
				2638	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
				2639
				2640	NET_INC_STATS(sock_net(sk), mib_idx);
				2641
				2642	tp->prior_ssthresh = 0;
				2643	tcp_init_undo(tp);
				2644
				2645	if (!tcp_in_cwnd_reduction(sk)) {
				2646	if (!ece_ack)
				2647	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2648	tcp_init_cwnd_reduction(sk);
				2649	}
				2650	tcp_set_ca_state(sk, TCP_CA_Recovery);
				2651	}
				2652
				2653	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
				2654	* recovered or spurious. Otherwise retransmits more on partial ACKs.
				2655	*/
				2656	static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
				2657	int *rexmit)
				2658	{
				2659	struct tcp_sock *tp = tcp_sk(sk);
				2660	bool recovered = !before(tp->snd_una, tp->high_seq);
				2661
				2662	if ((flag & FLAG_SND_UNA_ADVANCED) &&
				2663	tcp_try_undo_loss(sk, false))
				2664	return;
				2665
				2666	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
				2667	/* Step 3.b. A timeout is spurious if not all data are
				2668	* lost, i.e., never-retransmitted data are (s)acked.
				2669	*/
				2670	if ((flag & FLAG_ORIG_SACK_ACKED) &&
				2671	tcp_try_undo_loss(sk, true))
				2672	return;
				2673
				2674	if (after(tp->snd_nxt, tp->high_seq)) {
				2675	if (flag & FLAG_DATA_SACKED \|\| is_dupack)
				2676	tp->frto = 0; /* Step 3.a. loss was real */
				2677	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
				2678	tp->high_seq = tp->snd_nxt;
				2679	/* Step 2.b. Try send new data (but deferred until cwnd
				2680	* is updated in tcp_ack()). Otherwise fall back to
				2681	* the conventional recovery.
				2682	*/
				2683	if (!tcp_write_queue_empty(sk) &&
				2684	after(tcp_wnd_end(tp), tp->snd_nxt)) {
				2685	*rexmit = REXMIT_NEW;
				2686	return;
				2687	}
				2688	tp->frto = 0;
				2689	}
				2690	}
				2691
				2692	if (recovered) {
				2693	/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
				2694	tcp_try_undo_recovery(sk);
				2695	return;
				2696	}
				2697	if (tcp_is_reno(tp)) {
				2698	/* A Reno DUPACK means new data in F-RTO step 2.b above are
				2699	* delivered. Lower inflight to clock out (re)tranmissions.
				2700	*/
				2701	if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
				2702	tcp_add_reno_sack(sk);
				2703	else if (flag & FLAG_SND_UNA_ADVANCED)
				2704	tcp_reset_reno_sack(tp);
				2705	}
				2706	*rexmit = REXMIT_LOST;
				2707	}
				2708
				2709	/* Undo during fast recovery after partial ACK. */
				2710	static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
				2711	{
				2712	struct tcp_sock *tp = tcp_sk(sk);
				2713
				2714	if (tp->undo_marker && tcp_packet_delayed(tp)) {
				2715	/* Plain luck! Hole if filled with delayed
				2716	* packet, rather than with a retransmit. Check reordering.
				2717	*/
				2718	tcp_check_sack_reordering(sk, prior_snd_una, 1);
				2719
				2720	/* We are getting evidence that the reordering degree is higher
				2721	* than we realized. If there are no retransmits out then we
				2722	* can undo. Otherwise we clock out new packets but do not
				2723	* mark more packets lost or retransmit more.
				2724	*/
				2725	if (tp->retrans_out)
				2726	return true;
				2727
				2728	if (!tcp_any_retrans_done(sk))
				2729	tp->retrans_stamp = 0;
				2730
				2731	DBGUNDO(sk, "partial recovery");
				2732	tcp_undo_cwnd_reduction(sk, true);
				2733	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
				2734	tcp_try_keep_open(sk);
				2735	return true;
				2736	}
				2737	return false;
				2738	}
				2739
				2740	static void tcp_identify_packet_loss(struct sock sk, int ack_flag)
				2741	{
				2742	struct tcp_sock *tp = tcp_sk(sk);
				2743
				2744	if (tcp_rtx_queue_empty(sk))
				2745	return;
				2746
				2747	if (unlikely(tcp_is_reno(tp))) {
				2748	tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
				2749	} else if (tcp_is_rack(sk)) {
				2750	u32 prior_retrans = tp->retrans_out;
				2751
				2752	tcp_rack_mark_lost(sk);
				2753	if (prior_retrans > tp->retrans_out)
				2754	*ack_flag \|= FLAG_LOST_RETRANS;
				2755	}
				2756	}
				2757
				2758	static bool tcp_force_fast_retransmit(struct sock *sk)
				2759	{
				2760	struct tcp_sock *tp = tcp_sk(sk);
				2761
				2762	return after(tcp_highest_sack_seq(tp),
				2763	tp->snd_una + tp->reordering * tp->mss_cache);
				2764	}
				2765
				2766	/* Process an event, which can update packets-in-flight not trivially.
				2767	* Main goal of this function is to calculate new estimate for left_out,
				2768	* taking into account both packets sitting in receiver's buffer and
				2769	* packets lost by network.
				2770	*
				2771	* Besides that it updates the congestion state when packet loss or ECN
				2772	* is detected. But it does not reduce the cwnd, it is done by the
				2773	* congestion control later.
				2774	*
				2775	* It does _not_ decide what to send, it is made in function
				2776	* tcp_xmit_retransmit_queue().
				2777	*/
				2778	static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
				2779	bool is_dupack, int ack_flag, int rexmit)
				2780	{
				2781	struct inet_connection_sock *icsk = inet_csk(sk);
				2782	struct tcp_sock *tp = tcp_sk(sk);
				2783	int fast_rexmit = 0, flag = *ack_flag;
				2784	bool do_lost = is_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
				2785	tcp_force_fast_retransmit(sk));
				2786
				2787	if (!tp->packets_out && tp->sacked_out)
				2788	tp->sacked_out = 0;
				2789
				2790	/* Now state machine starts.
				2791	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
				2792	if (flag & FLAG_ECE)
				2793	tp->prior_ssthresh = 0;
				2794
				2795	/* B. In all the states check for reneging SACKs. */
				2796	if (tcp_check_sack_reneging(sk, flag))
				2797	return;
				2798
				2799	/* C. Check consistency of the current state. */
				2800	tcp_verify_left_out(tp);
				2801
				2802	/* D. Check state exit conditions. State can be terminated
				2803	* when high_seq is ACKed. */
				2804	if (icsk->icsk_ca_state == TCP_CA_Open) {
				2805	WARN_ON(tp->retrans_out != 0);
				2806	tp->retrans_stamp = 0;
				2807	} else if (!before(tp->snd_una, tp->high_seq)) {
				2808	switch (icsk->icsk_ca_state) {
				2809	case TCP_CA_CWR:
				2810	/* CWR is to be held something above high_seq
				2811	* is ACKed for CWR bit to reach receiver. */
				2812	if (tp->snd_una != tp->high_seq) {
				2813	tcp_end_cwnd_reduction(sk);
				2814	tcp_set_ca_state(sk, TCP_CA_Open);
				2815	}
				2816	break;
				2817
				2818	case TCP_CA_Recovery:
				2819	if (tcp_is_reno(tp))
				2820	tcp_reset_reno_sack(tp);
				2821	if (tcp_try_undo_recovery(sk))
				2822	return;
				2823	tcp_end_cwnd_reduction(sk);
				2824	break;
				2825	}
				2826	}
				2827
				2828	/* E. Process state. */
				2829	switch (icsk->icsk_ca_state) {
				2830	case TCP_CA_Recovery:
				2831	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
				2832	if (tcp_is_reno(tp) && is_dupack)
				2833	tcp_add_reno_sack(sk);
				2834	} else {
				2835	if (tcp_try_undo_partial(sk, prior_snd_una))
				2836	return;
				2837	/* Partial ACK arrived. Force fast retransmit. */
				2838	do_lost = tcp_is_reno(tp) \|\|
				2839	tcp_force_fast_retransmit(sk);
				2840	}
				2841	if (tcp_try_undo_dsack(sk)) {
				2842	tcp_try_keep_open(sk);
				2843	return;
				2844	}
				2845	tcp_identify_packet_loss(sk, ack_flag);
				2846	break;
				2847	case TCP_CA_Loss:
				2848	tcp_process_loss(sk, flag, is_dupack, rexmit);
				2849	tcp_identify_packet_loss(sk, ack_flag);
				2850	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
				2851	(*ack_flag & FLAG_LOST_RETRANS)))
				2852	return;
				2853	/* Change state if cwnd is undone or retransmits are lost */
				2854	/* fall through */
				2855	default:
				2856	if (tcp_is_reno(tp)) {
				2857	if (flag & FLAG_SND_UNA_ADVANCED)
				2858	tcp_reset_reno_sack(tp);
				2859	if (is_dupack)
				2860	tcp_add_reno_sack(sk);
				2861	}
				2862
				2863	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
				2864	tcp_try_undo_dsack(sk);
				2865
				2866	tcp_identify_packet_loss(sk, ack_flag);
				2867	if (!tcp_time_to_recover(sk, flag)) {
				2868	tcp_try_to_open(sk, flag);
				2869	return;
				2870	}
				2871
				2872	/* MTU probe failure: don't reduce cwnd */
				2873	if (icsk->icsk_ca_state < TCP_CA_CWR &&
				2874	icsk->icsk_mtup.probe_size &&
				2875	tp->snd_una == tp->mtu_probe.probe_seq_start) {
				2876	tcp_mtup_probe_failed(sk);
				2877	/* Restores the reduction we did in tcp_mtup_probe() */
				2878	tp->snd_cwnd++;
				2879	tcp_simple_retransmit(sk);
				2880	return;
				2881	}
				2882
				2883	/* Otherwise enter Recovery state */
				2884	tcp_enter_recovery(sk, (flag & FLAG_ECE));
				2885	fast_rexmit = 1;
				2886	}
				2887
				2888	if (!tcp_is_rack(sk) && do_lost)
				2889	tcp_update_scoreboard(sk, fast_rexmit);
				2890	*rexmit = REXMIT_LOST;
				2891	}
				2892
				2893	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
				2894	{
				2895	u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
				2896	struct tcp_sock *tp = tcp_sk(sk);
				2897
				2898	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
				2899	/* If the remote keeps returning delayed ACKs, eventually
				2900	* the min filter would pick it up and overestimate the
				2901	* prop. delay when it expires. Skip suspected delayed ACKs.
				2902	*/
				2903	return;
				2904	}
				2905	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
				2906	rtt_us ? : jiffies_to_usecs(1));
				2907	}
				2908
				2909	static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
				2910	long seq_rtt_us, long sack_rtt_us,
				2911	long ca_rtt_us, struct rate_sample *rs)
				2912	{
				2913	const struct tcp_sock *tp = tcp_sk(sk);
				2914
				2915	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
				2916	* broken middle-boxes or peers may corrupt TS-ECR fields. But
				2917	* Karn's algorithm forbids taking RTT if some retransmitted data
				2918	* is acked (RFC6298).
				2919	*/
				2920	if (seq_rtt_us < 0)
				2921	seq_rtt_us = sack_rtt_us;
				2922
				2923	/* RTTM Rule: A TSecr value received in a segment is used to
				2924	* update the averaged RTT measurement only if the segment
				2925	* acknowledges some new data, i.e., only if it advances the
				2926	* left edge of the send window.
				2927	* See draft-ietf-tcplw-high-performance-00, section 3.3.
				2928	*/
				2929	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2930	flag & FLAG_ACKED) {
				2931	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
				2932	u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
				2933
				2934	seq_rtt_us = ca_rtt_us = delta_us;
				2935	}
				2936	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
				2937	if (seq_rtt_us < 0)
				2938	return false;
				2939
				2940	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
				2941	* always taken together with ACK, SACK, or TS-opts. Any negative
				2942	* values will be skipped with the seq_rtt_us < 0 check above.
				2943	*/
				2944	tcp_update_rtt_min(sk, ca_rtt_us, flag);
				2945	tcp_rtt_estimator(sk, seq_rtt_us);
				2946	tcp_set_rto(sk);
				2947
				2948	/* RFC6298: only reset backoff on valid RTT measurement. */
				2949	inet_csk(sk)->icsk_backoff = 0;
				2950	return true;
				2951	}
				2952
				2953	/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
				2954	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req)
				2955	{
				2956	struct rate_sample rs;
				2957	long rtt_us = -1L;
				2958
				2959	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
				2960	rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
				2961
				2962	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
				2963	}
				2964
				2965
				2966	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
				2967	{
				2968	const struct inet_connection_sock *icsk = inet_csk(sk);
				2969
				2970	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
				2971	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
				2972	}
				2973
				2974	/* Restart timer after forward progress on connection.
				2975	* RFC2988 recommends to restart timer to now+rto.
				2976	*/
				2977	void tcp_rearm_rto(struct sock *sk)
				2978	{
				2979	const struct inet_connection_sock *icsk = inet_csk(sk);
				2980	struct tcp_sock *tp = tcp_sk(sk);
				2981
				2982	/* If the retrans timer is currently being used by Fast Open
				2983	* for SYN-ACK retrans purpose, stay put.
				2984	*/
				2985	if (tp->fastopen_rsk)
				2986	return;
				2987
				2988	if (!tp->packets_out) {
				2989	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
				2990	} else {
				2991	u32 rto = inet_csk(sk)->icsk_rto;
				2992	/* Offset the time elapsed after installing regular RTO */
				2993	if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
				2994	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				2995	s64 delta_us = tcp_rto_delta_us(sk);
				2996	/* delta_us may not be positive if the socket is locked
				2997	* when the retrans timer fires and is rescheduled.
				2998	*/
				2999	rto = usecs_to_jiffies(max_t(int, delta_us, 1));
				3000	}
				3001	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
				3002	TCP_RTO_MAX);
				3003	}
				3004	}
				3005
				3006	/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
				3007	static void tcp_set_xmit_timer(struct sock *sk)
				3008	{
				3009	if (!tcp_schedule_loss_probe(sk, true))
				3010	tcp_rearm_rto(sk);
				3011	}
				3012
				3013	/* If we get here, the whole TSO packet has not been acked. */
				3014	static u32 tcp_tso_acked(struct sock sk, struct sk_buff skb)
				3015	{
				3016	struct tcp_sock *tp = tcp_sk(sk);
				3017	u32 packets_acked;
				3018
				3019	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
				3020
				3021	packets_acked = tcp_skb_pcount(skb);
				3022	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				3023	return 0;
				3024	packets_acked -= tcp_skb_pcount(skb);
				3025
				3026	if (packets_acked) {
				3027	BUG_ON(tcp_skb_pcount(skb) == 0);
				3028	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
				3029	}
				3030
				3031	return packets_acked;
				3032	}
				3033
				3034	static void tcp_ack_tstamp(struct sock sk, struct sk_buff skb,
				3035	u32 prior_snd_una)
				3036	{
				3037	const struct skb_shared_info *shinfo;
				3038
				3039	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
				3040	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
				3041	return;
				3042
				3043	shinfo = skb_shinfo(skb);
				3044	if (!before(shinfo->tskey, prior_snd_una) &&
				3045	before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
				3046	tcp_skb_tsorted_save(skb) {
				3047	__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
				3048	} tcp_skb_tsorted_restore(skb);
				3049	}
				3050	}
				3051
				3052	/* Remove acknowledged frames from the retransmission queue. If our packet
				3053	* is before the ack sequence we can discard it as it's confirmed to have
				3054	* arrived at the other end.
				3055	*/
				3056	static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
				3057	u32 prior_snd_una,
				3058	struct tcp_sacktag_state *sack)
				3059	{
				3060	const struct inet_connection_sock *icsk = inet_csk(sk);
				3061	u64 first_ackt, last_ackt;
				3062	struct tcp_sock *tp = tcp_sk(sk);
				3063	u32 prior_sacked = tp->sacked_out;
				3064	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
				3065	struct sk_buff skb, next;
				3066	bool fully_acked = true;
				3067	long sack_rtt_us = -1L;
				3068	long seq_rtt_us = -1L;
				3069	long ca_rtt_us = -1L;
				3070	u32 pkts_acked = 0;
				3071	u32 last_in_flight = 0;
				3072	bool rtt_update;
				3073	int flag = 0;
				3074
				3075	first_ackt = 0;
				3076
				3077	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
				3078	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
				3079	const u32 start_seq = scb->seq;
				3080	u8 sacked = scb->sacked;
				3081	u32 acked_pcount;
				3082
				3083	tcp_ack_tstamp(sk, skb, prior_snd_una);
				3084
				3085	/* Determine how many packets and what bytes were acked, tso and else */
				3086	if (after(scb->end_seq, tp->snd_una)) {
				3087	if (tcp_skb_pcount(skb) == 1 \|\|
				3088	!after(tp->snd_una, scb->seq))
				3089	break;
				3090
				3091	acked_pcount = tcp_tso_acked(sk, skb);
				3092	if (!acked_pcount)
				3093	break;
				3094	fully_acked = false;
				3095	} else {
				3096	acked_pcount = tcp_skb_pcount(skb);
				3097	}
				3098
				3099	if (unlikely(sacked & TCPCB_RETRANS)) {
				3100	if (sacked & TCPCB_SACKED_RETRANS)
				3101	tp->retrans_out -= acked_pcount;
				3102	flag \|= FLAG_RETRANS_DATA_ACKED;
				3103	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
				3104	last_ackt = skb->skb_mstamp;
				3105	WARN_ON_ONCE(last_ackt == 0);
				3106	if (!first_ackt)
				3107	first_ackt = last_ackt;
				3108
				3109	last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
				3110	if (before(start_seq, reord))
				3111	reord = start_seq;
				3112	if (!after(scb->end_seq, tp->high_seq))
				3113	flag \|= FLAG_ORIG_SACK_ACKED;
				3114	}
				3115
				3116	if (sacked & TCPCB_SACKED_ACKED) {
				3117	tp->sacked_out -= acked_pcount;
				3118	} else if (tcp_is_sack(tp)) {
				3119	tp->delivered += acked_pcount;
				3120	if (!tcp_skb_spurious_retrans(tp, skb))
				3121	tcp_rack_advance(tp, sacked, scb->end_seq,
				3122	skb->skb_mstamp);
				3123	}
				3124	if (sacked & TCPCB_LOST)
				3125	tp->lost_out -= acked_pcount;
				3126
				3127	tp->packets_out -= acked_pcount;
				3128	pkts_acked += acked_pcount;
				3129	tcp_rate_skb_delivered(sk, skb, sack->rate);
				3130
				3131	/* Initial outgoing SYN's get put onto the write_queue
				3132	* just like anything else we transmit. It is not
				3133	* true data, and if we misinform our callers that
				3134	* this ACK acks real data, we will erroneously exit
				3135	* connection startup slow start one packet too
				3136	* quickly. This is severely frowned upon behavior.
				3137	*/
				3138	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
				3139	flag \|= FLAG_DATA_ACKED;
				3140	} else {
				3141	flag \|= FLAG_SYN_ACKED;
				3142	tp->retrans_stamp = 0;
				3143	}
				3144
				3145	if (!fully_acked)
				3146	break;
				3147
				3148	next = skb_rb_next(skb);
				3149	if (unlikely(skb == tp->retransmit_skb_hint))
				3150	tp->retransmit_skb_hint = NULL;
				3151	if (unlikely(skb == tp->lost_skb_hint))
				3152	tp->lost_skb_hint = NULL;
				3153	tcp_rtx_queue_unlink_and_free(skb, sk);
				3154	}
				3155
				3156	if (!skb)
				3157	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
				3158
				3159	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
				3160	tp->snd_up = tp->snd_una;
				3161
				3162	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				3163	flag \|= FLAG_SACK_RENEGING;
				3164
				3165	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
				3166	seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
				3167	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
				3168
				3169	if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
				3170	last_in_flight && !prior_sacked && fully_acked &&
				3171	sack->rate->prior_delivered + 1 == tp->delivered &&
				3172	!(flag & (FLAG_CA_ALERT \| FLAG_SYN_ACKED))) {
				3173	/* Conservatively mark a delayed ACK. It's typically
				3174	* from a lone runt packet over the round trip to
				3175	* a receiver w/o out-of-order or CE events.
				3176	*/
				3177	flag \|= FLAG_ACK_MAYBE_DELAYED;
				3178	}
				3179	}
				3180	if (sack->first_sackt) {
				3181	sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
				3182	ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
				3183	}
				3184	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
				3185	ca_rtt_us, sack->rate);
				3186
				3187	if (flag & FLAG_ACKED) {
				3188	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3189	if (unlikely(icsk->icsk_mtup.probe_size &&
				3190	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
				3191	tcp_mtup_probe_success(sk);
				3192	}
				3193
				3194	if (tcp_is_reno(tp)) {
				3195	tcp_remove_reno_sacks(sk, pkts_acked);
				3196
				3197	/* If any of the cumulatively ACKed segments was
				3198	* retransmitted, non-SACK case cannot confirm that
				3199	* progress was due to original transmission due to
				3200	* lack of TCPCB_SACKED_ACKED bits even if some of
				3201	* the packets may have been never retransmitted.
				3202	*/
				3203	if (flag & FLAG_RETRANS_DATA_ACKED)
				3204	flag &= ~FLAG_ORIG_SACK_ACKED;
				3205	} else {
				3206	int delta;
				3207
				3208	/* Non-retransmitted hole got filled? That's reordering */
				3209	if (before(reord, prior_fack))
				3210	tcp_check_sack_reordering(sk, reord, 0);
				3211
				3212	delta = prior_sacked - tp->sacked_out;
				3213	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
				3214	}
				3215	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
				3216	sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
				3217	/* Do not re-arm RTO if the sack RTT is measured from data sent
				3218	* after when the head was last (re)transmitted. Otherwise the
				3219	* timeout may continue to extend in loss recovery.
				3220	*/
				3221	flag \|= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
				3222	}
				3223
				3224	if (icsk->icsk_ca_ops->pkts_acked) {
				3225	struct ack_sample sample = { .pkts_acked = pkts_acked,
				3226	.rtt_us = sack->rate->rtt_us,
				3227	.in_flight = last_in_flight };
				3228
				3229	icsk->icsk_ca_ops->pkts_acked(sk, &sample);
				3230	}
				3231
				3232	#if FASTRETRANS_DEBUG > 0
				3233	WARN_ON((int)tp->sacked_out < 0);
				3234	WARN_ON((int)tp->lost_out < 0);
				3235	WARN_ON((int)tp->retrans_out < 0);
				3236	if (!tp->packets_out && tcp_is_sack(tp)) {
				3237	icsk = inet_csk(sk);
				3238	if (tp->lost_out) {
				3239	pr_debug("Leak l=%u %d\n",
				3240	tp->lost_out, icsk->icsk_ca_state);
				3241	tp->lost_out = 0;
				3242	}
				3243	if (tp->sacked_out) {
				3244	pr_debug("Leak s=%u %d\n",
				3245	tp->sacked_out, icsk->icsk_ca_state);
				3246	tp->sacked_out = 0;
				3247	}
				3248	if (tp->retrans_out) {
				3249	pr_debug("Leak r=%u %d\n",
				3250	tp->retrans_out, icsk->icsk_ca_state);
				3251	tp->retrans_out = 0;
				3252	}
				3253	}
				3254	#endif
				3255	return flag;
				3256	}
				3257
				3258	static void tcp_ack_probe(struct sock *sk)
				3259	{
				3260	struct inet_connection_sock *icsk = inet_csk(sk);
				3261	struct sk_buff *head = tcp_send_head(sk);
				3262	const struct tcp_sock *tp = tcp_sk(sk);
				3263
				3264	/* Was it a usable window open? */
				3265	if (!head)
				3266	return;
				3267	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
				3268	icsk->icsk_backoff = 0;
				3269	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
				3270	/* Socket must be waked up by subsequent tcp_data_snd_check().
				3271	* This function is not for random using!
				3272	*/
				3273	} else {
				3274	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
				3275
				3276	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3277	when, TCP_RTO_MAX);
				3278	}
				3279	}
				3280
				3281	static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
				3282	{
				3283	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
				3284	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
				3285	}
				3286
				3287	/* Decide wheather to run the increase function of congestion control. */
				3288	static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
				3289	{
				3290	/* If reordering is high then always grow cwnd whenever data is
				3291	* delivered regardless of its ordering. Otherwise stay conservative
				3292	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
				3293	* new SACK or ECE mark may first advance cwnd here and later reduce
				3294	* cwnd in tcp_fastretrans_alert() based on more states.
				3295	*/
				3296	if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
				3297	return flag & FLAG_FORWARD_PROGRESS;
				3298
				3299	return flag & FLAG_DATA_ACKED;
				3300	}
				3301
				3302	/* The "ultimate" congestion control function that aims to replace the rigid
				3303	* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
				3304	* It's called toward the end of processing an ACK with precise rate
				3305	* information. All transmission or retransmission are delayed afterwards.
				3306	*/
				3307	static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
				3308	int flag, const struct rate_sample *rs)
				3309	{
				3310	const struct inet_connection_sock *icsk = inet_csk(sk);
				3311
				3312	if (icsk->icsk_ca_ops->cong_control) {
				3313	icsk->icsk_ca_ops->cong_control(sk, rs);
				3314	return;
				3315	}
				3316
				3317	if (tcp_in_cwnd_reduction(sk)) {
				3318	/* Reduce cwnd if state mandates */
				3319	tcp_cwnd_reduction(sk, acked_sacked, flag);
				3320	} else if (tcp_may_raise_cwnd(sk, flag)) {
				3321	/* Advance cwnd if state allows */
				3322	tcp_cong_avoid(sk, ack, acked_sacked);
				3323	}
				3324	tcp_update_pacing_rate(sk);
				3325	}
				3326
				3327	/* Check that window update is acceptable.
				3328	* The function assumes that snd_una<=ack<=snd_next.
				3329	*/
				3330	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
				3331	const u32 ack, const u32 ack_seq,
				3332	const u32 nwin)
				3333	{
				3334	return after(ack, tp->snd_una) \|\|
				3335	after(ack_seq, tp->snd_wl1) \|\|
				3336	(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
				3337	}
				3338
				3339	/* If we update tp->snd_una, also update tp->bytes_acked */
				3340	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
				3341	{
				3342	u32 delta = ack - tp->snd_una;
				3343
				3344	sock_owned_by_me((struct sock *)tp);
				3345	tp->bytes_acked += delta;
				3346	tp->snd_una = ack;
				3347	}
				3348
				3349	/* If we update tp->rcv_nxt, also update tp->bytes_received */
				3350	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
				3351	{
				3352	u32 delta = seq - tp->rcv_nxt;
				3353
				3354	sock_owned_by_me((struct sock *)tp);
				3355	tp->bytes_received += delta;
				3356	WRITE_ONCE(tp->rcv_nxt, seq);
				3357	}
				3358
				3359	/* Update our send window.
				3360	*
				3361	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
				3362	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
				3363	*/
				3364	static int tcp_ack_update_window(struct sock sk, const struct sk_buff skb, u32 ack,
				3365	u32 ack_seq)
				3366	{
				3367	struct tcp_sock *tp = tcp_sk(sk);
				3368	int flag = 0;
				3369	u32 nwin = ntohs(tcp_hdr(skb)->window);
				3370
				3371	if (likely(!tcp_hdr(skb)->syn))
				3372	nwin <<= tp->rx_opt.snd_wscale;
				3373
				3374	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
				3375	flag \|= FLAG_WIN_UPDATE;
				3376	tcp_update_wl(tp, ack_seq);
				3377
				3378	if (tp->snd_wnd != nwin) {
				3379	tp->snd_wnd = nwin;
				3380
				3381	/* Note, it is the only place, where
				3382	* fast path is recovered for sending TCP.
				3383	*/
				3384	tp->pred_flags = 0;
				3385	tcp_fast_path_check(sk);
				3386
				3387	if (!tcp_write_queue_empty(sk))
				3388	tcp_slow_start_after_idle_check(sk);
				3389
				3390	if (nwin > tp->max_window) {
				3391	tp->max_window = nwin;
				3392	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
				3393	}
				3394	}
				3395	}
				3396
				3397	tcp_snd_una_update(tp, ack);
				3398
				3399	return flag;
				3400	}
				3401
				3402	static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
				3403	u32 *last_oow_ack_time)
				3404	{
				3405	if (*last_oow_ack_time) {
				3406	s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
				3407
				3408	if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
				3409	NET_INC_STATS(net, mib_idx);
				3410	return true; /* rate-limited: don't send yet! */
				3411	}
				3412	}
				3413
				3414	*last_oow_ack_time = tcp_jiffies32;
				3415
				3416	return false; /* not rate-limited: go ahead, send dupack now! */
				3417	}
				3418
				3419	/* Return true if we're currently rate-limiting out-of-window ACKs and
				3420	* thus shouldn't send a dupack right now. We rate-limit dupacks in
				3421	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
				3422	* attacks that send repeated SYNs or ACKs for the same connection. To
				3423	* do this, we do not send a duplicate SYNACK or ACK if the remote
				3424	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
				3425	*/
				3426	bool tcp_oow_rate_limited(struct net net, const struct sk_buff skb,
				3427	int mib_idx, u32 *last_oow_ack_time)
				3428	{
				3429	/* Data packets without SYNs are not likely part of an ACK loop. */
				3430	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
				3431	!tcp_hdr(skb)->syn)
				3432	return false;
				3433
				3434	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
				3435	}
				3436
				3437	/* RFC 5961 7 [ACK Throttling] */
				3438	static void tcp_send_challenge_ack(struct sock sk, const struct sk_buff skb)
				3439	{
				3440	/* unprotected vars, we dont care of overwrites */
				3441	static u32 challenge_timestamp;
				3442	static unsigned int challenge_count;
				3443	struct tcp_sock *tp = tcp_sk(sk);
				3444	struct net *net = sock_net(sk);
				3445	u32 count, now;
				3446
				3447	/* First check our per-socket dupack rate limit. */
				3448	if (__tcp_oow_rate_limited(net,
				3449	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
				3450	&tp->last_oow_ack_time))
				3451	return;
				3452
				3453	/* Then check host-wide RFC 5961 rate limit. */
				3454	now = jiffies / HZ;
				3455	if (now != challenge_timestamp) {
				3456	u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
				3457	u32 half = (ack_limit + 1) >> 1;
				3458
				3459	challenge_timestamp = now;
				3460	WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
				3461	}
				3462	count = READ_ONCE(challenge_count);
				3463	if (count > 0) {
				3464	WRITE_ONCE(challenge_count, count - 1);
				3465	NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
				3466	tcp_send_ack(sk);
				3467	}
				3468	}
				3469
				3470	static void tcp_store_ts_recent(struct tcp_sock *tp)
				3471	{
				3472	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
				3473	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
				3474	}
				3475
				3476	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
				3477	{
				3478	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
				3479	/* PAWS bug workaround wrt. ACK frames, the PAWS discard
				3480	* extra check below makes sure this can only happen
				3481	* for pure ACK frames. -DaveM
				3482	*
				3483	* Not only, also it occurs for expired timestamps.
				3484	*/
				3485
				3486	if (tcp_paws_check(&tp->rx_opt, 0))
				3487	tcp_store_ts_recent(tp);
				3488	}
				3489	}
				3490
				3491	/* This routine deals with acks during a TLP episode.
				3492	* We mark the end of a TLP episode on receiving TLP dupack or when
				3493	* ack is after tlp_high_seq.
				3494	* Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
				3495	*/
				3496	static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
				3497	{
				3498	struct tcp_sock *tp = tcp_sk(sk);
				3499
				3500	if (before(ack, tp->tlp_high_seq))
				3501	return;
				3502
				3503	if (flag & FLAG_DSACKING_ACK) {
				3504	/* This DSACK means original and TLP probe arrived; no loss */
				3505	tp->tlp_high_seq = 0;
				3506	} else if (after(ack, tp->tlp_high_seq)) {
				3507	/* ACK advances: there was a loss, so reduce cwnd. Reset
				3508	* tlp_high_seq in tcp_init_cwnd_reduction()
				3509	*/
				3510	tcp_init_cwnd_reduction(sk);
				3511	tcp_set_ca_state(sk, TCP_CA_CWR);
				3512	tcp_end_cwnd_reduction(sk);
				3513	tcp_try_keep_open(sk);
				3514	NET_INC_STATS(sock_net(sk),
				3515	LINUX_MIB_TCPLOSSPROBERECOVERY);
				3516	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
				3517	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
				3518	/* Pure dupack: original and TLP probe arrived; no loss */
				3519	tp->tlp_high_seq = 0;
				3520	}
				3521	}
				3522
				3523	static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
				3524	{
				3525	const struct inet_connection_sock *icsk = inet_csk(sk);
				3526
				3527	if (icsk->icsk_ca_ops->in_ack_event)
				3528	icsk->icsk_ca_ops->in_ack_event(sk, flags);
				3529	}
				3530
				3531	/* Congestion control has updated the cwnd already. So if we're in
				3532	* loss recovery then now we do any new sends (for FRTO) or
				3533	* retransmits (for CA_Loss or CA_recovery) that make sense.
				3534	*/
				3535	static void tcp_xmit_recovery(struct sock *sk, int rexmit)
				3536	{
				3537	struct tcp_sock *tp = tcp_sk(sk);
				3538
				3539	if (rexmit == REXMIT_NONE)
				3540	return;
				3541
				3542	if (unlikely(rexmit == 2)) {
				3543	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
				3544	TCP_NAGLE_OFF);
				3545	if (after(tp->snd_nxt, tp->high_seq))
				3546	return;
				3547	tp->frto = 0;
				3548	}
				3549	tcp_xmit_retransmit_queue(sk);
				3550	}
				3551
				3552	/* Returns the number of packets newly acked or sacked by the current ACK */
				3553	static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
				3554	{
				3555	const struct net *net = sock_net(sk);
				3556	struct tcp_sock *tp = tcp_sk(sk);
				3557	u32 delivered;
				3558
				3559	delivered = tp->delivered - prior_delivered;
				3560	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
				3561	if (flag & FLAG_ECE) {
				3562	tp->delivered_ce += delivered;
				3563	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
				3564	}
				3565	return delivered;
				3566	}
				3567
				3568	/* This routine deals with incoming acks, but not outgoing ones. */
				3569	static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
				3570	{
				3571	struct inet_connection_sock *icsk = inet_csk(sk);
				3572	struct tcp_sock *tp = tcp_sk(sk);
				3573	struct tcp_sacktag_state sack_state;
				3574	struct rate_sample rs = { .prior_delivered = 0 };
				3575	u32 prior_snd_una = tp->snd_una;
				3576	bool is_sack_reneg = tp->is_sack_reneg;
				3577	u32 ack_seq = TCP_SKB_CB(skb)->seq;
				3578	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				3579	bool is_dupack = false;
				3580	int prior_packets = tp->packets_out;
				3581	u32 delivered = tp->delivered;
				3582	u32 lost = tp->lost;
				3583	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
				3584	u32 prior_fack;
				3585
				3586	sack_state.first_sackt = 0;
				3587	sack_state.rate = &rs;
				3588
				3589	/* We very likely will need to access rtx queue. */
				3590	prefetch(sk->tcp_rtx_queue.rb_node);
				3591
				3592	/* If the ack is older than previous acks
				3593	* then we can probably ignore it.
				3594	*/
				3595	if (before(ack, prior_snd_una)) {
				3596	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
				3597	if (before(ack, prior_snd_una - tp->max_window)) {
				3598	if (!(flag & FLAG_NO_CHALLENGE_ACK))
				3599	tcp_send_challenge_ack(sk, skb);
				3600	return -1;
				3601	}
				3602	goto old_ack;
				3603	}
				3604
				3605	/* If the ack includes data we haven't sent yet, discard
				3606	* this segment (RFC793 Section 3.9).
				3607	*/
				3608	if (after(ack, tp->snd_nxt))
				3609	goto invalid_ack;
				3610
				3611	if (after(ack, prior_snd_una)) {
				3612	flag \|= FLAG_SND_UNA_ADVANCED;
				3613	icsk->icsk_retransmits = 0;
				3614
				3615	#if IS_ENABLED(CONFIG_TLS_DEVICE)
				3616	if (static_branch_unlikely(&clean_acked_data_enabled))
				3617	if (icsk->icsk_clean_acked)
				3618	icsk->icsk_clean_acked(sk, ack);
				3619	#endif
				3620	}
				3621
				3622	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
				3623	rs.prior_in_flight = tcp_packets_in_flight(tp);
				3624
				3625	/* ts_recent update must be made after we are sure that the packet
				3626	* is in window.
				3627	*/
				3628	if (flag & FLAG_UPDATE_TS_RECENT)
				3629	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
				3630
				3631	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
				3632	/* Window is constant, pure forward advance.
				3633	* No more checks are required.
				3634	* Note, we use the fact that SND.UNA>=SND.WL2.
				3635	*/
				3636	tcp_update_wl(tp, ack_seq);
				3637	tcp_snd_una_update(tp, ack);
				3638	flag \|= FLAG_WIN_UPDATE;
				3639
				3640	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
				3641
				3642	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
				3643	} else {
				3644	u32 ack_ev_flags = CA_ACK_SLOWPATH;
				3645
				3646	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
				3647	flag \|= FLAG_DATA;
				3648	else
				3649	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
				3650
				3651	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
				3652
				3653	if (TCP_SKB_CB(skb)->sacked)
				3654	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3655	&sack_state);
				3656
				3657	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
				3658	flag \|= FLAG_ECE;
				3659	ack_ev_flags \|= CA_ACK_ECE;
				3660	}
				3661
				3662	if (flag & FLAG_WIN_UPDATE)
				3663	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
				3664
				3665	tcp_in_ack_event(sk, ack_ev_flags);
				3666	}
				3667
				3668	/* We passed data and got it acked, remove any soft error
				3669	* log. Something worked...
				3670	*/
				3671	sk->sk_err_soft = 0;
				3672	icsk->icsk_probes_out = 0;
				3673	tp->rcv_tstamp = tcp_jiffies32;
				3674	if (!prior_packets)
				3675	goto no_queue;
				3676
				3677	/* See if we can take anything off of the retransmit queue. */
				3678	flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
				3679
				3680	tcp_rack_update_reo_wnd(sk, &rs);
				3681
				3682	if (tp->tlp_high_seq)
				3683	tcp_process_tlp_ack(sk, ack, flag);
				3684	/* If needed, reset TLP/RTO timer; RACK may later override this. */
				3685	if (flag & FLAG_SET_XMIT_TIMER)
				3686	tcp_set_xmit_timer(sk);
				3687
				3688	if (tcp_ack_is_dubious(sk, flag)) {
				3689	is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP));
				3690	tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
				3691	&rexmit);
				3692	}
				3693
				3694	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
				3695	sk_dst_confirm(sk);
				3696
				3697	delivered = tcp_newly_delivered(sk, delivered, flag);
				3698	lost = tp->lost - lost; /* freshly marked lost */
				3699	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
				3700	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
				3701	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
				3702	tcp_xmit_recovery(sk, rexmit);
				3703	return 1;
				3704
				3705	no_queue:
				3706	/* If data was DSACKed, see if we can undo a cwnd reduction. */
				3707	if (flag & FLAG_DSACKING_ACK) {
				3708	tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
				3709	&rexmit);
				3710	tcp_newly_delivered(sk, delivered, flag);
				3711	}
				3712	/* If this ack opens up a zero window, clear backoff. It was
				3713	* being used to time the probes, and is probably far higher than
				3714	* it needs to be for normal retransmission.
				3715	*/
				3716	tcp_ack_probe(sk);
				3717
				3718	if (tp->tlp_high_seq)
				3719	tcp_process_tlp_ack(sk, ack, flag);
				3720	return 1;
				3721
				3722	invalid_ack:
				3723	SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
				3724	return -1;
				3725
				3726	old_ack:
				3727	/* If data was SACKed, tag it and see if we should send more data.
				3728	* If data was DSACKed, see if we can undo a cwnd reduction.
				3729	*/
				3730	if (TCP_SKB_CB(skb)->sacked) {
				3731	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3732	&sack_state);
				3733	tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
				3734	&rexmit);
				3735	tcp_newly_delivered(sk, delivered, flag);
				3736	tcp_xmit_recovery(sk, rexmit);
				3737	}
				3738
				3739	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
				3740	return 0;
				3741	}
				3742
				3743	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
				3744	bool syn, struct tcp_fastopen_cookie *foc,
				3745	bool exp_opt)
				3746	{
				3747	/* Valid only in SYN or SYN-ACK with an even length. */
				3748	if (!foc \|\| !syn \|\| len < 0 \|\| (len & 1))
				3749	return;
				3750
				3751	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
				3752	len <= TCP_FASTOPEN_COOKIE_MAX)
				3753	memcpy(foc->val, cookie, len);
				3754	else if (len != 0)
				3755	len = -1;
				3756	foc->len = len;
				3757	foc->exp = exp_opt;
				3758	}
				3759
				3760	static void smc_parse_options(const struct tcphdr *th,
				3761	struct tcp_options_received *opt_rx,
				3762	const unsigned char *ptr,
				3763	int opsize)
				3764	{
				3765	#if IS_ENABLED(CONFIG_SMC)
				3766	if (static_branch_unlikely(&tcp_have_smc)) {
				3767	if (th->syn && !(opsize & 1) &&
				3768	opsize >= TCPOLEN_EXP_SMC_BASE &&
				3769	get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
				3770	opt_rx->smc_ok = 1;
				3771	}
				3772	#endif
				3773	}
				3774
				3775	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
				3776	* But, this can also be called on packets in the established flow when
				3777	* the fast version below fails.
				3778	*/
				3779	void tcp_parse_options(const struct net *net,
				3780	const struct sk_buff *skb,
				3781	struct tcp_options_received *opt_rx, int estab,
				3782	struct tcp_fastopen_cookie *foc)
				3783	{
				3784	const unsigned char *ptr;
				3785	const struct tcphdr *th = tcp_hdr(skb);
				3786	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3787
				3788	ptr = (const unsigned char *)(th + 1);
				3789	opt_rx->saw_tstamp = 0;
				3790
				3791	while (length > 0) {
				3792	int opcode = *ptr++;
				3793	int opsize;
				3794
				3795	switch (opcode) {
				3796	case TCPOPT_EOL:
				3797	return;
				3798	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3799	length--;
				3800	continue;
				3801	default:
				3802	opsize = *ptr++;
				3803	if (opsize < 2) /* "silly options" */
				3804	return;
				3805	if (opsize > length)
				3806	return; /* don't parse partial options */
				3807	switch (opcode) {
				3808	case TCPOPT_MSS:
				3809	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
				3810	u16 in_mss = get_unaligned_be16(ptr);
				3811	if (in_mss) {
				3812	if (opt_rx->user_mss &&
				3813	opt_rx->user_mss < in_mss)
				3814	in_mss = opt_rx->user_mss;
				3815	opt_rx->mss_clamp = in_mss;
				3816	}
				3817	}
				3818	break;
				3819	case TCPOPT_WINDOW:
				3820	if (opsize == TCPOLEN_WINDOW && th->syn &&
				3821	!estab && net->ipv4.sysctl_tcp_window_scaling) {
				3822	__u8 snd_wscale = (__u8 )ptr;
				3823	opt_rx->wscale_ok = 1;
				3824	if (snd_wscale > TCP_MAX_WSCALE) {
				3825	net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
				3826	__func__,
				3827	snd_wscale,
				3828	TCP_MAX_WSCALE);
				3829	snd_wscale = TCP_MAX_WSCALE;
				3830	}
				3831	opt_rx->snd_wscale = snd_wscale;
				3832	}
				3833	break;
				3834	case TCPOPT_TIMESTAMP:
				3835	if ((opsize == TCPOLEN_TIMESTAMP) &&
				3836	((estab && opt_rx->tstamp_ok) \|\|
				3837	(!estab && net->ipv4.sysctl_tcp_timestamps))) {
				3838	opt_rx->saw_tstamp = 1;
				3839	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
				3840	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
				3841	}
				3842	break;
				3843	case TCPOPT_SACK_PERM:
				3844	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
				3845	!estab && net->ipv4.sysctl_tcp_sack) {
				3846	opt_rx->sack_ok = TCP_SACK_SEEN;
				3847	tcp_sack_reset(opt_rx);
				3848	}
				3849	break;
				3850
				3851	case TCPOPT_SACK:
				3852	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				3853	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				3854	opt_rx->sack_ok) {
				3855	TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				3856	}
				3857	break;
				3858	#ifdef CONFIG_TCP_MD5SIG
				3859	case TCPOPT_MD5SIG:
				3860	/*
				3861	* The MD5 Hash has already been
				3862	* checked (see tcp_v{4,6}_do_rcv()).
				3863	*/
				3864	break;
				3865	#endif
				3866	case TCPOPT_FASTOPEN:
				3867	tcp_parse_fastopen_option(
				3868	opsize - TCPOLEN_FASTOPEN_BASE,
				3869	ptr, th->syn, foc, false);
				3870	break;
				3871
				3872	case TCPOPT_EXP:
				3873	/* Fast Open option shares code 254 using a
				3874	* 16 bits magic number.
				3875	*/
				3876	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
				3877	get_unaligned_be16(ptr) ==
				3878	TCPOPT_FASTOPEN_MAGIC)
				3879	tcp_parse_fastopen_option(opsize -
				3880	TCPOLEN_EXP_FASTOPEN_BASE,
				3881	ptr + 2, th->syn, foc, true);
				3882	else
				3883	smc_parse_options(th, opt_rx, ptr,
				3884	opsize);
				3885	break;
				3886
				3887	}
				3888	ptr += opsize-2;
				3889	length -= opsize;
				3890	}
				3891	}
				3892	}
				3893	EXPORT_SYMBOL(tcp_parse_options);
				3894
				3895	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const struct tcphdr th)
				3896	{
				3897	const __be32 ptr = (const __be32 )(th + 1);
				3898
				3899	if (*ptr == htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16)
				3900	\| (TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP)) {
				3901	tp->rx_opt.saw_tstamp = 1;
				3902	++ptr;
				3903	tp->rx_opt.rcv_tsval = ntohl(*ptr);
				3904	++ptr;
				3905	if (*ptr)
				3906	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
				3907	else
				3908	tp->rx_opt.rcv_tsecr = 0;
				3909	return true;
				3910	}
				3911	return false;
				3912	}
				3913
				3914	/* Fast parse options. This hopes to only see timestamps.
				3915	* If it is wrong it falls back on tcp_parse_options().
				3916	*/
				3917	static bool tcp_fast_parse_options(const struct net *net,
				3918	const struct sk_buff *skb,
				3919	const struct tcphdr th, struct tcp_sock tp)
				3920	{
				3921	/* In the spirit of fast parsing, compare doff directly to constant
				3922	* values. Because equality is used, short doff can be ignored here.
				3923	*/
				3924	if (th->doff == (sizeof(*th) / 4)) {
				3925	tp->rx_opt.saw_tstamp = 0;
				3926	return false;
				3927	} else if (tp->rx_opt.tstamp_ok &&
				3928	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
				3929	if (tcp_parse_aligned_timestamp(tp, th))
				3930	return true;
				3931	}
				3932
				3933	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
				3934	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				3935	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				3936
				3937	return true;
				3938	}
				3939
				3940	#ifdef CONFIG_TCP_MD5SIG
				3941	/*
				3942	* Parse MD5 Signature option
				3943	*/
				3944	const u8 tcp_parse_md5sig_option(const struct tcphdr th)
				3945	{
				3946	int length = (th->doff << 2) - sizeof(*th);
				3947	const u8 ptr = (const u8 )(th + 1);
				3948
				3949	/* If not enough data remaining, we can short cut */
				3950	while (length >= TCPOLEN_MD5SIG) {
				3951	int opcode = *ptr++;
				3952	int opsize;
				3953
				3954	switch (opcode) {
				3955	case TCPOPT_EOL:
				3956	return NULL;
				3957	case TCPOPT_NOP:
				3958	length--;
				3959	continue;
				3960	default:
				3961	opsize = *ptr++;
				3962	if (opsize < 2 \|\| opsize > length)
				3963	return NULL;
				3964	if (opcode == TCPOPT_MD5SIG)
				3965	return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
				3966	}
				3967	ptr += opsize - 2;
				3968	length -= opsize;
				3969	}
				3970	return NULL;
				3971	}
				3972	EXPORT_SYMBOL(tcp_parse_md5sig_option);
				3973	#endif
				3974
				3975	/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
				3976	*
				3977	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
				3978	* it can pass through stack. So, the following predicate verifies that
				3979	* this segment is not used for anything but congestion avoidance or
				3980	* fast retransmit. Moreover, we even are able to eliminate most of such
				3981	* second order effects, if we apply some small "replay" window (~RTO)
				3982	* to timestamp space.
				3983	*
				3984	* All these measures still do not guarantee that we reject wrapped ACKs
				3985	* on networks with high bandwidth, when sequence space is recycled fastly,
				3986	* but it guarantees that such events will be very rare and do not affect
				3987	* connection seriously. This doesn't look nice, but alas, PAWS is really
				3988	* buggy extension.
				3989	*
				3990	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
				3991	* states that events when retransmit arrives after original data are rare.
				3992	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
				3993	* the biggest problem on large power networks even with minor reordering.
				3994	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
				3995	* up to bandwidth of 18Gigabit/sec. 8) ]
				3996	*/
				3997
				3998	static int tcp_disordered_ack(const struct sock sk, const struct sk_buff skb)
				3999	{
				4000	const struct tcp_sock *tp = tcp_sk(sk);
				4001	const struct tcphdr *th = tcp_hdr(skb);
				4002	u32 seq = TCP_SKB_CB(skb)->seq;
				4003	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				4004
				4005	return (/* 1. Pure ACK with correct sequence number. */
				4006	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
				4007
				4008	/* 2. ... and duplicate ACK. */
				4009	ack == tp->snd_una &&
				4010
				4011	/* 3. ... and does not update window. */
				4012	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
				4013
				4014	/* 4. ... and sits in replay window. */
				4015	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
				4016	}
				4017
				4018	static inline bool tcp_paws_discard(const struct sock *sk,
				4019	const struct sk_buff *skb)
				4020	{
				4021	const struct tcp_sock *tp = tcp_sk(sk);
				4022
				4023	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
				4024	!tcp_disordered_ack(sk, skb);
				4025	}
				4026
				4027	/* Check segment sequence number for validity.
				4028	*
				4029	* Segment controls are considered valid, if the segment
				4030	* fits to the window after truncation to the window. Acceptability
				4031	* of data (and SYN, FIN, of course) is checked separately.
				4032	* See tcp_data_queue(), for example.
				4033	*
				4034	* Also, controls (RST is main one) are accepted using RCV.WUP instead
				4035	* of RCV.NXT. Peer still did not advance his SND.UNA when we
				4036	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
				4037	* (borrowed from freebsd)
				4038	*/
				4039
				4040	static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
				4041	{
				4042	return !before(end_seq, tp->rcv_wup) &&
				4043	!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
				4044	}
				4045
				4046	/* When we get a reset we do this. */
				4047	void tcp_reset(struct sock *sk)
				4048	{
				4049	trace_tcp_receive_reset(sk);
				4050
				4051	/* We want the right error as BSD sees it (and indeed as we do). */
				4052	switch (sk->sk_state) {
				4053	case TCP_SYN_SENT:
				4054	sk->sk_err = ECONNREFUSED;
				4055	break;
				4056	case TCP_CLOSE_WAIT:
				4057	sk->sk_err = EPIPE;
				4058	break;
				4059	case TCP_CLOSE:
				4060	return;
				4061	default:
				4062	sk->sk_err = ECONNRESET;
				4063	}
				4064	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				4065	smp_wmb();
				4066
				4067	tcp_write_queue_purge(sk);
				4068	tcp_done(sk);
				4069
				4070	if (!sock_flag(sk, SOCK_DEAD))
				4071	sk->sk_error_report(sk);
				4072	}
				4073
				4074	/*
				4075	* Process the FIN bit. This now behaves as it is supposed to work
				4076	* and the FIN takes effect when it is validly part of sequence
				4077	* space. Not before when we get holes.
				4078	*
				4079	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
				4080	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
				4081	* TIME-WAIT)
				4082	*
				4083	* If we are in FINWAIT-1, a received FIN indicates simultaneous
				4084	* close and we go into CLOSING (and later onto TIME-WAIT)
				4085	*
				4086	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
				4087	*/
				4088	void tcp_fin(struct sock *sk)
				4089	{
				4090	struct tcp_sock *tp = tcp_sk(sk);
				4091
				4092	inet_csk_schedule_ack(sk);
				4093
				4094	sk->sk_shutdown \|= RCV_SHUTDOWN;
				4095	sock_set_flag(sk, SOCK_DONE);
				4096
				4097	switch (sk->sk_state) {
				4098	case TCP_SYN_RECV:
				4099	case TCP_ESTABLISHED:
				4100	/* Move to CLOSE_WAIT */
				4101	tcp_set_state(sk, TCP_CLOSE_WAIT);
				4102	inet_csk(sk)->icsk_ack.pingpong = 1;
				4103	break;
				4104
				4105	case TCP_CLOSE_WAIT:
				4106	case TCP_CLOSING:
				4107	/* Received a retransmission of the FIN, do
				4108	* nothing.
				4109	*/
				4110	break;
				4111	case TCP_LAST_ACK:
				4112	/* RFC793: Remain in the LAST-ACK state. */
				4113	break;
				4114
				4115	case TCP_FIN_WAIT1:
				4116	/* This case occurs when a simultaneous close
				4117	* happens, we must ack the received FIN and
				4118	* enter the CLOSING state.
				4119	*/
				4120	tcp_send_ack(sk);
				4121	tcp_set_state(sk, TCP_CLOSING);
				4122	break;
				4123	case TCP_FIN_WAIT2:
				4124	/* Received a FIN -- send ACK and enter TIME_WAIT. */
				4125	tcp_send_ack(sk);
				4126	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				4127	break;
				4128	default:
				4129	/* Only TCP_LISTEN and TCP_CLOSE are left, in these
				4130	* cases we should never reach this piece of code.
				4131	*/
				4132	pr_err("%s: Impossible, sk->sk_state=%d\n",
				4133	__func__, sk->sk_state);
				4134	break;
				4135	}
				4136
				4137	/* It _is_ possible, that we have something out-of-order _after_ FIN.
				4138	* Probably, we should reset in this case. For now drop them.
				4139	*/
				4140	skb_rbtree_purge(&tp->out_of_order_queue);
				4141	if (tcp_is_sack(tp))
				4142	tcp_sack_reset(&tp->rx_opt);
				4143	sk_mem_reclaim(sk);
				4144
				4145	if (!sock_flag(sk, SOCK_DEAD)) {
				4146	sk->sk_state_change(sk);
				4147
				4148	/* Do not send POLL_HUP for half duplex close. */
				4149	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
				4150	sk->sk_state == TCP_CLOSE)
				4151	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
				4152	else
				4153	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				4154	}
				4155	}
				4156
				4157	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
				4158	u32 end_seq)
				4159	{
				4160	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
				4161	if (before(seq, sp->start_seq))
				4162	sp->start_seq = seq;
				4163	if (after(end_seq, sp->end_seq))
				4164	sp->end_seq = end_seq;
				4165	return true;
				4166	}
				4167	return false;
				4168	}
				4169
				4170	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
				4171	{
				4172	struct tcp_sock *tp = tcp_sk(sk);
				4173
				4174	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
				4175	int mib_idx;
				4176
				4177	if (before(seq, tp->rcv_nxt))
				4178	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
				4179	else
				4180	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
				4181
				4182	NET_INC_STATS(sock_net(sk), mib_idx);
				4183
				4184	tp->rx_opt.dsack = 1;
				4185	tp->duplicate_sack[0].start_seq = seq;
				4186	tp->duplicate_sack[0].end_seq = end_seq;
				4187	}
				4188	}
				4189
				4190	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
				4191	{
				4192	struct tcp_sock *tp = tcp_sk(sk);
				4193
				4194	if (!tp->rx_opt.dsack)
				4195	tcp_dsack_set(sk, seq, end_seq);
				4196	else
				4197	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
				4198	}
				4199
				4200	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
				4201	{
				4202	struct tcp_sock *tp = tcp_sk(sk);
				4203
				4204	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				4205	before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4206	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4207	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4208
				4209	if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
				4210	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				4211
				4212	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				4213	end_seq = tp->rcv_nxt;
				4214	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
				4215	}
				4216	}
				4217
				4218	tcp_send_ack(sk);
				4219	}
				4220
				4221	/* These routines update the SACK block as out-of-order packets arrive or
				4222	* in-order packets close up the sequence space.
				4223	*/
				4224	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
				4225	{
				4226	int this_sack;
				4227	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4228	struct tcp_sack_block *swalk = sp + 1;
				4229
				4230	/* See if the recent change to the first SACK eats into
				4231	* or hits the sequence space of other SACK blocks, if so coalesce.
				4232	*/
				4233	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
				4234	if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
				4235	int i;
				4236
				4237	/* Zap SWALK, by moving every further SACK up by one slot.
				4238	* Decrease num_sacks.
				4239	*/
				4240	tp->rx_opt.num_sacks--;
				4241	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				4242	sp[i] = sp[i + 1];
				4243	continue;
				4244	}
				4245	this_sack++, swalk++;
				4246	}
				4247	}
				4248
				4249	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
				4250	{
				4251	struct tcp_sock *tp = tcp_sk(sk);
				4252	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4253	int cur_sacks = tp->rx_opt.num_sacks;
				4254	int this_sack;
				4255
				4256	if (!cur_sacks)
				4257	goto new_sack;
				4258
				4259	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
				4260	if (tcp_sack_extend(sp, seq, end_seq)) {
				4261	/* Rotate this_sack to the first one. */
				4262	for (; this_sack > 0; this_sack--, sp--)
				4263	swap(sp, (sp - 1));
				4264	if (cur_sacks > 1)
				4265	tcp_sack_maybe_coalesce(tp);
				4266	return;
				4267	}
				4268	}
				4269
				4270	/* Could not find an adjacent existing SACK, build a new one,
				4271	* put it at the front, and shift everyone else down. We
				4272	* always know there is at least one SACK present already here.
				4273	*
				4274	* If the sack array is full, forget about the last one.
				4275	*/
				4276	if (this_sack >= TCP_NUM_SACKS) {
				4277	if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
				4278	tcp_send_ack(sk);
				4279	this_sack--;
				4280	tp->rx_opt.num_sacks--;
				4281	sp--;
				4282	}
				4283	for (; this_sack > 0; this_sack--, sp--)
				4284	sp = (sp - 1);
				4285
				4286	new_sack:
				4287	/* Build the new head SACK, and we're done. */
				4288	sp->start_seq = seq;
				4289	sp->end_seq = end_seq;
				4290	tp->rx_opt.num_sacks++;
				4291	}
				4292
				4293	/* RCV.NXT advances, some SACKs should be eaten. */
				4294
				4295	static void tcp_sack_remove(struct tcp_sock *tp)
				4296	{
				4297	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4298	int num_sacks = tp->rx_opt.num_sacks;
				4299	int this_sack;
				4300
				4301	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
				4302	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4303	tp->rx_opt.num_sacks = 0;
				4304	return;
				4305	}
				4306
				4307	for (this_sack = 0; this_sack < num_sacks;) {
				4308	/* Check if the start of the sack is covered by RCV.NXT. */
				4309	if (!before(tp->rcv_nxt, sp->start_seq)) {
				4310	int i;
				4311
				4312	/* RCV.NXT must cover all the block! */
				4313	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
				4314
				4315	/* Zap this SACK, by moving forward any other SACKS. */
				4316	for (i = this_sack+1; i < num_sacks; i++)
				4317	tp->selective_acks[i-1] = tp->selective_acks[i];
				4318	num_sacks--;
				4319	continue;
				4320	}
				4321	this_sack++;
				4322	sp++;
				4323	}
				4324	tp->rx_opt.num_sacks = num_sacks;
				4325	}
				4326
				4327	/**
				4328	* tcp_try_coalesce - try to merge skb to prior one
				4329	* @sk: socket
				4330	* @dest: destination queue
				4331	* @to: prior buffer
				4332	* @from: buffer to add in queue
				4333	* @fragstolen: pointer to boolean
				4334	*
				4335	* Before queueing skb @from after @to, try to merge them
				4336	* to reduce overall memory use and queue lengths, if cost is small.
				4337	* Packets in ofo or receive queues can stay a long time.
				4338	* Better try to coalesce them right now to avoid future collapses.
				4339	* Returns true if caller should free @from instead of queueing it
				4340	*/
				4341	static bool tcp_try_coalesce(struct sock *sk,
				4342	struct sk_buff *to,
				4343	struct sk_buff *from,
				4344	bool *fragstolen)
				4345	{
				4346	int delta;
				4347
				4348	*fragstolen = false;
				4349
				4350	/* Its possible this segment overlaps with prior segment in queue */
				4351	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
				4352	return false;
				4353
				4354	#ifdef CONFIG_TLS_DEVICE
				4355	if (from->decrypted != to->decrypted)
				4356	return false;
				4357	#endif
				4358
				4359	if (!skb_try_coalesce(to, from, fragstolen, &delta))
				4360	return false;
				4361
				4362	atomic_add(delta, &sk->sk_rmem_alloc);
				4363	sk_mem_charge(sk, delta);
				4364	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
				4365	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
				4366	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
				4367	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
				4368
				4369	if (TCP_SKB_CB(from)->has_rxtstamp) {
				4370	TCP_SKB_CB(to)->has_rxtstamp = true;
				4371	to->tstamp = from->tstamp;
				4372	skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
				4373	}
				4374
				4375	return true;
				4376	}
				4377
				4378	static bool tcp_ooo_try_coalesce(struct sock *sk,
				4379	struct sk_buff *to,
				4380	struct sk_buff *from,
				4381	bool *fragstolen)
				4382	{
				4383	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
				4384
				4385	/* In case tcp_drop() is called later, update to->gso_segs */
				4386	if (res) {
				4387	u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
				4388	max_t(u16, 1, skb_shinfo(from)->gso_segs);
				4389
				4390	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
				4391	}
				4392	return res;
				4393	}
				4394
				4395	static void tcp_drop(struct sock sk, struct sk_buff skb)
				4396	{
				4397	sk_drops_add(sk, skb);
				4398	__kfree_skb(skb);
				4399	}
				4400
				4401	/* This one checks to see if we can put data from the
				4402	* out_of_order queue into the receive_queue.
				4403	*/
				4404	static void tcp_ofo_queue(struct sock *sk)
				4405	{
				4406	struct tcp_sock *tp = tcp_sk(sk);
				4407	__u32 dsack_high = tp->rcv_nxt;
				4408	bool fin, fragstolen, eaten;
				4409	struct sk_buff skb, tail;
				4410	struct rb_node *p;
				4411
				4412	p = rb_first(&tp->out_of_order_queue);
				4413	while (p) {
				4414	skb = rb_to_skb(p);
				4415	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				4416	break;
				4417
				4418	if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
				4419	__u32 dsack = dsack_high;
				4420	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				4421	dsack_high = TCP_SKB_CB(skb)->end_seq;
				4422	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
				4423	}
				4424	p = rb_next(p);
				4425	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
				4426
				4427	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
				4428	SOCK_DEBUG(sk, "ofo packet was already received\n");
				4429	tcp_drop(sk, skb);
				4430	continue;
				4431	}
				4432	SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
				4433	tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
				4434	TCP_SKB_CB(skb)->end_seq);
				4435
				4436	tail = skb_peek_tail(&sk->sk_receive_queue);
				4437	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
				4438	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				4439	fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
				4440	if (!eaten)
				4441	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4442	else
				4443	kfree_skb_partial(skb, fragstolen);
				4444
				4445	if (unlikely(fin)) {
				4446	tcp_fin(sk);
				4447	/* tcp_fin() purges tp->out_of_order_queue,
				4448	* so we must end this loop right now.
				4449	*/
				4450	break;
				4451	}
				4452	}
				4453	}
				4454
				4455	static bool tcp_prune_ofo_queue(struct sock *sk);
				4456	static int tcp_prune_queue(struct sock *sk);
				4457
				4458	static int tcp_try_rmem_schedule(struct sock sk, struct sk_buff skb,
				4459	unsigned int size)
				4460	{
				4461	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
				4462	!sk_rmem_schedule(sk, skb, size)) {
				4463
				4464	if (tcp_prune_queue(sk) < 0)
				4465	return -1;
				4466
				4467	while (!sk_rmem_schedule(sk, skb, size)) {
				4468	if (!tcp_prune_ofo_queue(sk))
				4469	return -1;
				4470	}
				4471	}
				4472	return 0;
				4473	}
				4474
				4475	static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
				4476	{
				4477	struct tcp_sock *tp = tcp_sk(sk);
				4478	struct rb_node *p, parent;
				4479	struct sk_buff *skb1;
				4480	u32 seq, end_seq;
				4481	bool fragstolen;
				4482
				4483	tcp_ecn_check_ce(sk, skb);
				4484
				4485	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
				4486	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
				4487	tcp_drop(sk, skb);
				4488	return;
				4489	}
				4490
				4491	/* Disable header prediction. */
				4492	tp->pred_flags = 0;
				4493	inet_csk_schedule_ack(sk);
				4494
				4495	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
				4496	seq = TCP_SKB_CB(skb)->seq;
				4497	end_seq = TCP_SKB_CB(skb)->end_seq;
				4498	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
				4499	tp->rcv_nxt, seq, end_seq);
				4500
				4501	p = &tp->out_of_order_queue.rb_node;
				4502	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4503	/* Initial out of order segment, build 1 SACK. */
				4504	if (tcp_is_sack(tp)) {
				4505	tp->rx_opt.num_sacks = 1;
				4506	tp->selective_acks[0].start_seq = seq;
				4507	tp->selective_acks[0].end_seq = end_seq;
				4508	}
				4509	rb_link_node(&skb->rbnode, NULL, p);
				4510	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4511	tp->ooo_last_skb = skb;
				4512	goto end;
				4513	}
				4514
				4515	/* In the typical case, we are adding an skb to the end of the list.
				4516	* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
				4517	*/
				4518	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
				4519	skb, &fragstolen)) {
				4520	coalesce_done:
				4521	tcp_grow_window(sk, skb);
				4522	kfree_skb_partial(skb, fragstolen);
				4523	skb = NULL;
				4524	goto add_sack;
				4525	}
				4526	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
				4527	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
				4528	parent = &tp->ooo_last_skb->rbnode;
				4529	p = &parent->rb_right;
				4530	goto insert;
				4531	}
				4532
				4533	/* Find place to insert this segment. Handle overlaps on the way. */
				4534	parent = NULL;
				4535	while (*p) {
				4536	parent = *p;
				4537	skb1 = rb_to_skb(parent);
				4538	if (before(seq, TCP_SKB_CB(skb1)->seq)) {
				4539	p = &parent->rb_left;
				4540	continue;
				4541	}
				4542	if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
				4543	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4544	/* All the bits are present. Drop. */
				4545	NET_INC_STATS(sock_net(sk),
				4546	LINUX_MIB_TCPOFOMERGE);
				4547	tcp_drop(sk, skb);
				4548	skb = NULL;
				4549	tcp_dsack_set(sk, seq, end_seq);
				4550	goto add_sack;
				4551	}
				4552	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				4553	/* Partial overlap. */
				4554	tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
				4555	} else {
				4556	/* skb's seq == skb1's seq and skb covers skb1.
				4557	* Replace skb1 with skb.
				4558	*/
				4559	rb_replace_node(&skb1->rbnode, &skb->rbnode,
				4560	&tp->out_of_order_queue);
				4561	tcp_dsack_extend(sk,
				4562	TCP_SKB_CB(skb1)->seq,
				4563	TCP_SKB_CB(skb1)->end_seq);
				4564	NET_INC_STATS(sock_net(sk),
				4565	LINUX_MIB_TCPOFOMERGE);
				4566	tcp_drop(sk, skb1);
				4567	goto merge_right;
				4568	}
				4569	} else if (tcp_ooo_try_coalesce(sk, skb1,
				4570	skb, &fragstolen)) {
				4571	goto coalesce_done;
				4572	}
				4573	p = &parent->rb_right;
				4574	}
				4575	insert:
				4576	/* Insert segment into RB tree. */
				4577	rb_link_node(&skb->rbnode, parent, p);
				4578	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
				4579
				4580	merge_right:
				4581	/* Remove other segments covered by skb. */
				4582	while ((skb1 = skb_rb_next(skb)) != NULL) {
				4583	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
				4584	break;
				4585	if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4586	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4587	end_seq);
				4588	break;
				4589	}
				4590	rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
				4591	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4592	TCP_SKB_CB(skb1)->end_seq);
				4593	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
				4594	tcp_drop(sk, skb1);
				4595	}
				4596	/* If there is no skb after us, we are the last_skb ! */
				4597	if (!skb1)
				4598	tp->ooo_last_skb = skb;
				4599
				4600	add_sack:
				4601	if (tcp_is_sack(tp))
				4602	tcp_sack_new_ofo_skb(sk, seq, end_seq);
				4603	end:
				4604	if (skb) {
				4605	tcp_grow_window(sk, skb);
				4606	skb_condense(skb);
				4607	skb_set_owner_r(skb, sk);
				4608	}
				4609	}
				4610
				4611	static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb, int hdrlen,
				4612	bool *fragstolen)
				4613	{
				4614	int eaten;
				4615	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
				4616
				4617	__skb_pull(skb, hdrlen);
				4618	eaten = (tail &&
				4619	tcp_try_coalesce(sk, tail,
				4620	skb, fragstolen)) ? 1 : 0;
				4621	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
				4622	if (!eaten) {
				4623	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4624	skb_set_owner_r(skb, sk);
				4625	}
				4626	return eaten;
				4627	}
				4628
				4629	int tcp_send_rcvq(struct sock sk, struct msghdr msg, size_t size)
				4630	{
				4631	struct sk_buff *skb;
				4632	int err = -ENOMEM;
				4633	int data_len = 0;
				4634	bool fragstolen;
				4635
				4636	if (size == 0)
				4637	return 0;
				4638
				4639	if (size > PAGE_SIZE) {
				4640	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
				4641
				4642	data_len = npages << PAGE_SHIFT;
				4643	size = data_len + (size & ~PAGE_MASK);
				4644	}
				4645	skb = alloc_skb_with_frags(size - data_len, data_len,
				4646	PAGE_ALLOC_COSTLY_ORDER,
				4647	&err, sk->sk_allocation);
				4648	if (!skb)
				4649	goto err;
				4650
				4651	skb_put(skb, size - data_len);
				4652	skb->data_len = data_len;
				4653	skb->len = size;
				4654
				4655	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4656	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
				4657	goto err_free;
				4658	}
				4659
				4660	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
				4661	if (err)
				4662	goto err_free;
				4663
				4664	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
				4665	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
				4666	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
				4667
				4668	if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
				4669	WARN_ON_ONCE(fragstolen); /* should not happen */
				4670	__kfree_skb(skb);
				4671	}
				4672	return size;
				4673
				4674	err_free:
				4675	kfree_skb(skb);
				4676	err:
				4677	return err;
				4678
				4679	}
				4680
				4681	void tcp_data_ready(struct sock *sk)
				4682	{
				4683	const struct tcp_sock *tp = tcp_sk(sk);
				4684	int avail = tp->rcv_nxt - tp->copied_seq;
				4685
				4686	if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
				4687	return;
				4688
				4689	sk->sk_data_ready(sk);
				4690	}
				4691
				4692	static void tcp_data_queue(struct sock sk, struct sk_buff skb)
				4693	{
				4694	struct tcp_sock *tp = tcp_sk(sk);
				4695	bool fragstolen;
				4696	int eaten;
				4697
				4698	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
				4699	__kfree_skb(skb);
				4700	return;
				4701	}
				4702	skb_dst_drop(skb);
				4703	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
				4704
				4705	tcp_ecn_accept_cwr(sk, skb);
				4706
				4707	tp->rx_opt.dsack = 0;
				4708
				4709	/* Queue data for delivery to the user.
				4710	* Packets in sequence go to the receive queue.
				4711	* Out of sequence packets to the out_of_order_queue.
				4712	*/
				4713	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
				4714	if (tcp_receive_window(tp) == 0) {
				4715	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4716	goto out_of_window;
				4717	}
				4718
				4719	/* Ok. In sequence. In window. */
				4720	queue_and_out:
				4721	if (skb_queue_len(&sk->sk_receive_queue) == 0)
				4722	sk_forced_mem_schedule(sk, skb->truesize);
				4723	else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
				4724	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
				4725	goto drop;
				4726	}
				4727
				4728	eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
				4729	if (skb->len)
				4730	tcp_event_data_recv(sk, skb);
				4731	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				4732	tcp_fin(sk);
				4733
				4734	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				4735	tcp_ofo_queue(sk);
				4736
				4737	/* RFC5681. 4.2. SHOULD send immediate ACK, when
				4738	* gap in queue is filled.
				4739	*/
				4740	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				4741	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
				4742	}
				4743
				4744	if (tp->rx_opt.num_sacks)
				4745	tcp_sack_remove(tp);
				4746
				4747	tcp_fast_path_check(sk);
				4748
				4749	if (eaten > 0)
				4750	kfree_skb_partial(skb, fragstolen);
				4751	if (!sock_flag(sk, SOCK_DEAD))
				4752	tcp_data_ready(sk);
				4753	return;
				4754	}
				4755
				4756	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
				4757	/* A retransmit, 2nd most common case. Force an immediate ack. */
				4758	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4759	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4760
				4761	out_of_window:
				4762	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				4763	inet_csk_schedule_ack(sk);
				4764	drop:
				4765	tcp_drop(sk, skb);
				4766	return;
				4767	}
				4768
				4769	/* Out of window. F.e. zero window probe. */
				4770	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
				4771	goto out_of_window;
				4772
				4773	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4774	/* Partial packet, seq < rcv_next < end_seq */
				4775	SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
				4776	tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
				4777	TCP_SKB_CB(skb)->end_seq);
				4778
				4779	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
				4780
				4781	/* If window is closed, drop tail of packet. But after
				4782	* remembering D-SACK for its head made in previous line.
				4783	*/
				4784	if (!tcp_receive_window(tp)) {
				4785	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
				4786	goto out_of_window;
				4787	}
				4788	goto queue_and_out;
				4789	}
				4790
				4791	tcp_data_queue_ofo(sk, skb);
				4792	}
				4793
				4794	static struct sk_buff tcp_skb_next(struct sk_buff skb, struct sk_buff_head *list)
				4795	{
				4796	if (list)
				4797	return !skb_queue_is_last(list, skb) ? skb->next : NULL;
				4798
				4799	return skb_rb_next(skb);
				4800	}
				4801
				4802	static struct sk_buff tcp_collapse_one(struct sock sk, struct sk_buff *skb,
				4803	struct sk_buff_head *list,
				4804	struct rb_root *root)
				4805	{
				4806	struct sk_buff *next = tcp_skb_next(skb, list);
				4807
				4808	if (list)
				4809	__skb_unlink(skb, list);
				4810	else
				4811	rb_erase(&skb->rbnode, root);
				4812
				4813	__kfree_skb(skb);
				4814	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
				4815
				4816	return next;
				4817	}
				4818
				4819	/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
				4820	void tcp_rbtree_insert(struct rb_root root, struct sk_buff skb)
				4821	{
				4822	struct rb_node **p = &root->rb_node;
				4823	struct rb_node *parent = NULL;
				4824	struct sk_buff *skb1;
				4825
				4826	while (*p) {
				4827	parent = *p;
				4828	skb1 = rb_to_skb(parent);
				4829	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
				4830	p = &parent->rb_left;
				4831	else
				4832	p = &parent->rb_right;
				4833	}
				4834	rb_link_node(&skb->rbnode, parent, p);
				4835	rb_insert_color(&skb->rbnode, root);
				4836	}
				4837
				4838	/* Collapse contiguous sequence of skbs head..tail with
				4839	* sequence numbers start..end.
				4840	*
				4841	* If tail is NULL, this means until the end of the queue.
				4842	*
				4843	* Segments with FIN/SYN are not collapsed (only because this
				4844	* simplifies code)
				4845	*/
				4846	static void
				4847	tcp_collapse(struct sock sk, struct sk_buff_head list, struct rb_root *root,
				4848	struct sk_buff head, struct sk_buff tail, u32 start, u32 end)
				4849	{
				4850	struct sk_buff skb = head, n;
				4851	struct sk_buff_head tmp;
				4852	bool end_of_skbs;
				4853
				4854	/* First, check that queue is collapsible and find
				4855	* the point where collapsing can be useful.
				4856	*/
				4857	restart:
				4858	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
				4859	n = tcp_skb_next(skb, list);
				4860
				4861	/* No new bits? It is possible on ofo queue. */
				4862	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4863	skb = tcp_collapse_one(sk, skb, list, root);
				4864	if (!skb)
				4865	break;
				4866	goto restart;
				4867	}
				4868
				4869	/* The first skb to collapse is:
				4870	* - not SYN/FIN and
				4871	* - bloated or contains data before "start" or
				4872	* overlaps to the next one.
				4873	*/
				4874	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
				4875	(tcp_win_from_space(sk, skb->truesize) > skb->len \|\|
				4876	before(TCP_SKB_CB(skb)->seq, start))) {
				4877	end_of_skbs = false;
				4878	break;
				4879	}
				4880
				4881	if (n && n != tail &&
				4882	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
				4883	end_of_skbs = false;
				4884	break;
				4885	}
				4886
				4887	/* Decided to skip this, advance start seq. */
				4888	start = TCP_SKB_CB(skb)->end_seq;
				4889	}
				4890	if (end_of_skbs \|\|
				4891	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				4892	return;
				4893
				4894	__skb_queue_head_init(&tmp);
				4895
				4896	while (before(start, end)) {
				4897	int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
				4898	struct sk_buff *nskb;
				4899
				4900	nskb = alloc_skb(copy, GFP_ATOMIC);
				4901	if (!nskb)
				4902	break;
				4903
				4904	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
				4905	#ifdef CONFIG_TLS_DEVICE
				4906	nskb->decrypted = skb->decrypted;
				4907	#endif
				4908	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
				4909	if (list)
				4910	__skb_queue_before(list, skb, nskb);
				4911	else
				4912	__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
				4913	skb_set_owner_r(nskb, sk);
				4914
				4915	/* Copy data, releasing collapsed skbs. */
				4916	while (copy > 0) {
				4917	int offset = start - TCP_SKB_CB(skb)->seq;
				4918	int size = TCP_SKB_CB(skb)->end_seq - start;
				4919
				4920	BUG_ON(offset < 0);
				4921	if (size > 0) {
				4922	size = min(copy, size);
				4923	if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				4924	BUG();
				4925	TCP_SKB_CB(nskb)->end_seq += size;
				4926	copy -= size;
				4927	start += size;
				4928	}
				4929	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4930	skb = tcp_collapse_one(sk, skb, list, root);
				4931	if (!skb \|\|
				4932	skb == tail \|\|
				4933	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				4934	goto end;
				4935	#ifdef CONFIG_TLS_DEVICE
				4936	if (skb->decrypted != nskb->decrypted)
				4937	goto end;
				4938	#endif
				4939	}
				4940	}
				4941	}
				4942	end:
				4943	skb_queue_walk_safe(&tmp, skb, n)
				4944	tcp_rbtree_insert(root, skb);
				4945	}
				4946
				4947	/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
				4948	* and tcp_collapse() them until all the queue is collapsed.
				4949	*/
				4950	static void tcp_collapse_ofo_queue(struct sock *sk)
				4951	{
				4952	struct tcp_sock *tp = tcp_sk(sk);
				4953	u32 range_truesize, sum_tiny = 0;
				4954	struct sk_buff skb, head;
				4955	u32 start, end;
				4956
				4957	skb = skb_rb_first(&tp->out_of_order_queue);
				4958	new_range:
				4959	if (!skb) {
				4960	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
				4961	return;
				4962	}
				4963	start = TCP_SKB_CB(skb)->seq;
				4964	end = TCP_SKB_CB(skb)->end_seq;
				4965	range_truesize = skb->truesize;
				4966
				4967	for (head = skb;;) {
				4968	skb = skb_rb_next(skb);
				4969
				4970	/* Range is terminated when we see a gap or when
				4971	* we are at the queue end.
				4972	*/
				4973	if (!skb \|\|
				4974	after(TCP_SKB_CB(skb)->seq, end) \|\|
				4975	before(TCP_SKB_CB(skb)->end_seq, start)) {
				4976	/* Do not attempt collapsing tiny skbs */
				4977	if (range_truesize != head->truesize \|\|
				4978	end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
				4979	tcp_collapse(sk, NULL, &tp->out_of_order_queue,
				4980	head, skb, start, end);
				4981	} else {
				4982	sum_tiny += range_truesize;
				4983	if (sum_tiny > sk->sk_rcvbuf >> 3)
				4984	return;
				4985	}
				4986	goto new_range;
				4987	}
				4988
				4989	range_truesize += skb->truesize;
				4990	if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
				4991	start = TCP_SKB_CB(skb)->seq;
				4992	if (after(TCP_SKB_CB(skb)->end_seq, end))
				4993	end = TCP_SKB_CB(skb)->end_seq;
				4994	}
				4995	}
				4996
				4997	/*
				4998	* Clean the out-of-order queue to make room.
				4999	* We drop high sequences packets to :
				5000	* 1) Let a chance for holes to be filled.
				5001	* 2) not add too big latencies if thousands of packets sit there.
				5002	* (But if application shrinks SO_RCVBUF, we could still end up
				5003	* freeing whole queue here)
				5004	* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
				5005	*
				5006	* Return true if queue has shrunk.
				5007	*/
				5008	static bool tcp_prune_ofo_queue(struct sock *sk)
				5009	{
				5010	struct tcp_sock *tp = tcp_sk(sk);
				5011	struct rb_node node, prev;
				5012	int goal;
				5013
				5014	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
				5015	return false;
				5016
				5017	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
				5018	goal = sk->sk_rcvbuf >> 3;
				5019	node = &tp->ooo_last_skb->rbnode;
				5020	do {
				5021	prev = rb_prev(node);
				5022	rb_erase(node, &tp->out_of_order_queue);
				5023	goal -= rb_to_skb(node)->truesize;
				5024	tcp_drop(sk, rb_to_skb(node));
				5025	if (!prev \|\| goal <= 0) {
				5026	sk_mem_reclaim(sk);
				5027	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
				5028	!tcp_under_memory_pressure(sk))
				5029	break;
				5030	goal = sk->sk_rcvbuf >> 3;
				5031	}
				5032	node = prev;
				5033	} while (node);
				5034	tp->ooo_last_skb = rb_to_skb(prev);
				5035
				5036	/* Reset SACK state. A conforming SACK implementation will
				5037	* do the same at a timeout based retransmit. When a connection
				5038	* is in a sad state like this, we care only about integrity
				5039	* of the connection not performance.
				5040	*/
				5041	if (tp->rx_opt.sack_ok)
				5042	tcp_sack_reset(&tp->rx_opt);
				5043	return true;
				5044	}
				5045
				5046	/* Reduce allocated memory if we can, trying to get
				5047	* the socket within its memory limits again.
				5048	*
				5049	* Return less than zero if we should start dropping frames
				5050	* until the socket owning process reads some of the data
				5051	* to stabilize the situation.
				5052	*/
				5053	static int tcp_prune_queue(struct sock *sk)
				5054	{
				5055	struct tcp_sock *tp = tcp_sk(sk);
				5056
				5057	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
				5058
				5059	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
				5060
				5061	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				5062	tcp_clamp_window(sk);
				5063	else if (tcp_under_memory_pressure(sk))
				5064	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
				5065
				5066	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5067	return 0;
				5068
				5069	tcp_collapse_ofo_queue(sk);
				5070	if (!skb_queue_empty(&sk->sk_receive_queue))
				5071	tcp_collapse(sk, &sk->sk_receive_queue, NULL,
				5072	skb_peek(&sk->sk_receive_queue),
				5073	NULL,
				5074	tp->copied_seq, tp->rcv_nxt);
				5075	sk_mem_reclaim(sk);
				5076
				5077	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5078	return 0;
				5079
				5080	/* Collapsing did not help, destructive actions follow.
				5081	* This must not ever occur. */
				5082
				5083	tcp_prune_ofo_queue(sk);
				5084
				5085	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5086	return 0;
				5087
				5088	/* If we are really being abused, tell the caller to silently
				5089	* drop receive data on the floor. It will get retransmitted
				5090	* and hopefully then we'll have sufficient space.
				5091	*/
				5092	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
				5093
				5094	/* Massive buffer overcommit. */
				5095	tp->pred_flags = 0;
				5096	return -1;
				5097	}
				5098
				5099	static bool tcp_should_expand_sndbuf(const struct sock *sk)
				5100	{
				5101	const struct tcp_sock *tp = tcp_sk(sk);
				5102
				5103	/* If the user specified a specific send buffer setting, do
				5104	* not modify it.
				5105	*/
				5106	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
				5107	return false;
				5108
				5109	/* If we are under global TCP memory pressure, do not expand. */
				5110	if (tcp_under_memory_pressure(sk))
				5111	return false;
				5112
				5113	/* If we are under soft global TCP memory pressure, do not expand. */
				5114	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
				5115	return false;
				5116
				5117	/* If we filled the congestion window, do not expand. */
				5118	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
				5119	return false;
				5120
				5121	return true;
				5122	}
				5123
				5124	/* When incoming ACK allowed to free some skb from write_queue,
				5125	* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
				5126	* on the exit from tcp input handler.
				5127	*
				5128	* PROBLEM: sndbuf expansion does not work well with largesend.
				5129	*/
				5130	static void tcp_new_space(struct sock *sk)
				5131	{
				5132	struct tcp_sock *tp = tcp_sk(sk);
				5133
				5134	if (tcp_should_expand_sndbuf(sk)) {
				5135	tcp_sndbuf_expand(sk);
				5136	tp->snd_cwnd_stamp = tcp_jiffies32;
				5137	}
				5138
				5139	sk->sk_write_space(sk);
				5140	}
				5141
				5142	static void tcp_check_space(struct sock *sk)
				5143	{
				5144	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
				5145	sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
				5146	/* pairs with tcp_poll() */
				5147	smp_mb();
				5148	if (sk->sk_socket &&
				5149	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				5150	tcp_new_space(sk);
				5151	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
				5152	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				5153	}
				5154	}
				5155	}
				5156
				5157	static inline void tcp_data_snd_check(struct sock *sk)
				5158	{
				5159	tcp_push_pending_frames(sk);
				5160	tcp_check_space(sk);
				5161	}
				5162
				5163	/*
				5164	* Check if sending an ack is needed.
				5165	*/
				5166	static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
				5167	{
				5168	struct tcp_sock *tp = tcp_sk(sk);
				5169	unsigned long rtt, delay;
				5170
				5171	/* More than one full frame received... */
				5172	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
				5173	/* ... and right edge of window advances far enough.
				5174	* (tcp_recvmsg() will send ACK otherwise).
				5175	* If application uses SO_RCVLOWAT, we want send ack now if
				5176	* we have not received enough bytes to satisfy the condition.
				5177	*/
				5178	(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat \|\|
				5179	__tcp_select_window(sk) >= tp->rcv_wnd)) \|\|
				5180	/* We ACK each frame or... */
				5181	tcp_in_quickack_mode(sk) \|\|
				5182	/* Protocol state mandates a one-time immediate ACK */
				5183	inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
				5184	send_now:
				5185	tcp_send_ack(sk);
				5186	return;
				5187	}
				5188
				5189	if (!ofo_possible \|\| RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
				5190	tcp_send_delayed_ack(sk);
				5191	return;
				5192	}
				5193
				5194	if (!tcp_is_sack(tp) \|\|
				5195	tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
				5196	goto send_now;
				5197
				5198	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
				5199	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
				5200	if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
				5201	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
				5202	tp->compressed_ack - TCP_FASTRETRANS_THRESH);
				5203	tp->compressed_ack = 0;
				5204	}
				5205
				5206	if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
				5207	goto send_now;
				5208
				5209	if (hrtimer_is_queued(&tp->compressed_ack_timer))
				5210	return;
				5211
				5212	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
				5213
				5214	rtt = tp->rcv_rtt_est.rtt_us;
				5215	if (tp->srtt_us && tp->srtt_us < rtt)
				5216	rtt = tp->srtt_us;
				5217
				5218	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
				5219	rtt * (NSEC_PER_USEC >> 3)/20);
				5220	sock_hold(sk);
				5221	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
				5222	HRTIMER_MODE_REL_PINNED_SOFT);
				5223	}
				5224
				5225	static inline void tcp_ack_snd_check(struct sock *sk)
				5226	{
				5227	if (!inet_csk_ack_scheduled(sk)) {
				5228	/* We sent a data segment already. */
				5229	return;
				5230	}
				5231	__tcp_ack_snd_check(sk, 1);
				5232	}
				5233
				5234	/*
				5235	* This routine is only called when we have urgent data
				5236	* signaled. Its the 'slow' part of tcp_urg. It could be
				5237	* moved inline now as tcp_urg is only called from one
				5238	* place. We handle URGent data wrong. We have to - as
				5239	* BSD still doesn't use the correction from RFC961.
				5240	* For 1003.1g we should support a new option TCP_STDURG to permit
				5241	* either form (or just set the sysctl tcp_stdurg).
				5242	*/
				5243
				5244	static void tcp_check_urg(struct sock sk, const struct tcphdr th)
				5245	{
				5246	struct tcp_sock *tp = tcp_sk(sk);
				5247	u32 ptr = ntohs(th->urg_ptr);
				5248
				5249	if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
				5250	ptr--;
				5251	ptr += ntohl(th->seq);
				5252
				5253	/* Ignore urgent data that we've already seen and read. */
				5254	if (after(tp->copied_seq, ptr))
				5255	return;
				5256
				5257	/* Do not replay urg ptr.
				5258	*
				5259	* NOTE: interesting situation not covered by specs.
				5260	* Misbehaving sender may send urg ptr, pointing to segment,
				5261	* which we already have in ofo queue. We are not able to fetch
				5262	* such data and will stay in TCP_URG_NOTYET until will be eaten
				5263	* by recvmsg(). Seems, we are not obliged to handle such wicked
				5264	* situations. But it is worth to think about possibility of some
				5265	* DoSes using some hypothetical application level deadlock.
				5266	*/
				5267	if (before(ptr, tp->rcv_nxt))
				5268	return;
				5269
				5270	/* Do we already have a newer (or duplicate) urgent pointer? */
				5271	if (tp->urg_data && !after(ptr, tp->urg_seq))
				5272	return;
				5273
				5274	/* Tell the world about our new urgent pointer. */
				5275	sk_send_sigurg(sk);
				5276
				5277	/* We may be adding urgent data when the last byte read was
				5278	* urgent. To do this requires some care. We cannot just ignore
				5279	* tp->copied_seq since we would read the last urgent byte again
				5280	* as data, nor can we alter copied_seq until this data arrives
				5281	* or we break the semantics of SIOCATMARK (and thus sockatmark())
				5282	*
				5283	* NOTE. Double Dutch. Rendering to plain English: author of comment
				5284	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
				5285	* and expect that both A and B disappear from stream. This is _wrong_.
				5286	* Though this happens in BSD with high probability, this is occasional.
				5287	* Any application relying on this is buggy. Note also, that fix "works"
				5288	* only in this artificial test. Insert some normal data between A and B and we will
				5289	* decline of BSD again. Verdict: it is better to remove to trap
				5290	* buggy users.
				5291	*/
				5292	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
				5293	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
				5294	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				5295	tp->copied_seq++;
				5296	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
				5297	__skb_unlink(skb, &sk->sk_receive_queue);
				5298	__kfree_skb(skb);
				5299	}
				5300	}
				5301
				5302	tp->urg_data = TCP_URG_NOTYET;
				5303	tp->urg_seq = ptr;
				5304
				5305	/* Disable header prediction. */
				5306	tp->pred_flags = 0;
				5307	}
				5308
				5309	/* This is the 'fast' part of urgent handling. */
				5310	static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *th)
				5311	{
				5312	struct tcp_sock *tp = tcp_sk(sk);
				5313
				5314	/* Check if we get a new urgent pointer - normally not. */
				5315	if (th->urg)
				5316	tcp_check_urg(sk, th);
				5317
				5318	/* Do we wait for any urgent data? - normally not... */
				5319	if (tp->urg_data == TCP_URG_NOTYET) {
				5320	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
				5321	th->syn;
				5322
				5323	/* Is the urgent pointer pointing into this packet? */
				5324	if (ptr < skb->len) {
				5325	u8 tmp;
				5326	if (skb_copy_bits(skb, ptr, &tmp, 1))
				5327	BUG();
				5328	tp->urg_data = TCP_URG_VALID \| tmp;
				5329	if (!sock_flag(sk, SOCK_DEAD))
				5330	sk->sk_data_ready(sk);
				5331	}
				5332	}
				5333	}
				5334
				5335	/* Accept RST for rcv_nxt - 1 after a FIN.
				5336	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
				5337	* FIN is sent followed by a RST packet. The RST is sent with the same
				5338	* sequence number as the FIN, and thus according to RFC 5961 a challenge
				5339	* ACK should be sent. However, Mac OSX rate limits replies to challenge
				5340	* ACKs on the closed socket. In addition middleboxes can drop either the
				5341	* challenge ACK or a subsequent RST.
				5342	*/
				5343	static bool tcp_reset_check(const struct sock sk, const struct sk_buff skb)
				5344	{
				5345	struct tcp_sock *tp = tcp_sk(sk);
				5346
				5347	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
				5348	(1 << sk->sk_state) & (TCPF_CLOSE_WAIT \| TCPF_LAST_ACK \|
				5349	TCPF_CLOSING));
				5350	}
				5351
				5352	/* Does PAWS and seqno based validation of an incoming segment, flags will
				5353	* play significant role here.
				5354	*/
				5355	static bool tcp_validate_incoming(struct sock sk, struct sk_buff skb,
				5356	const struct tcphdr *th, int syn_inerr)
				5357	{
				5358	struct tcp_sock *tp = tcp_sk(sk);
				5359	bool rst_seq_match = false;
				5360
				5361	/* RFC1323: H1. Apply PAWS check first. */
				5362	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
				5363	tp->rx_opt.saw_tstamp &&
				5364	tcp_paws_discard(sk, skb)) {
				5365	if (!th->rst) {
				5366	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
				5367	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5368	LINUX_MIB_TCPACKSKIPPEDPAWS,
				5369	&tp->last_oow_ack_time))
				5370	tcp_send_dupack(sk, skb);
				5371	goto discard;
				5372	}
				5373	/* Reset is accepted even if it did not pass PAWS. */
				5374	}
				5375
				5376	/* Step 1: check sequence number */
				5377	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
				5378	/* RFC793, page 37: "In all states except SYN-SENT, all reset
				5379	* (RST) segments are validated by checking their SEQ-fields."
				5380	* And page 69: "If an incoming segment is not acceptable,
				5381	* an acknowledgment should be sent in reply (unless the RST
				5382	* bit is set, if so drop the segment and return)".
				5383	*/
				5384	if (!th->rst) {
				5385	if (th->syn)
				5386	goto syn_challenge;
				5387	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5388	LINUX_MIB_TCPACKSKIPPEDSEQ,
				5389	&tp->last_oow_ack_time))
				5390	tcp_send_dupack(sk, skb);
				5391	} else if (tcp_reset_check(sk, skb)) {
				5392	tcp_reset(sk);
				5393	}
				5394	goto discard;
				5395	}
				5396
				5397	/* Step 2: check RST bit */
				5398	if (th->rst) {
				5399	/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
				5400	* FIN and SACK too if available):
				5401	* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
				5402	* the right-most SACK block,
				5403	* then
				5404	* RESET the connection
				5405	* else
				5406	* Send a challenge ACK
				5407	*/
				5408	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt \|\|
				5409	tcp_reset_check(sk, skb)) {
				5410	rst_seq_match = true;
				5411	} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
				5412	struct tcp_sack_block *sp = &tp->selective_acks[0];
				5413	int max_sack = sp[0].end_seq;
				5414	int this_sack;
				5415
				5416	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
				5417	++this_sack) {
				5418	max_sack = after(sp[this_sack].end_seq,
				5419	max_sack) ?
				5420	sp[this_sack].end_seq : max_sack;
				5421	}
				5422
				5423	if (TCP_SKB_CB(skb)->seq == max_sack)
				5424	rst_seq_match = true;
				5425	}
				5426
				5427	if (rst_seq_match)
				5428	tcp_reset(sk);
				5429	else {
				5430	/* Disable TFO if RST is out-of-order
				5431	* and no data has been received
				5432	* for current active TFO socket
				5433	*/
				5434	if (tp->syn_fastopen && !tp->data_segs_in &&
				5435	sk->sk_state == TCP_ESTABLISHED)
				5436	tcp_fastopen_active_disable(sk);
				5437	tcp_send_challenge_ack(sk, skb);
				5438	}
				5439	goto discard;
				5440	}
				5441
				5442	/* step 3: check security and precedence [ignored] */
				5443
				5444	/* step 4: Check for a SYN
				5445	* RFC 5961 4.2 : Send a challenge ack
				5446	*/
				5447	if (th->syn) {
				5448	syn_challenge:
				5449	if (syn_inerr)
				5450	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5451	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
				5452	tcp_send_challenge_ack(sk, skb);
				5453	goto discard;
				5454	}
				5455
				5456	return true;
				5457
				5458	discard:
				5459	tcp_drop(sk, skb);
				5460	return false;
				5461	}
				5462
				5463	/*
				5464	* TCP receive function for the ESTABLISHED state.
				5465	*
				5466	* It is split into a fast path and a slow path. The fast path is
				5467	* disabled when:
				5468	* - A zero window was announced from us - zero window probing
				5469	* is only handled properly in the slow path.
				5470	* - Out of order segments arrived.
				5471	* - Urgent data is expected.
				5472	* - There is no buffer space left
				5473	* - Unexpected TCP flags/window values/header lengths are received
				5474	* (detected by checking the TCP header against pred_flags)
				5475	* - Data is sent in both directions. Fast path only supports pure senders
				5476	* or pure receivers (this means either the sequence number or the ack
				5477	* value must stay constant)
				5478	* - Unexpected TCP option.
				5479	*
				5480	* When these conditions are not satisfied it drops into a standard
				5481	* receive procedure patterned after RFC793 to handle all cases.
				5482	* The first three cases are guaranteed by proper pred_flags setting,
				5483	* the rest is checked inline. Fast processing is turned on in
				5484	* tcp_data_queue when everything is OK.
				5485	*/
				5486	void tcp_rcv_established(struct sock sk, struct sk_buff skb)
				5487	{
				5488	const struct tcphdr th = (const struct tcphdr )skb->data;
				5489	struct tcp_sock *tp = tcp_sk(sk);
				5490	unsigned int len = skb->len;
				5491
				5492	/avoid tcp receive path error accelerate/
				5493	hwnat_magic_tag_set_zero(skb);
				5494	/* TCP congestion window tracking */
				5495	trace_tcp_probe(sk, skb);
				5496
				5497	tcp_mstamp_refresh(tp);
				5498	if (unlikely(!sk->sk_rx_dst))
				5499	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5500	/*
				5501	* Header prediction.
				5502	* The code loosely follows the one in the famous
				5503	* "30 instruction TCP receive" Van Jacobson mail.
				5504	*
				5505	* Van's trick is to deposit buffers into socket queue
				5506	* on a device interrupt, to call tcp_recv function
				5507	* on the receive process context and checksum and copy
				5508	* the buffer to user space. smart...
				5509	*
				5510	* Our current scheme is not silly either but we take the
				5511	* extra cost of the net_bh soft interrupt processing...
				5512	* We do checksum and copy also but from device to kernel.
				5513	*/
				5514
				5515	tp->rx_opt.saw_tstamp = 0;
				5516
				5517	/* pred_flags is 0xS?10 << 16 + snd_wnd
				5518	* if header_prediction is to be made
				5519	* 'S' will always be tp->tcp_header_len >> 2
				5520	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
				5521	* turn it off (when there are holes in the receive
				5522	* space for instance)
				5523	* PSH flag is ignored.
				5524	*/
				5525
				5526	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
				5527	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
				5528	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				5529	int tcp_header_len = tp->tcp_header_len;
				5530
				5531	/* Timestamp header prediction: tcp_header_len
				5532	* is automatically equal to th->doff*4 due to pred_flags
				5533	* match.
				5534	*/
				5535
				5536	/* Check timestamp */
				5537	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
				5538	/* No? Slow path! */
				5539	if (!tcp_parse_aligned_timestamp(tp, th))
				5540	goto slow_path;
				5541
				5542	/* If PAWS failed, check it more carefully in slow path */
				5543	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				5544	goto slow_path;
				5545
				5546	/* DO NOT update ts_recent here, if checksum fails
				5547	* and timestamp was corrupted part, it will result
				5548	* in a hung connection since we will drop all
				5549	* future packets due to the PAWS test.
				5550	*/
				5551	}
				5552
				5553	if (len <= tcp_header_len) {
				5554	/* Bulk data transfer: sender */
				5555	if (len == tcp_header_len) {
				5556	/* Predicted packet is in window by definition.
				5557	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5558	* Hence, check seq<=rcv_wup reduces to:
				5559	*/
				5560	if (tcp_header_len ==
				5561	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5562	tp->rcv_nxt == tp->rcv_wup)
				5563	tcp_store_ts_recent(tp);
				5564
				5565	/* We know that such packets are checksummed
				5566	* on entry.
				5567	*/
				5568	tcp_ack(sk, skb, 0);
				5569	__kfree_skb(skb);
				5570	tcp_data_snd_check(sk);
				5571	/* When receiving pure ack in fast path, update
				5572	* last ts ecr directly instead of calling
				5573	* tcp_rcv_rtt_measure_ts()
				5574	*/
				5575	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
				5576	return;
				5577	} else { /* Header too small */
				5578	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5579	goto discard;
				5580	}
				5581	} else {
				5582	int eaten = 0;
				5583	bool fragstolen = false;
				5584
				5585	if (tcp_checksum_complete(skb))
				5586	goto csum_error;
				5587
				5588	if ((int)skb->truesize > sk->sk_forward_alloc)
				5589	goto step5;
				5590
				5591	/* Predicted packet is in window by definition.
				5592	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5593	* Hence, check seq<=rcv_wup reduces to:
				5594	*/
				5595	if (tcp_header_len ==
				5596	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5597	tp->rcv_nxt == tp->rcv_wup)
				5598	tcp_store_ts_recent(tp);
				5599
				5600	tcp_rcv_rtt_measure_ts(sk, skb);
				5601
				5602	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
				5603
				5604	/* Bulk data transfer: receiver */
				5605	eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
				5606	&fragstolen);
				5607
				5608	tcp_event_data_recv(sk, skb);
				5609
				5610	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				5611	/* Well, only one small jumplet in fast path... */
				5612	tcp_ack(sk, skb, FLAG_DATA);
				5613	tcp_data_snd_check(sk);
				5614	if (!inet_csk_ack_scheduled(sk))
				5615	goto no_ack;
				5616	}
				5617
				5618	__tcp_ack_snd_check(sk, 0);
				5619	no_ack:
				5620	if (eaten)
				5621	kfree_skb_partial(skb, fragstolen);
				5622	tcp_data_ready(sk);
				5623	return;
				5624	}
				5625	}
				5626
				5627	slow_path:
				5628	if (len < (th->doff << 2) \|\| tcp_checksum_complete(skb))
				5629	goto csum_error;
				5630
				5631	if (!th->ack && !th->rst && !th->syn)
				5632	goto discard;
				5633
				5634	/*
				5635	* Standard slow path.
				5636	*/
				5637
				5638	if (!tcp_validate_incoming(sk, skb, th, 1))
				5639	return;
				5640
				5641	step5:
				5642	if (tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT) < 0)
				5643	goto discard;
				5644
				5645	tcp_rcv_rtt_measure_ts(sk, skb);
				5646
				5647	/* Process urgent data. */
				5648	tcp_urg(sk, skb, th);
				5649
				5650	/* step 7: process the segment text */
				5651	tcp_data_queue(sk, skb);
				5652
				5653	tcp_data_snd_check(sk);
				5654	tcp_ack_snd_check(sk);
				5655	return;
				5656
				5657	csum_error:
				5658	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
				5659	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
				5660
				5661	discard:
				5662	tcp_drop(sk, skb);
				5663	}
				5664	EXPORT_SYMBOL(tcp_rcv_established);
				5665
				5666	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
				5667	{
				5668	struct tcp_sock *tp = tcp_sk(sk);
				5669	struct inet_connection_sock *icsk = inet_csk(sk);
				5670
				5671	tcp_set_state(sk, TCP_ESTABLISHED);
				5672	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
				5673
				5674	if (skb) {
				5675	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5676	security_inet_conn_established(sk, skb);
				5677	sk_mark_napi_id(sk, skb);
				5678	}
				5679
				5680	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
				5681
				5682	/* Prevent spurious tcp_cwnd_restart() on first data
				5683	* packet.
				5684	*/
				5685	tp->lsndtime = tcp_jiffies32;
				5686
				5687	if (sock_flag(sk, SOCK_KEEPOPEN))
				5688	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
				5689
				5690	if (!tp->rx_opt.snd_wscale)
				5691	__tcp_fast_path_on(tp, tp->snd_wnd);
				5692	else
				5693	tp->pred_flags = 0;
				5694	}
				5695
				5696	static bool tcp_rcv_fastopen_synack(struct sock sk, struct sk_buff synack,
				5697	struct tcp_fastopen_cookie *cookie)
				5698	{
				5699	struct tcp_sock *tp = tcp_sk(sk);
				5700	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
				5701	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
				5702	bool syn_drop = false;
				5703
				5704	if (mss == tp->rx_opt.user_mss) {
				5705	struct tcp_options_received opt;
				5706
				5707	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
				5708	tcp_clear_options(&opt);
				5709	opt.user_mss = opt.mss_clamp = 0;
				5710	tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
				5711	mss = opt.mss_clamp;
				5712	}
				5713
				5714	if (!tp->syn_fastopen) {
				5715	/* Ignore an unsolicited cookie */
				5716	cookie->len = -1;
				5717	} else if (tp->total_retrans) {
				5718	/* SYN timed out and the SYN-ACK neither has a cookie nor
				5719	* acknowledges data. Presumably the remote received only
				5720	* the retransmitted (regular) SYNs: either the original
				5721	* SYN-data or the corresponding SYN-ACK was dropped.
				5722	*/
				5723	syn_drop = (cookie->len < 0 && data);
				5724	} else if (cookie->len < 0 && !tp->syn_data) {
				5725	/* We requested a cookie but didn't get it. If we did not use
				5726	* the (old) exp opt format then try so next time (try_exp=1).
				5727	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
				5728	*/
				5729	try_exp = tp->syn_fastopen_exp ? 2 : 1;
				5730	}
				5731
				5732	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
				5733
				5734	if (data) { /* Retransmit unacked data in SYN */
				5735	skb_rbtree_walk_from(data) {
				5736	if (__tcp_retransmit_skb(sk, data, 1))
				5737	break;
				5738	}
				5739	tcp_rearm_rto(sk);
				5740	NET_INC_STATS(sock_net(sk),
				5741	LINUX_MIB_TCPFASTOPENACTIVEFAIL);
				5742	return true;
				5743	}
				5744	tp->syn_data_acked = tp->syn_data;
				5745	if (tp->syn_data_acked) {
				5746	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
				5747	/* SYN-data is counted as two separate packets in tcp_ack() */
				5748	if (tp->delivered > 1)
				5749	--tp->delivered;
				5750	}
				5751
				5752	tcp_fastopen_add_skb(sk, synack);
				5753
				5754	return false;
				5755	}
				5756
				5757	static void smc_check_reset_syn(struct tcp_sock *tp)
				5758	{
				5759	#if IS_ENABLED(CONFIG_SMC)
				5760	if (static_branch_unlikely(&tcp_have_smc)) {
				5761	if (tp->syn_smc && !tp->rx_opt.smc_ok)
				5762	tp->syn_smc = 0;
				5763	}
				5764	#endif
				5765	}
				5766
				5767	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
				5768	const struct tcphdr *th)
				5769	{
				5770	struct inet_connection_sock *icsk = inet_csk(sk);
				5771	struct tcp_sock *tp = tcp_sk(sk);
				5772	struct tcp_fastopen_cookie foc = { .len = -1 };
				5773	int saved_clamp = tp->rx_opt.mss_clamp;
				5774	bool fastopen_fail;
				5775
				5776	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
				5777	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				5778	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				5779
				5780	if (th->ack) {
				5781	/* rfc793:
				5782	* "If the state is SYN-SENT then
				5783	* first check the ACK bit
				5784	* If the ACK bit is set
				5785	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
				5786	* a reset (unless the RST bit is set, if so drop
				5787	* the segment and return)"
				5788	*/
				5789	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
				5790	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
				5791	goto reset_and_undo;
				5792
				5793	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				5794	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
				5795	tcp_time_stamp(tp))) {
				5796	NET_INC_STATS(sock_net(sk),
				5797	LINUX_MIB_PAWSACTIVEREJECTED);
				5798	goto reset_and_undo;
				5799	}
				5800
				5801	/* Now ACK is acceptable.
				5802	*
				5803	* "If the RST bit is set
				5804	* If the ACK was acceptable then signal the user "error:
				5805	* connection reset", drop the segment, enter CLOSED state,
				5806	* delete TCB, and return."
				5807	*/
				5808
				5809	if (th->rst) {
				5810	tcp_reset(sk);
				5811	goto discard;
				5812	}
				5813
				5814	/* rfc793:
				5815	* "fifth, if neither of the SYN or RST bits is set then
				5816	* drop the segment and return."
				5817	*
				5818	* See note below!
				5819	* --ANK(990513)
				5820	*/
				5821	if (!th->syn)
				5822	goto discard_and_undo;
				5823
				5824	/* rfc793:
				5825	* "If the SYN bit is on ...
				5826	* are acceptable then ...
				5827	* (our SYN has been ACKed), change the connection
				5828	* state to ESTABLISHED..."
				5829	*/
				5830
				5831	tcp_ecn_rcv_synack(tp, th);
				5832
				5833	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				5834	tcp_ack(sk, skb, FLAG_SLOWPATH);
				5835
				5836	/* Ok.. it's good. Set up sequence numbers and
				5837	* move to established.
				5838	*/
				5839	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
				5840	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5841
				5842	/* RFC1323: The window in SYN & SYN/ACK segments is
				5843	* never scaled.
				5844	*/
				5845	tp->snd_wnd = ntohs(th->window);
				5846
				5847	if (!tp->rx_opt.wscale_ok) {
				5848	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
				5849	tp->window_clamp = min(tp->window_clamp, 65535U);
				5850	}
				5851
				5852	if (tp->rx_opt.saw_tstamp) {
				5853	tp->rx_opt.tstamp_ok = 1;
				5854	tp->tcp_header_len =
				5855	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5856	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				5857	tcp_store_ts_recent(tp);
				5858	} else {
				5859	tp->tcp_header_len = sizeof(struct tcphdr);
				5860	}
				5861
				5862	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5863	tcp_initialize_rcv_mss(sk);
				5864
				5865	/* Remember, tcp_poll() does not lock socket!
				5866	* Change state from SYN-SENT only after copied_seq
				5867	* is initialized. */
				5868	tp->copied_seq = tp->rcv_nxt;
				5869
				5870	smc_check_reset_syn(tp);
				5871
				5872	smp_mb();
				5873
				5874	tcp_finish_connect(sk, skb);
				5875
				5876	fastopen_fail = (tp->syn_fastopen \|\| tp->syn_data) &&
				5877	tcp_rcv_fastopen_synack(sk, skb, &foc);
				5878
				5879	if (!sock_flag(sk, SOCK_DEAD)) {
				5880	sk->sk_state_change(sk);
				5881	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				5882	}
				5883	if (fastopen_fail)
				5884	return -1;
				5885	if (sk->sk_write_pending \|\|
				5886	icsk->icsk_accept_queue.rskq_defer_accept \|\|
				5887	icsk->icsk_ack.pingpong) {
				5888	/* Save one ACK. Data will be ready after
				5889	* several ticks, if write_pending is set.
				5890	*
				5891	* It may be deleted, but with this feature tcpdumps
				5892	* look so _wonderfully_ clever, that I was not able
				5893	* to stand against the temptation 8) --ANK
				5894	*/
				5895	inet_csk_schedule_ack(sk);
				5896	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
				5897	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				5898	TCP_DELACK_MAX, TCP_RTO_MAX);
				5899
				5900	discard:
				5901	tcp_drop(sk, skb);
				5902	return 0;
				5903	} else {
				5904	tcp_send_ack(sk);
				5905	}
				5906	return -1;
				5907	}
				5908
				5909	/* No ACK in the segment */
				5910
				5911	if (th->rst) {
				5912	/* rfc793:
				5913	* "If the RST bit is set
				5914	*
				5915	* Otherwise (no ACK) drop the segment and return."
				5916	*/
				5917
				5918	goto discard_and_undo;
				5919	}
				5920
				5921	/* PAWS check. */
				5922	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
				5923	tcp_paws_reject(&tp->rx_opt, 0))
				5924	goto discard_and_undo;
				5925
				5926	if (th->syn) {
				5927	/* We see SYN without ACK. It is attempt of
				5928	* simultaneous connect with crossed SYNs.
				5929	* Particularly, it can be connect to self.
				5930	*/
				5931	tcp_set_state(sk, TCP_SYN_RECV);
				5932
				5933	if (tp->rx_opt.saw_tstamp) {
				5934	tp->rx_opt.tstamp_ok = 1;
				5935	tcp_store_ts_recent(tp);
				5936	tp->tcp_header_len =
				5937	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5938	} else {
				5939	tp->tcp_header_len = sizeof(struct tcphdr);
				5940	}
				5941
				5942	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
				5943	tp->copied_seq = tp->rcv_nxt;
				5944	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5945
				5946	/* RFC1323: The window in SYN & SYN/ACK segments is
				5947	* never scaled.
				5948	*/
				5949	tp->snd_wnd = ntohs(th->window);
				5950	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				5951	tp->max_window = tp->snd_wnd;
				5952
				5953	tcp_ecn_rcv_syn(tp, th);
				5954
				5955	tcp_mtup_init(sk);
				5956	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5957	tcp_initialize_rcv_mss(sk);
				5958
				5959	tcp_send_synack(sk);
				5960	#if 0
				5961	/* Note, we could accept data and URG from this segment.
				5962	* There are no obstacles to make this (except that we must
				5963	* either change tcp_recvmsg() to prevent it from returning data
				5964	* before 3WHS completes per RFC793, or employ TCP Fast Open).
				5965	*
				5966	* However, if we ignore data in ACKless segments sometimes,
				5967	* we have no reasons to accept it sometimes.
				5968	* Also, seems the code doing it in step6 of tcp_rcv_state_process
				5969	* is not flawless. So, discard packet for sanity.
				5970	* Uncomment this return to process the data.
				5971	*/
				5972	return -1;
				5973	#else
				5974	goto discard;
				5975	#endif
				5976	}
				5977	/* "fifth, if neither of the SYN or RST bits is set then
				5978	* drop the segment and return."
				5979	*/
				5980
				5981	discard_and_undo:
				5982	tcp_clear_options(&tp->rx_opt);
				5983	tp->rx_opt.mss_clamp = saved_clamp;
				5984	goto discard;
				5985
				5986	reset_and_undo:
				5987	tcp_clear_options(&tp->rx_opt);
				5988	tp->rx_opt.mss_clamp = saved_clamp;
				5989	return 1;
				5990	}
				5991
				5992	/*
				5993	* This function implements the receiving procedure of RFC 793 for
				5994	* all states except ESTABLISHED and TIME_WAIT.
				5995	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
				5996	* address independent.
				5997	*/
				5998
				5999	int tcp_rcv_state_process(struct sock sk, struct sk_buff skb)
				6000	{
				6001	struct tcp_sock *tp = tcp_sk(sk);
				6002	struct inet_connection_sock *icsk = inet_csk(sk);
				6003	const struct tcphdr *th = tcp_hdr(skb);
				6004	struct request_sock *req;
				6005	int queued = 0;
				6006	bool acceptable;
				6007
				6008	switch (sk->sk_state) {
				6009	case TCP_CLOSE:
				6010	goto discard;
				6011
				6012	case TCP_LISTEN:
				6013	if (th->ack)
				6014	return 1;
				6015
				6016	if (th->rst)
				6017	goto discard;
				6018
				6019	if (th->syn) {
				6020	if (th->fin)
				6021	goto discard;
				6022	/* It is possible that we process SYN packets from backlog,
				6023	* so we need to make sure to disable BH and RCU right there.
				6024	*/
				6025	rcu_read_lock();
				6026	local_bh_disable();
				6027	acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
				6028	local_bh_enable();
				6029	rcu_read_unlock();
				6030
				6031	if (!acceptable)
				6032	return 1;
				6033	consume_skb(skb);
				6034	return 0;
				6035	}
				6036	goto discard;
				6037
				6038	case TCP_SYN_SENT:
				6039	tp->rx_opt.saw_tstamp = 0;
				6040	tcp_mstamp_refresh(tp);
				6041	queued = tcp_rcv_synsent_state_process(sk, skb, th);
				6042	if (queued >= 0)
				6043	return queued;
				6044
				6045	/* Do step6 onward by hand. */
				6046	tcp_urg(sk, skb, th);
				6047	__kfree_skb(skb);
				6048	tcp_data_snd_check(sk);
				6049	return 0;
				6050	}
				6051
				6052	tcp_mstamp_refresh(tp);
				6053	tp->rx_opt.saw_tstamp = 0;
				6054	req = tp->fastopen_rsk;
				6055	if (req) {
				6056	bool req_stolen;
				6057
				6058	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
				6059	sk->sk_state != TCP_FIN_WAIT1);
				6060
				6061	if (!tcp_check_req(sk, skb, req, true, &req_stolen))
				6062	goto discard;
				6063	}
				6064
				6065	if (!th->ack && !th->rst && !th->syn)
				6066	goto discard;
				6067
				6068	if (!tcp_validate_incoming(sk, skb, th, 0))
				6069	return 0;
				6070
				6071	/* step 5: check the ACK field */
				6072	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
				6073	FLAG_UPDATE_TS_RECENT \|
				6074	FLAG_NO_CHALLENGE_ACK) > 0;
				6075
				6076	if (!acceptable) {
				6077	if (sk->sk_state == TCP_SYN_RECV)
				6078	return 1; /* send one RST */
				6079	tcp_send_challenge_ack(sk, skb);
				6080	goto discard;
				6081	}
				6082	switch (sk->sk_state) {
				6083	case TCP_SYN_RECV:
				6084	tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
				6085	if (!tp->srtt_us)
				6086	tcp_synack_rtt_meas(sk, req);
				6087
				6088	/* Once we leave TCP_SYN_RECV, we no longer need req
				6089	* so release it.
				6090	*/
				6091	if (req) {
				6092	inet_csk(sk)->icsk_retransmits = 0;
				6093	reqsk_fastopen_remove(sk, req, false);
				6094	/* Re-arm the timer because data may have been sent out.
				6095	* This is similar to the regular data transmission case
				6096	* when new data has just been ack'ed.
				6097	*
				6098	* (TFO) - we could try to be more aggressive and
				6099	* retransmitting any data sooner based on when they
				6100	* are sent out.
				6101	*/
				6102	tcp_rearm_rto(sk);
				6103	} else {
				6104	tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
				6105	tp->copied_seq = tp->rcv_nxt;
				6106	}
				6107	smp_mb();
				6108	tcp_set_state(sk, TCP_ESTABLISHED);
				6109	sk->sk_state_change(sk);
				6110
				6111	/* Note, that this wakeup is only for marginal crossed SYN case.
				6112	* Passively open sockets are not waked up, because
				6113	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
				6114	*/
				6115	if (sk->sk_socket)
				6116	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				6117
				6118	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				6119	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
				6120	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				6121
				6122	if (tp->rx_opt.tstamp_ok)
				6123	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6124
				6125	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
				6126	tcp_update_pacing_rate(sk);
				6127
				6128	/* Prevent spurious tcp_cwnd_restart() on first data packet */
				6129	tp->lsndtime = tcp_jiffies32;
				6130
				6131	tcp_initialize_rcv_mss(sk);
				6132	tcp_fast_path_on(tp);
				6133	break;
				6134
				6135	case TCP_FIN_WAIT1: {
				6136	int tmo;
				6137
				6138	/* If we enter the TCP_FIN_WAIT1 state and we are a
				6139	* Fast Open socket and this is the first acceptable
				6140	* ACK we have received, this would have acknowledged
				6141	* our SYNACK so stop the SYNACK timer.
				6142	*/
				6143	if (req) {
				6144	/* We no longer need the request sock. */
				6145	reqsk_fastopen_remove(sk, req, false);
				6146	tcp_rearm_rto(sk);
				6147	}
				6148	if (tp->snd_una != tp->write_seq)
				6149	break;
				6150
				6151	tcp_set_state(sk, TCP_FIN_WAIT2);
				6152	sk->sk_shutdown \|= SEND_SHUTDOWN;
				6153
				6154	sk_dst_confirm(sk);
				6155
				6156	if (!sock_flag(sk, SOCK_DEAD)) {
				6157	/* Wake up lingering close() */
				6158	sk->sk_state_change(sk);
				6159	break;
				6160	}
				6161
				6162	if (tp->linger2 < 0) {
				6163	tcp_done(sk);
				6164	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6165	return 1;
				6166	}
				6167	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6168	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6169	/* Receive out of order FIN after close() */
				6170	if (tp->syn_fastopen && th->fin)
				6171	tcp_fastopen_active_disable(sk);
				6172	tcp_done(sk);
				6173	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6174	return 1;
				6175	}
				6176
				6177	tmo = tcp_fin_time(sk);
				6178	if (tmo > TCP_TIMEWAIT_LEN) {
				6179	inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
				6180	} else if (th->fin \|\| sock_owned_by_user(sk)) {
				6181	/* Bad case. We could lose such FIN otherwise.
				6182	* It is not a big problem, but it looks confusing
				6183	* and not so rare event. We still can lose it now,
				6184	* if it spins in bh_lock_sock(), but it is really
				6185	* marginal case.
				6186	*/
				6187	inet_csk_reset_keepalive_timer(sk, tmo);
				6188	} else {
				6189	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				6190	goto discard;
				6191	}
				6192	break;
				6193	}
				6194
				6195	case TCP_CLOSING:
				6196	if (tp->snd_una == tp->write_seq) {
				6197	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				6198	goto discard;
				6199	}
				6200	break;
				6201
				6202	case TCP_LAST_ACK:
				6203	if (tp->snd_una == tp->write_seq) {
				6204	tcp_update_metrics(sk);
				6205	tcp_done(sk);
				6206	goto discard;
				6207	}
				6208	break;
				6209	}
				6210
				6211	/* step 6: check the URG bit */
				6212	tcp_urg(sk, skb, th);
				6213
				6214	/* step 7: process the segment text */
				6215	switch (sk->sk_state) {
				6216	case TCP_CLOSE_WAIT:
				6217	case TCP_CLOSING:
				6218	case TCP_LAST_ACK:
				6219	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				6220	break;
				6221	/* fall through */
				6222	case TCP_FIN_WAIT1:
				6223	case TCP_FIN_WAIT2:
				6224	/* RFC 793 says to queue data in these states,
				6225	* RFC 1122 says we MUST send a reset.
				6226	* BSD 4.4 also does reset.
				6227	*/
				6228	if (sk->sk_shutdown & RCV_SHUTDOWN) {
				6229	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6230	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6231	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6232	tcp_reset(sk);
				6233	return 1;
				6234	}
				6235	}
				6236	/* Fall through */
				6237	case TCP_ESTABLISHED:
				6238	tcp_data_queue(sk, skb);
				6239	queued = 1;
				6240	break;
				6241	}
				6242
				6243	/* tcp_data could move socket to TIME-WAIT */
				6244	if (sk->sk_state != TCP_CLOSE) {
				6245	tcp_data_snd_check(sk);
				6246	tcp_ack_snd_check(sk);
				6247	}
				6248
				6249	if (!queued) {
				6250	discard:
				6251	tcp_drop(sk, skb);
				6252	}
				6253	return 0;
				6254	}
				6255	EXPORT_SYMBOL(tcp_rcv_state_process);
				6256
				6257	static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
				6258	{
				6259	struct inet_request_sock *ireq = inet_rsk(req);
				6260
				6261	if (family == AF_INET)
				6262	net_dbg_ratelimited("drop open request from %pI4/%u\n",
				6263	&ireq->ir_rmt_addr, port);
				6264	#if IS_ENABLED(CONFIG_IPV6)
				6265	else if (family == AF_INET6)
				6266	net_dbg_ratelimited("drop open request from %pI6/%u\n",
				6267	&ireq->ir_v6_rmt_addr, port);
				6268	#endif
				6269	}
				6270
				6271	/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
				6272	*
				6273	* If we receive a SYN packet with these bits set, it means a
				6274	* network is playing bad games with TOS bits. In order to
				6275	* avoid possible false congestion notifications, we disable
				6276	* TCP ECN negotiation.
				6277	*
				6278	* Exception: tcp_ca wants ECN. This is required for DCTCP
				6279	* congestion control: Linux DCTCP asserts ECT on all packets,
				6280	* including SYN, which is most optimal solution; however,
				6281	* others, such as FreeBSD do not.
				6282	*/
				6283	static void tcp_ecn_create_request(struct request_sock *req,
				6284	const struct sk_buff *skb,
				6285	const struct sock *listen_sk,
				6286	const struct dst_entry *dst)
				6287	{
				6288	const struct tcphdr *th = tcp_hdr(skb);
				6289	const struct net *net = sock_net(listen_sk);
				6290	bool th_ecn = th->ece && th->cwr;
				6291	bool ect, ecn_ok;
				6292	u32 ecn_ok_dst;
				6293
				6294	if (!th_ecn)
				6295	return;
				6296
				6297	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
				6298	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
				6299	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
				6300
				6301	if ((!ect && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
				6302	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
				6303	tcp_bpf_ca_needs_ecn((struct sock *)req))
				6304	inet_rsk(req)->ecn_ok = 1;
				6305	}
				6306
				6307	static void tcp_openreq_init(struct request_sock *req,
				6308	const struct tcp_options_received *rx_opt,
				6309	struct sk_buff skb, const struct sock sk)
				6310	{
				6311	struct inet_request_sock *ireq = inet_rsk(req);
				6312
				6313	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
				6314	req->cookie_ts = 0;
				6315	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
				6316	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				6317	tcp_rsk(req)->snt_synack = tcp_clock_us();
				6318	tcp_rsk(req)->last_oow_ack_time = 0;
				6319	req->mss = rx_opt->mss_clamp;
				6320	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
				6321	ireq->tstamp_ok = rx_opt->tstamp_ok;
				6322	ireq->sack_ok = rx_opt->sack_ok;
				6323	ireq->snd_wscale = rx_opt->snd_wscale;
				6324	ireq->wscale_ok = rx_opt->wscale_ok;
				6325	ireq->acked = 0;
				6326	ireq->ecn_ok = 0;
				6327	ireq->ir_rmt_port = tcp_hdr(skb)->source;
				6328	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
				6329	ireq->ir_mark = inet_request_mark(sk, skb);
				6330	#if IS_ENABLED(CONFIG_SMC)
				6331	ireq->smc_ok = rx_opt->smc_ok;
				6332	#endif
				6333	}
				6334
				6335	struct request_sock inet_reqsk_alloc(const struct request_sock_ops ops,
				6336	struct sock *sk_listener,
				6337	bool attach_listener)
				6338	{
				6339	struct request_sock *req = reqsk_alloc(ops, sk_listener,
				6340	attach_listener);
				6341
				6342	if (req) {
				6343	struct inet_request_sock *ireq = inet_rsk(req);
				6344
				6345	ireq->ireq_opt = NULL;
				6346	#if IS_ENABLED(CONFIG_IPV6)
				6347	ireq->pktopts = NULL;
				6348	#endif
				6349	atomic64_set(&ireq->ir_cookie, 0);
				6350	ireq->ireq_state = TCP_NEW_SYN_RECV;
				6351	write_pnet(&ireq->ireq_net, sock_net(sk_listener));
				6352	ireq->ireq_family = sk_listener->sk_family;
				6353	}
				6354
				6355	return req;
				6356	}
				6357	EXPORT_SYMBOL(inet_reqsk_alloc);
				6358
				6359	/*
				6360	* Return true if a syncookie should be sent
				6361	*/
				6362	static bool tcp_syn_flood_action(const struct sock *sk,
				6363	const struct sk_buff *skb,
				6364	const char *proto)
				6365	{
				6366	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
				6367	const char *msg = "Dropping request";
				6368	bool want_cookie = false;
				6369	struct net *net = sock_net(sk);
				6370
				6371	#ifdef CONFIG_SYN_COOKIES
				6372	if (net->ipv4.sysctl_tcp_syncookies) {
				6373	msg = "Sending cookies";
				6374	want_cookie = true;
				6375	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
				6376	} else
				6377	#endif
				6378	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
				6379
				6380	if (!queue->synflood_warned &&
				6381	net->ipv4.sysctl_tcp_syncookies != 2 &&
				6382	xchg(&queue->synflood_warned, 1) == 0)
				6383	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
				6384	proto, ntohs(tcp_hdr(skb)->dest), msg);
				6385
				6386	return want_cookie;
				6387	}
				6388
				6389	static void tcp_reqsk_record_syn(const struct sock *sk,
				6390	struct request_sock *req,
				6391	const struct sk_buff *skb)
				6392	{
				6393	if (tcp_sk(sk)->save_syn) {
				6394	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
				6395	u32 *copy;
				6396
				6397	copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
				6398	if (copy) {
				6399	copy[0] = len;
				6400	memcpy(&copy[1], skb_network_header(skb), len);
				6401	req->saved_syn = copy;
				6402	}
				6403	}
				6404	}
				6405
				6406	int tcp_conn_request(struct request_sock_ops *rsk_ops,
				6407	const struct tcp_request_sock_ops *af_ops,
				6408	struct sock sk, struct sk_buff skb)
				6409	{
				6410	struct tcp_fastopen_cookie foc = { .len = -1 };
				6411	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
				6412	struct tcp_options_received tmp_opt;
				6413	struct tcp_sock *tp = tcp_sk(sk);
				6414	struct net *net = sock_net(sk);
				6415	struct sock *fastopen_sk = NULL;
				6416	struct request_sock *req;
				6417	bool want_cookie = false;
				6418	struct dst_entry *dst;
				6419	struct flowi fl;
				6420
				6421	/* TW buckets are converted to open requests without
				6422	* limitations, they conserve resources and peer is
				6423	* evidently real one.
				6424	*/
				6425	if ((net->ipv4.sysctl_tcp_syncookies == 2 \|\|
				6426	inet_csk_reqsk_queue_is_full(sk)) && !isn) {
				6427	want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
				6428	if (!want_cookie)
				6429	goto drop;
				6430	}
				6431
				6432	if (sk_acceptq_is_full(sk)) {
				6433	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6434	goto drop;
				6435	}
				6436
				6437	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
				6438	if (!req)
				6439	goto drop;
				6440
				6441	tcp_rsk(req)->af_specific = af_ops;
				6442	tcp_rsk(req)->ts_off = 0;
				6443
				6444	tcp_clear_options(&tmp_opt);
				6445	tmp_opt.mss_clamp = af_ops->mss_clamp;
				6446	tmp_opt.user_mss = tp->rx_opt.user_mss;
				6447	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
				6448	want_cookie ? NULL : &foc);
				6449
				6450	if (want_cookie && !tmp_opt.saw_tstamp)
				6451	tcp_clear_options(&tmp_opt);
				6452
				6453	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
				6454	tmp_opt.smc_ok = 0;
				6455
				6456	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				6457	tcp_openreq_init(req, &tmp_opt, skb, sk);
				6458	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
				6459
				6460	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
				6461	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
				6462
				6463	af_ops->init_req(req, sk, skb);
				6464
				6465	if (security_inet_conn_request(sk, skb, req))
				6466	goto drop_and_free;
				6467
				6468	if (tmp_opt.tstamp_ok)
				6469	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
				6470
				6471	dst = af_ops->route_req(sk, &fl, req);
				6472	if (!dst)
				6473	goto drop_and_free;
				6474
				6475	if (!want_cookie && !isn) {
				6476	/* Kill the following clause, if you dislike this way. */
				6477	if (!net->ipv4.sysctl_tcp_syncookies &&
				6478	(net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
				6479	(net->ipv4.sysctl_max_syn_backlog >> 2)) &&
				6480	!tcp_peer_is_proven(req, dst)) {
				6481	/* Without syncookies last quarter of
				6482	* backlog is filled with destinations,
				6483	* proven to be alive.
				6484	* It means that we continue to communicate
				6485	* to destinations, already remembered
				6486	* to the moment of synflood.
				6487	*/
				6488	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				6489	rsk_ops->family);
				6490	goto drop_and_release;
				6491	}
				6492
				6493	isn = af_ops->init_seq(skb);
				6494	}
				6495
				6496	tcp_ecn_create_request(req, skb, sk, dst);
				6497
				6498	if (want_cookie) {
				6499	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
				6500	req->cookie_ts = tmp_opt.tstamp_ok;
				6501	if (!tmp_opt.tstamp_ok)
				6502	inet_rsk(req)->ecn_ok = 0;
				6503	}
				6504
				6505	tcp_rsk(req)->snt_isn = isn;
				6506	tcp_rsk(req)->txhash = net_tx_rndhash();
				6507	tcp_openreq_init_rwin(req, sk, dst);
				6508	sk_rx_queue_set(req_to_sk(req), skb);
				6509	if (!want_cookie) {
				6510	tcp_reqsk_record_syn(sk, req, skb);
				6511	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
				6512	}
				6513	if (fastopen_sk) {
				6514	af_ops->send_synack(fastopen_sk, dst, &fl, req,
				6515	&foc, TCP_SYNACK_FASTOPEN);
				6516	/* Add the child socket directly into the accept queue */
				6517	if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
				6518	reqsk_fastopen_remove(fastopen_sk, req, false);
				6519	bh_unlock_sock(fastopen_sk);
				6520	sock_put(fastopen_sk);
				6521	reqsk_put(req);
				6522	goto drop;
				6523	}
				6524	sk->sk_data_ready(sk);
				6525	bh_unlock_sock(fastopen_sk);
				6526	sock_put(fastopen_sk);
				6527	} else {
				6528	tcp_rsk(req)->tfo_listener = false;
				6529	if (!want_cookie)
				6530	inet_csk_reqsk_queue_hash_add(sk, req,
				6531	tcp_timeout_init((struct sock *)req));
				6532	af_ops->send_synack(sk, dst, &fl, req, &foc,
				6533	!want_cookie ? TCP_SYNACK_NORMAL :
				6534	TCP_SYNACK_COOKIE);
				6535	if (want_cookie) {
				6536	reqsk_free(req);
				6537	return 0;
				6538	}
				6539	}
				6540	reqsk_put(req);
				6541	return 0;
				6542
				6543	drop_and_release:
				6544	dst_release(dst);
				6545	drop_and_free:
				6546	reqsk_free(req);
				6547	drop:
				6548	tcp_listendrop(sk);
				6549	return 0;
				6550	}
				6551	EXPORT_SYMBOL(tcp_conn_request);