Blame - ap/os/linux/linux-3.4.x/net/ipv4/tcp_input.c - R306

blob: 924af66bca94d16d20365d13e33391f456403a03 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*/
				20
				21	/*
				22	* Changes:
				23	* Pedro Roque : Fast Retransmit/Recovery.
				24	* Two receive queues.
				25	* Retransmit queue handled by TCP.
				26	* Better retransmit timer handling.
				27	* New congestion avoidance.
				28	* Header prediction.
				29	* Variable renaming.
				30	*
				31	* Eric : Fast Retransmit.
				32	* Randy Scott : MSS option defines.
				33	* Eric Schenk : Fixes to slow start algorithm.
				34	* Eric Schenk : Yet another double ACK bug.
				35	* Eric Schenk : Delayed ACK bug fixes.
				36	* Eric Schenk : Floyd style fast retrans war avoidance.
				37	* David S. Miller : Don't allow zero congestion window.
				38	* Eric Schenk : Fix retransmitter so that it sends
				39	* next packet on ack of previous packet.
				40	* Andi Kleen : Moved open_request checking here
				41	* and process RSTs for open_requests.
				42	* Andi Kleen : Better prune_queue, and other fixes.
				43	* Andrey Savochkin: Fix RTT measurements in the presence of
				44	* timestamps.
				45	* Andrey Savochkin: Check sequence numbers correctly when
				46	* removing SACKs due to in sequence incoming
				47	* data segments.
				48	* Andi Kleen: Make sure we never ack data there is not
				49	* enough room for. Also make this condition
				50	* a fatal error if it might still happen.
				51	* Andi Kleen: Add tcp_measure_rcv_mss to make
				52	* connections with MSS<min(MTU,ann. MSS)
				53	* work without delayed acks.
				54	* Andi Kleen: Process packets with PSH set in the
				55	* fast path.
				56	* J Hadi Salim: ECN support
				57	* Andrei Gurtov,
				58	* Pasi Sarolahti,
				59	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
				60	* engine. Lots of bugs are found.
				61	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
				62	*/
				63
				64	#define pr_fmt(fmt) "TCP: " fmt
				65
				66	#include <linux/mm.h>
				67	#include <linux/slab.h>
				68	#include <linux/module.h>
				69	#include <linux/sysctl.h>
				70	#include <linux/kernel.h>
				71	#include <net/dst.h>
				72	#include <net/tcp.h>
				73	#include <net/inet_common.h>
				74	#include <linux/ipsec.h>
				75	#include <asm/unaligned.h>
				76	#include <net/netdma.h>
				77	#include <net/SI/errno_track.h>
				78	#include <net/SI/sock_track.h>
				79
				80
				81	int sysctl_tcp_timestamps __read_mostly = 1;
				82	int sysctl_tcp_window_scaling __read_mostly = 1;
				83	int sysctl_tcp_sack __read_mostly = 1;
				84	int sysctl_tcp_fack __read_mostly = 1;
				85	int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
				86	EXPORT_SYMBOL(sysctl_tcp_reordering);
				87	int sysctl_tcp_ecn __read_mostly = 2;
				88	EXPORT_SYMBOL(sysctl_tcp_ecn);
				89	int sysctl_tcp_dsack __read_mostly = 1;
				90	int sysctl_tcp_app_win __read_mostly = 31;
				91	int sysctl_tcp_adv_win_scale __read_mostly = 1;
				92	EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
				93
				94	/* rfc5961 challenge ack rate limiting */
				95	int sysctl_tcp_challenge_ack_limit = 100;
				96
				97	int sysctl_tcp_stdurg __read_mostly;
				98	int sysctl_tcp_rfc1337 __read_mostly;
				99	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
				100	int sysctl_tcp_frto __read_mostly = 2;
				101	int sysctl_tcp_frto_response __read_mostly;
				102	int sysctl_tcp_nometrics_save __read_mostly;
				103
				104	int sysctl_tcp_thin_dupack __read_mostly;
				105
				106	int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
				107	int sysctl_tcp_abc __read_mostly;
				108
				109	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
				110	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
				111	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
				112	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
				113	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
				114	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
				115	#define FLAG_ECE 0x40 /* ECE in this ACK */
				116	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
				117	#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
				118	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
				119	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
				120	#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
				121	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
				122	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
				123
				124	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
				125	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
				126	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE)
				127	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
				128	#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS\|FLAG_SND_UNA_ADVANCED)
				129
				130	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
				131	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
				132
				133	/* Adapt the MSS value used to make delayed ack decision to the
				134	* real world.
				135	*/
				136	static void tcp_measure_rcv_mss(struct sock sk, const struct sk_buff skb)
				137	{
				138	struct inet_connection_sock *icsk = inet_csk(sk);
				139	const unsigned int lss = icsk->icsk_ack.last_seg_size;
				140	unsigned int len;
				141
				142	icsk->icsk_ack.last_seg_size = 0;
				143
				144	/* skb->len may jitter because of SACKs, even if peer
				145	* sends good full-sized frames.
				146	*/
				147	len = skb_shinfo(skb)->gso_size ? : skb->len;
				148	if (len >= icsk->icsk_ack.rcv_mss) {
				149	icsk->icsk_ack.rcv_mss = len;
				150	} else {
				151	/* Otherwise, we make more careful check taking into account,
				152	* that SACKs block is variable.
				153	*
				154	* "len" is invariant segment length, including TCP header.
				155	*/
				156	len += skb->data - skb_transport_header(skb);
				157	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
				158	/* If PSH is not set, packet should be
				159	* full sized, provided peer TCP is not badly broken.
				160	* This observation (if it is correct 8)) allows
				161	* to handle super-low mtu links fairly.
				162	*/
				163	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
				164	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
				165	/* Subtract also invariant (if peer is RFC compliant),
				166	* tcp header plus fixed timestamp option length.
				167	* Resulting "len" is MSS free of SACK jitter.
				168	*/
				169	len -= tcp_sk(sk)->tcp_header_len;
				170	icsk->icsk_ack.last_seg_size = len;
				171	if (len == lss) {
				172	icsk->icsk_ack.rcv_mss = len;
				173	return;
				174	}
				175	}
				176	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
				177	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
				178	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				179	}
				180	}
				181
				182	static void tcp_incr_quickack(struct sock *sk)
				183	{
				184	struct inet_connection_sock *icsk = inet_csk(sk);
				185	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
				186
				187	if (quickacks == 0)
				188	quickacks = 2;
				189	if (quickacks > icsk->icsk_ack.quick)
				190	icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
				191	}
				192
				193	static void tcp_enter_quickack_mode(struct sock *sk)
				194	{
				195	struct inet_connection_sock *icsk = inet_csk(sk);
				196	tcp_incr_quickack(sk);
				197	icsk->icsk_ack.pingpong = 0;
				198	icsk->icsk_ack.ato = TCP_ATO_MIN;
				199	}
				200
				201	/* Send ACKs quickly, if "quick" count is not exhausted
				202	* and the session is not interactive.
				203	*/
				204
				205	static inline int tcp_in_quickack_mode(const struct sock *sk)
				206	{
				207	const struct inet_connection_sock *icsk = inet_csk(sk);
				208	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
				209	}
				210
				211	static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
				212	{
				213	if (tp->ecn_flags & TCP_ECN_OK)
				214	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
				215	}
				216
				217	static inline void TCP_ECN_accept_cwr(struct tcp_sock tp, const struct sk_buff skb)
				218	{
				219	if (tcp_hdr(skb)->cwr)
				220	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				221	}
				222
				223	static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
				224	{
				225	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				226	}
				227
				228	static inline void TCP_ECN_check_ce(struct tcp_sock tp, const struct sk_buff skb)
				229	{
				230	if (!(tp->ecn_flags & TCP_ECN_OK))
				231	return;
				232
				233	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
				234	case INET_ECN_NOT_ECT:
				235	/* Funny extension: if ECT is not set on a segment,
				236	* and we already seen ECT on a previous segment,
				237	* it is probably a retransmit.
				238	*/
				239	if (tp->ecn_flags & TCP_ECN_SEEN)
				240	tcp_enter_quickack_mode((struct sock *)tp);
				241	break;
				242	case INET_ECN_CE:
				243	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
				244	/* fallinto */
				245	default:
				246	tp->ecn_flags \|= TCP_ECN_SEEN;
				247	}
				248	}
				249
				250	static inline void TCP_ECN_rcv_synack(struct tcp_sock tp, const struct tcphdr th)
				251	{
				252	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
				253	tp->ecn_flags &= ~TCP_ECN_OK;
				254	}
				255
				256	static inline void TCP_ECN_rcv_syn(struct tcp_sock tp, const struct tcphdr th)
				257	{
				258	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
				259	tp->ecn_flags &= ~TCP_ECN_OK;
				260	}
				261
				262	static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock tp, const struct tcphdr th)
				263	{
				264	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
				265	return 1;
				266	return 0;
				267	}
				268
				269	/* Buffer size and advertised window tuning.
				270	*
				271	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
				272	*/
				273
				274	static void tcp_fixup_sndbuf(struct sock *sk)
				275	{
				276	int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
				277
				278	sndmem *= TCP_INIT_CWND;
				279	if (sk->sk_sndbuf < sndmem)
				280	sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
				281	}
				282
				283	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
				284	*
				285	* All tcp_full_space() is split to two parts: "network" buffer, allocated
				286	* forward and advertised in receiver window (tp->rcv_wnd) and
				287	* "application buffer", required to isolate scheduling/application
				288	* latencies from network.
				289	* window_clamp is maximal advertised window. It can be less than
				290	* tcp_full_space(), in this case tcp_full_space() - window_clamp
				291	* is reserved for "application" buffer. The less window_clamp is
				292	* the smoother our behaviour from viewpoint of network, but the lower
				293	* throughput and the higher sensitivity of the connection to losses. 8)
				294	*
				295	* rcv_ssthresh is more strict window_clamp used at "slow start"
				296	* phase to predict further behaviour of this connection.
				297	* It is used for two goals:
				298	* - to enforce header prediction at sender, even when application
				299	* requires some significant "application buffer". It is check #1.
				300	* - to prevent pruning of receive queue because of misprediction
				301	* of receiver window. Check #2.
				302	*
				303	* The scheme does not work when sender sends good segments opening
				304	* window and then starts to feed us spaghetti. But it should work
				305	* in common situations. Otherwise, we have to rely on queue collapsing.
				306	*/
				307
				308	/* Slow part of check#2. */
				309	static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
				310	{
				311	struct tcp_sock *tp = tcp_sk(sk);
				312	/* Optimize this! */
				313	int truesize = tcp_win_from_space(skb->truesize) >> 1;
				314	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
				315
				316	while (tp->rcv_ssthresh <= window) {
				317	if (truesize <= skb->len)
				318	return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
				319
				320	truesize >>= 1;
				321	window >>= 1;
				322	}
				323	return 0;
				324	}
				325
				326	static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
				327	{
				328	struct tcp_sock *tp = tcp_sk(sk);
				329
				330	/* Check #1 */
				331	if (tp->rcv_ssthresh < tp->window_clamp &&
				332	(int)tp->rcv_ssthresh < tcp_space(sk) &&
				333	!sk_under_memory_pressure(sk)) {
				334	int incr;
				335
				336	/* Check #2. Increase window, if skb with such overhead
				337	* will fit to rcvbuf in future.
				338	*/
				339	if (tcp_win_from_space(skb->truesize) <= skb->len)
				340	incr = 2 * tp->advmss;
				341	else
				342	incr = __tcp_grow_window(sk, skb);
				343
				344	if (incr) {
				345	incr = max_t(int, incr, 2 * skb->len);
				346	tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
				347	tp->window_clamp);
				348	inet_csk(sk)->icsk_ack.quick \|= 1;
				349	}
				350	}
				351	}
				352
				353	/* 3. Tuning rcvbuf, when connection enters established state. */
				354
				355	static void tcp_fixup_rcvbuf(struct sock *sk)
				356	{
				357	u32 mss = tcp_sk(sk)->advmss;
				358	u32 icwnd = TCP_DEFAULT_INIT_RCVWND;
				359	int rcvmem;
				360
				361	/* Limit to 10 segments if mss <= 1460,
				362	* or 14600/mss segments, with a minimum of two segments.
				363	*/
				364	if (mss > 1460)
				365	icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
				366
				367	rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
				368	while (tcp_win_from_space(rcvmem) < mss)
				369	rcvmem += 128;
				370
				371	rcvmem *= icwnd;
				372
				373	if (sk->sk_rcvbuf < rcvmem)
				374	sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
				375	}
				376
				377	/* 4. Try to fixup all. It is made immediately after connection enters
				378	* established state.
				379	*/
				380	static void tcp_init_buffer_space(struct sock *sk)
				381	{
				382	struct tcp_sock *tp = tcp_sk(sk);
				383	int maxwin;
				384
				385	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
				386	tcp_fixup_rcvbuf(sk);
				387	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
				388	tcp_fixup_sndbuf(sk);
				389
				390	tp->rcvq_space.space = tp->rcv_wnd;
				391
				392	maxwin = tcp_full_space(sk);
				393
				394	if (tp->window_clamp >= maxwin) {
				395	tp->window_clamp = maxwin;
				396
				397	if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
				398	tp->window_clamp = max(maxwin -
				399	(maxwin >> sysctl_tcp_app_win),
				400	4 * tp->advmss);
				401	}
				402
				403	/* Force reservation of one segment. */
				404	if (sysctl_tcp_app_win &&
				405	tp->window_clamp > 2 * tp->advmss &&
				406	tp->window_clamp + tp->advmss > maxwin)
				407	tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
				408
				409	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
				410	tp->snd_cwnd_stamp = tcp_time_stamp;
				411	}
				412
				413	/* 5. Recalculate window clamp after socket hit its memory bounds. */
				414	static void tcp_clamp_window(struct sock *sk)
				415	{
				416	struct tcp_sock *tp = tcp_sk(sk);
				417	struct inet_connection_sock *icsk = inet_csk(sk);
				418
				419	icsk->icsk_ack.quick = 0;
				420
				421	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
				422	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
				423	!sk_under_memory_pressure(sk) &&
				424	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
				425	sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				426	sysctl_tcp_rmem[2]);
				427	}
				428	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
				429	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
				430	}
				431
				432	/* Initialize RCV_MSS value.
				433	* RCV_MSS is an our guess about MSS used by the peer.
				434	* We haven't any direct information about the MSS.
				435	* It's better to underestimate the RCV_MSS rather than overestimate.
				436	* Overestimations make us ACKing less frequently than needed.
				437	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				438	*/
				439	void tcp_initialize_rcv_mss(struct sock *sk)
				440	{
				441	const struct tcp_sock *tp = tcp_sk(sk);
				442	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
				443
				444	hint = min(hint, tp->rcv_wnd / 2);
				445	hint = min(hint, TCP_MSS_DEFAULT);
				446	hint = max(hint, TCP_MIN_MSS);
				447
				448	inet_csk(sk)->icsk_ack.rcv_mss = hint;
				449	}
				450	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
				451
				452	/* Receiver "autotuning" code.
				453	*
				454	* The algorithm for RTT estimation w/o timestamps is based on
				455	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
				456	* <http://public.lanl.gov/radiant/pubs.html#DRS>
				457	*
				458	* More detail on this code can be found at
				459	* <http://staff.psc.edu/jheffner/>,
				460	* though this reference is out of date. A new paper
				461	* is pending.
				462	*/
				463	static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
				464	{
				465	u32 new_sample = tp->rcv_rtt_est.rtt;
				466	long m = sample;
				467
				468	if (m == 0)
				469	m = 1;
				470
				471	if (new_sample != 0) {
				472	/* If we sample in larger samples in the non-timestamp
				473	* case, we could grossly overestimate the RTT especially
				474	* with chatty applications or bulk transfer apps which
				475	* are stalled on filesystem I/O.
				476	*
				477	* Also, since we are only going for a minimum in the
				478	* non-timestamp case, we do not smooth things out
				479	* else with timestamps disabled convergence takes too
				480	* long.
				481	*/
				482	if (!win_dep) {
				483	m -= (new_sample >> 3);
				484	new_sample += m;
				485	} else {
				486	m <<= 3;
				487	if (m < new_sample)
				488	new_sample = m;
				489	}
				490	} else {
				491	/* No previous measure. */
				492	new_sample = m << 3;
				493	}
				494
				495	if (tp->rcv_rtt_est.rtt != new_sample)
				496	tp->rcv_rtt_est.rtt = new_sample;
				497	}
				498
				499	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
				500	{
				501	if (tp->rcv_rtt_est.time == 0)
				502	goto new_measure;
				503	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
				504	return;
				505	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
				506
				507	new_measure:
				508	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
				509	tp->rcv_rtt_est.time = tcp_time_stamp;
				510	}
				511
				512	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
				513	const struct sk_buff *skb)
				514	{
				515	struct tcp_sock *tp = tcp_sk(sk);
				516	if (tp->rx_opt.rcv_tsecr &&
				517	(TCP_SKB_CB(skb)->end_seq -
				518	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
				519	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
				520	}
				521
				522	/*
				523	* This function should be called every time data is copied to user space.
				524	* It calculates the appropriate TCP receive buffer space.
				525	*/
				526	void tcp_rcv_space_adjust(struct sock *sk)
				527	{
				528	struct tcp_sock *tp = tcp_sk(sk);
				529	int time;
				530	int space;
				531
				532	if (tp->rcvq_space.time == 0)
				533	goto new_measure;
				534
				535	time = tcp_time_stamp - tp->rcvq_space.time;
				536	if (time < (tp->rcv_rtt_est.rtt >> 3) \|\| tp->rcv_rtt_est.rtt == 0)
				537	return;
				538
				539	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
				540
				541	space = max(tp->rcvq_space.space, space);
				542
				543	if (tp->rcvq_space.space != space) {
				544	int rcvmem;
				545
				546	tp->rcvq_space.space = space;
				547
				548	if (sysctl_tcp_moderate_rcvbuf &&
				549	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
				550	int new_clamp = space;
				551
				552	/* Receive space grows, normalize in order to
				553	* take into account packet headers and sk_buff
				554	* structure overhead.
				555	*/
				556	space /= tp->advmss;
				557	if (!space)
				558	space = 1;
				559	rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
				560	while (tcp_win_from_space(rcvmem) < tp->advmss)
				561	rcvmem += 128;
				562	space *= rcvmem;
				563	space = min(space, sysctl_tcp_rmem[2]);
				564	if (space > sk->sk_rcvbuf) {
				565	sk->sk_rcvbuf = space;
				566
				567	/* Make the window clamp follow along. */
				568	tp->window_clamp = new_clamp;
				569	}
				570	}
				571	}
				572
				573	new_measure:
				574	tp->rcvq_space.seq = tp->copied_seq;
				575	tp->rcvq_space.time = tcp_time_stamp;
				576	}
				577
				578	/* There is something which you must keep in mind when you analyze the
				579	* behavior of the tp->ato delayed ack timeout interval. When a
				580	* connection starts up, we want to ack as quickly as possible. The
				581	* problem is that "good" TCP's do slow start at the beginning of data
				582	* transmission. The means that until we send the first few ACK's the
				583	* sender will sit on his end and only queue most of his data, because
				584	* he can only send snd_cwnd unacked packets at any given time. For
				585	* each ACK we send, he increments snd_cwnd and transmits more of his
				586	* queue. -DaveM
				587	*/
				588	static void tcp_event_data_recv(struct sock sk, struct sk_buff skb)
				589	{
				590	struct tcp_sock *tp = tcp_sk(sk);
				591	struct inet_connection_sock *icsk = inet_csk(sk);
				592	u32 now;
				593
				594	inet_csk_schedule_ack(sk);
				595
				596	tcp_measure_rcv_mss(sk, skb);
				597
				598	tcp_rcv_rtt_measure(tp);
				599
				600	now = tcp_time_stamp;
				601
				602	if (!icsk->icsk_ack.ato) {
				603	/* The _first_ data packet received, initialize
				604	* delayed ACK engine.
				605	*/
				606	tcp_incr_quickack(sk);
				607	icsk->icsk_ack.ato = TCP_ATO_MIN;
				608	} else {
				609	int m = now - icsk->icsk_ack.lrcvtime;
				610
				611	if (m <= TCP_ATO_MIN / 2) {
				612	/* The fastest case is the first. */
				613	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
				614	} else if (m < icsk->icsk_ack.ato) {
				615	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
				616	if (icsk->icsk_ack.ato > icsk->icsk_rto)
				617	icsk->icsk_ack.ato = icsk->icsk_rto;
				618	} else if (m > icsk->icsk_rto) {
				619	/* Too long gap. Apparently sender failed to
				620	* restart window, so that we send ACKs quickly.
				621	*/
				622	tcp_incr_quickack(sk);
				623	sk_mem_reclaim(sk);
				624	}
				625	}
				626	icsk->icsk_ack.lrcvtime = now;
				627
				628	TCP_ECN_check_ce(tp, skb);
				629
				630	if (skb->len >= 128)
				631	tcp_grow_window(sk, skb);
				632	}
				633
				634	/* Called to compute a smoothed rtt estimate. The data fed to this
				635	* routine either comes from timestamps, or from segments that were
				636	* known _not_ to have been retransmitted [see Karn/Partridge
				637	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
				638	* piece by Van Jacobson.
				639	* NOTE: the next three routines used to be one big routine.
				640	* To save cycles in the RFC 1323 implementation it was better to break
				641	* it up into three procedures. -- erics
				642	*/
				643	static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
				644	{
				645	struct tcp_sock *tp = tcp_sk(sk);
				646	long m = mrtt; /* RTT */
				647
				648	/* The following amusing code comes from Jacobson's
				649	* article in SIGCOMM '88. Note that rtt and mdev
				650	* are scaled versions of rtt and mean deviation.
				651	* This is designed to be as fast as possible
				652	* m stands for "measurement".
				653	*
				654	* On a 1990 paper the rto value is changed to:
				655	* RTO = rtt + 4 * mdev
				656	*
				657	* Funny. This algorithm seems to be very broken.
				658	* These formulae increase RTO, when it should be decreased, increase
				659	* too slowly, when it should be increased quickly, decrease too quickly
				660	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
				661	* does not matter how to _calculate_ it. Seems, it was trap
				662	* that VJ failed to avoid. 8)
				663	*/
				664	if (m == 0)
				665	m = 1;
				666	if (tp->srtt != 0) {
				667	m -= (tp->srtt >> 3); /* m is now error in rtt est */
				668	tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
				669	if (m < 0) {
				670	m = -m; /* m is now abs(error) */
				671	m -= (tp->mdev >> 2); /* similar update on mdev */
				672	/* This is similar to one of Eifel findings.
				673	* Eifel blocks mdev updates when rtt decreases.
				674	* This solution is a bit different: we use finer gain
				675	* for mdev in this case (alpha*beta).
				676	* Like Eifel it also prevents growth of rto,
				677	* but also it limits too fast rto decreases,
				678	* happening in pure Eifel.
				679	*/
				680	if (m > 0)
				681	m >>= 3;
				682	} else {
				683	m -= (tp->mdev >> 2); /* similar update on mdev */
				684	}
				685	tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
				686	if (tp->mdev > tp->mdev_max) {
				687	tp->mdev_max = tp->mdev;
				688	if (tp->mdev_max > tp->rttvar)
				689	tp->rttvar = tp->mdev_max;
				690	}
				691	if (after(tp->snd_una, tp->rtt_seq)) {
				692	if (tp->mdev_max < tp->rttvar)
				693	tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
				694	tp->rtt_seq = tp->snd_nxt;
				695	tp->mdev_max = tcp_rto_min(sk);
				696	}
				697	} else {
				698	/* no previous measure. */
				699	tp->srtt = m << 3; /* take the measured time to be rtt */
				700	tp->mdev = m << 1; /* make sure rto = 3rtt /
				701	tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
				702	tp->rtt_seq = tp->snd_nxt;
				703	}
				704	}
				705
				706	/* Calculate rto without backoff. This is the second half of Van Jacobson's
				707	* routine referred to above.
				708	*/
				709	static inline void tcp_set_rto(struct sock *sk)
				710	{
				711	const struct tcp_sock *tp = tcp_sk(sk);
				712	/* Old crap is replaced with new one. 8)
				713	*
				714	* More seriously:
				715	* 1. If rtt variance happened to be less 50msec, it is hallucination.
				716	* It cannot be less due to utterly erratic ACK generation made
				717	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
				718	* to do with delayed acks, because at cwnd>2 true delack timeout
				719	* is invisible. Actually, Linux-2.4 also generates erratic
				720	* ACKs in some circumstances.
				721	*/
				722	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
				723
				724	/* 2. Fixups made earlier cannot be right.
				725	* If we do not estimate RTO correctly without them,
				726	* all the algo is pure shit and should be replaced
				727	* with correct one. It is exactly, which we pretend to do.
				728	*/
				729
				730	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
				731	* guarantees that rto is higher.
				732	*/
				733	tcp_bound_rto(sk);
				734	}
				735
				736	/* Save metrics learned by this TCP session.
				737	This function is called only, when TCP finishes successfully
				738	i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
				739	*/
				740	void tcp_update_metrics(struct sock *sk)
				741	{
				742	struct tcp_sock *tp = tcp_sk(sk);
				743	struct dst_entry *dst = __sk_dst_get(sk);
				744
				745	if (sysctl_tcp_nometrics_save)
				746	return;
				747
				748	dst_confirm(dst);
				749
				750	if (dst && (dst->flags & DST_HOST)) {
				751	const struct inet_connection_sock *icsk = inet_csk(sk);
				752	int m;
				753	unsigned long rtt;
				754
				755	if (icsk->icsk_backoff \|\| !tp->srtt) {
				756	/* This session failed to estimate rtt. Why?
				757	* Probably, no packets returned in time.
				758	* Reset our results.
				759	*/
				760	if (!(dst_metric_locked(dst, RTAX_RTT)))
				761	dst_metric_set(dst, RTAX_RTT, 0);
				762	return;
				763	}
				764
				765	rtt = dst_metric_rtt(dst, RTAX_RTT);
				766	m = rtt - tp->srtt;
				767
				768	/* If newly calculated rtt larger than stored one,
				769	* store new one. Otherwise, use EWMA. Remember,
				770	* rtt overestimation is always better than underestimation.
				771	*/
				772	if (!(dst_metric_locked(dst, RTAX_RTT))) {
				773	if (m <= 0)
				774	set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
				775	else
				776	set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
				777	}
				778
				779	if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
				780	unsigned long var;
				781	if (m < 0)
				782	m = -m;
				783
				784	/* Scale deviation to rttvar fixed point */
				785	m >>= 1;
				786	if (m < tp->mdev)
				787	m = tp->mdev;
				788
				789	var = dst_metric_rtt(dst, RTAX_RTTVAR);
				790	if (m >= var)
				791	var = m;
				792	else
				793	var -= (var - m) >> 2;
				794
				795	set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
				796	}
				797
				798	if (tcp_in_initial_slowstart(tp)) {
				799	/* Slow start still did not finish. */
				800	if (dst_metric(dst, RTAX_SSTHRESH) &&
				801	!dst_metric_locked(dst, RTAX_SSTHRESH) &&
				802	(tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
				803	dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
				804	if (!dst_metric_locked(dst, RTAX_CWND) &&
				805	tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
				806	dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
				807	} else if (tp->snd_cwnd > tp->snd_ssthresh &&
				808	icsk->icsk_ca_state == TCP_CA_Open) {
				809	/* Cong. avoidance phase, cwnd is reliable. */
				810	if (!dst_metric_locked(dst, RTAX_SSTHRESH))
				811	dst_metric_set(dst, RTAX_SSTHRESH,
				812	max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
				813	if (!dst_metric_locked(dst, RTAX_CWND))
				814	dst_metric_set(dst, RTAX_CWND,
				815	(dst_metric(dst, RTAX_CWND) +
				816	tp->snd_cwnd) >> 1);
				817	} else {
				818	/* Else slow start did not finish, cwnd is non-sense,
				819	ssthresh may be also invalid.
				820	*/
				821	if (!dst_metric_locked(dst, RTAX_CWND))
				822	dst_metric_set(dst, RTAX_CWND,
				823	(dst_metric(dst, RTAX_CWND) +
				824	tp->snd_ssthresh) >> 1);
				825	if (dst_metric(dst, RTAX_SSTHRESH) &&
				826	!dst_metric_locked(dst, RTAX_SSTHRESH) &&
				827	tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
				828	dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
				829	}
				830
				831	if (!dst_metric_locked(dst, RTAX_REORDERING)) {
				832	if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
				833	tp->reordering != sysctl_tcp_reordering)
				834	dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
				835	}
				836	}
				837	}
				838
				839	__u32 tcp_init_cwnd(const struct tcp_sock tp, const struct dst_entry dst)
				840	{
				841	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
				842
				843	if (!cwnd)
				844	cwnd = TCP_INIT_CWND;
				845	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
				846	}
				847
				848	/* Set slow start threshold and cwnd not falling to slow start */
				849	void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
				850	{
				851	struct tcp_sock *tp = tcp_sk(sk);
				852	const struct inet_connection_sock *icsk = inet_csk(sk);
				853
				854	tp->prior_ssthresh = 0;
				855	tp->bytes_acked = 0;
				856	if (icsk->icsk_ca_state < TCP_CA_CWR) {
				857	tp->undo_marker = 0;
				858	if (set_ssthresh)
				859	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				860	tp->snd_cwnd = min(tp->snd_cwnd,
				861	tcp_packets_in_flight(tp) + 1U);
				862	tp->snd_cwnd_cnt = 0;
				863	tp->high_seq = tp->snd_nxt;
				864	tp->snd_cwnd_stamp = tcp_time_stamp;
				865	TCP_ECN_queue_cwr(tp);
				866
				867	tcp_set_ca_state(sk, TCP_CA_CWR);
				868	}
				869	}
				870
				871	/*
				872	* Packet counting of FACK is based on in-order assumptions, therefore TCP
				873	* disables it when reordering is detected
				874	*/
				875	static void tcp_disable_fack(struct tcp_sock *tp)
				876	{
				877	/* RFC3517 uses different metric in lost marker => reset on change */
				878	if (tcp_is_fack(tp))
				879	tp->lost_skb_hint = NULL;
				880	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
				881	}
				882
				883	/* Take a notice that peer is sending D-SACKs */
				884	static void tcp_dsack_seen(struct tcp_sock *tp)
				885	{
				886	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
				887	}
				888
				889	/* Initialize metrics on socket. */
				890
				891	static void tcp_init_metrics(struct sock *sk)
				892	{
				893	struct tcp_sock *tp = tcp_sk(sk);
				894	struct dst_entry *dst = __sk_dst_get(sk);
				895
				896	if (dst == NULL)
				897	goto reset;
				898
				899	dst_confirm(dst);
				900
				901	if (dst_metric_locked(dst, RTAX_CWND))
				902	tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
				903	if (dst_metric(dst, RTAX_SSTHRESH)) {
				904	tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
				905	if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
				906	tp->snd_ssthresh = tp->snd_cwnd_clamp;
				907	} else {
				908	/* ssthresh may have been reduced unnecessarily during.
				909	* 3WHS. Restore it back to its initial default.
				910	*/
				911	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
				912	}
				913	if (dst_metric(dst, RTAX_REORDERING) &&
				914	tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
				915	tcp_disable_fack(tp);
				916	tp->reordering = dst_metric(dst, RTAX_REORDERING);
				917	}
				918
				919	if (dst_metric(dst, RTAX_RTT) == 0 \|\| tp->srtt == 0)
				920	goto reset;
				921
				922	/* Initial rtt is determined from SYN,SYN-ACK.
				923	* The segment is small and rtt may appear much
				924	* less than real one. Use per-dst memory
				925	* to make it more realistic.
				926	*
				927	* A bit of theory. RTT is time passed after "normal" sized packet
				928	* is sent until it is ACKed. In normal circumstances sending small
				929	* packets force peer to delay ACKs and calculation is correct too.
				930	* The algorithm is adaptive and, provided we follow specs, it
				931	* NEVER underestimate RTT. BUT! If peer tries to make some clever
				932	* tricks sort of "quick acks" for time long enough to decrease RTT
				933	* to low value, and then abruptly stops to do it and starts to delay
				934	* ACKs, wait for troubles.
				935	*/
				936	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
				937	tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
				938	tp->rtt_seq = tp->snd_nxt;
				939	}
				940	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
				941	tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
				942	tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
				943	}
				944	tcp_set_rto(sk);
				945	reset:
				946	if (tp->srtt == 0) {
				947	/* RFC2988bis: We've failed to get a valid RTT sample from
				948	* 3WHS. This is most likely due to retransmission,
				949	* including spurious one. Reset the RTO back to 3secs
				950	* from the more aggressive 1sec to avoid more spurious
				951	* retransmission.
				952	*/
				953	tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
				954	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
				955	}
				956	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
				957	* retransmitted. In light of RFC2988bis' more aggressive 1sec
				958	* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
				959	* retransmission has occurred.
				960	*/
				961	if (tp->total_retrans > 1)
				962	tp->snd_cwnd = 1;
				963	else
				964	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
				965	tp->snd_cwnd_stamp = tcp_time_stamp;
				966	}
				967
				968	static void tcp_update_reordering(struct sock *sk, const int metric,
				969	const int ts)
				970	{
				971	struct tcp_sock *tp = tcp_sk(sk);
				972	if (metric > tp->reordering) {
				973	int mib_idx;
				974
				975	tp->reordering = min(TCP_MAX_REORDERING, metric);
				976
				977	/* This exciting event is worth to be remembered. 8) */
				978	if (ts)
				979	mib_idx = LINUX_MIB_TCPTSREORDER;
				980	else if (tcp_is_reno(tp))
				981	mib_idx = LINUX_MIB_TCPRENOREORDER;
				982	else if (tcp_is_fack(tp))
				983	mib_idx = LINUX_MIB_TCPFACKREORDER;
				984	else
				985	mib_idx = LINUX_MIB_TCPSACKREORDER;
				986
				987	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				988	#if FASTRETRANS_DEBUG > 1
				989	printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
				990	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
				991	tp->reordering,
				992	tp->fackets_out,
				993	tp->sacked_out,
				994	tp->undo_marker ? tp->undo_retrans : 0);
				995	#endif
				996	tcp_disable_fack(tp);
				997	}
				998	}
				999
				1000	/* This must be called before lost_out is incremented */
				1001	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
				1002	{
				1003	if ((tp->retransmit_skb_hint == NULL) \|\|
				1004	before(TCP_SKB_CB(skb)->seq,
				1005	TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
				1006	tp->retransmit_skb_hint = skb;
				1007
				1008	if (!tp->lost_out \|\|
				1009	after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
				1010	tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
				1011	}
				1012
				1013	static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
				1014	{
				1015	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				1016	tcp_verify_retransmit_hint(tp, skb);
				1017
				1018	tp->lost_out += tcp_skb_pcount(skb);
				1019	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				1020	}
				1021	}
				1022
				1023	static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
				1024	struct sk_buff *skb)
				1025	{
				1026	tcp_verify_retransmit_hint(tp, skb);
				1027
				1028	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				1029	tp->lost_out += tcp_skb_pcount(skb);
				1030	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				1031	}
				1032	}
				1033
				1034	/* This procedure tags the retransmission queue when SACKs arrive.
				1035	*
				1036	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
				1037	* Packets in queue with these bits set are counted in variables
				1038	* sacked_out, retrans_out and lost_out, correspondingly.
				1039	*
				1040	* Valid combinations are:
				1041	* Tag InFlight Description
				1042	* 0 1 - orig segment is in flight.
				1043	* S 0 - nothing flies, orig reached receiver.
				1044	* L 0 - nothing flies, orig lost by net.
				1045	* R 2 - both orig and retransmit are in flight.
				1046	* L\|R 1 - orig is lost, retransmit is in flight.
				1047	* S\|R 1 - orig reached receiver, retrans is still in flight.
				1048	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
				1049	* but it is equivalent to plain S and code short-curcuits it to S.
				1050	* L\|S is logically invalid, it would mean -1 packet in flight 8))
				1051	*
				1052	* These 6 states form finite state machine, controlled by the following events:
				1053	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
				1054	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
				1055	* 3. Loss detection event of two flavors:
				1056	* A. Scoreboard estimator decided the packet is lost.
				1057	* A'. Reno "three dupacks" marks head of queue lost.
				1058	* A''. Its FACK modification, head until snd.fack is lost.
				1059	* B. SACK arrives sacking SND.NXT at the moment, when the
				1060	* segment was retransmitted.
				1061	* 4. D-SACK added new rule: D-SACK changes any tag to S.
				1062	*
				1063	* It is pleasant to note, that state diagram turns out to be commutative,
				1064	* so that we are allowed not to be bothered by order of our actions,
				1065	* when multiple events arrive simultaneously. (see the function below).
				1066	*
				1067	* Reordering detection.
				1068	* --------------------
				1069	* Reordering metric is maximal distance, which a packet can be displaced
				1070	* in packet stream. With SACKs we can estimate it:
				1071	*
				1072	* 1. SACK fills old hole and the corresponding segment was not
				1073	* ever retransmitted -> reordering. Alas, we cannot use it
				1074	* when segment was retransmitted.
				1075	* 2. The last flaw is solved with D-SACK. D-SACK arrives
				1076	* for retransmitted and already SACKed segment -> reordering..
				1077	* Both of these heuristics are not used in Loss state, when we cannot
				1078	* account for retransmits accurately.
				1079	*
				1080	* SACK block validation.
				1081	* ----------------------
				1082	*
				1083	* SACK block range validation checks that the received SACK block fits to
				1084	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
				1085	* Note that SND.UNA is not included to the range though being valid because
				1086	* it means that the receiver is rather inconsistent with itself reporting
				1087	* SACK reneging when it should advance SND.UNA. Such SACK block this is
				1088	* perfectly valid, however, in light of RFC2018 which explicitly states
				1089	* that "SACK block MUST reflect the newest segment. Even if the newest
				1090	* segment is going to be discarded ...", not that it looks very clever
				1091	* in case of head skb. Due to potentional receiver driven attacks, we
				1092	* choose to avoid immediate execution of a walk in write queue due to
				1093	* reneging and defer head skb's loss recovery to standard loss recovery
				1094	* procedure that will eventually trigger (nothing forbids us doing this).
				1095	*
				1096	* Implements also blockage to start_seq wrap-around. Problem lies in the
				1097	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
				1098	* there's no guarantee that it will be before snd_nxt (n). The problem
				1099	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
				1100	* wrap (s_w):
				1101	*
				1102	* <- outs wnd -> <- wrapzone ->
				1103	* u e n u_w e_w s n_w
				1104	* \| \| \| \| \| \| \|
				1105	* \|<------------+------+----- TCP seqno space --------------+---------->\|
				1106	* ...-- <2^31 ->\| \|<--------...
				1107	* ...---- >2^31 ------>\| \|<--------...
				1108	*
				1109	* Current code wouldn't be vulnerable but it's better still to discard such
				1110	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
				1111	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
				1112	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
				1113	* equal to the ideal case (infinite seqno space without wrap caused issues).
				1114	*
				1115	* With D-SACK the lower bound is extended to cover sequence space below
				1116	* SND.UNA down to undo_marker, which is the last point of interest. Yet
				1117	* again, D-SACK block must not to go across snd_una (for the same reason as
				1118	* for the normal SACK blocks, explained above). But there all simplicity
				1119	* ends, TCP might receive valid D-SACKs below that. As long as they reside
				1120	* fully below undo_marker they do not affect behavior in anyway and can
				1121	* therefore be safely ignored. In rare cases (which are more or less
				1122	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
				1123	* fragmentation and packet reordering past skb's retransmission. To consider
				1124	* them correctly, the acceptable range must be extended even more though
				1125	* the exact amount is rather hard to quantify. However, tp->max_window can
				1126	* be used as an exaggerated estimate.
				1127	*/
				1128	static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
				1129	u32 start_seq, u32 end_seq)
				1130	{
				1131	/* Too far in future, or reversed (interpretation is ambiguous) */
				1132	if (after(end_seq, tp->snd_nxt) \|\| !before(start_seq, end_seq))
				1133	return 0;
				1134
				1135	/* Nasty start_seq wrap-around check (see comments above) */
				1136	if (!before(start_seq, tp->snd_nxt))
				1137	return 0;
				1138
				1139	/* In outstanding window? ...This is valid exit for D-SACKs too.
				1140	* start_seq == snd_una is non-sensical (see comments above)
				1141	*/
				1142	if (after(start_seq, tp->snd_una))
				1143	return 1;
				1144
				1145	if (!is_dsack \|\| !tp->undo_marker)
				1146	return 0;
				1147
				1148	/* ...Then it's D-SACK, and must reside below snd_una completely */
				1149	if (after(end_seq, tp->snd_una))
				1150	return 0;
				1151
				1152	if (!before(start_seq, tp->undo_marker))
				1153	return 1;
				1154
				1155	/* Too old */
				1156	if (!after(end_seq, tp->undo_marker))
				1157	return 0;
				1158
				1159	/* Undo_marker boundary crossing (overestimates a lot). Known already:
				1160	* start_seq < undo_marker and end_seq >= undo_marker.
				1161	*/
				1162	return !before(start_seq, end_seq - tp->max_window);
				1163	}
				1164
				1165	/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
				1166	* Event "B". Later note: FACK people cheated me again 8), we have to account
				1167	* for reordering! Ugly, but should help.
				1168	*
				1169	* Search retransmitted skbs from write_queue that were sent when snd_nxt was
				1170	* less than what is now known to be received by the other end (derived from
				1171	* highest SACK block). Also calculate the lowest snd_nxt among the remaining
				1172	* retransmitted skbs to avoid some costly processing per ACKs.
				1173	*/
				1174	static void tcp_mark_lost_retrans(struct sock *sk)
				1175	{
				1176	const struct inet_connection_sock *icsk = inet_csk(sk);
				1177	struct tcp_sock *tp = tcp_sk(sk);
				1178	struct sk_buff *skb;
				1179	int cnt = 0;
				1180	u32 new_low_seq = tp->snd_nxt;
				1181	u32 received_upto = tcp_highest_sack_seq(tp);
				1182
				1183	if (!tcp_is_fack(tp) \|\| !tp->retrans_out \|\|
				1184	!after(received_upto, tp->lost_retrans_low) \|\|
				1185	icsk->icsk_ca_state != TCP_CA_Recovery)
				1186	return;
				1187
				1188	tcp_for_write_queue(skb, sk) {
				1189	u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
				1190
				1191	if (skb == tcp_send_head(sk))
				1192	break;
				1193	if (cnt == tp->retrans_out)
				1194	break;
				1195	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1196	continue;
				1197
				1198	if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
				1199	continue;
				1200
				1201	/* TODO: We would like to get rid of tcp_is_fack(tp) only
				1202	* constraint here (see above) but figuring out that at
				1203	* least tp->reordering SACK blocks reside between ack_seq
				1204	* and received_upto is not easy task to do cheaply with
				1205	* the available datastructures.
				1206	*
				1207	* Whether FACK should check here for tp->reordering segs
				1208	* in-between one could argue for either way (it would be
				1209	* rather simple to implement as we could count fack_count
				1210	* during the walk and do tp->fackets_out - fack_count).
				1211	*/
				1212	if (after(received_upto, ack_seq)) {
				1213	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				1214	tp->retrans_out -= tcp_skb_pcount(skb);
				1215
				1216	tcp_skb_mark_lost_uncond_verify(tp, skb);
				1217	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
				1218	} else {
				1219	if (before(ack_seq, new_low_seq))
				1220	new_low_seq = ack_seq;
				1221	cnt += tcp_skb_pcount(skb);
				1222	}
				1223	}
				1224
				1225	if (tp->retrans_out)
				1226	tp->lost_retrans_low = new_low_seq;
				1227	}
				1228
				1229	static int tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
				1230	struct tcp_sack_block_wire *sp, int num_sacks,
				1231	u32 prior_snd_una)
				1232	{
				1233	struct tcp_sock *tp = tcp_sk(sk);
				1234	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
				1235	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
				1236	int dup_sack = 0;
				1237
				1238	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
				1239	dup_sack = 1;
				1240	tcp_dsack_seen(tp);
				1241	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
				1242	} else if (num_sacks > 1) {
				1243	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
				1244	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
				1245
				1246	if (!after(end_seq_0, end_seq_1) &&
				1247	!before(start_seq_0, start_seq_1)) {
				1248	dup_sack = 1;
				1249	tcp_dsack_seen(tp);
				1250	NET_INC_STATS_BH(sock_net(sk),
				1251	LINUX_MIB_TCPDSACKOFORECV);
				1252	}
				1253	}
				1254
				1255	/* D-SACK for already forgotten data... Do dumb counting. */
				1256	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
				1257	!after(end_seq_0, prior_snd_una) &&
				1258	after(end_seq_0, tp->undo_marker))
				1259	tp->undo_retrans--;
				1260
				1261	return dup_sack;
				1262	}
				1263
				1264	struct tcp_sacktag_state {
				1265	int reord;
				1266	int fack_count;
				1267	int flag;
				1268	};
				1269
				1270	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
				1271	* the incoming SACK may not exactly match but we can find smaller MSS
				1272	* aligned portion of it that matches. Therefore we might need to fragment
				1273	* which may fail and creates some hassle (caller must handle error case
				1274	* returns).
				1275	*
				1276	* FIXME: this could be merged to shift decision code
				1277	*/
				1278	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
				1279	u32 start_seq, u32 end_seq)
				1280	{
				1281	int in_sack, err;
				1282	unsigned int pkt_len;
				1283	unsigned int mss;
				1284
				1285	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1286	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1287
				1288	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
				1289	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
				1290	mss = tcp_skb_mss(skb);
				1291	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1292
				1293	if (!in_sack) {
				1294	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
				1295	if (pkt_len < mss)
				1296	pkt_len = mss;
				1297	} else {
				1298	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
				1299	if (pkt_len < mss)
				1300	return -EINVAL;
				1301	}
				1302
				1303	/* Round if necessary so that SACKs cover only full MSSes
				1304	* and/or the remaining small portion (if present)
				1305	*/
				1306	if (pkt_len > mss) {
				1307	unsigned int new_len = (pkt_len / mss) * mss;
				1308	if (!in_sack && new_len < pkt_len) {
				1309	new_len += mss;
				1310	if (new_len >= skb->len)
				1311	return 0;
				1312	}
				1313	pkt_len = new_len;
				1314	}
				1315	err = tcp_fragment(sk, skb, pkt_len, mss);
				1316	if (err < 0)
				1317	return err;
				1318	}
				1319
				1320	return in_sack;
				1321	}
				1322
				1323	/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
				1324	static u8 tcp_sacktag_one(struct sock *sk,
				1325	struct tcp_sacktag_state *state, u8 sacked,
				1326	u32 start_seq, u32 end_seq,
				1327	int dup_sack, int pcount)
				1328	{
				1329	struct tcp_sock *tp = tcp_sk(sk);
				1330	int fack_count = state->fack_count;
				1331
				1332	/* Account D-SACK for retransmitted packet. */
				1333	if (dup_sack && (sacked & TCPCB_RETRANS)) {
				1334	if (tp->undo_marker && tp->undo_retrans > 0 &&
				1335	after(end_seq, tp->undo_marker))
				1336	tp->undo_retrans--;
				1337	if (sacked & TCPCB_SACKED_ACKED)
				1338	state->reord = min(fack_count, state->reord);
				1339	}
				1340
				1341	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
				1342	if (!after(end_seq, tp->snd_una))
				1343	return sacked;
				1344
				1345	if (!(sacked & TCPCB_SACKED_ACKED)) {
				1346	if (sacked & TCPCB_SACKED_RETRANS) {
				1347	/* If the segment is not tagged as lost,
				1348	* we do not clear RETRANS, believing
				1349	* that retransmission is still in flight.
				1350	*/
				1351	if (sacked & TCPCB_LOST) {
				1352	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
				1353	tp->lost_out -= pcount;
				1354	tp->retrans_out -= pcount;
				1355	}
				1356	} else {
				1357	if (!(sacked & TCPCB_RETRANS)) {
				1358	/* New sack for not retransmitted frame,
				1359	* which was in hole. It is reordering.
				1360	*/
				1361	if (before(start_seq,
				1362	tcp_highest_sack_seq(tp)))
				1363	state->reord = min(fack_count,
				1364	state->reord);
				1365
				1366	/* SACK enhanced F-RTO (RFC4138; Appendix B) */
				1367	if (!after(end_seq, tp->frto_highmark))
				1368	state->flag \|= FLAG_ONLY_ORIG_SACKED;
				1369	}
				1370
				1371	if (sacked & TCPCB_LOST) {
				1372	sacked &= ~TCPCB_LOST;
				1373	tp->lost_out -= pcount;
				1374	}
				1375	}
				1376
				1377	sacked \|= TCPCB_SACKED_ACKED;
				1378	state->flag \|= FLAG_DATA_SACKED;
				1379	tp->sacked_out += pcount;
				1380
				1381	fack_count += pcount;
				1382
				1383	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
				1384	if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
				1385	before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
				1386	tp->lost_cnt_hint += pcount;
				1387
				1388	if (fack_count > tp->fackets_out)
				1389	tp->fackets_out = fack_count;
				1390	}
				1391
				1392	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
				1393	* frames and clear it. undo_retrans is decreased above, L\|R frames
				1394	* are accounted above as well.
				1395	*/
				1396	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
				1397	sacked &= ~TCPCB_SACKED_RETRANS;
				1398	tp->retrans_out -= pcount;
				1399	}
				1400
				1401	return sacked;
				1402	}
				1403
				1404	/* Shift newly-SACKed bytes from this skb to the immediately previous
				1405	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
				1406	*/
				1407	static int tcp_shifted_skb(struct sock sk, struct sk_buff skb,
				1408	struct tcp_sacktag_state *state,
				1409	unsigned int pcount, int shifted, int mss,
				1410	int dup_sack)
				1411	{
				1412	struct tcp_sock *tp = tcp_sk(sk);
				1413	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
				1414	u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
				1415	u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
				1416
				1417	BUG_ON(!pcount);
				1418
				1419	/* Adjust counters and hints for the newly sacked sequence
				1420	* range but discard the return value since prev is already
				1421	* marked. We must tag the range first because the seq
				1422	* advancement below implicitly advances
				1423	* tcp_highest_sack_seq() when skb is highest_sack.
				1424	*/
				1425	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
				1426	start_seq, end_seq, dup_sack, pcount);
				1427
				1428	if (skb == tp->lost_skb_hint)
				1429	tp->lost_cnt_hint += pcount;
				1430
				1431	TCP_SKB_CB(prev)->end_seq += shifted;
				1432	TCP_SKB_CB(skb)->seq += shifted;
				1433
				1434	skb_shinfo(prev)->gso_segs += pcount;
				1435	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
				1436	skb_shinfo(skb)->gso_segs -= pcount;
				1437
				1438	/* When we're adding to gso_segs == 1, gso_size will be zero,
				1439	* in theory this shouldn't be necessary but as long as DSACK
				1440	* code can come after this skb later on it's better to keep
				1441	* setting gso_size to something.
				1442	*/
				1443	if (!skb_shinfo(prev)->gso_size) {
				1444	skb_shinfo(prev)->gso_size = mss;
				1445	skb_shinfo(prev)->gso_type = sk->sk_gso_type;
				1446	}
				1447
				1448	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
				1449	if (skb_shinfo(skb)->gso_segs <= 1) {
				1450	skb_shinfo(skb)->gso_size = 0;
				1451	skb_shinfo(skb)->gso_type = 0;
				1452	}
				1453
				1454	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
				1455	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
				1456
				1457	if (skb->len > 0) {
				1458	BUG_ON(!tcp_skb_pcount(skb));
				1459	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
				1460	return 0;
				1461	}
				1462
				1463	/* Whole SKB was eaten :-) */
				1464
				1465	if (skb == tp->retransmit_skb_hint)
				1466	tp->retransmit_skb_hint = prev;
				1467	if (skb == tp->scoreboard_skb_hint)
				1468	tp->scoreboard_skb_hint = prev;
				1469	if (skb == tp->lost_skb_hint) {
				1470	tp->lost_skb_hint = prev;
				1471	tp->lost_cnt_hint -= tcp_skb_pcount(prev);
				1472	}
				1473
				1474	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1475	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1476	TCP_SKB_CB(prev)->end_seq++;
				1477
				1478	if (skb == tcp_highest_sack(sk))
				1479	tcp_advance_highest_sack(sk, skb);
				1480
				1481	tcp_unlink_write_queue(skb, sk);
				1482	sk_wmem_free_skb(sk, skb);
				1483
				1484	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
				1485
				1486	return 1;
				1487	}
				1488
				1489	/* I wish gso_size would have a bit more sane initialization than
				1490	* something-or-zero which complicates things
				1491	*/
				1492	static int tcp_skb_seglen(const struct sk_buff *skb)
				1493	{
				1494	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
				1495	}
				1496
				1497	/* Shifting pages past head area doesn't work */
				1498	static int skb_can_shift(const struct sk_buff *skb)
				1499	{
				1500	return !skb_headlen(skb) && skb_is_nonlinear(skb);
				1501	}
				1502	//hub:CVE-2019-11477
				1503	int tcp_skb_shift(struct sk_buff * to, struct sk_buff * from, int pcount, int shiftlen)
				1504	{
				1505	/* TCP min gso_size is 8 bytes(TCP_MIN_GSO_SIZE)
				1506	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
				1507	* to make sure not storing more then 65535*8 bytes per skb,
				1508	* event if current MSS is bigger.
				1509	*/
				1510	if(unlikely(to->len + shiftlen >= 65535*TCP_MIN_GSO_SIZE))
				1511	return 0;
				1512	if(unlikely(tcp_skb_pcount(to) + pcount > 65535))
				1513	return 0;
				1514
				1515	return skb_shift(to, from, shiftlen);
				1516	}
				1517
				1518	/* Try collapsing SACK blocks spanning across multiple skbs to a single
				1519	* skb.
				1520	*/
				1521	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
				1522	struct tcp_sacktag_state *state,
				1523	u32 start_seq, u32 end_seq,
				1524	int dup_sack)
				1525	{
				1526	struct tcp_sock *tp = tcp_sk(sk);
				1527	struct sk_buff *prev;
				1528	int mss;
				1529	int pcount = 0;
				1530	int len;
				1531	int in_sack;
				1532
				1533	if (!sk_can_gso(sk))
				1534	goto fallback;
				1535
				1536	/* Normally R but no L won't result in plain S */
				1537	if (!dup_sack &&
				1538	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
				1539	goto fallback;
				1540	if (!skb_can_shift(skb))
				1541	goto fallback;
				1542	/* This frame is about to be dropped (was ACKed). */
				1543	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1544	goto fallback;
				1545
				1546	/* Can only happen with delayed DSACK + discard craziness */
				1547	if (unlikely(skb == tcp_write_queue_head(sk)))
				1548	goto fallback;
				1549	prev = tcp_write_queue_prev(sk, skb);
				1550
				1551	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
				1552	goto fallback;
				1553
				1554	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1555	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1556
				1557	if (in_sack) {
				1558	len = skb->len;
				1559	pcount = tcp_skb_pcount(skb);
				1560	mss = tcp_skb_seglen(skb);
				1561
				1562	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1563	* drop this restriction as unnecessary
				1564	*/
				1565	if (mss != tcp_skb_seglen(prev))
				1566	goto fallback;
				1567	} else {
				1568	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
				1569	goto noop;
				1570	/* CHECKME: This is non-MSS split case only?, this will
				1571	* cause skipped skbs due to advancing loop btw, original
				1572	* has that feature too
				1573	*/
				1574	if (tcp_skb_pcount(skb) <= 1)
				1575	goto noop;
				1576
				1577	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1578	if (!in_sack) {
				1579	/* TODO: head merge to next could be attempted here
				1580	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
				1581	* though it might not be worth of the additional hassle
				1582	*
				1583	* ...we can probably just fallback to what was done
				1584	* previously. We could try merging non-SACKed ones
				1585	* as well but it probably isn't going to buy off
				1586	* because later SACKs might again split them, and
				1587	* it would make skb timestamp tracking considerably
				1588	* harder problem.
				1589	*/
				1590	goto fallback;
				1591	}
				1592
				1593	len = end_seq - TCP_SKB_CB(skb)->seq;
				1594	BUG_ON(len < 0);
				1595	BUG_ON(len > skb->len);
				1596
				1597	/* MSS boundaries should be honoured or else pcount will
				1598	* severely break even though it makes things bit trickier.
				1599	* Optimize common case to avoid most of the divides
				1600	*/
				1601	mss = tcp_skb_mss(skb);
				1602
				1603	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1604	* drop this restriction as unnecessary
				1605	*/
				1606	if (mss != tcp_skb_seglen(prev))
				1607	goto fallback;
				1608
				1609	if (len == mss) {
				1610	pcount = 1;
				1611	} else if (len < mss) {
				1612	goto noop;
				1613	} else {
				1614	pcount = len / mss;
				1615	len = pcount * mss;
				1616	}
				1617	}
				1618
				1619	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
				1620	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
				1621	goto fallback;
				1622
				1623	if (!tcp_skb_shift(prev, skb, pcount, len)) //hub:CVE-2019-11477
				1624	goto fallback;
				1625	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
				1626	goto out;
				1627
				1628	/* Hole filled allows collapsing with the next as well, this is very
				1629	* useful when hole on every nth skb pattern happens
				1630	*/
				1631	if (prev == tcp_write_queue_tail(sk))
				1632	goto out;
				1633	skb = tcp_write_queue_next(sk, prev);
				1634
				1635	if (!skb_can_shift(skb) \|\|
				1636	(skb == tcp_send_head(sk)) \|\|
				1637	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
				1638	(mss != tcp_skb_seglen(skb)))
				1639	goto out;
				1640
				1641	len = skb->len;
				1642	//hub:CVE-2019-11477
				1643	pcount = tcp_skb_pcount(skb);
				1644	if (tcp_skb_shift(prev, skb, pcount, len)) {
				1645	tcp_shifted_skb(sk, skb, state, pcount, len, mss, 0);
				1646	}
				1647
				1648	out:
				1649	state->fack_count += pcount;
				1650	return prev;
				1651
				1652	noop:
				1653	return skb;
				1654
				1655	fallback:
				1656	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
				1657	return NULL;
				1658	}
				1659
				1660	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
				1661	struct tcp_sack_block *next_dup,
				1662	struct tcp_sacktag_state *state,
				1663	u32 start_seq, u32 end_seq,
				1664	int dup_sack_in)
				1665	{
				1666	struct tcp_sock *tp = tcp_sk(sk);
				1667	struct sk_buff *tmp;
				1668
				1669	tcp_for_write_queue_from(skb, sk) {
				1670	int in_sack = 0;
				1671	int dup_sack = dup_sack_in;
				1672
				1673	if (skb == tcp_send_head(sk))
				1674	break;
				1675
				1676	/* queue is in-order => we can short-circuit the walk early */
				1677	if (!before(TCP_SKB_CB(skb)->seq, end_seq))
				1678	break;
				1679
				1680	if ((next_dup != NULL) &&
				1681	before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
				1682	in_sack = tcp_match_skb_to_sack(sk, skb,
				1683	next_dup->start_seq,
				1684	next_dup->end_seq);
				1685	if (in_sack > 0)
				1686	dup_sack = 1;
				1687	}
				1688
				1689	/* skb reference here is a bit tricky to get right, since
				1690	* shifting can eat and free both this skb and the next,
				1691	* so not even _safe variant of the loop is enough.
				1692	*/
				1693	if (in_sack <= 0) {
				1694	tmp = tcp_shift_skb_data(sk, skb, state,
				1695	start_seq, end_seq, dup_sack);
				1696	if (tmp != NULL) {
				1697	if (tmp != skb) {
				1698	skb = tmp;
				1699	continue;
				1700	}
				1701
				1702	in_sack = 0;
				1703	} else {
				1704	in_sack = tcp_match_skb_to_sack(sk, skb,
				1705	start_seq,
				1706	end_seq);
				1707	}
				1708	}
				1709
				1710	if (unlikely(in_sack < 0))
				1711	break;
				1712
				1713	if (in_sack) {
				1714	TCP_SKB_CB(skb)->sacked =
				1715	tcp_sacktag_one(sk,
				1716	state,
				1717	TCP_SKB_CB(skb)->sacked,
				1718	TCP_SKB_CB(skb)->seq,
				1719	TCP_SKB_CB(skb)->end_seq,
				1720	dup_sack,
				1721	tcp_skb_pcount(skb));
				1722
				1723	if (!before(TCP_SKB_CB(skb)->seq,
				1724	tcp_highest_sack_seq(tp)))
				1725	tcp_advance_highest_sack(sk, skb);
				1726	}
				1727
				1728	state->fack_count += tcp_skb_pcount(skb);
				1729	}
				1730	return skb;
				1731	}
				1732
				1733	/* Avoid all extra work that is being done by sacktag while walking in
				1734	* a normal way
				1735	*/
				1736	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
				1737	struct tcp_sacktag_state *state,
				1738	u32 skip_to_seq)
				1739	{
				1740	tcp_for_write_queue_from(skb, sk) {
				1741	if (skb == tcp_send_head(sk))
				1742	break;
				1743
				1744	if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
				1745	break;
				1746
				1747	state->fack_count += tcp_skb_pcount(skb);
				1748	}
				1749	return skb;
				1750	}
				1751
				1752	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
				1753	struct sock *sk,
				1754	struct tcp_sack_block *next_dup,
				1755	struct tcp_sacktag_state *state,
				1756	u32 skip_to_seq)
				1757	{
				1758	if (next_dup == NULL)
				1759	return skb;
				1760
				1761	if (before(next_dup->start_seq, skip_to_seq)) {
				1762	skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
				1763	skb = tcp_sacktag_walk(skb, sk, NULL, state,
				1764	next_dup->start_seq, next_dup->end_seq,
				1765	1);
				1766	}
				1767
				1768	return skb;
				1769	}
				1770
				1771	static int tcp_sack_cache_ok(const struct tcp_sock tp, const struct tcp_sack_block cache)
				1772	{
				1773	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1774	}
				1775
				1776	static int
				1777	tcp_sacktag_write_queue(struct sock sk, const struct sk_buff ack_skb,
				1778	u32 prior_snd_una)
				1779	{
				1780	const struct inet_connection_sock *icsk = inet_csk(sk);
				1781	struct tcp_sock *tp = tcp_sk(sk);
				1782	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				1783	TCP_SKB_CB(ack_skb)->sacked);
				1784	struct tcp_sack_block_wire sp_wire = (struct tcp_sack_block_wire )(ptr+2);
				1785	struct tcp_sack_block sp[TCP_NUM_SACKS];
				1786	struct tcp_sack_block *cache;
				1787	struct tcp_sacktag_state state;
				1788	struct sk_buff *skb;
				1789	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
				1790	int used_sacks;
				1791	int found_dup_sack = 0;
				1792	int i, j;
				1793	int first_sack_index;
				1794
				1795	state.flag = 0;
				1796	state.reord = tp->packets_out;
				1797
				1798	if (!tp->sacked_out) {
				1799	if (WARN_ON(tp->fackets_out))
				1800	tp->fackets_out = 0;
				1801	tcp_highest_sack_reset(sk);
				1802	}
				1803
				1804	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
				1805	num_sacks, prior_snd_una);
				1806	if (found_dup_sack)
				1807	state.flag \|= FLAG_DSACKING_ACK;
				1808
				1809	/* Eliminate too old ACKs, but take into
				1810	* account more or less fresh ones, they can
				1811	* contain valid SACK info.
				1812	*/
				1813	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
				1814	return 0;
				1815
				1816	if (!tp->packets_out)
				1817	goto out;
				1818
				1819	used_sacks = 0;
				1820	first_sack_index = 0;
				1821	for (i = 0; i < num_sacks; i++) {
				1822	int dup_sack = !i && found_dup_sack;
				1823
				1824	sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
				1825	sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
				1826
				1827	if (!tcp_is_sackblock_valid(tp, dup_sack,
				1828	sp[used_sacks].start_seq,
				1829	sp[used_sacks].end_seq)) {
				1830	int mib_idx;
				1831
				1832	if (dup_sack) {
				1833	if (!tp->undo_marker)
				1834	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
				1835	else
				1836	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
				1837	} else {
				1838	/* Don't count olds caused by ACK reordering */
				1839	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				1840	!after(sp[used_sacks].end_seq, tp->snd_una))
				1841	continue;
				1842	mib_idx = LINUX_MIB_TCPSACKDISCARD;
				1843	}
				1844
				1845	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				1846	if (i == 0)
				1847	first_sack_index = -1;
				1848	continue;
				1849	}
				1850
				1851	/* Ignore very old stuff early */
				1852	if (!after(sp[used_sacks].end_seq, prior_snd_una))
				1853	continue;
				1854
				1855	used_sacks++;
				1856	}
				1857
				1858	/* order SACK blocks to allow in order walk of the retrans queue */
				1859	for (i = used_sacks - 1; i > 0; i--) {
				1860	for (j = 0; j < i; j++) {
				1861	if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
				1862	swap(sp[j], sp[j + 1]);
				1863
				1864	/* Track where the first SACK block goes to */
				1865	if (j == first_sack_index)
				1866	first_sack_index = j + 1;
				1867	}
				1868	}
				1869	}
				1870
				1871	skb = tcp_write_queue_head(sk);
				1872	state.fack_count = 0;
				1873	i = 0;
				1874
				1875	if (!tp->sacked_out) {
				1876	/* It's already past, so skip checking against it */
				1877	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1878	} else {
				1879	cache = tp->recv_sack_cache;
				1880	/* Skip empty blocks in at head of the cache */
				1881	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
				1882	!cache->end_seq)
				1883	cache++;
				1884	}
				1885
				1886	while (i < used_sacks) {
				1887	u32 start_seq = sp[i].start_seq;
				1888	u32 end_seq = sp[i].end_seq;
				1889	int dup_sack = (found_dup_sack && (i == first_sack_index));
				1890	struct tcp_sack_block *next_dup = NULL;
				1891
				1892	if (found_dup_sack && ((i + 1) == first_sack_index))
				1893	next_dup = &sp[i + 1];
				1894
				1895	/* Skip too early cached blocks */
				1896	while (tcp_sack_cache_ok(tp, cache) &&
				1897	!before(start_seq, cache->end_seq))
				1898	cache++;
				1899
				1900	/* Can skip some work by looking recv_sack_cache? */
				1901	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
				1902	after(end_seq, cache->start_seq)) {
				1903
				1904	/* Head todo? */
				1905	if (before(start_seq, cache->start_seq)) {
				1906	skb = tcp_sacktag_skip(skb, sk, &state,
				1907	start_seq);
				1908	skb = tcp_sacktag_walk(skb, sk, next_dup,
				1909	&state,
				1910	start_seq,
				1911	cache->start_seq,
				1912	dup_sack);
				1913	}
				1914
				1915	/* Rest of the block already fully processed? */
				1916	if (!after(end_seq, cache->end_seq))
				1917	goto advance_sp;
				1918
				1919	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
				1920	&state,
				1921	cache->end_seq);
				1922
				1923	/* ...tail remains todo... */
				1924	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
				1925	/* ...but better entrypoint exists! */
				1926	skb = tcp_highest_sack(sk);
				1927	if (skb == NULL)
				1928	break;
				1929	state.fack_count = tp->fackets_out;
				1930	cache++;
				1931	goto walk;
				1932	}
				1933
				1934	skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
				1935	/* Check overlap against next cached too (past this one already) */
				1936	cache++;
				1937	continue;
				1938	}
				1939
				1940	if (!before(start_seq, tcp_highest_sack_seq(tp))) {
				1941	skb = tcp_highest_sack(sk);
				1942	if (skb == NULL)
				1943	break;
				1944	state.fack_count = tp->fackets_out;
				1945	}
				1946	skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
				1947
				1948	walk:
				1949	skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
				1950	start_seq, end_seq, dup_sack);
				1951
				1952	advance_sp:
				1953	/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
				1954	* due to in-order walk
				1955	*/
				1956	if (after(end_seq, tp->frto_highmark))
				1957	state.flag &= ~FLAG_ONLY_ORIG_SACKED;
				1958
				1959	i++;
				1960	}
				1961
				1962	/* Clear the head of the cache sack blocks so we can skip it next time */
				1963	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
				1964	tp->recv_sack_cache[i].start_seq = 0;
				1965	tp->recv_sack_cache[i].end_seq = 0;
				1966	}
				1967	for (j = 0; j < used_sacks; j++)
				1968	tp->recv_sack_cache[i++] = sp[j];
				1969
				1970	tcp_mark_lost_retrans(sk);
				1971
				1972	tcp_verify_left_out(tp);
				1973
				1974	if ((state.reord < tp->fackets_out) &&
				1975	((icsk->icsk_ca_state != TCP_CA_Loss) \|\| tp->undo_marker) &&
				1976	(!tp->frto_highmark \|\| after(tp->snd_una, tp->frto_highmark)))
				1977	tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
				1978
				1979	out:
				1980
				1981	#if FASTRETRANS_DEBUG > 0
				1982	WARN_ON((int)tp->sacked_out < 0);
				1983	WARN_ON((int)tp->lost_out < 0);
				1984	WARN_ON((int)tp->retrans_out < 0);
				1985	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
				1986	#endif
				1987	return state.flag;
				1988	}
				1989
				1990	/* Limits sacked_out so that sum with lost_out isn't ever larger than
				1991	* packets_out. Returns zero if sacked_out adjustement wasn't necessary.
				1992	*/
				1993	static int tcp_limit_reno_sacked(struct tcp_sock *tp)
				1994	{
				1995	u32 holes;
				1996
				1997	holes = max(tp->lost_out, 1U);
				1998	holes = min(holes, tp->packets_out);
				1999
				2000	if ((tp->sacked_out + holes) > tp->packets_out) {
				2001	tp->sacked_out = tp->packets_out - holes;
				2002	return 1;
				2003	}
				2004	return 0;
				2005	}
				2006
				2007	/* If we receive more dupacks than we expected counting segments
				2008	* in assumption of absent reordering, interpret this as reordering.
				2009	* The only another reason could be bug in receiver TCP.
				2010	*/
				2011	static void tcp_check_reno_reordering(struct sock *sk, const int addend)
				2012	{
				2013	struct tcp_sock *tp = tcp_sk(sk);
				2014	if (tcp_limit_reno_sacked(tp))
				2015	tcp_update_reordering(sk, tp->packets_out + addend, 0);
				2016	}
				2017
				2018	/* Emulate SACKs for SACKless connection: account for a new dupack. */
				2019
				2020	static void tcp_add_reno_sack(struct sock *sk)
				2021	{
				2022	struct tcp_sock *tp = tcp_sk(sk);
				2023	tp->sacked_out++;
				2024	tcp_check_reno_reordering(sk, 0);
				2025	tcp_verify_left_out(tp);
				2026	}
				2027
				2028	/* Account for ACK, ACKing some data in Reno Recovery phase. */
				2029
				2030	static void tcp_remove_reno_sacks(struct sock *sk, int acked)
				2031	{
				2032	struct tcp_sock *tp = tcp_sk(sk);
				2033
				2034	if (acked > 0) {
				2035	/* One ACK acked hole. The rest eat duplicate ACKs. */
				2036	if (acked - 1 >= tp->sacked_out)
				2037	tp->sacked_out = 0;
				2038	else
				2039	tp->sacked_out -= acked - 1;
				2040	}
				2041	tcp_check_reno_reordering(sk, acked);
				2042	tcp_verify_left_out(tp);
				2043	}
				2044
				2045	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
				2046	{
				2047	tp->sacked_out = 0;
				2048	}
				2049
				2050	static int tcp_is_sackfrto(const struct tcp_sock *tp)
				2051	{
				2052	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
				2053	}
				2054
				2055	/* F-RTO can only be used if TCP has never retransmitted anything other than
				2056	* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
				2057	*/
				2058	int tcp_use_frto(struct sock *sk)
				2059	{
				2060	const struct tcp_sock *tp = tcp_sk(sk);
				2061	const struct inet_connection_sock *icsk = inet_csk(sk);
				2062	struct sk_buff *skb;
				2063
				2064	if (!sysctl_tcp_frto)
				2065	return 0;
				2066
				2067	/* MTU probe and F-RTO won't really play nicely along currently */
				2068	if (icsk->icsk_mtup.probe_size)
				2069	return 0;
				2070
				2071	if (tcp_is_sackfrto(tp))
				2072	return 1;
				2073
				2074	/* Avoid expensive walking of rexmit queue if possible */
				2075	if (tp->retrans_out > 1)
				2076	return 0;
				2077
				2078	skb = tcp_write_queue_head(sk);
				2079	if (tcp_skb_is_last(sk, skb))
				2080	return 1;
				2081	skb = tcp_write_queue_next(sk, skb); /* Skips head */
				2082	tcp_for_write_queue_from(skb, sk) {
				2083	if (skb == tcp_send_head(sk))
				2084	break;
				2085	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
				2086	return 0;
				2087	/* Short-circuit when first non-SACKed skb has been checked */
				2088	if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				2089	break;
				2090	}
				2091	return 1;
				2092	}
				2093
				2094	/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
				2095	* recovery a bit and use heuristics in tcp_process_frto() to detect if
				2096	* the RTO was spurious. Only clear SACKED_RETRANS of the head here to
				2097	* keep retrans_out counting accurate (with SACK F-RTO, other than head
				2098	* may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
				2099	* bits are handled if the Loss state is really to be entered (in
				2100	* tcp_enter_frto_loss).
				2101	*
				2102	* Do like tcp_enter_loss() would; when RTO expires the second time it
				2103	* does:
				2104	* "Reduce ssthresh if it has not yet been made inside this window."
				2105	*/
				2106	void tcp_enter_frto(struct sock *sk)
				2107	{
				2108	const struct inet_connection_sock *icsk = inet_csk(sk);
				2109	struct tcp_sock *tp = tcp_sk(sk);
				2110	struct sk_buff *skb;
				2111
				2112	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) \|\|
				2113	tp->snd_una == tp->high_seq \|\|
				2114	((icsk->icsk_ca_state == TCP_CA_Loss \|\| tp->frto_counter) &&
				2115	!icsk->icsk_retransmits)) {
				2116	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2117	/* Our state is too optimistic in ssthresh() call because cwnd
				2118	* is not reduced until tcp_enter_frto_loss() when previous F-RTO
				2119	* recovery has not yet completed. Pattern would be this: RTO,
				2120	* Cumulative ACK, RTO (2xRTO for the same segment does not end
				2121	* up here twice).
				2122	* RFC4138 should be more specific on what to do, even though
				2123	* RTO is quite unlikely to occur after the first Cumulative ACK
				2124	* due to back-off and complexity of triggering events ...
				2125	*/
				2126	if (tp->frto_counter) {
				2127	u32 stored_cwnd;
				2128	stored_cwnd = tp->snd_cwnd;
				2129	tp->snd_cwnd = 2;
				2130	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				2131	tp->snd_cwnd = stored_cwnd;
				2132	} else {
				2133	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				2134	}
				2135	/* ... in theory, cong.control module could do "any tricks" in
				2136	* ssthresh(), which means that ca_state, lost bits and lost_out
				2137	* counter would have to be faked before the call occurs. We
				2138	* consider that too expensive, unlikely and hacky, so modules
				2139	* using these in ssthresh() must deal these incompatibility
				2140	* issues if they receives CA_EVENT_FRTO and frto_counter != 0
				2141	*/
				2142	tcp_ca_event(sk, CA_EVENT_FRTO);
				2143	}
				2144
				2145	tp->undo_marker = tp->snd_una;
				2146	tp->undo_retrans = 0;
				2147
				2148	skb = tcp_write_queue_head(sk);
				2149	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
				2150	tp->undo_marker = 0;
				2151	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2152	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2153	tp->retrans_out -= tcp_skb_pcount(skb);
				2154	}
				2155	tcp_verify_left_out(tp);
				2156
				2157	/* Too bad if TCP was application limited */
				2158	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
				2159
				2160	/* Earlier loss recovery underway (see RFC4138; Appendix B).
				2161	* The last condition is necessary at least in tp->frto_counter case.
				2162	*/
				2163	if (tcp_is_sackfrto(tp) && (tp->frto_counter \|\|
				2164	((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery\|TCPF_CA_Loss))) &&
				2165	after(tp->high_seq, tp->snd_una)) {
				2166	tp->frto_highmark = tp->high_seq;
				2167	} else {
				2168	tp->frto_highmark = tp->snd_nxt;
				2169	}
				2170	tcp_set_ca_state(sk, TCP_CA_Disorder);
				2171	tp->high_seq = tp->snd_nxt;
				2172	tp->frto_counter = 1;
				2173	}
				2174
				2175	/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
				2176	* which indicates that we should follow the traditional RTO recovery,
				2177	* i.e. mark everything lost and do go-back-N retransmission.
				2178	*/
				2179	static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
				2180	{
				2181	struct tcp_sock *tp = tcp_sk(sk);
				2182	struct sk_buff *skb;
				2183
				2184	tp->lost_out = 0;
				2185	tp->retrans_out = 0;
				2186	if (tcp_is_reno(tp))
				2187	tcp_reset_reno_sack(tp);
				2188
				2189	tcp_for_write_queue(skb, sk) {
				2190	if (skb == tcp_send_head(sk))
				2191	break;
				2192
				2193	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2194	/*
				2195	* Count the retransmission made on RTO correctly (only when
				2196	* waiting for the first ACK and did not get it)...
				2197	*/
				2198	if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
				2199	/* For some reason this R-bit might get cleared? */
				2200	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				2201	tp->retrans_out += tcp_skb_pcount(skb);
				2202	/* ...enter this if branch just for the first segment */
				2203	flag \|= FLAG_DATA_ACKED;
				2204	} else {
				2205	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
				2206	tp->undo_marker = 0;
				2207	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2208	}
				2209
				2210	/* Marking forward transmissions that were made after RTO lost
				2211	* can cause unnecessary retransmissions in some scenarios,
				2212	* SACK blocks will mitigate that in some but not in all cases.
				2213	* We used to not mark them but it was causing break-ups with
				2214	* receivers that do only in-order receival.
				2215	*
				2216	* TODO: we could detect presence of such receiver and select
				2217	* different behavior per flow.
				2218	*/
				2219	if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
				2220	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				2221	tp->lost_out += tcp_skb_pcount(skb);
				2222	tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
				2223	}
				2224	}
				2225	tcp_verify_left_out(tp);
				2226
				2227	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
				2228	tp->snd_cwnd_cnt = 0;
				2229	tp->snd_cwnd_stamp = tcp_time_stamp;
				2230	tp->frto_counter = 0;
				2231	tp->bytes_acked = 0;
				2232
				2233	tp->reordering = min_t(unsigned int, tp->reordering,
				2234	sysctl_tcp_reordering);
				2235	tcp_set_ca_state(sk, TCP_CA_Loss);
				2236	tp->high_seq = tp->snd_nxt;
				2237	TCP_ECN_queue_cwr(tp);
				2238
				2239	tcp_clear_all_retrans_hints(tp);
				2240	}
				2241
				2242	static void tcp_clear_retrans_partial(struct tcp_sock *tp)
				2243	{
				2244	tp->retrans_out = 0;
				2245	tp->lost_out = 0;
				2246
				2247	tp->undo_marker = 0;
				2248	tp->undo_retrans = -1;
				2249	}
				2250
				2251	void tcp_clear_retrans(struct tcp_sock *tp)
				2252	{
				2253	tcp_clear_retrans_partial(tp);
				2254
				2255	tp->fackets_out = 0;
				2256	tp->sacked_out = 0;
				2257	}
				2258
				2259	/* Enter Loss state. If "how" is not zero, forget all SACK information
				2260	* and reset tags completely, otherwise preserve SACKs. If receiver
				2261	* dropped its ofo queue, we will know this due to reneging detection.
				2262	*/
				2263	void tcp_enter_loss(struct sock *sk, int how)
				2264	{
				2265	const struct inet_connection_sock *icsk = inet_csk(sk);
				2266	struct tcp_sock *tp = tcp_sk(sk);
				2267	struct sk_buff *skb;
				2268
				2269	/* Reduce ssthresh if it has not yet been made inside this window. */
				2270	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\| tp->snd_una == tp->high_seq \|\|
				2271	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
				2272	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2273	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				2274	tcp_ca_event(sk, CA_EVENT_LOSS);
				2275	}
				2276	tp->snd_cwnd = 1;
				2277	tp->snd_cwnd_cnt = 0;
				2278	tp->snd_cwnd_stamp = tcp_time_stamp;
				2279
				2280	tp->bytes_acked = 0;
				2281	tcp_clear_retrans_partial(tp);
				2282
				2283	if (tcp_is_reno(tp))
				2284	tcp_reset_reno_sack(tp);
				2285
				2286	tp->undo_marker = tp->snd_una;
				2287	if (how) {
				2288	tp->sacked_out = 0;
				2289	tp->fackets_out = 0;
				2290	}
				2291	tcp_clear_all_retrans_hints(tp);
				2292
				2293	tcp_for_write_queue(skb, sk) {
				2294	if (skb == tcp_send_head(sk))
				2295	break;
				2296
				2297	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
				2298	tp->undo_marker = 0;
				2299	TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)\|TCPCB_SACKED_ACKED;
				2300	if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) \|\| how) {
				2301	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
				2302	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				2303	tp->lost_out += tcp_skb_pcount(skb);
				2304	tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
				2305	}
				2306	}
				2307	tcp_verify_left_out(tp);
				2308
				2309	tp->reordering = min_t(unsigned int, tp->reordering,
				2310	sysctl_tcp_reordering);
				2311	tcp_set_ca_state(sk, TCP_CA_Loss);
				2312	tp->high_seq = tp->snd_nxt;
				2313	TCP_ECN_queue_cwr(tp);
				2314	/* Abort F-RTO algorithm if one is in progress */
				2315	tp->frto_counter = 0;
				2316	}
				2317
				2318	/* If ACK arrived pointing to a remembered SACK, it means that our
				2319	* remembered SACKs do not reflect real state of receiver i.e.
				2320	* receiver _host_ is heavily congested (or buggy).
				2321	*
				2322	* Do processing similar to RTO timeout.
				2323	*/
				2324	static int tcp_check_sack_reneging(struct sock *sk, int flag)
				2325	{
				2326	if (flag & FLAG_SACK_RENEGING) {
				2327	struct inet_connection_sock *icsk = inet_csk(sk);
				2328	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
				2329
				2330	tcp_enter_loss(sk, 1);
				2331	icsk->icsk_retransmits++;
				2332	tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
				2333	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2334	icsk->icsk_rto, TCP_RTO_MAX);
				2335	return 1;
				2336	}
				2337	return 0;
				2338	}
				2339
				2340	static inline int tcp_fackets_out(const struct tcp_sock *tp)
				2341	{
				2342	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
				2343	}
				2344
				2345	/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
				2346	* counter when SACK is enabled (without SACK, sacked_out is used for
				2347	* that purpose).
				2348	*
				2349	* Instead, with FACK TCP uses fackets_out that includes both SACKed
				2350	* segments up to the highest received SACK block so far and holes in
				2351	* between them.
				2352	*
				2353	* With reordering, holes may still be in flight, so RFC3517 recovery
				2354	* uses pure sacked_out (total number of SACKed segments) even though
				2355	* it violates the RFC that uses duplicate ACKs, often these are equal
				2356	* but when e.g. out-of-window ACKs or packet duplication occurs,
				2357	* they differ. Since neither occurs due to loss, TCP should really
				2358	* ignore them.
				2359	*/
				2360	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
				2361	{
				2362	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
				2363	}
				2364
				2365	static inline int tcp_skb_timedout(const struct sock *sk,
				2366	const struct sk_buff *skb)
				2367	{
				2368	return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
				2369	}
				2370
				2371	static inline int tcp_head_timedout(const struct sock *sk)
				2372	{
				2373	const struct tcp_sock *tp = tcp_sk(sk);
				2374
				2375	return tp->packets_out &&
				2376	tcp_skb_timedout(sk, tcp_write_queue_head(sk));
				2377	}
				2378
				2379	/* Linux NewReno/SACK/FACK/ECN state machine.
				2380	* --------------------------------------
				2381	*
				2382	* "Open" Normal state, no dubious events, fast path.
				2383	* "Disorder" In all the respects it is "Open",
				2384	* but requires a bit more attention. It is entered when
				2385	* we see some SACKs or dupacks. It is split of "Open"
				2386	* mainly to move some processing from fast path to slow one.
				2387	* "CWR" CWND was reduced due to some Congestion Notification event.
				2388	* It can be ECN, ICMP source quench, local device congestion.
				2389	* "Recovery" CWND was reduced, we are fast-retransmitting.
				2390	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
				2391	*
				2392	* tcp_fastretrans_alert() is entered:
				2393	* - each incoming ACK, if state is not "Open"
				2394	* - when arrived ACK is unusual, namely:
				2395	* * SACK
				2396	* * Duplicate ACK.
				2397	* * ECN ECE.
				2398	*
				2399	* Counting packets in flight is pretty simple.
				2400	*
				2401	* in_flight = packets_out - left_out + retrans_out
				2402	*
				2403	* packets_out is SND.NXT-SND.UNA counted in packets.
				2404	*
				2405	* retrans_out is number of retransmitted segments.
				2406	*
				2407	* left_out is number of segments left network, but not ACKed yet.
				2408	*
				2409	* left_out = sacked_out + lost_out
				2410	*
				2411	* sacked_out: Packets, which arrived to receiver out of order
				2412	* and hence not ACKed. With SACKs this number is simply
				2413	* amount of SACKed data. Even without SACKs
				2414	* it is easy to give pretty reliable estimate of this number,
				2415	* counting duplicate ACKs.
				2416	*
				2417	* lost_out: Packets lost by network. TCP has no explicit
				2418	* "loss notification" feedback from network (for now).
				2419	* It means that this number can be only _guessed_.
				2420	* Actually, it is the heuristics to predict lossage that
				2421	* distinguishes different algorithms.
				2422	*
				2423	* F.e. after RTO, when all the queue is considered as lost,
				2424	* lost_out = packets_out and in_flight = retrans_out.
				2425	*
				2426	* Essentially, we have now two algorithms counting
				2427	* lost packets.
				2428	*
				2429	* FACK: It is the simplest heuristics. As soon as we decided
				2430	* that something is lost, we decide that _all_ not SACKed
				2431	* packets until the most forward SACK are lost. I.e.
				2432	* lost_out = fackets_out - sacked_out and left_out = fackets_out.
				2433	* It is absolutely correct estimate, if network does not reorder
				2434	* packets. And it loses any connection to reality when reordering
				2435	* takes place. We use FACK by default until reordering
				2436	* is suspected on the path to this destination.
				2437	*
				2438	* NewReno: when Recovery is entered, we assume that one segment
				2439	* is lost (classic Reno). While we are in Recovery and
				2440	* a partial ACK arrives, we assume that one more packet
				2441	* is lost (NewReno). This heuristics are the same in NewReno
				2442	* and SACK.
				2443	*
				2444	* Imagine, that's all! Forget about all this shamanism about CWND inflation
				2445	* deflation etc. CWND is real congestion window, never inflated, changes
				2446	* only according to classic VJ rules.
				2447	*
				2448	* Really tricky (and requiring careful tuning) part of algorithm
				2449	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
				2450	* The first determines the moment _when_ we should reduce CWND and,
				2451	* hence, slow down forward transmission. In fact, it determines the moment
				2452	* when we decide that hole is caused by loss, rather than by a reorder.
				2453	*
				2454	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
				2455	* holes, caused by lost packets.
				2456	*
				2457	* And the most logically complicated part of algorithm is undo
				2458	* heuristics. We detect false retransmits due to both too early
				2459	* fast retransmit (reordering) and underestimated RTO, analyzing
				2460	* timestamps and D-SACKs. When we detect that some segments were
				2461	* retransmitted by mistake and CWND reduction was wrong, we undo
				2462	* window reduction and abort recovery phase. This logic is hidden
				2463	* inside several functions named tcp_try_undo_<something>.
				2464	*/
				2465
				2466	/* This function decides, when we should leave Disordered state
				2467	* and enter Recovery phase, reducing congestion window.
				2468	*
				2469	* Main question: may we further continue forward transmission
				2470	* with the same cwnd?
				2471	*/
				2472	static int tcp_time_to_recover(struct sock *sk)
				2473	{
				2474	struct tcp_sock *tp = tcp_sk(sk);
				2475	__u32 packets_out;
				2476
				2477	/* Do not perform any recovery during F-RTO algorithm */
				2478	if (tp->frto_counter)
				2479	return 0;
				2480
				2481	/* Trick#1: The loss is proven. */
				2482	if (tp->lost_out)
				2483	return 1;
				2484
				2485	/* Not-A-Trick#2 : Classic rule... */
				2486	if (tcp_dupack_heuristics(tp) > tp->reordering)
				2487	return 1;
				2488
				2489	/* Trick#3 : when we use RFC2988 timer restart, fast
				2490	* retransmit can be triggered by timeout of queue head.
				2491	*/
				2492	if (tcp_is_fack(tp) && tcp_head_timedout(sk))
				2493	return 1;
				2494
				2495	/* Trick#4: It is still not OK... But will it be useful to delay
				2496	* recovery more?
				2497	*/
				2498	packets_out = tp->packets_out;
				2499	if (packets_out <= tp->reordering &&
				2500	tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
				2501	!tcp_may_send_now(sk)) {
				2502	/* We have nothing to send. This connection is limited
				2503	* either by receiver window or by application.
				2504	*/
				2505	return 1;
				2506	}
				2507
				2508	/* If a thin stream is detected, retransmit after first
				2509	* received dupack. Employ only if SACK is supported in order
				2510	* to avoid possible corner-case series of spurious retransmissions
				2511	* Use only if there are no unsent data.
				2512	*/
				2513	if ((tp->thin_dupack \|\| sysctl_tcp_thin_dupack) &&
				2514	tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
				2515	tcp_is_sack(tp) && !tcp_send_head(sk))
				2516	return 1;
				2517
				2518	return 0;
				2519	}
				2520
				2521	/* New heuristics: it is possible only after we switched to restart timer
				2522	* each time when something is ACKed. Hence, we can detect timed out packets
				2523	* during fast retransmit without falling to slow start.
				2524	*
				2525	* Usefulness of this as is very questionable, since we should know which of
				2526	* the segments is the next to timeout which is relatively expensive to find
				2527	* in general case unless we add some data structure just for that. The
				2528	* current approach certainly won't find the right one too often and when it
				2529	* finally does find _something_ it usually marks large part of the window
				2530	* right away (because a retransmission with a larger timestamp blocks the
				2531	* loop from advancing). -ij
				2532	*/
				2533	static void tcp_timeout_skbs(struct sock *sk)
				2534	{
				2535	struct tcp_sock *tp = tcp_sk(sk);
				2536	struct sk_buff *skb;
				2537
				2538	if (!tcp_is_fack(tp) \|\| !tcp_head_timedout(sk))
				2539	return;
				2540
				2541	skb = tp->scoreboard_skb_hint;
				2542	if (tp->scoreboard_skb_hint == NULL)
				2543	skb = tcp_write_queue_head(sk);
				2544
				2545	tcp_for_write_queue_from(skb, sk) {
				2546	if (skb == tcp_send_head(sk))
				2547	break;
				2548	if (!tcp_skb_timedout(sk, skb))
				2549	break;
				2550
				2551	tcp_skb_mark_lost(tp, skb);
				2552	}
				2553
				2554	tp->scoreboard_skb_hint = skb;
				2555
				2556	tcp_verify_left_out(tp);
				2557	}
				2558
				2559	/* Detect loss in event "A" above by marking head of queue up as lost.
				2560	* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
				2561	* are considered lost. For RFC3517 SACK, a segment is considered lost if it
				2562	* has at least tp->reordering SACKed seqments above it; "packets" refers to
				2563	* the maximum SACKed segments to pass before reaching this limit.
				2564	*/
				2565	static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
				2566	{
				2567	struct tcp_sock *tp = tcp_sk(sk);
				2568	struct sk_buff *skb;
				2569	int cnt, oldcnt;
				2570	int err;
				2571	unsigned int mss;
				2572	/* Use SACK to deduce losses of new sequences sent during recovery */
				2573	const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
				2574
				2575	WARN_ON(packets > tp->packets_out);
				2576	if (tp->lost_skb_hint) {
				2577	skb = tp->lost_skb_hint;
				2578	cnt = tp->lost_cnt_hint;
				2579	/* Head already handled? */
				2580	if (mark_head && skb != tcp_write_queue_head(sk))
				2581	return;
				2582	} else {
				2583	skb = tcp_write_queue_head(sk);
				2584	cnt = 0;
				2585	}
				2586
				2587	tcp_for_write_queue_from(skb, sk) {
				2588	if (skb == tcp_send_head(sk))
				2589	break;
				2590	/* TODO: do this better */
				2591	/* this is not the most efficient way to do this... */
				2592	tp->lost_skb_hint = skb;
				2593	tp->lost_cnt_hint = cnt;
				2594
				2595	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
				2596	break;
				2597
				2598	oldcnt = cnt;
				2599	if (tcp_is_fack(tp) \|\| tcp_is_reno(tp) \|\|
				2600	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				2601	cnt += tcp_skb_pcount(skb);
				2602
				2603	if (cnt > packets) {
				2604	if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) \|\|
				2605	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
				2606	(oldcnt >= packets))
				2607	break;
				2608
				2609	mss = skb_shinfo(skb)->gso_size;
				2610	err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
				2611	if (err < 0)
				2612	break;
				2613	cnt = packets;
				2614	}
				2615
				2616	tcp_skb_mark_lost(tp, skb);
				2617
				2618	if (mark_head)
				2619	break;
				2620	}
				2621	tcp_verify_left_out(tp);
				2622	}
				2623
				2624	/* Account newly detected lost packet(s) */
				2625
				2626	static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
				2627	{
				2628	struct tcp_sock *tp = tcp_sk(sk);
				2629
				2630	if (tcp_is_reno(tp)) {
				2631	tcp_mark_head_lost(sk, 1, 1);
				2632	} else if (tcp_is_fack(tp)) {
				2633	int lost = tp->fackets_out - tp->reordering;
				2634	if (lost <= 0)
				2635	lost = 1;
				2636	tcp_mark_head_lost(sk, lost, 0);
				2637	} else {
				2638	int sacked_upto = tp->sacked_out - tp->reordering;
				2639	if (sacked_upto >= 0)
				2640	tcp_mark_head_lost(sk, sacked_upto, 0);
				2641	else if (fast_rexmit)
				2642	tcp_mark_head_lost(sk, 1, 1);
				2643	}
				2644
				2645	tcp_timeout_skbs(sk);
				2646	}
				2647
				2648	/* CWND moderation, preventing bursts due to too big ACKs
				2649	* in dubious situations.
				2650	*/
				2651	static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
				2652	{
				2653	tp->snd_cwnd = min(tp->snd_cwnd,
				2654	tcp_packets_in_flight(tp) + tcp_max_burst(tp));
				2655	tp->snd_cwnd_stamp = tcp_time_stamp;
				2656	}
				2657
				2658	/* Lower bound on congestion window is slow start threshold
				2659	* unless congestion avoidance choice decides to overide it.
				2660	*/
				2661	static inline u32 tcp_cwnd_min(const struct sock *sk)
				2662	{
				2663	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				2664
				2665	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
				2666	}
				2667
				2668	/* Decrease cwnd each second ack. */
				2669	static void tcp_cwnd_down(struct sock *sk, int flag)
				2670	{
				2671	struct tcp_sock *tp = tcp_sk(sk);
				2672	int decr = tp->snd_cwnd_cnt + 1;
				2673
				2674	if ((flag & (FLAG_ANY_PROGRESS \| FLAG_DSACKING_ACK)) \|\|
				2675	(tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
				2676	tp->snd_cwnd_cnt = decr & 1;
				2677	decr >>= 1;
				2678
				2679	if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
				2680	tp->snd_cwnd -= decr;
				2681
				2682	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
				2683	tp->snd_cwnd_stamp = tcp_time_stamp;
				2684	}
				2685	}
				2686
				2687	/* Nothing was retransmitted or returned timestamp is less
				2688	* than timestamp of the first retransmission.
				2689	*/
				2690	static inline int tcp_packet_delayed(const struct tcp_sock *tp)
				2691	{
				2692	return !tp->retrans_stamp \|\|
				2693	(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2694	before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
				2695	}
				2696
				2697	/* Undo procedures. */
				2698
				2699	#if FASTRETRANS_DEBUG > 1
				2700	static void DBGUNDO(struct sock sk, const char msg)
				2701	{
				2702	struct tcp_sock *tp = tcp_sk(sk);
				2703	struct inet_sock *inet = inet_sk(sk);
				2704
				2705	if (sk->sk_family == AF_INET) {
				2706	printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
				2707	msg,
				2708	&inet->inet_daddr, ntohs(inet->inet_dport),
				2709	tp->snd_cwnd, tcp_left_out(tp),
				2710	tp->snd_ssthresh, tp->prior_ssthresh,
				2711	tp->packets_out);
				2712	}
				2713	#if IS_ENABLED(CONFIG_IPV6)
				2714	else if (sk->sk_family == AF_INET6) {
				2715	struct ipv6_pinfo *np = inet6_sk(sk);
				2716	printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
				2717	msg,
				2718	&np->daddr, ntohs(inet->inet_dport),
				2719	tp->snd_cwnd, tcp_left_out(tp),
				2720	tp->snd_ssthresh, tp->prior_ssthresh,
				2721	tp->packets_out);
				2722	}
				2723	#endif
				2724	}
				2725	#else
				2726	#define DBGUNDO(x...) do { } while (0)
				2727	#endif
				2728
				2729	static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
				2730	{
				2731	struct tcp_sock *tp = tcp_sk(sk);
				2732
				2733	if (tp->prior_ssthresh) {
				2734	const struct inet_connection_sock *icsk = inet_csk(sk);
				2735
				2736	if (icsk->icsk_ca_ops->undo_cwnd)
				2737	tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
				2738	else
				2739	tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
				2740
				2741	if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
				2742	tp->snd_ssthresh = tp->prior_ssthresh;
				2743	TCP_ECN_withdraw_cwr(tp);
				2744	}
				2745	} else {
				2746	tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
				2747	}
				2748	tp->snd_cwnd_stamp = tcp_time_stamp;
				2749	}
				2750
				2751	static inline int tcp_may_undo(const struct tcp_sock *tp)
				2752	{
				2753	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
				2754	}
				2755
				2756	/* People celebrate: "We love our President!" */
				2757	static int tcp_try_undo_recovery(struct sock *sk)
				2758	{
				2759	struct tcp_sock *tp = tcp_sk(sk);
				2760
				2761	if (tcp_may_undo(tp)) {
				2762	int mib_idx;
				2763
				2764	/* Happy end! We did not retransmit anything
				2765	* or our original transmission succeeded.
				2766	*/
				2767	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
				2768	tcp_undo_cwr(sk, true);
				2769	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				2770	mib_idx = LINUX_MIB_TCPLOSSUNDO;
				2771	else
				2772	mib_idx = LINUX_MIB_TCPFULLUNDO;
				2773
				2774	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				2775	tp->undo_marker = 0;
				2776	}
				2777	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
				2778	/* Hold old state until something above high_seq
				2779	* is ACKed. For Reno it is MUST to prevent false
				2780	* fast retransmits (RFC2582). SACK TCP is safe. */
				2781	tcp_moderate_cwnd(tp);
				2782	return 1;
				2783	}
				2784	tcp_set_ca_state(sk, TCP_CA_Open);
				2785	return 0;
				2786	}
				2787
				2788	/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
				2789	static void tcp_try_undo_dsack(struct sock *sk)
				2790	{
				2791	struct tcp_sock *tp = tcp_sk(sk);
				2792
				2793	if (tp->undo_marker && !tp->undo_retrans) {
				2794	DBGUNDO(sk, "D-SACK");
				2795	tcp_undo_cwr(sk, true);
				2796	tp->undo_marker = 0;
				2797	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
				2798	}
				2799	}
				2800
				2801	/* We can clear retrans_stamp when there are no retransmissions in the
				2802	* window. It would seem that it is trivially available for us in
				2803	* tp->retrans_out, however, that kind of assumptions doesn't consider
				2804	* what will happen if errors occur when sending retransmission for the
				2805	* second time. ...It could the that such segment has only
				2806	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
				2807	* the head skb is enough except for some reneging corner cases that
				2808	* are not worth the effort.
				2809	*
				2810	* Main reason for all this complexity is the fact that connection dying
				2811	* time now depends on the validity of the retrans_stamp, in particular,
				2812	* that successive retransmissions of a segment must not advance
				2813	* retrans_stamp under any conditions.
				2814	*/
				2815	static int tcp_any_retrans_done(const struct sock *sk)
				2816	{
				2817	const struct tcp_sock *tp = tcp_sk(sk);
				2818	struct sk_buff *skb;
				2819
				2820	if (tp->retrans_out)
				2821	return 1;
				2822
				2823	skb = tcp_write_queue_head(sk);
				2824	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
				2825	return 1;
				2826
				2827	return 0;
				2828	}
				2829
				2830	/* Undo during fast recovery after partial ACK. */
				2831
				2832	static int tcp_try_undo_partial(struct sock *sk, int acked)
				2833	{
				2834	struct tcp_sock *tp = tcp_sk(sk);
				2835	/* Partial ACK arrived. Force Hoe's retransmit. */
				2836	int failed = tcp_is_reno(tp) \|\| (tcp_fackets_out(tp) > tp->reordering);
				2837
				2838	if (tcp_may_undo(tp)) {
				2839	/* Plain luck! Hole if filled with delayed
				2840	* packet, rather than with a retransmit.
				2841	*/
				2842	if (!tcp_any_retrans_done(sk))
				2843	tp->retrans_stamp = 0;
				2844
				2845	tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
				2846
				2847	DBGUNDO(sk, "Hoe");
				2848	tcp_undo_cwr(sk, false);
				2849	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
				2850
				2851	/* So... Do not make Hoe's retransmit yet.
				2852	* If the first packet was delayed, the rest
				2853	* ones are most probably delayed as well.
				2854	*/
				2855	failed = 0;
				2856	}
				2857	return failed;
				2858	}
				2859
				2860	/* Undo during loss recovery after partial ACK. */
				2861	static int tcp_try_undo_loss(struct sock *sk)
				2862	{
				2863	struct tcp_sock *tp = tcp_sk(sk);
				2864
				2865	if (tcp_may_undo(tp)) {
				2866	struct sk_buff *skb;
				2867	tcp_for_write_queue(skb, sk) {
				2868	if (skb == tcp_send_head(sk))
				2869	break;
				2870	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2871	}
				2872
				2873	tcp_clear_all_retrans_hints(tp);
				2874
				2875	DBGUNDO(sk, "partial loss");
				2876	tp->lost_out = 0;
				2877	tcp_undo_cwr(sk, true);
				2878	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
				2879	inet_csk(sk)->icsk_retransmits = 0;
				2880	tp->undo_marker = 0;
				2881	if (tcp_is_sack(tp))
				2882	tcp_set_ca_state(sk, TCP_CA_Open);
				2883	return 1;
				2884	}
				2885	return 0;
				2886	}
				2887
				2888	static inline void tcp_complete_cwr(struct sock *sk)
				2889	{
				2890	struct tcp_sock *tp = tcp_sk(sk);
				2891
				2892	/* Do not moderate cwnd if it's already undone in cwr or recovery. */
				2893	if (tp->undo_marker) {
				2894	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
				2895	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
				2896	tp->snd_cwnd_stamp = tcp_time_stamp;
				2897	} else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
				2898	/* PRR algorithm. */
				2899	tp->snd_cwnd = tp->snd_ssthresh;
				2900	tp->snd_cwnd_stamp = tcp_time_stamp;
				2901	}
				2902	}
				2903	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
				2904	}
				2905
				2906	static void tcp_try_keep_open(struct sock *sk)
				2907	{
				2908	struct tcp_sock *tp = tcp_sk(sk);
				2909	int state = TCP_CA_Open;
				2910
				2911	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
				2912	state = TCP_CA_Disorder;
				2913
				2914	if (inet_csk(sk)->icsk_ca_state != state) {
				2915	tcp_set_ca_state(sk, state);
				2916	tp->high_seq = tp->snd_nxt;
				2917	}
				2918	}
				2919
				2920	static void tcp_try_to_open(struct sock *sk, int flag)
				2921	{
				2922	struct tcp_sock *tp = tcp_sk(sk);
				2923
				2924	tcp_verify_left_out(tp);
				2925
				2926	if (!tp->frto_counter && !tcp_any_retrans_done(sk))
				2927	tp->retrans_stamp = 0;
				2928
				2929	if (flag & FLAG_ECE)
				2930	tcp_enter_cwr(sk, 1);
				2931
				2932	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
				2933	tcp_try_keep_open(sk);
				2934	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
				2935	tcp_moderate_cwnd(tp);
				2936	} else {
				2937	tcp_cwnd_down(sk, flag);
				2938	}
				2939	}
				2940
				2941	static void tcp_mtup_probe_failed(struct sock *sk)
				2942	{
				2943	struct inet_connection_sock *icsk = inet_csk(sk);
				2944
				2945	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
				2946	icsk->icsk_mtup.probe_size = 0;
				2947	}
				2948
				2949	static void tcp_mtup_probe_success(struct sock *sk)
				2950	{
				2951	struct tcp_sock *tp = tcp_sk(sk);
				2952	struct inet_connection_sock *icsk = inet_csk(sk);
				2953
				2954	/* FIXME: breaks with very large cwnd */
				2955	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2956	tp->snd_cwnd = tp->snd_cwnd *
				2957	tcp_mss_to_mtu(sk, tp->mss_cache) /
				2958	icsk->icsk_mtup.probe_size;
				2959	tp->snd_cwnd_cnt = 0;
				2960	tp->snd_cwnd_stamp = tcp_time_stamp;
				2961	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2962
				2963	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
				2964	icsk->icsk_mtup.probe_size = 0;
				2965	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				2966	}
				2967
				2968	/* Do a simple retransmit without using the backoff mechanisms in
				2969	* tcp_timer. This is used for path mtu discovery.
				2970	* The socket is already locked here.
				2971	*/
				2972	void tcp_simple_retransmit(struct sock *sk)
				2973	{
				2974	const struct inet_connection_sock *icsk = inet_csk(sk);
				2975	struct tcp_sock *tp = tcp_sk(sk);
				2976	struct sk_buff *skb;
				2977	unsigned int mss = tcp_current_mss(sk);
				2978	u32 prior_lost = tp->lost_out;
				2979
				2980	tcp_for_write_queue(skb, sk) {
				2981	if (skb == tcp_send_head(sk))
				2982	break;
				2983	if (tcp_skb_seglen(skb) > mss &&
				2984	!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
				2985	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2986	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2987	tp->retrans_out -= tcp_skb_pcount(skb);
				2988	}
				2989	tcp_skb_mark_lost_uncond_verify(tp, skb);
				2990	}
				2991	}
				2992
				2993	tcp_clear_retrans_hints_partial(tp);
				2994
				2995	if (prior_lost == tp->lost_out)
				2996	return;
				2997
				2998	if (tcp_is_reno(tp))
				2999	tcp_limit_reno_sacked(tp);
				3000
				3001	tcp_verify_left_out(tp);
				3002
				3003	/* Don't muck with the congestion window here.
				3004	* Reason is that we do not increase amount of _data_
				3005	* in network, but units changed and effective
				3006	* cwnd/ssthresh really reduced now.
				3007	*/
				3008	if (icsk->icsk_ca_state != TCP_CA_Loss) {
				3009	tp->high_seq = tp->snd_nxt;
				3010	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				3011	tp->prior_ssthresh = 0;
				3012	tp->undo_marker = 0;
				3013	tcp_set_ca_state(sk, TCP_CA_Loss);
				3014	}
				3015	tcp_xmit_retransmit_queue(sk);
				3016	}
				3017	EXPORT_SYMBOL(tcp_simple_retransmit);
				3018
				3019	/* This function implements the PRR algorithm, specifcally the PRR-SSRB
				3020	* (proportional rate reduction with slow start reduction bound) as described in
				3021	* http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
				3022	* It computes the number of packets to send (sndcnt) based on packets newly
				3023	* delivered:
				3024	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
				3025	* cwnd reductions across a full RTT.
				3026	* 2) If packets in flight is lower than ssthresh (such as due to excess
				3027	* losses and/or application stalls), do not perform any further cwnd
				3028	* reductions, but instead slow start up to ssthresh.
				3029	*/
				3030	static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
				3031	int fast_rexmit, int flag)
				3032	{
				3033	struct tcp_sock *tp = tcp_sk(sk);
				3034	int sndcnt = 0;
				3035	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
				3036	if (newly_acked_sacked <= 0 \|\| WARN_ON_ONCE(!tp->prior_cwnd))
				3037	return;
				3038
				3039	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
				3040	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
				3041	tp->prior_cwnd - 1;
				3042	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
				3043	} else {
				3044	sndcnt = min_t(int, delta,
				3045	max_t(int, tp->prr_delivered - tp->prr_out,
				3046	newly_acked_sacked) + 1);
				3047	}
				3048
				3049	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
				3050	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
				3051	}
				3052
				3053	/* Process an event, which can update packets-in-flight not trivially.
				3054	* Main goal of this function is to calculate new estimate for left_out,
				3055	* taking into account both packets sitting in receiver's buffer and
				3056	* packets lost by network.
				3057	*
				3058	* Besides that it does CWND reduction, when packet loss is detected
				3059	* and changes state of machine.
				3060	*
				3061	* It does _not_ decide what to send, it is made in function
				3062	* tcp_xmit_retransmit_queue().
				3063	*/
				3064	static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
				3065	int prior_sacked, int prior_packets,
				3066	bool is_dupack, int flag)
				3067	{
				3068	struct inet_connection_sock *icsk = inet_csk(sk);
				3069	struct tcp_sock *tp = tcp_sk(sk);
				3070	int do_lost = is_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
				3071	(tcp_fackets_out(tp) > tp->reordering));
				3072	int newly_acked_sacked = 0;
				3073	int fast_rexmit = 0, mib_idx;
				3074
				3075	if (WARN_ON(!tp->packets_out && tp->sacked_out))
				3076	tp->sacked_out = 0;
				3077	if (WARN_ON(!tp->sacked_out && tp->fackets_out))
				3078	tp->fackets_out = 0;
				3079
				3080	/* Now state machine starts.
				3081	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
				3082	if (flag & FLAG_ECE)
				3083	tp->prior_ssthresh = 0;
				3084
				3085	/* B. In all the states check for reneging SACKs. */
				3086	if (tcp_check_sack_reneging(sk, flag))
				3087	return;
				3088
				3089	/* C. Check consistency of the current state. */
				3090	tcp_verify_left_out(tp);
				3091
				3092	/* D. Check state exit conditions. State can be terminated
				3093	* when high_seq is ACKed. */
				3094	if (icsk->icsk_ca_state == TCP_CA_Open) {
				3095	WARN_ON(tp->retrans_out != 0);
				3096	tp->retrans_stamp = 0;
				3097	} else if (!before(tp->snd_una, tp->high_seq)) {
				3098	switch (icsk->icsk_ca_state) {
				3099	case TCP_CA_Loss:
				3100	icsk->icsk_retransmits = 0;
				3101	if (tcp_try_undo_recovery(sk))
				3102	return;
				3103	break;
				3104
				3105	case TCP_CA_CWR:
				3106	/* CWR is to be held something above high_seq
				3107	* is ACKed for CWR bit to reach receiver. */
				3108	if (tp->snd_una != tp->high_seq) {
				3109	tcp_complete_cwr(sk);
				3110	tcp_set_ca_state(sk, TCP_CA_Open);
				3111	}
				3112	break;
				3113
				3114	case TCP_CA_Recovery:
				3115	if (tcp_is_reno(tp))
				3116	tcp_reset_reno_sack(tp);
				3117	if (tcp_try_undo_recovery(sk))
				3118	return;
				3119	tcp_complete_cwr(sk);
				3120	break;
				3121	}
				3122	}
				3123
				3124	/* E. Process state. */
				3125	switch (icsk->icsk_ca_state) {
				3126	case TCP_CA_Recovery:
				3127	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
				3128	if (tcp_is_reno(tp) && is_dupack)
				3129	tcp_add_reno_sack(sk);
				3130	} else
				3131	do_lost = tcp_try_undo_partial(sk, pkts_acked);
				3132	newly_acked_sacked = prior_packets - tp->packets_out +
				3133	tp->sacked_out - prior_sacked;
				3134	break;
				3135	case TCP_CA_Loss:
				3136	if (flag & FLAG_DATA_ACKED)
				3137	icsk->icsk_retransmits = 0;
				3138	if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
				3139	tcp_reset_reno_sack(tp);
				3140	if (!tcp_try_undo_loss(sk)) {
				3141	tcp_moderate_cwnd(tp);
				3142	tcp_xmit_retransmit_queue(sk);
				3143	return;
				3144	}
				3145	if (icsk->icsk_ca_state != TCP_CA_Open)
				3146	return;
				3147	/* Loss is undone; fall through to processing in Open state. */
				3148	default:
				3149	if (tcp_is_reno(tp)) {
				3150	if (flag & FLAG_SND_UNA_ADVANCED)
				3151	tcp_reset_reno_sack(tp);
				3152	if (is_dupack)
				3153	tcp_add_reno_sack(sk);
				3154	}
				3155	newly_acked_sacked = prior_packets - tp->packets_out +
				3156	tp->sacked_out - prior_sacked;
				3157
				3158	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
				3159	tcp_try_undo_dsack(sk);
				3160
				3161	if (!tcp_time_to_recover(sk)) {
				3162	tcp_try_to_open(sk, flag);
				3163	return;
				3164	}
				3165
				3166	/* MTU probe failure: don't reduce cwnd */
				3167	if (icsk->icsk_ca_state < TCP_CA_CWR &&
				3168	icsk->icsk_mtup.probe_size &&
				3169	tp->snd_una == tp->mtu_probe.probe_seq_start) {
				3170	tcp_mtup_probe_failed(sk);
				3171	/* Restores the reduction we did in tcp_mtup_probe() */
				3172	tp->snd_cwnd++;
				3173	tcp_simple_retransmit(sk);
				3174	return;
				3175	}
				3176
				3177	/* Otherwise enter Recovery state */
				3178
				3179	if (tcp_is_reno(tp))
				3180	mib_idx = LINUX_MIB_TCPRENORECOVERY;
				3181	else
				3182	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
				3183
				3184	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				3185
				3186	tp->high_seq = tp->snd_nxt;
				3187	tp->prior_ssthresh = 0;
				3188	tp->undo_marker = tp->snd_una;
				3189	tp->undo_retrans = tp->retrans_out ? : -1;
				3190
				3191	if (icsk->icsk_ca_state < TCP_CA_CWR) {
				3192	if (!(flag & FLAG_ECE))
				3193	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				3194	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				3195	TCP_ECN_queue_cwr(tp);
				3196	}
				3197
				3198	tp->bytes_acked = 0;
				3199	tp->snd_cwnd_cnt = 0;
				3200	tp->prior_cwnd = tp->snd_cwnd;
				3201	tp->prr_delivered = 0;
				3202	tp->prr_out = 0;
				3203	tcp_set_ca_state(sk, TCP_CA_Recovery);
				3204	fast_rexmit = 1;
				3205	}
				3206
				3207	if (do_lost \|\| (tcp_is_fack(tp) && tcp_head_timedout(sk)))
				3208	tcp_update_scoreboard(sk, fast_rexmit);
				3209	tp->prr_delivered += newly_acked_sacked;
				3210	tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
				3211	tcp_xmit_retransmit_queue(sk);
				3212	}
				3213
				3214	void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
				3215	{
				3216	tcp_rtt_estimator(sk, seq_rtt);
				3217	tcp_set_rto(sk);
				3218	inet_csk(sk)->icsk_backoff = 0;
				3219	}
				3220	EXPORT_SYMBOL(tcp_valid_rtt_meas);
				3221
				3222	/* Read draft-ietf-tcplw-high-performance before mucking
				3223	* with this code. (Supersedes RFC1323)
				3224	*/
				3225	static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
				3226	{
				3227	/* RTTM Rule: A TSecr value received in a segment is used to
				3228	* update the averaged RTT measurement only if the segment
				3229	* acknowledges some new data, i.e., only if it advances the
				3230	* left edge of the send window.
				3231	*
				3232	* See draft-ietf-tcplw-high-performance-00, section 3.3.
				3233	* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
				3234	*
				3235	* Changed: reset backoff as soon as we see the first valid sample.
				3236	* If we do not, we get strongly overestimated rto. With timestamps
				3237	* samples are accepted even from very old segments: f.e., when rtt=1
				3238	* increases to 8, we retransmit 5 times and after 8 seconds delayed
				3239	* answer arrives rto becomes 120 seconds! If at least one of segments
				3240	* in window is lost... Voila. --ANK (010210)
				3241	*/
				3242	struct tcp_sock *tp = tcp_sk(sk);
				3243
				3244	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
				3245	}
				3246
				3247	static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
				3248	{
				3249	/* We don't have a timestamp. Can only use
				3250	* packets that are not retransmitted to determine
				3251	* rtt estimates. Also, we must not reset the
				3252	* backoff for rto until we get a non-retransmitted
				3253	* packet. This allows us to deal with a situation
				3254	* where the network delay has increased suddenly.
				3255	* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
				3256	*/
				3257
				3258	if (flag & FLAG_RETRANS_DATA_ACKED)
				3259	return;
				3260
				3261	tcp_valid_rtt_meas(sk, seq_rtt);
				3262	}
				3263
				3264	static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
				3265	const s32 seq_rtt)
				3266	{
				3267	const struct tcp_sock *tp = tcp_sk(sk);
				3268	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
				3269	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				3270	tcp_ack_saw_tstamp(sk, flag);
				3271	else if (seq_rtt >= 0)
				3272	tcp_ack_no_tstamp(sk, seq_rtt, flag);
				3273	}
				3274
				3275	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
				3276	{
				3277	const struct inet_connection_sock *icsk = inet_csk(sk);
				3278	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
				3279	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
				3280	}
				3281
				3282	/* Restart timer after forward progress on connection.
				3283	* RFC2988 recommends to restart timer to now+rto.
				3284	*/
				3285	static void tcp_rearm_rto(struct sock *sk)
				3286	{
				3287	const struct tcp_sock *tp = tcp_sk(sk);
				3288
				3289	if (!tp->packets_out) {
				3290	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
				3291	} else {
				3292	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3293	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
				3294	}
				3295	}
				3296
				3297	/* If we get here, the whole TSO packet has not been acked. */
				3298	static u32 tcp_tso_acked(struct sock sk, struct sk_buff skb)
				3299	{
				3300	struct tcp_sock *tp = tcp_sk(sk);
				3301	u32 packets_acked;
				3302
				3303	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
				3304
				3305	packets_acked = tcp_skb_pcount(skb);
				3306	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				3307	return 0;
				3308	packets_acked -= tcp_skb_pcount(skb);
				3309
				3310	if (packets_acked) {
				3311	BUG_ON(tcp_skb_pcount(skb) == 0);
				3312	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
				3313	}
				3314
				3315	return packets_acked;
				3316	}
				3317
				3318	/* Remove acknowledged frames from the retransmission queue. If our packet
				3319	* is before the ack sequence we can discard it as it's confirmed to have
				3320	* arrived at the other end.
				3321	*/
				3322	static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
				3323	u32 prior_snd_una)
				3324	{
				3325	struct tcp_sock *tp = tcp_sk(sk);
				3326	const struct inet_connection_sock *icsk = inet_csk(sk);
				3327	struct sk_buff *skb;
				3328	u32 now = tcp_time_stamp;
				3329	int fully_acked = 1;
				3330	int flag = 0;
				3331	u32 pkts_acked = 0;
				3332	u32 reord = tp->packets_out;
				3333	u32 prior_sacked = tp->sacked_out;
				3334	s32 seq_rtt = -1;
				3335	s32 ca_seq_rtt = -1;
				3336	ktime_t last_ackt = net_invalid_timestamp();
				3337
				3338	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
				3339	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
				3340	u32 acked_pcount;
				3341	u8 sacked = scb->sacked;
				3342
				3343	/* Determine how many packets and what bytes were acked, tso and else */
				3344	if (after(scb->end_seq, tp->snd_una)) {
				3345	if (tcp_skb_pcount(skb) == 1 \|\|
				3346	!after(tp->snd_una, scb->seq))
				3347	break;
				3348
				3349	acked_pcount = tcp_tso_acked(sk, skb);
				3350	if (!acked_pcount)
				3351	break;
				3352
				3353	fully_acked = 0;
				3354	} else {
				3355	acked_pcount = tcp_skb_pcount(skb);
				3356	}
				3357
				3358	if (sacked & TCPCB_RETRANS) {
				3359	if (sacked & TCPCB_SACKED_RETRANS)
				3360	tp->retrans_out -= acked_pcount;
				3361	flag \|= FLAG_RETRANS_DATA_ACKED;
				3362	ca_seq_rtt = -1;
				3363	seq_rtt = -1;
				3364	if ((flag & FLAG_DATA_ACKED) \|\| (acked_pcount > 1))
				3365	flag \|= FLAG_NONHEAD_RETRANS_ACKED;
				3366	} else {
				3367	ca_seq_rtt = now - scb->when;
				3368	last_ackt = skb->tstamp;
				3369	if (seq_rtt < 0) {
				3370	seq_rtt = ca_seq_rtt;
				3371	}
				3372	if (!(sacked & TCPCB_SACKED_ACKED))
				3373	reord = min(pkts_acked, reord);
				3374	}
				3375
				3376	if (sacked & TCPCB_SACKED_ACKED)
				3377	tp->sacked_out -= acked_pcount;
				3378	if (sacked & TCPCB_LOST)
				3379	tp->lost_out -= acked_pcount;
				3380
				3381	tp->packets_out -= acked_pcount;
				3382	pkts_acked += acked_pcount;
				3383
				3384	/* Initial outgoing SYN's get put onto the write_queue
				3385	* just like anything else we transmit. It is not
				3386	* true data, and if we misinform our callers that
				3387	* this ACK acks real data, we will erroneously exit
				3388	* connection startup slow start one packet too
				3389	* quickly. This is severely frowned upon behavior.
				3390	*/
				3391	if (!(scb->tcp_flags & TCPHDR_SYN)) {
				3392	flag \|= FLAG_DATA_ACKED;
				3393	} else {
				3394	flag \|= FLAG_SYN_ACKED;
				3395	tp->retrans_stamp = 0;
				3396	}
				3397
				3398	if (!fully_acked)
				3399	break;
				3400
				3401	tcp_unlink_write_queue(skb, sk);
				3402	sk_wmem_free_skb(sk, skb);
				3403	tp->scoreboard_skb_hint = NULL;
				3404	if (skb == tp->retransmit_skb_hint)
				3405	tp->retransmit_skb_hint = NULL;
				3406	if (skb == tp->lost_skb_hint)
				3407	tp->lost_skb_hint = NULL;
				3408	}
				3409
				3410	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
				3411	tp->snd_up = tp->snd_una;
				3412
				3413	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				3414	flag \|= FLAG_SACK_RENEGING;
				3415
				3416	if (flag & FLAG_ACKED) {
				3417	const struct tcp_congestion_ops *ca_ops
				3418	= inet_csk(sk)->icsk_ca_ops;
				3419
				3420	if (unlikely(icsk->icsk_mtup.probe_size &&
				3421	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
				3422	tcp_mtup_probe_success(sk);
				3423	}
				3424
				3425	tcp_ack_update_rtt(sk, flag, seq_rtt);
				3426	tcp_rearm_rto(sk);
				3427
				3428	if (tcp_is_reno(tp)) {
				3429	tcp_remove_reno_sacks(sk, pkts_acked);
				3430	} else {
				3431	int delta;
				3432
				3433	/* Non-retransmitted hole got filled? That's reordering */
				3434	if (reord < prior_fackets)
				3435	tcp_update_reordering(sk, tp->fackets_out - reord, 0);
				3436
				3437	delta = tcp_is_fack(tp) ? pkts_acked :
				3438	prior_sacked - tp->sacked_out;
				3439	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
				3440	}
				3441
				3442	tp->fackets_out -= min(pkts_acked, tp->fackets_out);
				3443
				3444	if (ca_ops->pkts_acked) {
				3445	s32 rtt_us = -1;
				3446
				3447	/* Is the ACK triggering packet unambiguous? */
				3448	if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
				3449	/* High resolution needed and available? */
				3450	if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
				3451	!ktime_equal(last_ackt,
				3452	net_invalid_timestamp()))
				3453	rtt_us = ktime_us_delta(ktime_get_real(),
				3454	last_ackt);
				3455	else if (ca_seq_rtt >= 0)
				3456	rtt_us = jiffies_to_usecs(ca_seq_rtt);
				3457	}
				3458
				3459	ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
				3460	}
				3461	}
				3462
				3463	#if FASTRETRANS_DEBUG > 0
				3464	WARN_ON((int)tp->sacked_out < 0);
				3465	WARN_ON((int)tp->lost_out < 0);
				3466	WARN_ON((int)tp->retrans_out < 0);
				3467	if (!tp->packets_out && tcp_is_sack(tp)) {
				3468	icsk = inet_csk(sk);
				3469	if (tp->lost_out) {
				3470	printk(KERN_DEBUG "Leak l=%u %d\n",
				3471	tp->lost_out, icsk->icsk_ca_state);
				3472	tp->lost_out = 0;
				3473	}
				3474	if (tp->sacked_out) {
				3475	printk(KERN_DEBUG "Leak s=%u %d\n",
				3476	tp->sacked_out, icsk->icsk_ca_state);
				3477	tp->sacked_out = 0;
				3478	}
				3479	if (tp->retrans_out) {
				3480	printk(KERN_DEBUG "Leak r=%u %d\n",
				3481	tp->retrans_out, icsk->icsk_ca_state);
				3482	tp->retrans_out = 0;
				3483	}
				3484	}
				3485	#endif
				3486	return flag;
				3487	}
				3488
				3489	static void tcp_ack_probe(struct sock *sk)
				3490	{
				3491	const struct tcp_sock *tp = tcp_sk(sk);
				3492	struct inet_connection_sock *icsk = inet_csk(sk);
				3493
				3494	/* Was it a usable window open? */
				3495
				3496	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
				3497	icsk->icsk_backoff = 0;
				3498	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
				3499	/* Socket must be waked up by subsequent tcp_data_snd_check().
				3500	* This function is not for random using!
				3501	*/
				3502	} else {
				3503	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3504	min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
				3505	TCP_RTO_MAX);
				3506	}
				3507	}
				3508
				3509	static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
				3510	{
				3511	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
				3512	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
				3513	}
				3514
				3515	static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
				3516	{
				3517	const struct tcp_sock *tp = tcp_sk(sk);
				3518	return (!(flag & FLAG_ECE) \|\| tp->snd_cwnd < tp->snd_ssthresh) &&
				3519	!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery \| TCPF_CA_CWR));
				3520	}
				3521
				3522	/* Check that window update is acceptable.
				3523	* The function assumes that snd_una<=ack<=snd_next.
				3524	*/
				3525	static inline int tcp_may_update_window(const struct tcp_sock *tp,
				3526	const u32 ack, const u32 ack_seq,
				3527	const u32 nwin)
				3528	{
				3529	return after(ack, tp->snd_una) \|\|
				3530	after(ack_seq, tp->snd_wl1) \|\|
				3531	(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
				3532	}
				3533
				3534	/* Update our send window.
				3535	*
				3536	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
				3537	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
				3538	*/
				3539	static int tcp_ack_update_window(struct sock sk, const struct sk_buff skb, u32 ack,
				3540	u32 ack_seq)
				3541	{
				3542	struct tcp_sock *tp = tcp_sk(sk);
				3543	int flag = 0;
				3544	u32 nwin = ntohs(tcp_hdr(skb)->window);
				3545
				3546	if (likely(!tcp_hdr(skb)->syn))
				3547	nwin <<= tp->rx_opt.snd_wscale;
				3548
				3549	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
				3550	flag \|= FLAG_WIN_UPDATE;
				3551	tcp_update_wl(tp, ack_seq);
				3552
				3553	if (tp->snd_wnd != nwin) {
				3554	tp->snd_wnd = nwin;
				3555
				3556	/* Note, it is the only place, where
				3557	* fast path is recovered for sending TCP.
				3558	*/
				3559	tp->pred_flags = 0;
				3560	tcp_fast_path_check(sk);
				3561
				3562	if (nwin > tp->max_window) {
				3563	tp->max_window = nwin;
				3564	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
				3565	}
				3566	}
				3567	}
				3568
				3569	tp->snd_una = ack;
				3570
				3571	return flag;
				3572	}
				3573
				3574	/* A very conservative spurious RTO response algorithm: reduce cwnd and
				3575	* continue in congestion avoidance.
				3576	*/
				3577	static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
				3578	{
				3579	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
				3580	tp->snd_cwnd_cnt = 0;
				3581	tp->bytes_acked = 0;
				3582	TCP_ECN_queue_cwr(tp);
				3583	tcp_moderate_cwnd(tp);
				3584	}
				3585
				3586	/* A conservative spurious RTO response algorithm: reduce cwnd using
				3587	* rate halving and continue in congestion avoidance.
				3588	*/
				3589	static void tcp_ratehalving_spur_to_response(struct sock *sk)
				3590	{
				3591	tcp_enter_cwr(sk, 0);
				3592	}
				3593
				3594	static void tcp_undo_spur_to_response(struct sock *sk, int flag)
				3595	{
				3596	if (flag & FLAG_ECE)
				3597	tcp_ratehalving_spur_to_response(sk);
				3598	else
				3599	tcp_undo_cwr(sk, true);
				3600	}
				3601
				3602	/* F-RTO spurious RTO detection algorithm (RFC4138)
				3603	*
				3604	* F-RTO affects during two new ACKs following RTO (well, almost, see inline
				3605	* comments). State (ACK number) is kept in frto_counter. When ACK advances
				3606	* window (but not to or beyond highest sequence sent before RTO):
				3607	* On First ACK, send two new segments out.
				3608	* On Second ACK, RTO was likely spurious. Do spurious response (response
				3609	* algorithm is not part of the F-RTO detection algorithm
				3610	* given in RFC4138 but can be selected separately).
				3611	* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
				3612	* and TCP falls back to conventional RTO recovery. F-RTO allows overriding
				3613	* of Nagle, this is done using frto_counter states 2 and 3, when a new data
				3614	* segment of any size sent during F-RTO, state 2 is upgraded to 3.
				3615	*
				3616	* Rationale: if the RTO was spurious, new ACKs should arrive from the
				3617	* original window even after we transmit two new data segments.
				3618	*
				3619	* SACK version:
				3620	* on first step, wait until first cumulative ACK arrives, then move to
				3621	* the second step. In second step, the next ACK decides.
				3622	*
				3623	* F-RTO is implemented (mainly) in four functions:
				3624	* - tcp_use_frto() is used to determine if TCP is can use F-RTO
				3625	* - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
				3626	* called when tcp_use_frto() showed green light
				3627	* - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
				3628	* - tcp_enter_frto_loss() is called if there is not enough evidence
				3629	* to prove that the RTO is indeed spurious. It transfers the control
				3630	* from F-RTO to the conventional RTO recovery
				3631	*/
				3632	static int tcp_process_frto(struct sock *sk, int flag)
				3633	{
				3634	struct tcp_sock *tp = tcp_sk(sk);
				3635
				3636	tcp_verify_left_out(tp);
				3637
				3638	/* Duplicate the behavior from Loss state (fastretrans_alert) */
				3639	if (flag & FLAG_DATA_ACKED)
				3640	inet_csk(sk)->icsk_retransmits = 0;
				3641
				3642	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) \|\|
				3643	((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
				3644	tp->undo_marker = 0;
				3645
				3646	if (!before(tp->snd_una, tp->frto_highmark)) {
				3647	tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
				3648	return 1;
				3649	}
				3650
				3651	if (!tcp_is_sackfrto(tp)) {
				3652	/* RFC4138 shortcoming in step 2; should also have case c):
				3653	* ACK isn't duplicate nor advances window, e.g., opposite dir
				3654	* data, winupdate
				3655	*/
				3656	if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
				3657	return 1;
				3658
				3659	if (!(flag & FLAG_DATA_ACKED)) {
				3660	tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
				3661	flag);
				3662	return 1;
				3663	}
				3664	} else {
				3665	if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
				3666	if (!tcp_packets_in_flight(tp)) {
				3667	tcp_enter_frto_loss(sk, 2, flag);
				3668	return true;
				3669	}
				3670
				3671	/* Prevent sending of new data. */
				3672	tp->snd_cwnd = min(tp->snd_cwnd,
				3673	tcp_packets_in_flight(tp));
				3674	return 1;
				3675	}
				3676
				3677	if ((tp->frto_counter >= 2) &&
				3678	(!(flag & FLAG_FORWARD_PROGRESS) \|\|
				3679	((flag & FLAG_DATA_SACKED) &&
				3680	!(flag & FLAG_ONLY_ORIG_SACKED)))) {
				3681	/* RFC4138 shortcoming (see comment above) */
				3682	if (!(flag & FLAG_FORWARD_PROGRESS) &&
				3683	(flag & FLAG_NOT_DUP))
				3684	return 1;
				3685
				3686	tcp_enter_frto_loss(sk, 3, flag);
				3687	return 1;
				3688	}
				3689	}
				3690
				3691	if (tp->frto_counter == 1) {
				3692	/* tcp_may_send_now needs to see updated state */
				3693	tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
				3694	tp->frto_counter = 2;
				3695
				3696	if (!tcp_may_send_now(sk))
				3697	tcp_enter_frto_loss(sk, 2, flag);
				3698
				3699	return 1;
				3700	} else {
				3701	switch (sysctl_tcp_frto_response) {
				3702	case 2:
				3703	tcp_undo_spur_to_response(sk, flag);
				3704	break;
				3705	case 1:
				3706	tcp_conservative_spur_to_response(tp);
				3707	break;
				3708	default:
				3709	tcp_ratehalving_spur_to_response(sk);
				3710	break;
				3711	}
				3712	tp->frto_counter = 0;
				3713	tp->undo_marker = 0;
				3714	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
				3715	}
				3716	return 0;
				3717	}
				3718
				3719	/* RFC 5961 7 [ACK Throttling] */
				3720	static void tcp_send_challenge_ack(struct sock *sk)
				3721	{
				3722	/* unprotected vars, we dont care of overwrites */
				3723	static u32 challenge_timestamp;
				3724	static unsigned int challenge_count;
				3725	u32 now = jiffies / HZ;
				3726
				3727	if (now != challenge_timestamp) {
				3728	challenge_timestamp = now;
				3729	challenge_count = 0;
				3730	}
				3731	if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
				3732	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
				3733	tcp_send_ack(sk);
				3734	}
				3735	}
				3736
				3737	static void tcp_store_ts_recent(struct tcp_sock *tp)
				3738	{
				3739	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
				3740	tp->rx_opt.ts_recent_stamp = get_seconds();
				3741	}
				3742
				3743	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
				3744	{
				3745	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
				3746	/* PAWS bug workaround wrt. ACK frames, the PAWS discard
				3747	* extra check below makes sure this can only happen
				3748	* for pure ACK frames. -DaveM
				3749	*
				3750	* Not only, also it occurs for expired timestamps.
				3751	*/
				3752
				3753	if (tcp_paws_check(&tp->rx_opt, 0))
				3754	tcp_store_ts_recent(tp);
				3755	}
				3756	}
				3757
				3758	/* This routine deals with incoming acks, but not outgoing ones. */
				3759	static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
				3760	{
				3761	struct inet_connection_sock *icsk = inet_csk(sk);
				3762	struct tcp_sock *tp = tcp_sk(sk);
				3763	u32 prior_snd_una = tp->snd_una;
				3764	u32 ack_seq = TCP_SKB_CB(skb)->seq;
				3765	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				3766	bool is_dupack = false;
				3767	u32 prior_in_flight;
				3768	u32 prior_fackets;
				3769	int prior_packets = tp->packets_out;
				3770	int prior_sacked = tp->sacked_out;
				3771	int pkts_acked = 0;
				3772	int previous_packets_out = 0;
				3773	int frto_cwnd = 0;
				3774
				3775	/* If the ack is older than previous acks
				3776	* then we can probably ignore it.
				3777	*/
				3778	if (before(ack, prior_snd_una)) {
				3779	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
				3780	if (before(ack, prior_snd_una - tp->max_window)) {
				3781	tcp_send_challenge_ack(sk);
				3782	return -1;
				3783	}
				3784	goto old_ack;
				3785	}
				3786
				3787	/* If the ack includes data we haven't sent yet, discard
				3788	* this segment (RFC793 Section 3.9).
				3789	*/
				3790	if (after(ack, tp->snd_nxt))
				3791	goto invalid_ack;
				3792
				3793	if (after(ack, prior_snd_una))
				3794	flag \|= FLAG_SND_UNA_ADVANCED;
				3795
				3796	if (sysctl_tcp_abc) {
				3797	if (icsk->icsk_ca_state < TCP_CA_CWR)
				3798	tp->bytes_acked += ack - prior_snd_una;
				3799	else if (icsk->icsk_ca_state == TCP_CA_Loss)
				3800	/* we assume just one segment left network */
				3801	tp->bytes_acked += min(ack - prior_snd_una,
				3802	tp->mss_cache);
				3803	}
				3804
				3805	prior_fackets = tp->fackets_out;
				3806	prior_in_flight = tcp_packets_in_flight(tp);
				3807
				3808	/* ts_recent update must be made after we are sure that the packet
				3809	* is in window.
				3810	*/
				3811	if (flag & FLAG_UPDATE_TS_RECENT)
				3812	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
				3813
				3814	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
				3815	/* Window is constant, pure forward advance.
				3816	* No more checks are required.
				3817	* Note, we use the fact that SND.UNA>=SND.WL2.
				3818	*/
				3819	tcp_update_wl(tp, ack_seq);
				3820	tp->snd_una = ack;
				3821	flag \|= FLAG_WIN_UPDATE;
				3822
				3823	tcp_ca_event(sk, CA_EVENT_FAST_ACK);
				3824
				3825	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
				3826	} else {
				3827	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
				3828	flag \|= FLAG_DATA;
				3829	else
				3830	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
				3831
				3832	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
				3833
				3834	if (TCP_SKB_CB(skb)->sacked)
				3835	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
				3836
				3837	if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
				3838	flag \|= FLAG_ECE;
				3839
				3840	tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
				3841	}
				3842
				3843	/* We passed data and got it acked, remove any soft error
				3844	* log. Something worked...
				3845	*/
				3846	sk->sk_err_soft = 0;
				3847	icsk->icsk_probes_out = 0;
				3848	tp->rcv_tstamp = tcp_time_stamp;
				3849	if (!prior_packets)
				3850	goto no_queue;
				3851
				3852	/* See if we can take anything off of the retransmit queue. */
				3853	previous_packets_out = tp->packets_out;
				3854	flag \|= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
				3855
				3856	pkts_acked = previous_packets_out - tp->packets_out;
				3857
				3858	if (tp->frto_counter)
				3859	frto_cwnd = tcp_process_frto(sk, flag);
				3860	/* Guarantee sacktag reordering detection against wrap-arounds */
				3861	if (before(tp->frto_highmark, tp->snd_una))
				3862	tp->frto_highmark = 0;
				3863
				3864	if (tcp_ack_is_dubious(sk, flag)) {
				3865	/* Advance CWND, if state allows this. */
				3866	if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
				3867	tcp_may_raise_cwnd(sk, flag))
				3868	tcp_cong_avoid(sk, ack, prior_in_flight);
				3869	is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP));
				3870	tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
				3871	prior_packets, is_dupack, flag);
				3872	} else {
				3873	if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
				3874	tcp_cong_avoid(sk, ack, prior_in_flight);
				3875	}
				3876
				3877	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
				3878	dst_confirm(__sk_dst_get(sk));
				3879
				3880	return 1;
				3881
				3882	no_queue:
				3883	/* If data was DSACKed, see if we can undo a cwnd reduction. */
				3884	if (flag & FLAG_DSACKING_ACK)
				3885	tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
				3886	prior_packets, is_dupack, flag);
				3887	/* If this ack opens up a zero window, clear backoff. It was
				3888	* being used to time the probes, and is probably far higher than
				3889	* it needs to be for normal retransmission.
				3890	*/
				3891	if (tcp_send_head(sk))
				3892	tcp_ack_probe(sk);
				3893	return 1;
				3894
				3895	invalid_ack:
				3896	SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
				3897	return -1;
				3898
				3899	old_ack:
				3900	/* If data was SACKed, tag it and see if we should send more data.
				3901	* If data was DSACKed, see if we can undo a cwnd reduction.
				3902	*/
				3903	if (TCP_SKB_CB(skb)->sacked) {
				3904	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
				3905	tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
				3906	prior_packets, is_dupack, flag);
				3907	}
				3908
				3909	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
				3910	return 0;
				3911	}
				3912
				3913	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
				3914	* But, this can also be called on packets in the established flow when
				3915	* the fast version below fails.
				3916	*/
				3917	void tcp_parse_options(const struct sk_buff skb, struct tcp_options_received opt_rx,
				3918	const u8 **hvpp, int estab)
				3919	{
				3920	const unsigned char *ptr;
				3921	const struct tcphdr *th = tcp_hdr(skb);
				3922	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3923
				3924	ptr = (const unsigned char *)(th + 1);
				3925	opt_rx->saw_tstamp = 0;
				3926
				3927	while (length > 0) {
				3928	int opcode = *ptr++;
				3929	int opsize;
				3930
				3931	switch (opcode) {
				3932	case TCPOPT_EOL:
				3933	return;
				3934	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3935	length--;
				3936	continue;
				3937	default:
				3938	opsize = *ptr++;
				3939	if (opsize < 2) /* "silly options" */
				3940	return;
				3941	if (opsize > length)
				3942	return; /* don't parse partial options */
				3943	switch (opcode) {
				3944	case TCPOPT_MSS:
				3945	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
				3946	u16 in_mss = get_unaligned_be16(ptr);
				3947	if (in_mss) {
				3948	if (opt_rx->user_mss &&
				3949	opt_rx->user_mss < in_mss)
				3950	in_mss = opt_rx->user_mss;
				3951	opt_rx->mss_clamp = in_mss;
				3952	}
				3953	}
				3954	break;
				3955	case TCPOPT_WINDOW:
				3956	if (opsize == TCPOLEN_WINDOW && th->syn &&
				3957	!estab && sysctl_tcp_window_scaling) {
				3958	__u8 snd_wscale = (__u8 )ptr;
				3959	opt_rx->wscale_ok = 1;
				3960	if (snd_wscale > 14) {
				3961	if (net_ratelimit())
				3962	pr_info("%s: Illegal window scaling value %d >14 received\n",
				3963	__func__,
				3964	snd_wscale);
				3965	snd_wscale = 14;
				3966	}
				3967	opt_rx->snd_wscale = snd_wscale;
				3968	}
				3969	break;
				3970	case TCPOPT_TIMESTAMP:
				3971	if ((opsize == TCPOLEN_TIMESTAMP) &&
				3972	((estab && opt_rx->tstamp_ok) \|\|
				3973	(!estab && sysctl_tcp_timestamps))) {
				3974	opt_rx->saw_tstamp = 1;
				3975	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
				3976	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
				3977	}
				3978	break;
				3979	case TCPOPT_SACK_PERM:
				3980	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
				3981	!estab && sysctl_tcp_sack) {
				3982	opt_rx->sack_ok = TCP_SACK_SEEN;
				3983	tcp_sack_reset(opt_rx);
				3984	}
				3985	break;
				3986
				3987	case TCPOPT_SACK:
				3988	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				3989	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				3990	opt_rx->sack_ok) {
				3991	TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				3992	}
				3993	break;
				3994	#ifdef CONFIG_TCP_MD5SIG
				3995	case TCPOPT_MD5SIG:
				3996	/*
				3997	* The MD5 Hash has already been
				3998	* checked (see tcp_v{4,6}_do_rcv()).
				3999	*/
				4000	break;
				4001	#endif
				4002	case TCPOPT_COOKIE:
				4003	/* This option is variable length.
				4004	*/
				4005	switch (opsize) {
				4006	case TCPOLEN_COOKIE_BASE:
				4007	/* not yet implemented */
				4008	break;
				4009	case TCPOLEN_COOKIE_PAIR:
				4010	/* not yet implemented */
				4011	break;
				4012	case TCPOLEN_COOKIE_MIN+0:
				4013	case TCPOLEN_COOKIE_MIN+2:
				4014	case TCPOLEN_COOKIE_MIN+4:
				4015	case TCPOLEN_COOKIE_MIN+6:
				4016	case TCPOLEN_COOKIE_MAX:
				4017	/* 16-bit multiple */
				4018	opt_rx->cookie_plus = opsize;
				4019	*hvpp = ptr;
				4020	break;
				4021	default:
				4022	/* ignore option */
				4023	break;
				4024	}
				4025	break;
				4026	}
				4027
				4028	ptr += opsize-2;
				4029	length -= opsize;
				4030	}
				4031	}
				4032	}
				4033	EXPORT_SYMBOL(tcp_parse_options);
				4034
				4035	static int tcp_parse_aligned_timestamp(struct tcp_sock tp, const struct tcphdr th)
				4036	{
				4037	const __be32 ptr = (const __be32 )(th + 1);
				4038
				4039	if (*ptr == htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16)
				4040	\| (TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP)) {
				4041	tp->rx_opt.saw_tstamp = 1;
				4042	++ptr;
				4043	tp->rx_opt.rcv_tsval = ntohl(*ptr);
				4044	++ptr;
				4045	tp->rx_opt.rcv_tsecr = ntohl(*ptr);
				4046	return 1;
				4047	}
				4048	return 0;
				4049	}
				4050
				4051	/* Fast parse options. This hopes to only see timestamps.
				4052	* If it is wrong it falls back on tcp_parse_options().
				4053	*/
				4054	static int tcp_fast_parse_options(const struct sk_buff *skb,
				4055	const struct tcphdr *th,
				4056	struct tcp_sock tp, const u8 *hvpp)
				4057	{
				4058	/* In the spirit of fast parsing, compare doff directly to constant
				4059	* values. Because equality is used, short doff can be ignored here.
				4060	*/
				4061	if (th->doff == (sizeof(*th) / 4)) {
				4062	tp->rx_opt.saw_tstamp = 0;
				4063	return 0;
				4064	} else if (tp->rx_opt.tstamp_ok &&
				4065	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
				4066	if (tcp_parse_aligned_timestamp(tp, th))
				4067	return 1;
				4068	}
				4069	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
				4070	return 1;
				4071	}
				4072
				4073	#ifdef CONFIG_TCP_MD5SIG
				4074	/*
				4075	* Parse MD5 Signature option
				4076	*/
				4077	const u8 tcp_parse_md5sig_option(const struct tcphdr th)
				4078	{
				4079	int length = (th->doff << 2) - sizeof(*th);
				4080	const u8 ptr = (const u8 )(th + 1);
				4081
				4082	/* If the TCP option is too short, we can short cut */
				4083	if (length < TCPOLEN_MD5SIG)
				4084	return NULL;
				4085
				4086	while (length > 0) {
				4087	int opcode = *ptr++;
				4088	int opsize;
				4089
				4090	switch(opcode) {
				4091	case TCPOPT_EOL:
				4092	return NULL;
				4093	case TCPOPT_NOP:
				4094	length--;
				4095	continue;
				4096	default:
				4097	opsize = *ptr++;
				4098	if (opsize < 2 \|\| opsize > length)
				4099	return NULL;
				4100	if (opcode == TCPOPT_MD5SIG)
				4101	return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
				4102	}
				4103	ptr += opsize - 2;
				4104	length -= opsize;
				4105	}
				4106	return NULL;
				4107	}
				4108	EXPORT_SYMBOL(tcp_parse_md5sig_option);
				4109	#endif
				4110
				4111	/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
				4112	*
				4113	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
				4114	* it can pass through stack. So, the following predicate verifies that
				4115	* this segment is not used for anything but congestion avoidance or
				4116	* fast retransmit. Moreover, we even are able to eliminate most of such
				4117	* second order effects, if we apply some small "replay" window (~RTO)
				4118	* to timestamp space.
				4119	*
				4120	* All these measures still do not guarantee that we reject wrapped ACKs
				4121	* on networks with high bandwidth, when sequence space is recycled fastly,
				4122	* but it guarantees that such events will be very rare and do not affect
				4123	* connection seriously. This doesn't look nice, but alas, PAWS is really
				4124	* buggy extension.
				4125	*
				4126	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
				4127	* states that events when retransmit arrives after original data are rare.
				4128	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
				4129	* the biggest problem on large power networks even with minor reordering.
				4130	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
				4131	* up to bandwidth of 18Gigabit/sec. 8) ]
				4132	*/
				4133
				4134	static int tcp_disordered_ack(const struct sock sk, const struct sk_buff skb)
				4135	{
				4136	const struct tcp_sock *tp = tcp_sk(sk);
				4137	const struct tcphdr *th = tcp_hdr(skb);
				4138	u32 seq = TCP_SKB_CB(skb)->seq;
				4139	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				4140
				4141	return (/* 1. Pure ACK with correct sequence number. */
				4142	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
				4143
				4144	/* 2. ... and duplicate ACK. */
				4145	ack == tp->snd_una &&
				4146
				4147	/* 3. ... and does not update window. */
				4148	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
				4149
				4150	/* 4. ... and sits in replay window. */
				4151	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
				4152	}
				4153
				4154	static inline int tcp_paws_discard(const struct sock *sk,
				4155	const struct sk_buff *skb)
				4156	{
				4157	const struct tcp_sock *tp = tcp_sk(sk);
				4158
				4159	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
				4160	!tcp_disordered_ack(sk, skb);
				4161	}
				4162
				4163	/* Check segment sequence number for validity.
				4164	*
				4165	* Segment controls are considered valid, if the segment
				4166	* fits to the window after truncation to the window. Acceptability
				4167	* of data (and SYN, FIN, of course) is checked separately.
				4168	* See tcp_data_queue(), for example.
				4169	*
				4170	* Also, controls (RST is main one) are accepted using RCV.WUP instead
				4171	* of RCV.NXT. Peer still did not advance his SND.UNA when we
				4172	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
				4173	* (borrowed from freebsd)
				4174	*/
				4175
				4176	static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
				4177	{
				4178	return !before(end_seq, tp->rcv_wup) &&
				4179	!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
				4180	}
				4181
				4182	/* When we get a reset we do this. */
				4183	static void tcp_reset(struct sock *sk)
				4184	{
				4185	TCP_PKT_STATS_INC(TCP_RST_RECV_NUM);
				4186	TCP_SOCK_TRACK(sk, TCP_RST_RECV);
				4187
				4188	/* We want the right error as BSD sees it (and indeed as we do). */
				4189	switch (sk->sk_state) {
				4190	case TCP_SYN_SENT:
				4191	sk->sk_err = ECONNREFUSED;
				4192	break;
				4193	case TCP_CLOSE_WAIT:
				4194	sk->sk_err = EPIPE;
				4195	break;
				4196	case TCP_CLOSE:
				4197	return;
				4198	default:
				4199	sk->sk_err = ECONNRESET;
				4200	ERRNO_TRACK(-ECONNRESET);
				4201	}
				4202	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				4203	smp_wmb();
				4204
				4205	if (!sock_flag(sk, SOCK_DEAD))
				4206	sk->sk_error_report(sk);
				4207
				4208	tcp_done(sk);
				4209	}
				4210
				4211	/*
				4212	* Process the FIN bit. This now behaves as it is supposed to work
				4213	* and the FIN takes effect when it is validly part of sequence
				4214	* space. Not before when we get holes.
				4215	*
				4216	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
				4217	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
				4218	* TIME-WAIT)
				4219	*
				4220	* If we are in FINWAIT-1, a received FIN indicates simultaneous
				4221	* close and we go into CLOSING (and later onto TIME-WAIT)
				4222	*
				4223	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
				4224	*/
				4225	static void tcp_fin(struct sock *sk)
				4226	{
				4227	struct tcp_sock *tp = tcp_sk(sk);
				4228
				4229	TCP_SOCK_TRACK(sk, TCP_FIN_RECV);
				4230
				4231	inet_csk_schedule_ack(sk);
				4232
				4233	sk->sk_shutdown \|= RCV_SHUTDOWN;
				4234	sock_set_flag(sk, SOCK_DONE);
				4235
				4236	switch (sk->sk_state) {
				4237	case TCP_SYN_RECV:
				4238	case TCP_ESTABLISHED:
				4239	/* Move to CLOSE_WAIT */
				4240	tcp_set_state(sk, TCP_CLOSE_WAIT);
				4241	inet_csk(sk)->icsk_ack.pingpong = 1;
				4242	break;
				4243
				4244	case TCP_CLOSE_WAIT:
				4245	case TCP_CLOSING:
				4246	/* Received a retransmission of the FIN, do
				4247	* nothing.
				4248	*/
				4249	break;
				4250	case TCP_LAST_ACK:
				4251	/* RFC793: Remain in the LAST-ACK state. */
				4252	break;
				4253
				4254	case TCP_FIN_WAIT1:
				4255	/* This case occurs when a simultaneous close
				4256	* happens, we must ack the received FIN and
				4257	* enter the CLOSING state.
				4258	*/
				4259	tcp_send_ack(sk);
				4260	tcp_set_state(sk, TCP_CLOSING);
				4261	break;
				4262	case TCP_FIN_WAIT2:
				4263	/* Received a FIN -- send ACK and enter TIME_WAIT. */
				4264	tcp_send_ack(sk);
				4265	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				4266	break;
				4267	default:
				4268	/* Only TCP_LISTEN and TCP_CLOSE are left, in these
				4269	* cases we should never reach this piece of code.
				4270	*/
				4271	pr_err("%s: Impossible, sk->sk_state=%d\n",
				4272	__func__, sk->sk_state);
				4273	break;
				4274	}
				4275
				4276	/* It _is_ possible, that we have something out-of-order _after_ FIN.
				4277	* Probably, we should reset in this case. For now drop them.
				4278	*/
				4279	__skb_queue_purge(&tp->out_of_order_queue);
				4280	if (tcp_is_sack(tp))
				4281	tcp_sack_reset(&tp->rx_opt);
				4282	sk_mem_reclaim(sk);
				4283
				4284	if (!sock_flag(sk, SOCK_DEAD)) {
				4285	sk->sk_state_change(sk);
				4286
				4287	/* Do not send POLL_HUP for half duplex close. */
				4288	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
				4289	sk->sk_state == TCP_CLOSE)
				4290	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
				4291	else
				4292	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				4293	}
				4294	}
				4295
				4296	static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
				4297	u32 end_seq)
				4298	{
				4299	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
				4300	if (before(seq, sp->start_seq))
				4301	sp->start_seq = seq;
				4302	if (after(end_seq, sp->end_seq))
				4303	sp->end_seq = end_seq;
				4304	return 1;
				4305	}
				4306	return 0;
				4307	}
				4308
				4309	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
				4310	{
				4311	struct tcp_sock *tp = tcp_sk(sk);
				4312
				4313	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
				4314	int mib_idx;
				4315
				4316	if (before(seq, tp->rcv_nxt))
				4317	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
				4318	else
				4319	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
				4320
				4321	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				4322
				4323	tp->rx_opt.dsack = 1;
				4324	tp->duplicate_sack[0].start_seq = seq;
				4325	tp->duplicate_sack[0].end_seq = end_seq;
				4326	}
				4327	}
				4328
				4329	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
				4330	{
				4331	struct tcp_sock *tp = tcp_sk(sk);
				4332
				4333	if (!tp->rx_opt.dsack)
				4334	tcp_dsack_set(sk, seq, end_seq);
				4335	else
				4336	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
				4337	}
				4338
				4339	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
				4340	{
				4341	struct tcp_sock *tp = tcp_sk(sk);
				4342
				4343	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				4344	before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4345	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4346	tcp_enter_quickack_mode(sk);
				4347
				4348	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
				4349	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				4350
				4351	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				4352	end_seq = tp->rcv_nxt;
				4353	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
				4354	}
				4355	}
				4356
				4357	tcp_send_ack(sk);
				4358	}
				4359
				4360	/* These routines update the SACK block as out-of-order packets arrive or
				4361	* in-order packets close up the sequence space.
				4362	*/
				4363	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
				4364	{
				4365	int this_sack;
				4366	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4367	struct tcp_sack_block *swalk = sp + 1;
				4368
				4369	/* See if the recent change to the first SACK eats into
				4370	* or hits the sequence space of other SACK blocks, if so coalesce.
				4371	*/
				4372	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
				4373	if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
				4374	int i;
				4375
				4376	/* Zap SWALK, by moving every further SACK up by one slot.
				4377	* Decrease num_sacks.
				4378	*/
				4379	tp->rx_opt.num_sacks--;
				4380	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				4381	sp[i] = sp[i + 1];
				4382	continue;
				4383	}
				4384	this_sack++, swalk++;
				4385	}
				4386	}
				4387
				4388	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
				4389	{
				4390	struct tcp_sock *tp = tcp_sk(sk);
				4391	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4392	int cur_sacks = tp->rx_opt.num_sacks;
				4393	int this_sack;
				4394
				4395	if (!cur_sacks)
				4396	goto new_sack;
				4397
				4398	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
				4399	if (tcp_sack_extend(sp, seq, end_seq)) {
				4400	/* Rotate this_sack to the first one. */
				4401	for (; this_sack > 0; this_sack--, sp--)
				4402	swap(sp, (sp - 1));
				4403	if (cur_sacks > 1)
				4404	tcp_sack_maybe_coalesce(tp);
				4405	return;
				4406	}
				4407	}
				4408
				4409	/* Could not find an adjacent existing SACK, build a new one,
				4410	* put it at the front, and shift everyone else down. We
				4411	* always know there is at least one SACK present already here.
				4412	*
				4413	* If the sack array is full, forget about the last one.
				4414	*/
				4415	if (this_sack >= TCP_NUM_SACKS) {
				4416	this_sack--;
				4417	tp->rx_opt.num_sacks--;
				4418	sp--;
				4419	}
				4420	for (; this_sack > 0; this_sack--, sp--)
				4421	sp = (sp - 1);
				4422
				4423	new_sack:
				4424	/* Build the new head SACK, and we're done. */
				4425	sp->start_seq = seq;
				4426	sp->end_seq = end_seq;
				4427	tp->rx_opt.num_sacks++;
				4428	}
				4429
				4430	/* RCV.NXT advances, some SACKs should be eaten. */
				4431
				4432	static void tcp_sack_remove(struct tcp_sock *tp)
				4433	{
				4434	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4435	int num_sacks = tp->rx_opt.num_sacks;
				4436	int this_sack;
				4437
				4438	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
				4439	if (skb_queue_empty(&tp->out_of_order_queue)) {
				4440	tp->rx_opt.num_sacks = 0;
				4441	return;
				4442	}
				4443
				4444	for (this_sack = 0; this_sack < num_sacks;) {
				4445	/* Check if the start of the sack is covered by RCV.NXT. */
				4446	if (!before(tp->rcv_nxt, sp->start_seq)) {
				4447	int i;
				4448
				4449	/* RCV.NXT must cover all the block! */
				4450	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
				4451
				4452	/* Zap this SACK, by moving forward any other SACKS. */
				4453	for (i=this_sack+1; i < num_sacks; i++)
				4454	tp->selective_acks[i-1] = tp->selective_acks[i];
				4455	num_sacks--;
				4456	continue;
				4457	}
				4458	this_sack++;
				4459	sp++;
				4460	}
				4461	tp->rx_opt.num_sacks = num_sacks;
				4462	}
				4463
				4464	/* This one checks to see if we can put data from the
				4465	* out_of_order queue into the receive_queue.
				4466	*/
				4467	static void tcp_ofo_queue(struct sock *sk)
				4468	{
				4469	struct tcp_sock *tp = tcp_sk(sk);
				4470	__u32 dsack_high = tp->rcv_nxt;
				4471	struct sk_buff *skb;
				4472
				4473	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
				4474	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				4475	break;
				4476
				4477	if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
				4478	__u32 dsack = dsack_high;
				4479	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				4480	dsack_high = TCP_SKB_CB(skb)->end_seq;
				4481	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
				4482	}
				4483
				4484	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
				4485	SOCK_DEBUG(sk, "ofo packet was already received\n");
				4486	__skb_unlink(skb, &tp->out_of_order_queue);
				4487	__kfree_skb(skb);
				4488	continue;
				4489	}
				4490	SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
				4491	tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
				4492	TCP_SKB_CB(skb)->end_seq);
				4493
				4494	__skb_unlink(skb, &tp->out_of_order_queue);
				4495	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4496	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
				4497	if (tcp_hdr(skb)->fin)
				4498	tcp_fin(sk);
				4499	}
				4500	}
				4501
				4502	static int tcp_prune_ofo_queue(struct sock *sk);
				4503	static int tcp_prune_queue(struct sock *sk);
				4504
				4505	static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
				4506	{
				4507	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
				4508	!sk_rmem_schedule(sk, size)) {
				4509
				4510	if (tcp_prune_queue(sk) < 0)
				4511	return -1;
				4512
				4513	if (!sk_rmem_schedule(sk, size)) {
				4514	if (!tcp_prune_ofo_queue(sk))
				4515	return -1;
				4516
				4517	if (!sk_rmem_schedule(sk, size))
				4518	return -1;
				4519	}
				4520	}
				4521	return 0;
				4522	}
				4523
				4524	static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
				4525	{
				4526	struct tcp_sock *tp = tcp_sk(sk);
				4527	struct sk_buff *skb1;
				4528	u32 seq, end_seq;
				4529
				4530	TCP_ECN_check_ce(tp, skb);
				4531
				4532	if (tcp_try_rmem_schedule(sk, skb->truesize)) {
				4533	/* TODO: should increment a counter */
				4534	TCP_SOCK_TRACK(sk, TCP_RECV_BUFF_FULL);
				4535	__kfree_skb(skb);
				4536	return;
				4537	}
				4538
				4539	/* Disable header prediction. */
				4540	tp->pred_flags = 0;
				4541	inet_csk_schedule_ack(sk);
				4542
				4543	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
				4544	tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4545
				4546	skb1 = skb_peek_tail(&tp->out_of_order_queue);
				4547	if (!skb1) {
				4548	/* Initial out of order segment, build 1 SACK. */
				4549	if (tcp_is_sack(tp)) {
				4550	tp->rx_opt.num_sacks = 1;
				4551	tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
				4552	tp->selective_acks[0].end_seq =
				4553	TCP_SKB_CB(skb)->end_seq;
				4554	}
				4555	__skb_queue_head(&tp->out_of_order_queue, skb);
				4556	goto end;
				4557	}
				4558
				4559	seq = TCP_SKB_CB(skb)->seq;
				4560	end_seq = TCP_SKB_CB(skb)->end_seq;
				4561
				4562	if (seq == TCP_SKB_CB(skb1)->end_seq) {
				4563	/* Packets in ofo can stay in queue a long time.
				4564	* Better try to coalesce them right now
				4565	* to avoid future tcp_collapse_ofo_queue(),
				4566	* probably the most expensive function in tcp stack.
				4567	*/
				4568	if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
				4569	NET_INC_STATS_BH(sock_net(sk),
				4570	LINUX_MIB_TCPRCVCOALESCE);
				4571	BUG_ON(skb_copy_bits(skb, 0,
				4572	skb_put(skb1, skb->len),
				4573	skb->len));
				4574	TCP_SKB_CB(skb1)->end_seq = end_seq;
				4575	TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
				4576	__kfree_skb(skb);
				4577	skb = NULL;
				4578	} else {
				4579	__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
				4580	}
				4581
				4582	if (!tp->rx_opt.num_sacks \|\|
				4583	tp->selective_acks[0].end_seq != seq)
				4584	goto add_sack;
				4585
				4586	/* Common case: data arrive in order after hole. */
				4587	tp->selective_acks[0].end_seq = end_seq;
				4588	goto end;
				4589	}
				4590
				4591	/* Find place to insert this segment. */
				4592	while (1) {
				4593	if (!after(TCP_SKB_CB(skb1)->seq, seq))
				4594	break;
				4595	if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
				4596	skb1 = NULL;
				4597	break;
				4598	}
				4599	skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
				4600	}
				4601
				4602	/* Do skb overlap to previous one? */
				4603	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
				4604	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4605	/* All the bits are present. Drop. */
				4606	__kfree_skb(skb);
				4607	skb = NULL;
				4608	tcp_dsack_set(sk, seq, end_seq);
				4609	goto add_sack;
				4610	}
				4611	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				4612	/* Partial overlap. */
				4613	tcp_dsack_set(sk, seq,
				4614	TCP_SKB_CB(skb1)->end_seq);
				4615	} else {
				4616	if (skb_queue_is_first(&tp->out_of_order_queue,
				4617	skb1))
				4618	skb1 = NULL;
				4619	else
				4620	skb1 = skb_queue_prev(
				4621	&tp->out_of_order_queue,
				4622	skb1);
				4623	}
				4624	}
				4625	if (!skb1)
				4626	__skb_queue_head(&tp->out_of_order_queue, skb);
				4627	else
				4628	__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
				4629
				4630	/* And clean segments covered by new one as whole. */
				4631	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
				4632	skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
				4633
				4634	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
				4635	break;
				4636	if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4637	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4638	end_seq);
				4639	break;
				4640	}
				4641	__skb_unlink(skb1, &tp->out_of_order_queue);
				4642	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4643	TCP_SKB_CB(skb1)->end_seq);
				4644	__kfree_skb(skb1);
				4645	}
				4646
				4647	add_sack:
				4648	if (tcp_is_sack(tp))
				4649	tcp_sack_new_ofo_skb(sk, seq, end_seq);
				4650	end:
				4651	if (skb)
				4652	skb_set_owner_r(skb, sk);
				4653	}
				4654
				4655
				4656	static void tcp_data_queue(struct sock sk, struct sk_buff skb)
				4657	{
				4658	const struct tcphdr *th = tcp_hdr(skb);
				4659	struct tcp_sock *tp = tcp_sk(sk);
				4660	int eaten = -1;
				4661
				4662	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
				4663	goto drop;
				4664
				4665	skb_dst_drop(skb);
				4666	__skb_pull(skb, th->doff * 4);
				4667
				4668	TCP_ECN_accept_cwr(tp, skb);
				4669
				4670	tp->rx_opt.dsack = 0;
				4671
				4672	/* Queue data for delivery to the user.
				4673	* Packets in sequence go to the receive queue.
				4674	* Out of sequence packets to the out_of_order_queue.
				4675	*/
				4676	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
				4677	if (tcp_receive_window(tp) == 0)
				4678	goto out_of_window;
				4679
				4680	/* Ok. In sequence. In window. */
				4681	if (tp->ucopy.task == current &&
				4682	tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
				4683	sock_owned_by_user(sk) && !tp->urg_data) {
				4684	int chunk = min_t(unsigned int, skb->len,
				4685	tp->ucopy.len);
				4686
				4687	__set_current_state(TASK_RUNNING);
				4688
				4689	local_bh_enable();
				4690	if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
				4691	tp->ucopy.len -= chunk;
				4692	tp->copied_seq += chunk;
				4693	eaten = (chunk == skb->len);
				4694	tcp_rcv_space_adjust(sk);
				4695	}
				4696	local_bh_disable();
				4697	}
				4698
				4699	if (eaten <= 0) {
				4700	queue_and_out:
				4701	if (eaten < 0 &&
				4702	tcp_try_rmem_schedule(sk, skb->truesize))
				4703	{
				4704	TCP_SOCK_TRACK(sk, TCP_RECV_BUFF_FULL);
				4705	goto drop;
				4706	}
				4707
				4708	skb_set_owner_r(skb, sk);
				4709	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4710	}
				4711	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
				4712	if (skb->len)
				4713	tcp_event_data_recv(sk, skb);
				4714	if (th->fin)
				4715	tcp_fin(sk);
				4716
				4717	if (!skb_queue_empty(&tp->out_of_order_queue)) {
				4718
				4719	/ÔÚ¿ìËÙÖØ´«Ê±£¬µ±Á¬ÐøÊÕµ½3¸öÒ»ÑùµÄACKÊ±£¬ÖØ´«±¨ÎÄ£¬ËùÒÔ¿ÉÒÔÈÏÎªÂÒÐò´ïµ½3¸ö£¬³öÏÖ¶ª°ü/
				4720	if(tp->out_of_order_queue.qlen >= 3)
				4721	TCP_PKT_STATS_INC(TCP_RECV_DROPS);
				4722
				4723	tcp_ofo_queue(sk);
				4724
				4725	/* RFC2581. 4.2. SHOULD send immediate ACK, when
				4726	* gap in queue is filled.
				4727	*/
				4728	if (skb_queue_empty(&tp->out_of_order_queue))
				4729	inet_csk(sk)->icsk_ack.pingpong = 0;
				4730	}
				4731
				4732	if (tp->rx_opt.num_sacks)
				4733	tcp_sack_remove(tp);
				4734
				4735	tcp_fast_path_check(sk);
				4736
				4737	if (eaten > 0)
				4738	__kfree_skb(skb);
				4739	else if (!sock_flag(sk, SOCK_DEAD))
				4740	sk->sk_data_ready(sk, 0);
				4741	return;
				4742	}
				4743
				4744	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
				4745	/* A retransmit, 2nd most common case. Force an immediate ack. */
				4746	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4747	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4748
				4749	/´ËÊ±½ÓÊÕ¶Ë»ØµÄµÄACK°ü¶ªÊ§/
				4750	TCP_PKT_STATS_INC(TCP_SEND_DROPS);
				4751
				4752	out_of_window:
				4753	TCP_SOCK_TRACK(sk, TCP_RECV_WINDOW_FULL);
				4754	tcp_enter_quickack_mode(sk);
				4755	inet_csk_schedule_ack(sk);
				4756	drop:
				4757	__kfree_skb(skb);
				4758	return;
				4759	}
				4760
				4761	/* Out of window. F.e. zero window probe. */
				4762	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
				4763	goto out_of_window;
				4764
				4765	tcp_enter_quickack_mode(sk);
				4766
				4767	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4768	/* Partial packet, seq < rcv_next < end_seq */
				4769	SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
				4770	tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
				4771	TCP_SKB_CB(skb)->end_seq);
				4772
				4773	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
				4774
				4775	/* If window is closed, drop tail of packet. But after
				4776	* remembering D-SACK for its head made in previous line.
				4777	*/
				4778	if (!tcp_receive_window(tp))
				4779	goto out_of_window;
				4780	goto queue_and_out;
				4781	}
				4782
				4783	tcp_data_queue_ofo(sk, skb);
				4784	}
				4785
				4786	static struct sk_buff tcp_collapse_one(struct sock sk, struct sk_buff *skb,
				4787	struct sk_buff_head *list)
				4788	{
				4789	struct sk_buff *next = NULL;
				4790
				4791	if (!skb_queue_is_last(list, skb))
				4792	next = skb_queue_next(list, skb);
				4793
				4794	__skb_unlink(skb, list);
				4795	__kfree_skb(skb);
				4796	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
				4797
				4798	return next;
				4799	}
				4800
				4801	/* Collapse contiguous sequence of skbs head..tail with
				4802	* sequence numbers start..end.
				4803	*
				4804	* If tail is NULL, this means until the end of the list.
				4805	*
				4806	* Segments with FIN/SYN are not collapsed (only because this
				4807	* simplifies code)
				4808	*/
				4809	static void
				4810	tcp_collapse(struct sock sk, struct sk_buff_head list,
				4811	struct sk_buff head, struct sk_buff tail,
				4812	u32 start, u32 end)
				4813	{
				4814	struct sk_buff skb, n;
				4815	bool end_of_skbs;
				4816
				4817	/* First, check that queue is collapsible and find
				4818	* the point where collapsing can be useful. */
				4819	skb = head;
				4820	restart:
				4821	end_of_skbs = true;
				4822	skb_queue_walk_from_safe(list, skb, n) {
				4823	if (skb == tail)
				4824	break;
				4825	/* No new bits? It is possible on ofo queue. */
				4826	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4827	skb = tcp_collapse_one(sk, skb, list);
				4828	if (!skb)
				4829	break;
				4830	goto restart;
				4831	}
				4832
				4833	/* The first skb to collapse is:
				4834	* - not SYN/FIN and
				4835	* - bloated or contains data before "start" or
				4836	* overlaps to the next one.
				4837	*/
				4838	if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
				4839	(tcp_win_from_space(skb->truesize) > skb->len \|\|
				4840	before(TCP_SKB_CB(skb)->seq, start))) {
				4841	end_of_skbs = false;
				4842	break;
				4843	}
				4844
				4845	if (!skb_queue_is_last(list, skb)) {
				4846	struct sk_buff *next = skb_queue_next(list, skb);
				4847	if (next != tail &&
				4848	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
				4849	end_of_skbs = false;
				4850	break;
				4851	}
				4852	}
				4853
				4854	/* Decided to skip this, advance start seq. */
				4855	start = TCP_SKB_CB(skb)->end_seq;
				4856	}
				4857	if (end_of_skbs \|\| tcp_hdr(skb)->syn \|\| tcp_hdr(skb)->fin)
				4858	return;
				4859
				4860	while (before(start, end)) {
				4861	struct sk_buff *nskb;
				4862	unsigned int header = skb_headroom(skb);
				4863	int copy = SKB_MAX_ORDER(header, 0);
				4864
				4865	/* Too big header? This can happen with IPv6. */
				4866	if (copy < 0)
				4867	return;
				4868	if (end - start < copy)
				4869	copy = end - start;
				4870	nskb = alloc_skb(copy + header, GFP_ATOMIC);
				4871	if (!nskb)
				4872	return;
				4873
				4874	skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
				4875	skb_set_network_header(nskb, (skb_network_header(skb) -
				4876	skb->head));
				4877	skb_set_transport_header(nskb, (skb_transport_header(skb) -
				4878	skb->head));
				4879	skb_reserve(nskb, header);
				4880	memcpy(nskb->head, skb->head, header);
				4881	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
				4882	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
				4883	__skb_queue_before(list, skb, nskb);
				4884	skb_set_owner_r(nskb, sk);
				4885
				4886	/* Copy data, releasing collapsed skbs. */
				4887	while (copy > 0) {
				4888	int offset = start - TCP_SKB_CB(skb)->seq;
				4889	int size = TCP_SKB_CB(skb)->end_seq - start;
				4890
				4891	BUG_ON(offset < 0);
				4892	if (size > 0) {
				4893	size = min(copy, size);
				4894	if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				4895	BUG();
				4896	TCP_SKB_CB(nskb)->end_seq += size;
				4897	copy -= size;
				4898	start += size;
				4899	}
				4900	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4901	skb = tcp_collapse_one(sk, skb, list);
				4902	if (!skb \|\|
				4903	skb == tail \|\|
				4904	tcp_hdr(skb)->syn \|\|
				4905	tcp_hdr(skb)->fin)
				4906	return;
				4907	}
				4908	}
				4909	}
				4910	}
				4911
				4912	/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
				4913	* and tcp_collapse() them until all the queue is collapsed.
				4914	*/
				4915	static void tcp_collapse_ofo_queue(struct sock *sk)
				4916	{
				4917	struct tcp_sock *tp = tcp_sk(sk);
				4918	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
				4919	struct sk_buff *head;
				4920	u32 start, end;
				4921
				4922	if (skb == NULL)
				4923	return;
				4924
				4925	start = TCP_SKB_CB(skb)->seq;
				4926	end = TCP_SKB_CB(skb)->end_seq;
				4927	head = skb;
				4928
				4929	for (;;) {
				4930	struct sk_buff *next = NULL;
				4931
				4932	if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
				4933	next = skb_queue_next(&tp->out_of_order_queue, skb);
				4934	skb = next;
				4935
				4936	/* Segment is terminated when we see gap or when
				4937	* we are at the end of all the queue. */
				4938	if (!skb \|\|
				4939	after(TCP_SKB_CB(skb)->seq, end) \|\|
				4940	before(TCP_SKB_CB(skb)->end_seq, start)) {
				4941	tcp_collapse(sk, &tp->out_of_order_queue,
				4942	head, skb, start, end);
				4943	head = skb;
				4944	if (!skb)
				4945	break;
				4946	/* Start new segment */
				4947	start = TCP_SKB_CB(skb)->seq;
				4948	end = TCP_SKB_CB(skb)->end_seq;
				4949	} else {
				4950	if (before(TCP_SKB_CB(skb)->seq, start))
				4951	start = TCP_SKB_CB(skb)->seq;
				4952	if (after(TCP_SKB_CB(skb)->end_seq, end))
				4953	end = TCP_SKB_CB(skb)->end_seq;
				4954	}
				4955	}
				4956	}
				4957
				4958	/*
				4959	* Purge the out-of-order queue.
				4960	* Return true if queue was pruned.
				4961	*/
				4962	static int tcp_prune_ofo_queue(struct sock *sk)
				4963	{
				4964	struct tcp_sock *tp = tcp_sk(sk);
				4965	int res = 0;
				4966
				4967	if (!skb_queue_empty(&tp->out_of_order_queue)) {
				4968	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
				4969	__skb_queue_purge(&tp->out_of_order_queue);
				4970
				4971	/* Reset SACK state. A conforming SACK implementation will
				4972	* do the same at a timeout based retransmit. When a connection
				4973	* is in a sad state like this, we care only about integrity
				4974	* of the connection not performance.
				4975	*/
				4976	if (tp->rx_opt.sack_ok)
				4977	tcp_sack_reset(&tp->rx_opt);
				4978	sk_mem_reclaim(sk);
				4979	res = 1;
				4980	}
				4981	return res;
				4982	}
				4983
				4984	/* Reduce allocated memory if we can, trying to get
				4985	* the socket within its memory limits again.
				4986	*
				4987	* Return less than zero if we should start dropping frames
				4988	* until the socket owning process reads some of the data
				4989	* to stabilize the situation.
				4990	*/
				4991	static int tcp_prune_queue(struct sock *sk)
				4992	{
				4993	struct tcp_sock *tp = tcp_sk(sk);
				4994
				4995	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
				4996
				4997	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
				4998
				4999	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				5000	tcp_clamp_window(sk);
				5001	else if (sk_under_memory_pressure(sk))
				5002	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
				5003
				5004	tcp_collapse_ofo_queue(sk);
				5005	if (!skb_queue_empty(&sk->sk_receive_queue))
				5006	tcp_collapse(sk, &sk->sk_receive_queue,
				5007	skb_peek(&sk->sk_receive_queue),
				5008	NULL,
				5009	tp->copied_seq, tp->rcv_nxt);
				5010	sk_mem_reclaim(sk);
				5011
				5012	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5013	return 0;
				5014
				5015	/* Collapsing did not help, destructive actions follow.
				5016	* This must not ever occur. */
				5017
				5018	tcp_prune_ofo_queue(sk);
				5019
				5020	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				5021	return 0;
				5022
				5023	/* If we are really being abused, tell the caller to silently
				5024	* drop receive data on the floor. It will get retransmitted
				5025	* and hopefully then we'll have sufficient space.
				5026	*/
				5027	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
				5028
				5029	/* Massive buffer overcommit. */
				5030	tp->pred_flags = 0;
				5031	return -1;
				5032	}
				5033
				5034	/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
				5035	* As additional protections, we do not touch cwnd in retransmission phases,
				5036	* and if application hit its sndbuf limit recently.
				5037	*/
				5038	void tcp_cwnd_application_limited(struct sock *sk)
				5039	{
				5040	struct tcp_sock *tp = tcp_sk(sk);
				5041
				5042	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
				5043	sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				5044	/* Limited by application or receiver window. */
				5045	u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
				5046	u32 win_used = max(tp->snd_cwnd_used, init_win);
				5047	if (win_used < tp->snd_cwnd) {
				5048	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				5049	tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
				5050	}
				5051	tp->snd_cwnd_used = 0;
				5052	}
				5053	tp->snd_cwnd_stamp = tcp_time_stamp;
				5054	}
				5055
				5056	static int tcp_should_expand_sndbuf(const struct sock *sk)
				5057	{
				5058	const struct tcp_sock *tp = tcp_sk(sk);
				5059
				5060	/* If the user specified a specific send buffer setting, do
				5061	* not modify it.
				5062	*/
				5063	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
				5064	return 0;
				5065
				5066	/* If we are under global TCP memory pressure, do not expand. */
				5067	if (sk_under_memory_pressure(sk))
				5068	return 0;
				5069
				5070	/* If we are under soft global TCP memory pressure, do not expand. */
				5071	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
				5072	return 0;
				5073
				5074	/* If we filled the congestion window, do not expand. */
				5075	if (tp->packets_out >= tp->snd_cwnd)
				5076	return 0;
				5077
				5078	return 1;
				5079	}
				5080
				5081	/* When incoming ACK allowed to free some skb from write_queue,
				5082	* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
				5083	* on the exit from tcp input handler.
				5084	*
				5085	* PROBLEM: sndbuf expansion does not work well with largesend.
				5086	*/
				5087	static void tcp_new_space(struct sock *sk)
				5088	{
				5089	struct tcp_sock *tp = tcp_sk(sk);
				5090
				5091	if (tcp_should_expand_sndbuf(sk)) {
				5092	int sndmem = SKB_TRUESIZE(max_t(u32,
				5093	tp->rx_opt.mss_clamp,
				5094	tp->mss_cache) +
				5095	MAX_TCP_HEADER);
				5096	int demanded = max_t(unsigned int, tp->snd_cwnd,
				5097	tp->reordering + 1);
				5098	sndmem = 2 demanded;
				5099	if (sndmem > sk->sk_sndbuf)
				5100	sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
				5101	tp->snd_cwnd_stamp = tcp_time_stamp;
				5102	}
				5103
				5104	sk->sk_write_space(sk);
				5105	}
				5106
				5107	static void tcp_check_space(struct sock *sk)
				5108	{
				5109	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
				5110	sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
				5111	if (sk->sk_socket &&
				5112	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
				5113	tcp_new_space(sk);
				5114	}
				5115	}
				5116
				5117	static inline void tcp_data_snd_check(struct sock *sk)
				5118	{
				5119	tcp_push_pending_frames(sk);
				5120	tcp_check_space(sk);
				5121	}
				5122
				5123	/*
				5124	* Check if sending an ack is needed.
				5125	*/
				5126	static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
				5127	{
				5128	struct tcp_sock *tp = tcp_sk(sk);
				5129
				5130	/* More than one full frame received... */
				5131	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
				5132	/* ... and right edge of window advances far enough.
				5133	* (tcp_recvmsg() will send ACK otherwise). Or...
				5134	*/
				5135	__tcp_select_window(sk) >= tp->rcv_wnd) \|\|
				5136	/* We ACK each frame or... */
				5137	tcp_in_quickack_mode(sk) \|\|
				5138	/* We have out of order data. */
				5139	(ofo_possible && skb_peek(&tp->out_of_order_queue))) {
				5140	/* Then ack it now */
				5141	tcp_send_ack(sk);
				5142	} else {
				5143	/* Else, send delayed ack. */
				5144	tcp_send_delayed_ack(sk);
				5145	}
				5146	}
				5147
				5148	static inline void tcp_ack_snd_check(struct sock *sk)
				5149	{
				5150	if (!inet_csk_ack_scheduled(sk)) {
				5151	/* We sent a data segment already. */
				5152	return;
				5153	}
				5154	__tcp_ack_snd_check(sk, 1);
				5155	}
				5156
				5157	/*
				5158	* This routine is only called when we have urgent data
				5159	* signaled. Its the 'slow' part of tcp_urg. It could be
				5160	* moved inline now as tcp_urg is only called from one
				5161	* place. We handle URGent data wrong. We have to - as
				5162	* BSD still doesn't use the correction from RFC961.
				5163	* For 1003.1g we should support a new option TCP_STDURG to permit
				5164	* either form (or just set the sysctl tcp_stdurg).
				5165	*/
				5166
				5167	static void tcp_check_urg(struct sock sk, const struct tcphdr th)
				5168	{
				5169	struct tcp_sock *tp = tcp_sk(sk);
				5170	u32 ptr = ntohs(th->urg_ptr);
				5171
				5172	if (ptr && !sysctl_tcp_stdurg)
				5173	ptr--;
				5174	ptr += ntohl(th->seq);
				5175
				5176	/* Ignore urgent data that we've already seen and read. */
				5177	if (after(tp->copied_seq, ptr))
				5178	return;
				5179
				5180	/* Do not replay urg ptr.
				5181	*
				5182	* NOTE: interesting situation not covered by specs.
				5183	* Misbehaving sender may send urg ptr, pointing to segment,
				5184	* which we already have in ofo queue. We are not able to fetch
				5185	* such data and will stay in TCP_URG_NOTYET until will be eaten
				5186	* by recvmsg(). Seems, we are not obliged to handle such wicked
				5187	* situations. But it is worth to think about possibility of some
				5188	* DoSes using some hypothetical application level deadlock.
				5189	*/
				5190	if (before(ptr, tp->rcv_nxt))
				5191	return;
				5192
				5193	/* Do we already have a newer (or duplicate) urgent pointer? */
				5194	if (tp->urg_data && !after(ptr, tp->urg_seq))
				5195	return;
				5196
				5197	/* Tell the world about our new urgent pointer. */
				5198	sk_send_sigurg(sk);
				5199
				5200	/* We may be adding urgent data when the last byte read was
				5201	* urgent. To do this requires some care. We cannot just ignore
				5202	* tp->copied_seq since we would read the last urgent byte again
				5203	* as data, nor can we alter copied_seq until this data arrives
				5204	* or we break the semantics of SIOCATMARK (and thus sockatmark())
				5205	*
				5206	* NOTE. Double Dutch. Rendering to plain English: author of comment
				5207	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
				5208	* and expect that both A and B disappear from stream. This is _wrong_.
				5209	* Though this happens in BSD with high probability, this is occasional.
				5210	* Any application relying on this is buggy. Note also, that fix "works"
				5211	* only in this artificial test. Insert some normal data between A and B and we will
				5212	* decline of BSD again. Verdict: it is better to remove to trap
				5213	* buggy users.
				5214	*/
				5215	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
				5216	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
				5217	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				5218	tp->copied_seq++;
				5219	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
				5220	__skb_unlink(skb, &sk->sk_receive_queue);
				5221	__kfree_skb(skb);
				5222	}
				5223	}
				5224
				5225	tp->urg_data = TCP_URG_NOTYET;
				5226	tp->urg_seq = ptr;
				5227
				5228	/* Disable header prediction. */
				5229	tp->pred_flags = 0;
				5230	}
				5231
				5232	/* This is the 'fast' part of urgent handling. */
				5233	static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *th)
				5234	{
				5235	struct tcp_sock *tp = tcp_sk(sk);
				5236
				5237	/* Check if we get a new urgent pointer - normally not. */
				5238	if (th->urg)
				5239	tcp_check_urg(sk, th);
				5240
				5241	/* Do we wait for any urgent data? - normally not... */
				5242	if (tp->urg_data == TCP_URG_NOTYET) {
				5243	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
				5244	th->syn;
				5245
				5246	/* Is the urgent pointer pointing into this packet? */
				5247	if (ptr < skb->len) {
				5248	u8 tmp;
				5249	if (skb_copy_bits(skb, ptr, &tmp, 1))
				5250	BUG();
				5251	tp->urg_data = TCP_URG_VALID \| tmp;
				5252	if (!sock_flag(sk, SOCK_DEAD))
				5253	sk->sk_data_ready(sk, 0);
				5254	}
				5255	}
				5256	}
				5257
				5258	static int tcp_copy_to_iovec(struct sock sk, struct sk_buff skb, int hlen)
				5259	{
				5260	struct tcp_sock *tp = tcp_sk(sk);
				5261	int chunk = skb->len - hlen;
				5262	int err;
				5263
				5264	local_bh_enable();
				5265	if (skb_csum_unnecessary(skb))
				5266	err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
				5267	else
				5268	err = skb_copy_and_csum_datagram_iovec(skb, hlen,
				5269	tp->ucopy.iov);
				5270
				5271	if (!err) {
				5272	tp->ucopy.len -= chunk;
				5273	tp->copied_seq += chunk;
				5274	tcp_rcv_space_adjust(sk);
				5275	}
				5276
				5277	local_bh_disable();
				5278	return err;
				5279	}
				5280
				5281	static __sum16 __tcp_checksum_complete_user(struct sock *sk,
				5282	struct sk_buff *skb)
				5283	{
				5284	__sum16 result;
				5285
				5286	if (sock_owned_by_user(sk)) {
				5287	local_bh_enable();
				5288	result = __tcp_checksum_complete(skb);
				5289	local_bh_disable();
				5290	} else {
				5291	result = __tcp_checksum_complete(skb);
				5292	}
				5293	return result;
				5294	}
				5295
				5296	static inline int tcp_checksum_complete_user(struct sock *sk,
				5297	struct sk_buff *skb)
				5298	{
				5299	return !skb_csum_unnecessary(skb) &&
				5300	__tcp_checksum_complete_user(sk, skb);
				5301	}
				5302
				5303	#ifdef CONFIG_NET_DMA
				5304	static int tcp_dma_try_early_copy(struct sock sk, struct sk_buff skb,
				5305	int hlen)
				5306	{
				5307	struct tcp_sock *tp = tcp_sk(sk);
				5308	int chunk = skb->len - hlen;
				5309	int dma_cookie;
				5310	int copied_early = 0;
				5311
				5312	if (tp->ucopy.wakeup)
				5313	return 0;
				5314
				5315	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
				5316	tp->ucopy.dma_chan = net_dma_find_channel();
				5317
				5318	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
				5319
				5320	dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
				5321	skb, hlen,
				5322	tp->ucopy.iov, chunk,
				5323	tp->ucopy.pinned_list);
				5324
				5325	if (dma_cookie < 0)
				5326	goto out;
				5327
				5328	tp->ucopy.dma_cookie = dma_cookie;
				5329	copied_early = 1;
				5330
				5331	tp->ucopy.len -= chunk;
				5332	tp->copied_seq += chunk;
				5333	tcp_rcv_space_adjust(sk);
				5334
				5335	if ((tp->ucopy.len == 0) \|\|
				5336	(tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) \|\|
				5337	(atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
				5338	tp->ucopy.wakeup = 1;
				5339	sk->sk_data_ready(sk, 0);
				5340	}
				5341	} else if (chunk > 0) {
				5342	tp->ucopy.wakeup = 1;
				5343	sk->sk_data_ready(sk, 0);
				5344	}
				5345	out:
				5346	return copied_early;
				5347	}
				5348	#endif /* CONFIG_NET_DMA */
				5349
				5350	/* Does PAWS and seqno based validation of an incoming segment, flags will
				5351	* play significant role here.
				5352	*/
				5353	static bool tcp_validate_incoming(struct sock sk, struct sk_buff skb,
				5354	const struct tcphdr *th, int syn_inerr)
				5355	{
				5356	const u8 *hash_location;
				5357	struct tcp_sock *tp = tcp_sk(sk);
				5358
				5359	/* RFC1323: H1. Apply PAWS check first. */
				5360	if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
				5361	tp->rx_opt.saw_tstamp &&
				5362	tcp_paws_discard(sk, skb)) {
				5363	if (!th->rst) {
				5364	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
				5365	tcp_send_dupack(sk, skb);
				5366	goto discard;
				5367	}
				5368	/* Reset is accepted even if it did not pass PAWS. */
				5369	}
				5370
				5371	/* Step 1: check sequence number */
				5372	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
				5373	/* RFC793, page 37: "In all states except SYN-SENT, all reset
				5374	* (RST) segments are validated by checking their SEQ-fields."
				5375	* And page 69: "If an incoming segment is not acceptable,
				5376	* an acknowledgment should be sent in reply (unless the RST
				5377	* bit is set, if so drop the segment and return)".
				5378	*/
				5379	if (!th->rst) {
				5380	if (th->syn)
				5381	goto syn_challenge;
				5382	tcp_send_dupack(sk, skb);
				5383	}
				5384	goto discard;
				5385	}
				5386
				5387	/* Step 2: check RST bit */
				5388	if (th->rst) {
				5389	/* RFC 5961 3.2 :
				5390	* If sequence number exactly matches RCV.NXT, then
				5391	* RESET the connection
				5392	* else
				5393	* Send a challenge ACK
				5394	*/
				5395	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
				5396	tcp_reset(sk);
				5397	else
				5398	tcp_send_challenge_ack(sk);
				5399	goto discard;
				5400	}
				5401
				5402	/* step 3: check security and precedence [ignored] */
				5403
				5404	/* step 4: Check for a SYN
				5405	* RFC 5691 4.2 : Send a challenge ack
				5406	*/
				5407	if (th->syn) {
				5408	syn_challenge:
				5409	if (syn_inerr)
				5410	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
				5411	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
				5412	tcp_send_challenge_ack(sk);
				5413	goto discard;
				5414	}
				5415
				5416	return true;
				5417
				5418	discard:
				5419	__kfree_skb(skb);
				5420	return false;
				5421	}
				5422
				5423	/*
				5424	* TCP receive function for the ESTABLISHED state.
				5425	*
				5426	* It is split into a fast path and a slow path. The fast path is
				5427	* disabled when:
				5428	* - A zero window was announced from us - zero window probing
				5429	* is only handled properly in the slow path.
				5430	* - Out of order segments arrived.
				5431	* - Urgent data is expected.
				5432	* - There is no buffer space left
				5433	* - Unexpected TCP flags/window values/header lengths are received
				5434	* (detected by checking the TCP header against pred_flags)
				5435	* - Data is sent in both directions. Fast path only supports pure senders
				5436	* or pure receivers (this means either the sequence number or the ack
				5437	* value must stay constant)
				5438	* - Unexpected TCP option.
				5439	*
				5440	* When these conditions are not satisfied it drops into a standard
				5441	* receive procedure patterned after RFC793 to handle all cases.
				5442	* The first three cases are guaranteed by proper pred_flags setting,
				5443	* the rest is checked inline. Fast processing is turned on in
				5444	* tcp_data_queue when everything is OK.
				5445	*/
				5446	int tcp_rcv_established(struct sock sk, struct sk_buff skb,
				5447	const struct tcphdr *th, unsigned int len)
				5448	{
				5449	struct tcp_sock *tp = tcp_sk(sk);
				5450
				5451	/*
				5452	* Header prediction.
				5453	* The code loosely follows the one in the famous
				5454	* "30 instruction TCP receive" Van Jacobson mail.
				5455	*
				5456	* Van's trick is to deposit buffers into socket queue
				5457	* on a device interrupt, to call tcp_recv function
				5458	* on the receive process context and checksum and copy
				5459	* the buffer to user space. smart...
				5460	*
				5461	* Our current scheme is not silly either but we take the
				5462	* extra cost of the net_bh soft interrupt processing...
				5463	* We do checksum and copy also but from device to kernel.
				5464	*/
				5465
				5466	tp->rx_opt.saw_tstamp = 0;
				5467
				5468	/* pred_flags is 0xS?10 << 16 + snd_wnd
				5469	* if header_prediction is to be made
				5470	* 'S' will always be tp->tcp_header_len >> 2
				5471	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
				5472	* turn it off (when there are holes in the receive
				5473	* space for instance)
				5474	* PSH flag is ignored.
				5475	*/
				5476
				5477	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
				5478	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
				5479	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				5480	int tcp_header_len = tp->tcp_header_len;
				5481
				5482	/* Timestamp header prediction: tcp_header_len
				5483	* is automatically equal to th->doff*4 due to pred_flags
				5484	* match.
				5485	*/
				5486
				5487	/* Check timestamp */
				5488	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
				5489	/* No? Slow path! */
				5490	if (!tcp_parse_aligned_timestamp(tp, th))
				5491	goto slow_path;
				5492
				5493	/* If PAWS failed, check it more carefully in slow path */
				5494	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				5495	goto slow_path;
				5496
				5497	/* DO NOT update ts_recent here, if checksum fails
				5498	* and timestamp was corrupted part, it will result
				5499	* in a hung connection since we will drop all
				5500	* future packets due to the PAWS test.
				5501	*/
				5502	}
				5503
				5504	if (len <= tcp_header_len) {
				5505	/* Bulk data transfer: sender */
				5506	if (len == tcp_header_len) {
				5507	/* Predicted packet is in window by definition.
				5508	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5509	* Hence, check seq<=rcv_wup reduces to:
				5510	*/
				5511	if (tcp_header_len ==
				5512	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5513	tp->rcv_nxt == tp->rcv_wup)
				5514	tcp_store_ts_recent(tp);
				5515
				5516	/* We know that such packets are checksummed
				5517	* on entry.
				5518	*/
				5519	tcp_ack(sk, skb, 0);
				5520	__kfree_skb(skb);
				5521	tcp_data_snd_check(sk);
				5522	return 0;
				5523	} else { /* Header too small */
				5524	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
				5525	goto discard;
				5526	}
				5527	} else {
				5528	int eaten = 0;
				5529	int copied_early = 0;
				5530
				5531	if (tp->copied_seq == tp->rcv_nxt &&
				5532	len - tcp_header_len <= tp->ucopy.len) {
				5533	#ifdef CONFIG_NET_DMA
				5534	if (tp->ucopy.task == current &&
				5535	sock_owned_by_user(sk) &&
				5536	tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
				5537	copied_early = 1;
				5538	eaten = 1;
				5539	}
				5540	#endif
				5541	if (tp->ucopy.task == current &&
				5542	sock_owned_by_user(sk) && !copied_early) {
				5543	__set_current_state(TASK_RUNNING);
				5544
				5545	if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
				5546	eaten = 1;
				5547	}
				5548	if (eaten) {
				5549	/* Predicted packet is in window by definition.
				5550	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5551	* Hence, check seq<=rcv_wup reduces to:
				5552	*/
				5553	if (tcp_header_len ==
				5554	(sizeof(struct tcphdr) +
				5555	TCPOLEN_TSTAMP_ALIGNED) &&
				5556	tp->rcv_nxt == tp->rcv_wup)
				5557	tcp_store_ts_recent(tp);
				5558
				5559	tcp_rcv_rtt_measure_ts(sk, skb);
				5560
				5561	__skb_pull(skb, tcp_header_len);
				5562	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
				5563	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
				5564	}
				5565	if (copied_early)
				5566	tcp_cleanup_rbuf(sk, skb->len);
				5567	}
				5568	if (!eaten) {
				5569	if (tcp_checksum_complete_user(sk, skb))
				5570	goto csum_error;
				5571
				5572	if ((int)skb->truesize > sk->sk_forward_alloc)
				5573	goto step5;
				5574
				5575	/* Predicted packet is in window by definition.
				5576	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5577	* Hence, check seq<=rcv_wup reduces to:
				5578	*/
				5579	if (tcp_header_len ==
				5580	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5581	tp->rcv_nxt == tp->rcv_wup)
				5582	tcp_store_ts_recent(tp);
				5583
				5584	tcp_rcv_rtt_measure_ts(sk, skb);
				5585
				5586	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
				5587
				5588	/* Bulk data transfer: receiver */
				5589	__skb_pull(skb, tcp_header_len);
				5590	__skb_queue_tail(&sk->sk_receive_queue, skb);
				5591	skb_set_owner_r(skb, sk);
				5592	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
				5593	}
				5594
				5595	tcp_event_data_recv(sk, skb);
				5596
				5597	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				5598	/* Well, only one small jumplet in fast path... */
				5599	tcp_ack(sk, skb, FLAG_DATA);
				5600	tcp_data_snd_check(sk);
				5601	if (!inet_csk_ack_scheduled(sk))
				5602	goto no_ack;
				5603	}
				5604
				5605	if (!copied_early \|\| tp->rcv_nxt != tp->rcv_wup)
				5606	__tcp_ack_snd_check(sk, 0);
				5607	no_ack:
				5608	#ifdef CONFIG_NET_DMA
				5609	if (copied_early)
				5610	__skb_queue_tail(&sk->sk_async_wait_queue, skb);
				5611	else
				5612	#endif
				5613	if (eaten)
				5614	__kfree_skb(skb);
				5615	else
				5616	sk->sk_data_ready(sk, 0);
				5617	return 0;
				5618	}
				5619	}
				5620
				5621	slow_path:
				5622	if (len < (th->doff << 2) \|\| tcp_checksum_complete_user(sk, skb))
				5623	goto csum_error;
				5624
				5625	/*
				5626	* Standard slow path.
				5627	*/
				5628
				5629	if (!tcp_validate_incoming(sk, skb, th, 1))
				5630	return 0;
				5631
				5632	step5:
				5633	if (th->ack &&
				5634	tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT) < 0)
				5635	goto discard;
				5636
				5637	tcp_rcv_rtt_measure_ts(sk, skb);
				5638
				5639	/* Process urgent data. */
				5640	tcp_urg(sk, skb, th);
				5641
				5642	/* step 7: process the segment text */
				5643	tcp_data_queue(sk, skb);
				5644
				5645	tcp_data_snd_check(sk);
				5646	tcp_ack_snd_check(sk);
				5647	return 0;
				5648
				5649	csum_error:
				5650	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
				5651
				5652	discard:
				5653	__kfree_skb(skb);
				5654	return 0;
				5655	}
				5656	EXPORT_SYMBOL(tcp_rcv_established);
				5657
				5658	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
				5659	const struct tcphdr *th, unsigned int len)
				5660	{
				5661	const u8 *hash_location;
				5662	struct inet_connection_sock *icsk = inet_csk(sk);
				5663	struct tcp_sock *tp = tcp_sk(sk);
				5664	struct tcp_cookie_values *cvp = tp->cookie_values;
				5665	int saved_clamp = tp->rx_opt.mss_clamp;
				5666
				5667	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
				5668
				5669	if (th->ack) {
				5670	/* rfc793:
				5671	* "If the state is SYN-SENT then
				5672	* first check the ACK bit
				5673	* If the ACK bit is set
				5674	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
				5675	* a reset (unless the RST bit is set, if so drop
				5676	* the segment and return)"
				5677	*
				5678	* We do not send data with SYN, so that RFC-correct
				5679	* test reduces to:
				5680	*/
				5681	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
				5682	goto reset_and_undo;
				5683
				5684	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				5685	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
				5686	tcp_time_stamp)) {
				5687	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
				5688	goto reset_and_undo;
				5689	}
				5690
				5691	/* Now ACK is acceptable.
				5692	*
				5693	* "If the RST bit is set
				5694	* If the ACK was acceptable then signal the user "error:
				5695	* connection reset", drop the segment, enter CLOSED state,
				5696	* delete TCB, and return."
				5697	*/
				5698
				5699	if (th->rst) {
				5700	tcp_reset(sk);
				5701	goto discard;
				5702	}
				5703
				5704	/* rfc793:
				5705	* "fifth, if neither of the SYN or RST bits is set then
				5706	* drop the segment and return."
				5707	*
				5708	* See note below!
				5709	* --ANK(990513)
				5710	*/
				5711	if (!th->syn)
				5712	goto discard_and_undo;
				5713
				5714	/* rfc793:
				5715	* "If the SYN bit is on ...
				5716	* are acceptable then ...
				5717	* (our SYN has been ACKed), change the connection
				5718	* state to ESTABLISHED..."
				5719	*/
				5720
				5721	TCP_ECN_rcv_synack(tp, th);
				5722
				5723	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				5724	tcp_ack(sk, skb, FLAG_SLOWPATH);
				5725
				5726	/* Ok.. it's good. Set up sequence numbers and
				5727	* move to established.
				5728	*/
				5729	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				5730	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5731
				5732	/* RFC1323: The window in SYN & SYN/ACK segments is
				5733	* never scaled.
				5734	*/
				5735	tp->snd_wnd = ntohs(th->window);
				5736	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				5737
				5738	if (!tp->rx_opt.wscale_ok) {
				5739	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
				5740	tp->window_clamp = min(tp->window_clamp, 65535U);
				5741	}
				5742
				5743	if (tp->rx_opt.saw_tstamp) {
				5744	tp->rx_opt.tstamp_ok = 1;
				5745	tp->tcp_header_len =
				5746	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5747	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				5748	tcp_store_ts_recent(tp);
				5749	} else {
				5750	tp->tcp_header_len = sizeof(struct tcphdr);
				5751	}
				5752
				5753	if (tcp_is_sack(tp) && sysctl_tcp_fack)
				5754	tcp_enable_fack(tp);
				5755
				5756	tcp_mtup_init(sk);
				5757	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5758	tcp_initialize_rcv_mss(sk);
				5759
				5760	/* Remember, tcp_poll() does not lock socket!
				5761	* Change state from SYN-SENT only after copied_seq
				5762	* is initialized. */
				5763	tp->copied_seq = tp->rcv_nxt;
				5764
				5765	if (cvp != NULL &&
				5766	cvp->cookie_pair_size > 0 &&
				5767	tp->rx_opt.cookie_plus > 0) {
				5768	int cookie_size = tp->rx_opt.cookie_plus
				5769	- TCPOLEN_COOKIE_BASE;
				5770	int cookie_pair_size = cookie_size
				5771	+ cvp->cookie_desired;
				5772
				5773	/* A cookie extension option was sent and returned.
				5774	* Note that each incoming SYNACK replaces the
				5775	* Responder cookie. The initial exchange is most
				5776	* fragile, as protection against spoofing relies
				5777	* entirely upon the sequence and timestamp (above).
				5778	* This replacement strategy allows the correct pair to
				5779	* pass through, while any others will be filtered via
				5780	* Responder verification later.
				5781	*/
				5782	if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
				5783	memcpy(&cvp->cookie_pair[cvp->cookie_desired],
				5784	hash_location, cookie_size);
				5785	cvp->cookie_pair_size = cookie_pair_size;
				5786	}
				5787	}
				5788
				5789	smp_mb();
				5790	tcp_set_state(sk, TCP_ESTABLISHED);
				5791
				5792	security_inet_conn_established(sk, skb);
				5793
				5794	/* Make sure socket is routed, for correct metrics. */
				5795	icsk->icsk_af_ops->rebuild_header(sk);
				5796
				5797	tcp_init_metrics(sk);
				5798
				5799	tcp_init_congestion_control(sk);
				5800
				5801	/* Prevent spurious tcp_cwnd_restart() on first data
				5802	* packet.
				5803	*/
				5804	tp->lsndtime = tcp_time_stamp;
				5805
				5806	tcp_init_buffer_space(sk);
				5807
				5808	if (sock_flag(sk, SOCK_KEEPOPEN))
				5809	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
				5810
				5811	if (!tp->rx_opt.snd_wscale)
				5812	__tcp_fast_path_on(tp, tp->snd_wnd);
				5813	else
				5814	tp->pred_flags = 0;
				5815
				5816	if (!sock_flag(sk, SOCK_DEAD)) {
				5817	sk->sk_state_change(sk);
				5818	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				5819	}
				5820
				5821	if (sk->sk_write_pending \|\|
				5822	icsk->icsk_accept_queue.rskq_defer_accept \|\|
				5823	icsk->icsk_ack.pingpong) {
				5824	/* Save one ACK. Data will be ready after
				5825	* several ticks, if write_pending is set.
				5826	*
				5827	* It may be deleted, but with this feature tcpdumps
				5828	* look so _wonderfully_ clever, that I was not able
				5829	* to stand against the temptation 8) --ANK
				5830	*/
				5831	inet_csk_schedule_ack(sk);
				5832	icsk->icsk_ack.lrcvtime = tcp_time_stamp;
				5833	icsk->icsk_ack.ato = TCP_ATO_MIN;
				5834	tcp_incr_quickack(sk);
				5835	tcp_enter_quickack_mode(sk);
				5836	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				5837	TCP_DELACK_MAX, TCP_RTO_MAX);
				5838
				5839	discard:
				5840	__kfree_skb(skb);
				5841	return 0;
				5842	} else {
				5843	tcp_send_ack(sk);
				5844	}
				5845	return -1;
				5846	}
				5847
				5848	/* No ACK in the segment */
				5849
				5850	if (th->rst) {
				5851	/* rfc793:
				5852	* "If the RST bit is set
				5853	*
				5854	* Otherwise (no ACK) drop the segment and return."
				5855	*/
				5856
				5857	goto discard_and_undo;
				5858	}
				5859
				5860	/* PAWS check. */
				5861	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
				5862	tcp_paws_reject(&tp->rx_opt, 0))
				5863	goto discard_and_undo;
				5864
				5865	if (th->syn) {
				5866	/* We see SYN without ACK. It is attempt of
				5867	* simultaneous connect with crossed SYNs.
				5868	* Particularly, it can be connect to self.
				5869	*/
				5870	tcp_set_state(sk, TCP_SYN_RECV);
				5871
				5872	if (tp->rx_opt.saw_tstamp) {
				5873	tp->rx_opt.tstamp_ok = 1;
				5874	tcp_store_ts_recent(tp);
				5875	tp->tcp_header_len =
				5876	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5877	} else {
				5878	tp->tcp_header_len = sizeof(struct tcphdr);
				5879	}
				5880
				5881	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				5882	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5883
				5884	/* RFC1323: The window in SYN & SYN/ACK segments is
				5885	* never scaled.
				5886	*/
				5887	tp->snd_wnd = ntohs(th->window);
				5888	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				5889	tp->max_window = tp->snd_wnd;
				5890
				5891	TCP_ECN_rcv_syn(tp, th);
				5892
				5893	tcp_mtup_init(sk);
				5894	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5895	tcp_initialize_rcv_mss(sk);
				5896
				5897	tcp_send_synack(sk);
				5898	#if 0
				5899	/* Note, we could accept data and URG from this segment.
				5900	* There are no obstacles to make this.
				5901	*
				5902	* However, if we ignore data in ACKless segments sometimes,
				5903	* we have no reasons to accept it sometimes.
				5904	* Also, seems the code doing it in step6 of tcp_rcv_state_process
				5905	* is not flawless. So, discard packet for sanity.
				5906	* Uncomment this return to process the data.
				5907	*/
				5908	return -1;
				5909	#else
				5910	goto discard;
				5911	#endif
				5912	}
				5913	/* "fifth, if neither of the SYN or RST bits is set then
				5914	* drop the segment and return."
				5915	*/
				5916
				5917	discard_and_undo:
				5918	tcp_clear_options(&tp->rx_opt);
				5919	tp->rx_opt.mss_clamp = saved_clamp;
				5920	goto discard;
				5921
				5922	reset_and_undo:
				5923	tcp_clear_options(&tp->rx_opt);
				5924	tp->rx_opt.mss_clamp = saved_clamp;
				5925	return 1;
				5926	}
				5927
				5928	/*
				5929	* This function implements the receiving procedure of RFC 793 for
				5930	* all states except ESTABLISHED and TIME_WAIT.
				5931	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
				5932	* address independent.
				5933	*/
				5934
				5935	int tcp_rcv_state_process(struct sock sk, struct sk_buff skb,
				5936	const struct tcphdr *th, unsigned int len)
				5937	{
				5938	struct tcp_sock *tp = tcp_sk(sk);
				5939	struct inet_connection_sock *icsk = inet_csk(sk);
				5940	int queued = 0;
				5941
				5942	tp->rx_opt.saw_tstamp = 0;
				5943
				5944	switch (sk->sk_state) {
				5945	case TCP_CLOSE:
				5946	goto discard;
				5947
				5948	case TCP_LISTEN:
				5949	if (th->ack)
				5950	return 1;
				5951
				5952	if (th->rst)
				5953	goto discard;
				5954
				5955	if (th->syn) {
				5956	if (th->fin)
				5957	goto discard;
				5958	if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
				5959	return 1;
				5960
				5961	/* Now we have several options: In theory there is
				5962	* nothing else in the frame. KA9Q has an option to
				5963	* send data with the syn, BSD accepts data with the
				5964	* syn up to the [to be] advertised window and
				5965	* Solaris 2.1 gives you a protocol error. For now
				5966	* we just ignore it, that fits the spec precisely
				5967	* and avoids incompatibilities. It would be nice in
				5968	* future to drop through and process the data.
				5969	*
				5970	* Now that TTCP is starting to be used we ought to
				5971	* queue this data.
				5972	* But, this leaves one open to an easy denial of
				5973	* service attack, and SYN cookies can't defend
				5974	* against this problem. So, we drop the data
				5975	* in the interest of security over speed unless
				5976	* it's still in use.
				5977	*/
				5978	kfree_skb(skb);
				5979	return 0;
				5980	}
				5981	goto discard;
				5982
				5983	case TCP_SYN_SENT:
				5984	queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
				5985	if (queued >= 0)
				5986	return queued;
				5987
				5988	/* Do step6 onward by hand. */
				5989	tcp_urg(sk, skb, th);
				5990	__kfree_skb(skb);
				5991	tcp_data_snd_check(sk);
				5992	return 0;
				5993	}
				5994
				5995	if (!tcp_validate_incoming(sk, skb, th, 0))
				5996	return 0;
				5997
				5998	/* step 5: check the ACK field */
				5999	if (th->ack) {
				6000	int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
				6001	FLAG_UPDATE_TS_RECENT) > 0;
				6002
				6003	switch (sk->sk_state) {
				6004	case TCP_SYN_RECV:
				6005	if (acceptable) {
				6006	tp->copied_seq = tp->rcv_nxt;
				6007	smp_mb();
				6008	tcp_set_state(sk, TCP_ESTABLISHED);
				6009	sk->sk_state_change(sk);
				6010
				6011	/* Note, that this wakeup is only for marginal
				6012	* crossed SYN case. Passively open sockets
				6013	* are not waked up, because sk->sk_sleep ==
				6014	* NULL and sk->sk_socket == NULL.
				6015	*/
				6016	if (sk->sk_socket)
				6017	sk_wake_async(sk,
				6018	SOCK_WAKE_IO, POLL_OUT);
				6019
				6020	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				6021	tp->snd_wnd = ntohs(th->window) <<
				6022	tp->rx_opt.snd_wscale;
				6023	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				6024
				6025	if (tp->rx_opt.tstamp_ok)
				6026	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				6027
				6028	/* Make sure socket is routed, for
				6029	* correct metrics.
				6030	*/
				6031	icsk->icsk_af_ops->rebuild_header(sk);
				6032
				6033	tcp_init_metrics(sk);
				6034
				6035	tcp_init_congestion_control(sk);
				6036
				6037	/* Prevent spurious tcp_cwnd_restart() on
				6038	* first data packet.
				6039	*/
				6040	tp->lsndtime = tcp_time_stamp;
				6041
				6042	tcp_mtup_init(sk);
				6043	tcp_initialize_rcv_mss(sk);
				6044	tcp_init_buffer_space(sk);
				6045	tcp_fast_path_on(tp);
				6046	} else {
				6047	return 1;
				6048	}
				6049	break;
				6050
				6051	case TCP_FIN_WAIT1:
				6052	if (tp->snd_una == tp->write_seq) {
				6053	tcp_set_state(sk, TCP_FIN_WAIT2);
				6054	sk->sk_shutdown \|= SEND_SHUTDOWN;
				6055	dst_confirm(__sk_dst_get(sk));
				6056
				6057	if (!sock_flag(sk, SOCK_DEAD))
				6058	/* Wake up lingering close() */
				6059	sk->sk_state_change(sk);
				6060	else {
				6061	int tmo;
				6062
				6063	if (tp->linger2 < 0 \|\|
				6064	(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6065	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
				6066	tcp_done(sk);
				6067	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6068	return 1;
				6069	}
				6070
				6071	tmo = tcp_fin_time(sk);
				6072	if (tmo > TCP_TIMEWAIT_LEN) {
				6073	inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
				6074	} else if (th->fin \|\| sock_owned_by_user(sk)) {
				6075	/* Bad case. We could lose such FIN otherwise.
				6076	* It is not a big problem, but it looks confusing
				6077	* and not so rare event. We still can lose it now,
				6078	* if it spins in bh_lock_sock(), but it is really
				6079	* marginal case.
				6080	*/
				6081	inet_csk_reset_keepalive_timer(sk, tmo);
				6082	} else {
				6083	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				6084	goto discard;
				6085	}
				6086	}
				6087	}
				6088	break;
				6089
				6090	case TCP_CLOSING:
				6091	if (tp->snd_una == tp->write_seq) {
				6092	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				6093	goto discard;
				6094	}
				6095	break;
				6096
				6097	case TCP_LAST_ACK:
				6098	if (tp->snd_una == tp->write_seq) {
				6099	tcp_update_metrics(sk);
				6100	tcp_done(sk);
				6101	goto discard;
				6102	}
				6103	break;
				6104	}
				6105	} else
				6106	goto discard;
				6107
				6108	/* step 6: check the URG bit */
				6109	tcp_urg(sk, skb, th);
				6110
				6111	/* step 7: process the segment text */
				6112	switch (sk->sk_state) {
				6113	case TCP_CLOSE_WAIT:
				6114	case TCP_CLOSING:
				6115	case TCP_LAST_ACK:
				6116	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				6117	break;
				6118	case TCP_FIN_WAIT1:
				6119	case TCP_FIN_WAIT2:
				6120	/* RFC 793 says to queue data in these states,
				6121	* RFC 1122 says we MUST send a reset.
				6122	* BSD 4.4 also does reset.
				6123	*/
				6124	if (sk->sk_shutdown & RCV_SHUTDOWN) {
				6125	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6126	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6127	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6128	tcp_reset(sk);
				6129	return 1;
				6130	}
				6131	}
				6132	/* Fall through */
				6133	case TCP_ESTABLISHED:
				6134	tcp_data_queue(sk, skb);
				6135	queued = 1;
				6136	break;
				6137	}
				6138
				6139	/* tcp_data could move socket to TIME-WAIT */
				6140	if (sk->sk_state != TCP_CLOSE) {
				6141	tcp_data_snd_check(sk);
				6142	tcp_ack_snd_check(sk);
				6143	}
				6144
				6145	if (!queued) {
				6146	discard:
				6147	__kfree_skb(skb);
				6148	}
				6149	return 0;
				6150	}
				6151	EXPORT_SYMBOL(tcp_rcv_state_process);