Blame - src/kernel/linux/v4.19/net/ipv4/tcp_output.c - T800

blob: 1cc20edf476238c5d7af5cc44611360a98d946ed [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*/
				20
				21	/*
				22	* Changes: Pedro Roque : Retransmit queue handled by TCP.
				23	* : Fragmentation on mtu decrease
				24	* : Segment collapse on retransmit
				25	* : AF independence
				26	*
				27	* Linus Torvalds : send_delayed_ack
				28	* David S. Miller : Charge memory using the right skb
				29	* during syn/ack processing.
				30	* David S. Miller : Output engine completely rewritten.
				31	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
				32	* Cacophonix Gaul : draft-minshall-nagle-01
				33	* J Hadi Salim : ECN support
				34	*
				35	*/
				36
				37	#define pr_fmt(fmt) "TCP: " fmt
				38
				39	#include <net/tcp.h>
				40
				41	#include <linux/compiler.h>
				42	#include <linux/gfp.h>
				43	#include <linux/module.h>
				44	#include <linux/static_key.h>
				45
				46	#include <trace/events/tcp.h>
				47
				48	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				49	int push_one, gfp_t gfp);
				50
				51	/* Account for new data that has been sent to the network. */
				52	static void tcp_event_new_data_sent(struct sock sk, struct sk_buff skb)
				53	{
				54	struct inet_connection_sock *icsk = inet_csk(sk);
				55	struct tcp_sock *tp = tcp_sk(sk);
				56	unsigned int prior_packets = tp->packets_out;
				57
				58	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
				59
				60	__skb_unlink(skb, &sk->sk_write_queue);
				61	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
				62
				63	if (tp->highest_sack == NULL)
				64	tp->highest_sack = skb;
				65
				66	tp->packets_out += tcp_skb_pcount(skb);
				67	if (!prior_packets \|\| icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
				68	tcp_rearm_rto(sk);
				69
				70	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
				71	tcp_skb_pcount(skb));
				72	}
				73
				74	/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
				75	* window scaling factor due to loss of precision.
				76	* If window has been shrunk, what should we make? It is not clear at all.
				77	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
				78	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
				79	* invalid. OK, let's make this for now:
				80	*/
				81	static inline __u32 tcp_acceptable_seq(const struct sock *sk)
				82	{
				83	const struct tcp_sock *tp = tcp_sk(sk);
				84
				85	if (!before(tcp_wnd_end(tp), tp->snd_nxt) \|\|
				86	(tp->rx_opt.wscale_ok &&
				87	((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
				88	return tp->snd_nxt;
				89	else
				90	return tcp_wnd_end(tp);
				91	}
				92
				93	/* Calculate mss to advertise in SYN segment.
				94	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
				95	*
				96	* 1. It is independent of path mtu.
				97	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
				98	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
				99	* attached devices, because some buggy hosts are confused by
				100	* large MSS.
				101	* 4. We do not make 3, we advertise MSS, calculated from first
				102	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
				103	* This may be overridden via information stored in routing table.
				104	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
				105	* probably even Jumbo".
				106	*/
				107	static __u16 tcp_advertise_mss(struct sock *sk)
				108	{
				109	struct tcp_sock *tp = tcp_sk(sk);
				110	const struct dst_entry *dst = __sk_dst_get(sk);
				111	int mss = tp->advmss;
				112
				113	if (dst) {
				114	unsigned int metric = dst_metric_advmss(dst);
				115
				116	if (metric < mss) {
				117	mss = metric;
				118	tp->advmss = mss;
				119	}
				120	}
				121
				122	return (__u16)mss;
				123	}
				124
				125	/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
				126	* This is the first part of cwnd validation mechanism.
				127	*/
				128	void tcp_cwnd_restart(struct sock *sk, s32 delta)
				129	{
				130	struct tcp_sock *tp = tcp_sk(sk);
				131	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
				132	u32 cwnd = tp->snd_cwnd;
				133
				134	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
				135
				136	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				137	restart_cwnd = min(restart_cwnd, cwnd);
				138
				139	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
				140	cwnd >>= 1;
				141	tp->snd_cwnd = max(cwnd, restart_cwnd);
				142	tp->snd_cwnd_stamp = tcp_jiffies32;
				143	tp->snd_cwnd_used = 0;
				144	}
				145
				146	/* Congestion state accounting after a packet has been sent. */
				147	static void tcp_event_data_sent(struct tcp_sock *tp,
				148	struct sock *sk)
				149	{
				150	struct inet_connection_sock *icsk = inet_csk(sk);
				151	const u32 now = tcp_jiffies32;
				152
				153	if (tcp_packets_in_flight(tp) == 0)
				154	tcp_ca_event(sk, CA_EVENT_TX_START);
				155
				156	tp->lsndtime = now;
				157
				158	/* If it is a reply for ato after last received
				159	* packet, enter pingpong mode.
				160	*/
				161	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
				162	icsk->icsk_ack.pingpong = 1;
				163	}
				164
				165	/* Account for an ACK we sent. */
				166	static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
				167	u32 rcv_nxt)
				168	{
				169	struct tcp_sock *tp = tcp_sk(sk);
				170
				171	if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
				172	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
				173	tp->compressed_ack - TCP_FASTRETRANS_THRESH);
				174	tp->compressed_ack = TCP_FASTRETRANS_THRESH;
				175	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
				176	__sock_put(sk);
				177	}
				178
				179	if (unlikely(rcv_nxt != tp->rcv_nxt))
				180	return; /* Special ACK sent by DCTCP to reflect ECN */
				181	tcp_dec_quickack_mode(sk, pkts);
				182	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
				183	}
				184
				185	/* Determine a window scaling and initial window to offer.
				186	* Based on the assumption that the given amount of space
				187	* will be offered. Store the results in the tp structure.
				188	* NOTE: for smooth operation initial space offering should
				189	* be a multiple of mss if possible. We assume here that mss >= 1.
				190	* This MUST be enforced by all callers.
				191	*/
				192	void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
				193	__u32 rcv_wnd, __u32 window_clamp,
				194	int wscale_ok, __u8 *rcv_wscale,
				195	__u32 init_rcv_wnd)
				196	{
				197	unsigned int space = (__space < 0 ? 0 : __space);
				198
				199	/* If no clamp set the clamp to the max possible scaled window */
				200	if (*window_clamp == 0)
				201	(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
				202	space = min(*window_clamp, space);
				203
				204	/* Quantize space offering to a multiple of mss if possible. */
				205	if (space > mss)
				206	space = rounddown(space, mss);
				207
				208	/* NOTE: offering an initial window larger than 32767
				209	* will break some buggy TCP stacks. If the admin tells us
				210	* it is likely we could be speaking with such a buggy stack
				211	* we will truncate our initial window offering to 32K-1
				212	* unless the remote has sent us a window scaling option,
				213	* which we interpret as a sign the remote TCP is not
				214	* misinterpreting the window field as a signed quantity.
				215	*/
				216	if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
				217	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
				218	else
				219	(*rcv_wnd) = min_t(u32, space, U16_MAX);
				220
				221	if (init_rcv_wnd)
				222	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
				223
				224	(*rcv_wscale) = 0;
				225	if (wscale_ok) {
				226	/* Set window scaling on max possible window */
				227	space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
				228	space = max_t(u32, space, sysctl_rmem_max);
				229	space = min_t(u32, space, *window_clamp);
				230	while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
				231	space >>= 1;
				232	(*rcv_wscale)++;
				233	}
				234	}
				235	/* Set the clamp no higher than max representable value */
				236	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
				237	}
				238	EXPORT_SYMBOL(tcp_select_initial_window);
				239
				240	/* Chose a new window to advertise, update state in tcp_sock for the
				241	* socket, and return result with RFC1323 scaling applied. The return
				242	* value can be stuffed directly into th->window for an outgoing
				243	* frame.
				244	*/
				245	static u16 tcp_select_window(struct sock *sk)
				246	{
				247	struct tcp_sock *tp = tcp_sk(sk);
				248	u32 old_win = tp->rcv_wnd;
				249	u32 cur_win = tcp_receive_window(tp);
				250	u32 new_win = __tcp_select_window(sk);
				251
				252	/* Never shrink the offered window */
				253	if (new_win < cur_win) {
				254	/* Danger Will Robinson!
				255	* Don't update rcv_wup/rcv_wnd here or else
				256	* we will not be able to advertise a zero
				257	* window in time. --DaveM
				258	*
				259	* Relax Will Robinson.
				260	*/
				261	if (new_win == 0)
				262	NET_INC_STATS(sock_net(sk),
				263	LINUX_MIB_TCPWANTZEROWINDOWADV);
				264	new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
				265	}
				266	tp->rcv_wnd = new_win;
				267	tp->rcv_wup = tp->rcv_nxt;
				268
				269	/* Make sure we do not exceed the maximum possible
				270	* scaled window.
				271	*/
				272	if (!tp->rx_opt.rcv_wscale &&
				273	sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
				274	new_win = min(new_win, MAX_TCP_WINDOW);
				275	else
				276	new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
				277
				278	/* RFC1323 scaling applied */
				279	new_win >>= tp->rx_opt.rcv_wscale;
				280
				281	/* If we advertise zero window, disable fast path. */
				282	if (new_win == 0) {
				283	tp->pred_flags = 0;
				284	if (old_win)
				285	NET_INC_STATS(sock_net(sk),
				286	LINUX_MIB_TCPTOZEROWINDOWADV);
				287	} else if (old_win == 0) {
				288	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
				289	}
				290
				291	return new_win;
				292	}
				293
				294	/* Packet ECN state for a SYN-ACK */
				295	static void tcp_ecn_send_synack(struct sock sk, struct sk_buff skb)
				296	{
				297	const struct tcp_sock *tp = tcp_sk(sk);
				298
				299	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
				300	if (!(tp->ecn_flags & TCP_ECN_OK))
				301	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
				302	else if (tcp_ca_needs_ecn(sk) \|\|
				303	tcp_bpf_ca_needs_ecn(sk))
				304	INET_ECN_xmit(sk);
				305	}
				306
				307	/* Packet ECN state for a SYN. */
				308	static void tcp_ecn_send_syn(struct sock sk, struct sk_buff skb)
				309	{
				310	struct tcp_sock *tp = tcp_sk(sk);
				311	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
				312	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 \|\|
				313	tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn;
				314
				315	if (!use_ecn) {
				316	const struct dst_entry *dst = __sk_dst_get(sk);
				317
				318	if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
				319	use_ecn = true;
				320	}
				321
				322	tp->ecn_flags = 0;
				323
				324	if (use_ecn) {
				325	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ECE \| TCPHDR_CWR;
				326	tp->ecn_flags = TCP_ECN_OK;
				327	if (tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn)
				328	INET_ECN_xmit(sk);
				329	}
				330	}
				331
				332	static void tcp_ecn_clear_syn(struct sock sk, struct sk_buff skb)
				333	{
				334	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
				335	/* tp->ecn_flags are cleared at a later point in time when
				336	* SYN ACK is ultimatively being received.
				337	*/
				338	TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE \| TCPHDR_CWR);
				339	}
				340
				341	static void
				342	tcp_ecn_make_synack(const struct request_sock req, struct tcphdr th)
				343	{
				344	if (inet_rsk(req)->ecn_ok)
				345	th->ece = 1;
				346	}
				347
				348	/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
				349	* be sent.
				350	*/
				351	static void tcp_ecn_send(struct sock sk, struct sk_buff skb,
				352	struct tcphdr *th, int tcp_header_len)
				353	{
				354	struct tcp_sock *tp = tcp_sk(sk);
				355
				356	if (tp->ecn_flags & TCP_ECN_OK) {
				357	/* Not-retransmitted data segment: set ECT and inject CWR. */
				358	if (skb->len != tcp_header_len &&
				359	!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
				360	INET_ECN_xmit(sk);
				361	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
				362	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				363	th->cwr = 1;
				364	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
				365	}
				366	} else if (!tcp_ca_needs_ecn(sk)) {
				367	/* ACK or retransmitted segment: clear ECT\|CE */
				368	INET_ECN_dontxmit(sk);
				369	}
				370	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
				371	th->ece = 1;
				372	}
				373	}
				374
				375	/* Constructs common control bits of non-data skb. If SYN/FIN is present,
				376	* auto increment end seqno.
				377	*/
				378	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
				379	{
				380	skb->ip_summed = CHECKSUM_PARTIAL;
				381
				382	TCP_SKB_CB(skb)->tcp_flags = flags;
				383	TCP_SKB_CB(skb)->sacked = 0;
				384
				385	tcp_skb_pcount_set(skb, 1);
				386
				387	TCP_SKB_CB(skb)->seq = seq;
				388	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
				389	seq++;
				390	TCP_SKB_CB(skb)->end_seq = seq;
				391	}
				392
				393	static inline bool tcp_urg_mode(const struct tcp_sock *tp)
				394	{
				395	return tp->snd_una != tp->snd_up;
				396	}
				397
				398	#define OPTION_SACK_ADVERTISE (1 << 0)
				399	#define OPTION_TS (1 << 1)
				400	#define OPTION_MD5 (1 << 2)
				401	#define OPTION_WSCALE (1 << 3)
				402	#define OPTION_FAST_OPEN_COOKIE (1 << 8)
				403	#define OPTION_SMC (1 << 9)
				404
				405	static void smc_options_write(__be32 ptr, u16 options)
				406	{
				407	#if IS_ENABLED(CONFIG_SMC)
				408	if (static_branch_unlikely(&tcp_have_smc)) {
				409	if (unlikely(OPTION_SMC & *options)) {
				410	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				411	(TCPOPT_NOP << 16) \|
				412	(TCPOPT_EXP << 8) \|
				413	(TCPOLEN_EXP_SMC_BASE));
				414	*ptr++ = htonl(TCPOPT_SMC_MAGIC);
				415	}
				416	}
				417	#endif
				418	}
				419
				420	struct tcp_out_options {
				421	u16 options; /* bit field of OPTION_* */
				422	u16 mss; /* 0 to disable */
				423	u8 ws; /* window scale, 0 to disable */
				424	u8 num_sack_blocks; /* number of SACK blocks to include */
				425	u8 hash_size; /* bytes in hash_location */
				426	__u8 hash_location; / temporary pointer, overloaded */
				427	__u32 tsval, tsecr; /* need to include OPTION_TS */
				428	struct tcp_fastopen_cookie fastopen_cookie; / Fast open cookie */
				429	};
				430
				431	/* Write previously computed TCP options to the packet.
				432	*
				433	* Beware: Something in the Internet is very sensitive to the ordering of
				434	* TCP options, we learned this through the hard way, so be careful here.
				435	* Luckily we can at least blame others for their non-compliance but from
				436	* inter-operability perspective it seems that we're somewhat stuck with
				437	* the ordering which we have been using if we want to keep working with
				438	* those broken things (not that it currently hurts anybody as there isn't
				439	* particular reason why the ordering would need to be changed).
				440	*
				441	* At least SACK_PERM as the first option is known to lead to a disaster
				442	* (but it may well be that other scenarios fail similarly).
				443	*/
				444	static void tcp_options_write(__be32 ptr, struct tcp_sock tp,
				445	struct tcp_out_options *opts)
				446	{
				447	u16 options = opts->options; /* mungable copy */
				448
				449	if (unlikely(OPTION_MD5 & options)) {
				450	*ptr++ = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				451	(TCPOPT_MD5SIG << 8) \| TCPOLEN_MD5SIG);
				452	/* overload cookie hash location */
				453	opts->hash_location = (__u8 *)ptr;
				454	ptr += 4;
				455	}
				456
				457	if (unlikely(opts->mss)) {
				458	*ptr++ = htonl((TCPOPT_MSS << 24) \|
				459	(TCPOLEN_MSS << 16) \|
				460	opts->mss);
				461	}
				462
				463	if (likely(OPTION_TS & options)) {
				464	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				465	*ptr++ = htonl((TCPOPT_SACK_PERM << 24) \|
				466	(TCPOLEN_SACK_PERM << 16) \|
				467	(TCPOPT_TIMESTAMP << 8) \|
				468	TCPOLEN_TIMESTAMP);
				469	options &= ~OPTION_SACK_ADVERTISE;
				470	} else {
				471	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				472	(TCPOPT_NOP << 16) \|
				473	(TCPOPT_TIMESTAMP << 8) \|
				474	TCPOLEN_TIMESTAMP);
				475	}
				476	*ptr++ = htonl(opts->tsval);
				477	*ptr++ = htonl(opts->tsecr);
				478	}
				479
				480	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				481	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				482	(TCPOPT_NOP << 16) \|
				483	(TCPOPT_SACK_PERM << 8) \|
				484	TCPOLEN_SACK_PERM);
				485	}
				486
				487	if (unlikely(OPTION_WSCALE & options)) {
				488	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				489	(TCPOPT_WINDOW << 16) \|
				490	(TCPOLEN_WINDOW << 8) \|
				491	opts->ws);
				492	}
				493
				494	if (unlikely(opts->num_sack_blocks)) {
				495	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
				496	tp->duplicate_sack : tp->selective_acks;
				497	int this_sack;
				498
				499	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				500	(TCPOPT_NOP << 16) \|
				501	(TCPOPT_SACK << 8) \|
				502	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
				503	TCPOLEN_SACK_PERBLOCK)));
				504
				505	for (this_sack = 0; this_sack < opts->num_sack_blocks;
				506	++this_sack) {
				507	*ptr++ = htonl(sp[this_sack].start_seq);
				508	*ptr++ = htonl(sp[this_sack].end_seq);
				509	}
				510
				511	tp->rx_opt.dsack = 0;
				512	}
				513
				514	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
				515	struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
				516	u8 p = (u8 )ptr;
				517	u32 len; /* Fast Open option length */
				518
				519	if (foc->exp) {
				520	len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
				521	*ptr = htonl((TCPOPT_EXP << 24) \| (len << 16) \|
				522	TCPOPT_FASTOPEN_MAGIC);
				523	p += TCPOLEN_EXP_FASTOPEN_BASE;
				524	} else {
				525	len = TCPOLEN_FASTOPEN_BASE + foc->len;
				526	*p++ = TCPOPT_FASTOPEN;
				527	*p++ = len;
				528	}
				529
				530	memcpy(p, foc->val, foc->len);
				531	if ((len & 3) == 2) {
				532	p[foc->len] = TCPOPT_NOP;
				533	p[foc->len + 1] = TCPOPT_NOP;
				534	}
				535	ptr += (len + 3) >> 2;
				536	}
				537
				538	smc_options_write(ptr, &options);
				539	}
				540
				541	static void smc_set_option(const struct tcp_sock *tp,
				542	struct tcp_out_options *opts,
				543	unsigned int *remaining)
				544	{
				545	#if IS_ENABLED(CONFIG_SMC)
				546	if (static_branch_unlikely(&tcp_have_smc)) {
				547	if (tp->syn_smc) {
				548	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
				549	opts->options \|= OPTION_SMC;
				550	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
				551	}
				552	}
				553	}
				554	#endif
				555	}
				556
				557	static void smc_set_option_cond(const struct tcp_sock *tp,
				558	const struct inet_request_sock *ireq,
				559	struct tcp_out_options *opts,
				560	unsigned int *remaining)
				561	{
				562	#if IS_ENABLED(CONFIG_SMC)
				563	if (static_branch_unlikely(&tcp_have_smc)) {
				564	if (tp->syn_smc && ireq->smc_ok) {
				565	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
				566	opts->options \|= OPTION_SMC;
				567	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
				568	}
				569	}
				570	}
				571	#endif
				572	}
				573
				574	/* Compute TCP options for SYN packets. This is not the final
				575	* network wire format yet.
				576	*/
				577	static unsigned int tcp_syn_options(struct sock sk, struct sk_buff skb,
				578	struct tcp_out_options *opts,
				579	struct tcp_md5sig_key **md5)
				580	{
				581	struct tcp_sock *tp = tcp_sk(sk);
				582	unsigned int remaining = MAX_TCP_OPTION_SPACE;
				583	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
				584
				585	*md5 = NULL;
				586	#ifdef CONFIG_TCP_MD5SIG
				587	if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
				588	*md5 = tp->af_specific->md5_lookup(sk, sk);
				589	if (*md5) {
				590	opts->options \|= OPTION_MD5;
				591	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				592	}
				593	}
				594	#endif
				595
				596	/* We always get an MSS option. The option bytes which will be seen in
				597	* normal data packets should timestamps be used, must be in the MSS
				598	* advertised. But we subtract them from tp->mss_cache so that
				599	* calculations in tcp_sendmsg are simpler etc. So account for this
				600	* fact here if necessary. If we don't do this correctly, as a
				601	* receiver we won't recognize data packets as being full sized when we
				602	* should, and thus we won't abide by the delayed ACK rules correctly.
				603	* SACKs don't matter, we never delay an ACK when we have any of those
				604	* going out. */
				605	opts->mss = tcp_advertise_mss(sk);
				606	remaining -= TCPOLEN_MSS_ALIGNED;
				607
				608	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
				609	opts->options \|= OPTION_TS;
				610	opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
				611	opts->tsecr = tp->rx_opt.ts_recent;
				612	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				613	}
				614	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
				615	opts->ws = tp->rx_opt.rcv_wscale;
				616	opts->options \|= OPTION_WSCALE;
				617	remaining -= TCPOLEN_WSCALE_ALIGNED;
				618	}
				619	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
				620	opts->options \|= OPTION_SACK_ADVERTISE;
				621	if (unlikely(!(OPTION_TS & opts->options)))
				622	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				623	}
				624
				625	if (fastopen && fastopen->cookie.len >= 0) {
				626	u32 need = fastopen->cookie.len;
				627
				628	need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				629	TCPOLEN_FASTOPEN_BASE;
				630	need = (need + 3) & ~3U; /* Align to 32 bits */
				631	if (remaining >= need) {
				632	opts->options \|= OPTION_FAST_OPEN_COOKIE;
				633	opts->fastopen_cookie = &fastopen->cookie;
				634	remaining -= need;
				635	tp->syn_fastopen = 1;
				636	tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
				637	}
				638	}
				639
				640	smc_set_option(tp, opts, &remaining);
				641
				642	return MAX_TCP_OPTION_SPACE - remaining;
				643	}
				644
				645	/* Set up TCP options for SYN-ACKs. */
				646	static unsigned int tcp_synack_options(const struct sock *sk,
				647	struct request_sock *req,
				648	unsigned int mss, struct sk_buff *skb,
				649	struct tcp_out_options *opts,
				650	const struct tcp_md5sig_key *md5,
				651	struct tcp_fastopen_cookie *foc)
				652	{
				653	struct inet_request_sock *ireq = inet_rsk(req);
				654	unsigned int remaining = MAX_TCP_OPTION_SPACE;
				655
				656	#ifdef CONFIG_TCP_MD5SIG
				657	if (md5) {
				658	opts->options \|= OPTION_MD5;
				659	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				660
				661	/* We can't fit any SACK blocks in a packet with MD5 + TS
				662	* options. There was discussion about disabling SACK
				663	* rather than TS in order to fit in better with old,
				664	* buggy kernels, but that was deemed to be unnecessary.
				665	*/
				666	ireq->tstamp_ok &= !ireq->sack_ok;
				667	}
				668	#endif
				669
				670	/* We always send an MSS option. */
				671	opts->mss = mss;
				672	remaining -= TCPOLEN_MSS_ALIGNED;
				673
				674	if (likely(ireq->wscale_ok)) {
				675	opts->ws = ireq->rcv_wscale;
				676	opts->options \|= OPTION_WSCALE;
				677	remaining -= TCPOLEN_WSCALE_ALIGNED;
				678	}
				679	if (likely(ireq->tstamp_ok)) {
				680	opts->options \|= OPTION_TS;
				681	opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
				682	opts->tsecr = req->ts_recent;
				683	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				684	}
				685	if (likely(ireq->sack_ok)) {
				686	opts->options \|= OPTION_SACK_ADVERTISE;
				687	if (unlikely(!ireq->tstamp_ok))
				688	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				689	}
				690	if (foc != NULL && foc->len >= 0) {
				691	u32 need = foc->len;
				692
				693	need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				694	TCPOLEN_FASTOPEN_BASE;
				695	need = (need + 3) & ~3U; /* Align to 32 bits */
				696	if (remaining >= need) {
				697	opts->options \|= OPTION_FAST_OPEN_COOKIE;
				698	opts->fastopen_cookie = foc;
				699	remaining -= need;
				700	}
				701	}
				702
				703	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
				704
				705	return MAX_TCP_OPTION_SPACE - remaining;
				706	}
				707
				708	/* Compute TCP options for ESTABLISHED sockets. This is not the
				709	* final wire format yet.
				710	*/
				711	static unsigned int tcp_established_options(struct sock sk, struct sk_buff skb,
				712	struct tcp_out_options *opts,
				713	struct tcp_md5sig_key **md5)
				714	{
				715	struct tcp_sock *tp = tcp_sk(sk);
				716	unsigned int size = 0;
				717	unsigned int eff_sacks;
				718
				719	opts->options = 0;
				720
				721	*md5 = NULL;
				722	#ifdef CONFIG_TCP_MD5SIG
				723	if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
				724	*md5 = tp->af_specific->md5_lookup(sk, sk);
				725	if (*md5) {
				726	opts->options \|= OPTION_MD5;
				727	size += TCPOLEN_MD5SIG_ALIGNED;
				728	}
				729	}
				730	#endif
				731
				732	if (likely(tp->rx_opt.tstamp_ok)) {
				733	opts->options \|= OPTION_TS;
				734	opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
				735	opts->tsecr = tp->rx_opt.ts_recent;
				736	size += TCPOLEN_TSTAMP_ALIGNED;
				737	}
				738
				739	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
				740	if (unlikely(eff_sacks)) {
				741	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
				742	opts->num_sack_blocks =
				743	min_t(unsigned int, eff_sacks,
				744	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
				745	TCPOLEN_SACK_PERBLOCK);
				746	if (likely(opts->num_sack_blocks))
				747	size += TCPOLEN_SACK_BASE_ALIGNED +
				748	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
				749	}
				750
				751	return size;
				752	}
				753
				754
				755	/* TCP SMALL QUEUES (TSQ)
				756	*
				757	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
				758	* to reduce RTT and bufferbloat.
				759	* We do this using a special skb destructor (tcp_wfree).
				760	*
				761	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
				762	* needs to be reallocated in a driver.
				763	* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
				764	*
				765	* Since transmit from skb destructor is forbidden, we use a tasklet
				766	* to process all sockets that eventually need to send more skbs.
				767	* We use one tasklet per cpu, with its own queue of sockets.
				768	*/
				769	struct tsq_tasklet {
				770	struct tasklet_struct tasklet;
				771	struct list_head head; /* queue of tcp sockets */
				772	};
				773	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
				774
				775	static void tcp_tsq_write(struct sock *sk)
				776	{
				777	if ((1 << sk->sk_state) &
				778	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|
				779	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK)) {
				780	struct tcp_sock *tp = tcp_sk(sk);
				781
				782	if (tp->lost_out > tp->retrans_out &&
				783	tp->snd_cwnd > tcp_packets_in_flight(tp)) {
				784	tcp_mstamp_refresh(tp);
				785	tcp_xmit_retransmit_queue(sk);
				786	}
				787
				788	tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
				789	0, GFP_ATOMIC);
				790	}
				791	}
				792
				793	static void tcp_tsq_handler(struct sock *sk)
				794	{
				795	bh_lock_sock(sk);
				796	if (!sock_owned_by_user(sk))
				797	tcp_tsq_write(sk);
				798	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
				799	sock_hold(sk);
				800	bh_unlock_sock(sk);
				801	}
				802	/*
				803	* One tasklet per cpu tries to send more skbs.
				804	* We run in tasklet context but need to disable irqs when
				805	* transferring tsq->head because tcp_wfree() might
				806	* interrupt us (non NAPI drivers)
				807	*/
				808	static void tcp_tasklet_func(unsigned long data)
				809	{
				810	struct tsq_tasklet tsq = (struct tsq_tasklet )data;
				811	LIST_HEAD(list);
				812	unsigned long flags;
				813	struct list_head q, n;
				814	struct tcp_sock *tp;
				815	struct sock *sk;
				816
				817	local_irq_save(flags);
				818	list_splice_init(&tsq->head, &list);
				819	local_irq_restore(flags);
				820
				821	list_for_each_safe(q, n, &list) {
				822	tp = list_entry(q, struct tcp_sock, tsq_node);
				823	list_del(&tp->tsq_node);
				824
				825	sk = (struct sock *)tp;
				826	smp_mb__before_atomic();
				827	clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
				828
				829	tcp_tsq_handler(sk);
				830	sk_free(sk);
				831	}
				832	}
				833
				834	#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED \| \
				835	TCPF_WRITE_TIMER_DEFERRED \| \
				836	TCPF_DELACK_TIMER_DEFERRED \| \
				837	TCPF_MTU_REDUCED_DEFERRED)
				838	/**
				839	* tcp_release_cb - tcp release_sock() callback
				840	* @sk: socket
				841	*
				842	* called from release_sock() to perform protocol dependent
				843	* actions before socket release.
				844	*/
				845	void tcp_release_cb(struct sock *sk)
				846	{
				847	unsigned long flags, nflags;
				848
				849	/* perform an atomic operation only if at least one flag is set */
				850	do {
				851	flags = sk->sk_tsq_flags;
				852	if (!(flags & TCP_DEFERRED_ALL))
				853	return;
				854	nflags = flags & ~TCP_DEFERRED_ALL;
				855	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
				856
				857	if (flags & TCPF_TSQ_DEFERRED) {
				858	tcp_tsq_write(sk);
				859	__sock_put(sk);
				860	}
				861	/* Here begins the tricky part :
				862	* We are called from release_sock() with :
				863	* 1) BH disabled
				864	* 2) sk_lock.slock spinlock held
				865	* 3) socket owned by us (sk->sk_lock.owned == 1)
				866	*
				867	* But following code is meant to be called from BH handlers,
				868	* so we should keep BH disabled, but early release socket ownership
				869	*/
				870	sock_release_ownership(sk);
				871
				872	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
				873	tcp_write_timer_handler(sk);
				874	__sock_put(sk);
				875	}
				876	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
				877	tcp_delack_timer_handler(sk);
				878	__sock_put(sk);
				879	}
				880	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
				881	inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
				882	__sock_put(sk);
				883	}
				884	}
				885	EXPORT_SYMBOL(tcp_release_cb);
				886
				887	void __init tcp_tasklet_init(void)
				888	{
				889	int i;
				890
				891	for_each_possible_cpu(i) {
				892	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
				893
				894	INIT_LIST_HEAD(&tsq->head);
				895	tasklet_init(&tsq->tasklet,
				896	tcp_tasklet_func,
				897	(unsigned long)tsq);
				898	}
				899	}
				900
				901	/*
				902	* Write buffer destructor automatically called from kfree_skb.
				903	* We can't xmit new skbs from this context, as we might already
				904	* hold qdisc lock.
				905	*/
				906	void tcp_wfree(struct sk_buff *skb)
				907	{
				908	struct sock *sk = skb->sk;
				909	struct tcp_sock *tp = tcp_sk(sk);
				910	unsigned long flags, nval, oval;
				911
				912	/* Keep one reference on sk_wmem_alloc.
				913	* Will be released by sk_free() from here or tcp_tasklet_func()
				914	*/
				915	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
				916
				917	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
				918	* Wait until our queues (qdisc + devices) are drained.
				919	* This gives :
				920	* - less callbacks to tcp_write_xmit(), reducing stress (batches)
				921	* - chance for incoming ACK (processed by another cpu maybe)
				922	* to migrate this flow (skb->ooo_okay will be eventually set)
				923	*/
				924	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
				925	goto out;
				926
				927	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
				928	struct tsq_tasklet *tsq;
				929	bool empty;
				930
				931	if (!(oval & TSQF_THROTTLED) \|\| (oval & TSQF_QUEUED))
				932	goto out;
				933
				934	nval = (oval & ~TSQF_THROTTLED) \| TSQF_QUEUED;
				935	nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
				936	if (nval != oval)
				937	continue;
				938
				939	/* queue this socket to tasklet queue */
				940	local_irq_save(flags);
				941	tsq = this_cpu_ptr(&tsq_tasklet);
				942	empty = list_empty(&tsq->head);
				943	list_add(&tp->tsq_node, &tsq->head);
				944	if (empty)
				945	tasklet_schedule(&tsq->tasklet);
				946	local_irq_restore(flags);
				947	return;
				948	}
				949	out:
				950	sk_free(sk);
				951	}
				952
				953	/* Note: Called under soft irq.
				954	* We can call TCP stack right away, unless socket is owned by user.
				955	*/
				956	enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
				957	{
				958	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
				959	struct sock sk = (struct sock )tp;
				960
				961	tcp_tsq_handler(sk);
				962	sock_put(sk);
				963
				964	return HRTIMER_NORESTART;
				965	}
				966
				967	static void tcp_internal_pacing(struct sock sk, const struct sk_buff skb)
				968	{
				969	u64 len_ns;
				970	u32 rate;
				971
				972	if (!tcp_needs_internal_pacing(sk))
				973	return;
				974	rate = sk->sk_pacing_rate;
				975	if (!rate \|\| rate == ~0U)
				976	return;
				977
				978	len_ns = (u64)skb->len * NSEC_PER_SEC;
				979	do_div(len_ns, rate);
				980	hrtimer_start(&tcp_sk(sk)->pacing_timer,
				981	ktime_add_ns(ktime_get(), len_ns),
				982	HRTIMER_MODE_ABS_PINNED_SOFT);
				983	sock_hold(sk);
				984	}
				985
				986	static void tcp_update_skb_after_send(struct tcp_sock tp, struct sk_buff skb)
				987	{
				988	skb->skb_mstamp = tp->tcp_mstamp;
				989	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
				990	}
				991
				992	/* This routine actually transmits TCP packets queued in by
				993	* tcp_do_sendmsg(). This is used by both the initial
				994	* transmission and possible later retransmissions.
				995	* All SKB's seen here are completely headerless. It is our
				996	* job to build the TCP header, and pass the packet down to
				997	* IP so it can do the same plus pass the packet off to the
				998	* device.
				999	*
				1000	* We are working here with either a clone of the original
				1001	* SKB, or a fresh unique copy made by the retransmit engine.
				1002	*/
				1003	static int __tcp_transmit_skb(struct sock sk, struct sk_buff skb,
				1004	int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
				1005	{
				1006	const struct inet_connection_sock *icsk = inet_csk(sk);
				1007	struct inet_sock *inet;
				1008	struct tcp_sock *tp;
				1009	struct tcp_skb_cb *tcb;
				1010	struct tcp_out_options opts;
				1011	unsigned int tcp_options_size, tcp_header_size;
				1012	struct sk_buff *oskb = NULL;
				1013	struct tcp_md5sig_key *md5;
				1014	struct tcphdr *th;
				1015	int err;
				1016
				1017	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
				1018	tp = tcp_sk(sk);
				1019
				1020	if (clone_it) {
				1021	TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
				1022	- tp->snd_una;
				1023	oskb = skb;
				1024
				1025	tcp_skb_tsorted_save(oskb) {
				1026	if (unlikely(skb_cloned(oskb)))
				1027	skb = pskb_copy(oskb, gfp_mask);
				1028	else
				1029	skb = skb_clone(oskb, gfp_mask);
				1030	} tcp_skb_tsorted_restore(oskb);
				1031
				1032	if (unlikely(!skb))
				1033	return -ENOBUFS;
				1034	}
				1035	skb->skb_mstamp = tp->tcp_mstamp;
				1036
				1037	inet = inet_sk(sk);
				1038	tcb = TCP_SKB_CB(skb);
				1039	memset(&opts, 0, sizeof(opts));
				1040
				1041	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
				1042	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
				1043	else
				1044	tcp_options_size = tcp_established_options(sk, skb, &opts,
				1045	&md5);
				1046	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
				1047
				1048	/* if no packet is in qdisc/device queue, then allow XPS to select
				1049	* another queue. We can be called from tcp_tsq_handler()
				1050	* which holds one reference to sk.
				1051	*
				1052	* TODO: Ideally, in-flight pure ACK packets should not matter here.
				1053	* One way to get this would be to set skb->truesize = 2 on them.
				1054	*/
				1055	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
				1056
				1057	/* If we had to use memory reserve to allocate this skb,
				1058	* this might cause drops if packet is looped back :
				1059	* Other socket might not have SOCK_MEMALLOC.
				1060	* Packets not looped back do not care about pfmemalloc.
				1061	*/
				1062	skb->pfmemalloc = 0;
				1063
				1064	skb_push(skb, tcp_header_size);
				1065	skb_reset_transport_header(skb);
				1066
				1067	skb_orphan(skb);
				1068	skb->sk = sk;
				1069	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
				1070	skb_set_hash_from_sk(skb, sk);
				1071	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				1072
				1073	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
				1074
				1075	/* Build TCP header and checksum it. */
				1076	th = (struct tcphdr *)skb->data;
				1077	th->source = inet->inet_sport;
				1078	th->dest = inet->inet_dport;
				1079	th->seq = htonl(tcb->seq);
				1080	th->ack_seq = htonl(rcv_nxt);
				1081	(((__be16 )th) + 6) = htons(((tcp_header_size >> 2) << 12) \|
				1082	tcb->tcp_flags);
				1083
				1084	th->check = 0;
				1085	th->urg_ptr = 0;
				1086
				1087	/* The urg_mode check is necessary during a below snd_una win probe */
				1088	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
				1089	if (before(tp->snd_up, tcb->seq + 0x10000)) {
				1090	th->urg_ptr = htons(tp->snd_up - tcb->seq);
				1091	th->urg = 1;
				1092	} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
				1093	th->urg_ptr = htons(0xFFFF);
				1094	th->urg = 1;
				1095	}
				1096	}
				1097
				1098	tcp_options_write((__be32 *)(th + 1), tp, &opts);
				1099	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
				1100	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
				1101	th->window = htons(tcp_select_window(sk));
				1102	tcp_ecn_send(sk, skb, th, tcp_header_size);
				1103	} else {
				1104	/* RFC1323: The window in SYN & SYN/ACK segments
				1105	* is never scaled.
				1106	*/
				1107	th->window = htons(min(tp->rcv_wnd, 65535U));
				1108	}
				1109	#ifdef CONFIG_TCP_MD5SIG
				1110	/* Calculate the MD5 hash, as we have all we need now */
				1111	if (md5) {
				1112	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
				1113	tp->af_specific->calc_md5_hash(opts.hash_location,
				1114	md5, sk, skb);
				1115	}
				1116	#endif
				1117
				1118	icsk->icsk_af_ops->send_check(sk, skb);
				1119
				1120	if (likely(tcb->tcp_flags & TCPHDR_ACK))
				1121	tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
				1122
				1123	if (skb->len != tcp_header_size) {
				1124	tcp_event_data_sent(tp, sk);
				1125	tp->data_segs_out += tcp_skb_pcount(skb);
				1126	tp->bytes_sent += skb->len - tcp_header_size;
				1127	tcp_internal_pacing(sk, skb);
				1128	}
				1129
				1130	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
				1131	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
				1132	tcp_skb_pcount(skb));
				1133
				1134	tp->segs_out += tcp_skb_pcount(skb);
				1135	/* OK, its time to fill skb_shinfo(skb)->gso_{segs\|size} */
				1136	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
				1137	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
				1138
				1139	/* Our usage of tstamp should remain private */
				1140	skb->tstamp = 0;
				1141
				1142	/* Cleanup our debris for IP stacks */
				1143	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
				1144	sizeof(struct inet6_skb_parm)));
				1145
				1146	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
				1147
				1148	if (unlikely(err > 0)) {
				1149	tcp_enter_cwr(sk);
				1150	err = net_xmit_eval(err);
				1151	}
				1152	if (!err && oskb) {
				1153	tcp_update_skb_after_send(tp, oskb);
				1154	tcp_rate_skb_sent(sk, oskb);
				1155	}
				1156	return err;
				1157	}
				1158
				1159	static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
				1160	gfp_t gfp_mask)
				1161	{
				1162	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
				1163	tcp_sk(sk)->rcv_nxt);
				1164	}
				1165
				1166	/* This routine just queues the buffer for sending.
				1167	*
				1168	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
				1169	* otherwise socket can stall.
				1170	*/
				1171	static void tcp_queue_skb(struct sock sk, struct sk_buff skb)
				1172	{
				1173	struct tcp_sock *tp = tcp_sk(sk);
				1174
				1175	/* Advance write_seq and place onto the write_queue. */
				1176	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
				1177	__skb_header_release(skb);
				1178	tcp_add_write_queue_tail(sk, skb);
				1179	sk->sk_wmem_queued += skb->truesize;
				1180	sk_mem_charge(sk, skb->truesize);
				1181	}
				1182
				1183	/* Initialize TSO segments for a packet. */
				1184	static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
				1185	{
				1186	if (skb->len <= mss_now) {
				1187	/* Avoid the costly divide in the normal
				1188	* non-TSO case.
				1189	*/
				1190	tcp_skb_pcount_set(skb, 1);
				1191	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1192	} else {
				1193	tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
				1194	TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
				1195	}
				1196	}
				1197
				1198	/* Pcount in the middle of the write queue got changed, we need to do various
				1199	* tweaks to fix counters
				1200	*/
				1201	static void tcp_adjust_pcount(struct sock sk, const struct sk_buff skb, int decr)
				1202	{
				1203	struct tcp_sock *tp = tcp_sk(sk);
				1204
				1205	tp->packets_out -= decr;
				1206
				1207	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1208	tp->sacked_out -= decr;
				1209	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				1210	tp->retrans_out -= decr;
				1211	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
				1212	tp->lost_out -= decr;
				1213
				1214	/* Reno case is special. Sigh... */
				1215	if (tcp_is_reno(tp) && decr > 0)
				1216	tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
				1217
				1218	if (tp->lost_skb_hint &&
				1219	before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
				1220	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				1221	tp->lost_cnt_hint -= decr;
				1222
				1223	tcp_verify_left_out(tp);
				1224	}
				1225
				1226	static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
				1227	{
				1228	return TCP_SKB_CB(skb)->txstamp_ack \|\|
				1229	(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
				1230	}
				1231
				1232	static void tcp_fragment_tstamp(struct sk_buff skb, struct sk_buff skb2)
				1233	{
				1234	struct skb_shared_info *shinfo = skb_shinfo(skb);
				1235
				1236	if (unlikely(tcp_has_tx_tstamp(skb)) &&
				1237	!before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
				1238	struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
				1239	u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
				1240
				1241	shinfo->tx_flags &= ~tsflags;
				1242	shinfo2->tx_flags \|= tsflags;
				1243	swap(shinfo->tskey, shinfo2->tskey);
				1244	TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
				1245	TCP_SKB_CB(skb)->txstamp_ack = 0;
				1246	}
				1247	}
				1248
				1249	static void tcp_skb_fragment_eor(struct sk_buff skb, struct sk_buff skb2)
				1250	{
				1251	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
				1252	TCP_SKB_CB(skb)->eor = 0;
				1253	}
				1254
				1255	/* Insert buff after skb on the write or rtx queue of sk. */
				1256	static void tcp_insert_write_queue_after(struct sk_buff *skb,
				1257	struct sk_buff *buff,
				1258	struct sock *sk,
				1259	enum tcp_queue tcp_queue)
				1260	{
				1261	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
				1262	__skb_queue_after(&sk->sk_write_queue, skb, buff);
				1263	else
				1264	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
				1265	}
				1266
				1267	/* Function to create two new TCP segments. Shrinks the given segment
				1268	* to the specified size and appends a new segment with the rest of the
				1269	* packet to the list. This won't be called frequently, I hope.
				1270	* Remember, these are still headerless SKBs at this point.
				1271	*/
				1272	int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
				1273	struct sk_buff *skb, u32 len,
				1274	unsigned int mss_now, gfp_t gfp)
				1275	{
				1276	struct tcp_sock *tp = tcp_sk(sk);
				1277	struct sk_buff *buff;
				1278	int nsize, old_factor;
				1279	long limit;
				1280	int nlen;
				1281	u8 flags;
				1282
				1283	if (WARN_ON(len > skb->len))
				1284	return -EINVAL;
				1285
				1286	nsize = skb_headlen(skb) - len;
				1287	if (nsize < 0)
				1288	nsize = 0;
				1289
				1290	/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
				1291	* We need some allowance to not penalize applications setting small
				1292	* SO_SNDBUF values.
				1293	* Also allow first and last skb in retransmit queue to be split.
				1294	*/
				1295	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
				1296	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
				1297	tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
				1298	skb != tcp_rtx_queue_head(sk) &&
				1299	skb != tcp_rtx_queue_tail(sk))) {
				1300	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
				1301	return -ENOMEM;
				1302	}
				1303
				1304	if (skb_unclone(skb, gfp))
				1305	return -ENOMEM;
				1306
				1307	/* Get a new skb... force flag on. */
				1308	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
				1309	if (!buff)
				1310	return -ENOMEM; /* We'll just try again later. */
				1311
				1312	sk->sk_wmem_queued += buff->truesize;
				1313	sk_mem_charge(sk, buff->truesize);
				1314	nlen = skb->len - len - nsize;
				1315	buff->truesize += nlen;
				1316	skb->truesize -= nlen;
				1317
				1318	/* Correct the sequence numbers. */
				1319	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1320	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1321	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1322
				1323	/* PSH and FIN should only be set in the second packet. */
				1324	flags = TCP_SKB_CB(skb)->tcp_flags;
				1325	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1326	TCP_SKB_CB(buff)->tcp_flags = flags;
				1327	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
				1328	tcp_skb_fragment_eor(skb, buff);
				1329
				1330	skb_split(skb, buff, len);
				1331
				1332	buff->ip_summed = CHECKSUM_PARTIAL;
				1333
				1334	buff->tstamp = skb->tstamp;
				1335	tcp_fragment_tstamp(skb, buff);
				1336
				1337	old_factor = tcp_skb_pcount(skb);
				1338
				1339	/* Fix up tso_factor for both original and new SKB. */
				1340	tcp_set_skb_tso_segs(skb, mss_now);
				1341	tcp_set_skb_tso_segs(buff, mss_now);
				1342
				1343	/* Update delivered info for the new segment */
				1344	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
				1345
				1346	/* If this packet has been sent out already, we must
				1347	* adjust the various packet counters.
				1348	*/
				1349	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
				1350	int diff = old_factor - tcp_skb_pcount(skb) -
				1351	tcp_skb_pcount(buff);
				1352
				1353	if (diff)
				1354	tcp_adjust_pcount(sk, skb, diff);
				1355	}
				1356
				1357	/* Link BUFF into the send queue. */
				1358	__skb_header_release(buff);
				1359	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
				1360	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
				1361	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
				1362
				1363	return 0;
				1364	}
				1365
				1366	/* This is similar to __pskb_pull_tail(). The difference is that pulled
				1367	* data is not copied, but immediately discarded.
				1368	*/
				1369	static int __pskb_trim_head(struct sk_buff *skb, int len)
				1370	{
				1371	struct skb_shared_info *shinfo;
				1372	int i, k, eat;
				1373
				1374	eat = min_t(int, len, skb_headlen(skb));
				1375	if (eat) {
				1376	__skb_pull(skb, eat);
				1377	len -= eat;
				1378	if (!len)
				1379	return 0;
				1380	}
				1381	eat = len;
				1382	k = 0;
				1383	shinfo = skb_shinfo(skb);
				1384	for (i = 0; i < shinfo->nr_frags; i++) {
				1385	int size = skb_frag_size(&shinfo->frags[i]);
				1386
				1387	if (size <= eat) {
				1388	skb_frag_unref(skb, i);
				1389	eat -= size;
				1390	} else {
				1391	shinfo->frags[k] = shinfo->frags[i];
				1392	if (eat) {
				1393	shinfo->frags[k].page_offset += eat;
				1394	skb_frag_size_sub(&shinfo->frags[k], eat);
				1395	eat = 0;
				1396	}
				1397	k++;
				1398	}
				1399	}
				1400	shinfo->nr_frags = k;
				1401
				1402	skb->data_len -= len;
				1403	skb->len = skb->data_len;
				1404	return len;
				1405	}
				1406
				1407	/* Remove acked data from a packet in the transmit queue. */
				1408	int tcp_trim_head(struct sock sk, struct sk_buff skb, u32 len)
				1409	{
				1410	u32 delta_truesize;
				1411
				1412	if (skb_unclone(skb, GFP_ATOMIC))
				1413	return -ENOMEM;
				1414
				1415	delta_truesize = __pskb_trim_head(skb, len);
				1416
				1417	TCP_SKB_CB(skb)->seq += len;
				1418	skb->ip_summed = CHECKSUM_PARTIAL;
				1419
				1420	if (delta_truesize) {
				1421	skb->truesize -= delta_truesize;
				1422	sk->sk_wmem_queued -= delta_truesize;
				1423	sk_mem_uncharge(sk, delta_truesize);
				1424	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
				1425	}
				1426
				1427	/* Any change of skb->len requires recalculation of tso factor. */
				1428	if (tcp_skb_pcount(skb) > 1)
				1429	tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
				1430
				1431	return 0;
				1432	}
				1433
				1434	/* Calculate MSS not accounting any TCP options. */
				1435	static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
				1436	{
				1437	const struct tcp_sock *tp = tcp_sk(sk);
				1438	const struct inet_connection_sock *icsk = inet_csk(sk);
				1439	int mss_now;
				1440
				1441	/* Calculate base mss without TCP options:
				1442	It is MMS_S - sizeof(tcphdr) of rfc1122
				1443	*/
				1444	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
				1445
				1446	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
				1447	if (icsk->icsk_af_ops->net_frag_header_len) {
				1448	const struct dst_entry *dst = __sk_dst_get(sk);
				1449
				1450	if (dst && dst_allfrag(dst))
				1451	mss_now -= icsk->icsk_af_ops->net_frag_header_len;
				1452	}
				1453
				1454	/* Clamp it (mss_clamp does not include tcp options) */
				1455	if (mss_now > tp->rx_opt.mss_clamp)
				1456	mss_now = tp->rx_opt.mss_clamp;
				1457
				1458	/* Now subtract optional transport overhead */
				1459	mss_now -= icsk->icsk_ext_hdr_len;
				1460
				1461	/* Then reserve room for full set of TCP options and 8 bytes of data */
				1462	mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
				1463	return mss_now;
				1464	}
				1465
				1466	/* Calculate MSS. Not accounting for SACKs here. */
				1467	int tcp_mtu_to_mss(struct sock *sk, int pmtu)
				1468	{
				1469	/* Subtract TCP options size, not including SACKs */
				1470	return __tcp_mtu_to_mss(sk, pmtu) -
				1471	(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
				1472	}
				1473
				1474	/* Inverse of above */
				1475	int tcp_mss_to_mtu(struct sock *sk, int mss)
				1476	{
				1477	const struct tcp_sock *tp = tcp_sk(sk);
				1478	const struct inet_connection_sock *icsk = inet_csk(sk);
				1479	int mtu;
				1480
				1481	mtu = mss +
				1482	tp->tcp_header_len +
				1483	icsk->icsk_ext_hdr_len +
				1484	icsk->icsk_af_ops->net_header_len;
				1485
				1486	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
				1487	if (icsk->icsk_af_ops->net_frag_header_len) {
				1488	const struct dst_entry *dst = __sk_dst_get(sk);
				1489
				1490	if (dst && dst_allfrag(dst))
				1491	mtu += icsk->icsk_af_ops->net_frag_header_len;
				1492	}
				1493	return mtu;
				1494	}
				1495	EXPORT_SYMBOL(tcp_mss_to_mtu);
				1496
				1497	/* MTU probing init per socket */
				1498	void tcp_mtup_init(struct sock *sk)
				1499	{
				1500	struct tcp_sock *tp = tcp_sk(sk);
				1501	struct inet_connection_sock *icsk = inet_csk(sk);
				1502	struct net *net = sock_net(sk);
				1503
				1504	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
				1505	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
				1506	icsk->icsk_af_ops->net_header_len;
				1507	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
				1508	icsk->icsk_mtup.probe_size = 0;
				1509	if (icsk->icsk_mtup.enabled)
				1510	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
				1511	}
				1512	EXPORT_SYMBOL(tcp_mtup_init);
				1513
				1514	/* This function synchronize snd mss to current pmtu/exthdr set.
				1515
				1516	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
				1517	for TCP options, but includes only bare TCP header.
				1518
				1519	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
				1520	It is minimum of user_mss and mss received with SYN.
				1521	It also does not include TCP options.
				1522
				1523	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
				1524
				1525	tp->mss_cache is current effective sending mss, including
				1526	all tcp options except for SACKs. It is evaluated,
				1527	taking into account current pmtu, but never exceeds
				1528	tp->rx_opt.mss_clamp.
				1529
				1530	NOTE1. rfc1122 clearly states that advertised MSS
				1531	DOES NOT include either tcp or ip options.
				1532
				1533	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
				1534	are READ ONLY outside this function. --ANK (980731)
				1535	*/
				1536	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
				1537	{
				1538	struct tcp_sock *tp = tcp_sk(sk);
				1539	struct inet_connection_sock *icsk = inet_csk(sk);
				1540	int mss_now;
				1541
				1542	if (icsk->icsk_mtup.search_high > pmtu)
				1543	icsk->icsk_mtup.search_high = pmtu;
				1544
				1545	mss_now = tcp_mtu_to_mss(sk, pmtu);
				1546	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
				1547
				1548	/* And store cached results */
				1549	icsk->icsk_pmtu_cookie = pmtu;
				1550	if (icsk->icsk_mtup.enabled)
				1551	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
				1552	tp->mss_cache = mss_now;
				1553
				1554	return mss_now;
				1555	}
				1556	EXPORT_SYMBOL(tcp_sync_mss);
				1557
				1558	/* Compute the current effective MSS, taking SACKs and IP options,
				1559	* and even PMTU discovery events into account.
				1560	*/
				1561	unsigned int tcp_current_mss(struct sock *sk)
				1562	{
				1563	const struct tcp_sock *tp = tcp_sk(sk);
				1564	const struct dst_entry *dst = __sk_dst_get(sk);
				1565	u32 mss_now;
				1566	unsigned int header_len;
				1567	struct tcp_out_options opts;
				1568	struct tcp_md5sig_key *md5;
				1569
				1570	mss_now = tp->mss_cache;
				1571
				1572	if (dst) {
				1573	u32 mtu = dst_mtu(dst);
				1574	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
				1575	mss_now = tcp_sync_mss(sk, mtu);
				1576	}
				1577
				1578	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
				1579	sizeof(struct tcphdr);
				1580	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
				1581	* some common options. If this is an odd packet (because we have SACK
				1582	* blocks etc) then our calculated header_len will be different, and
				1583	* we have to adjust mss_now correspondingly */
				1584	if (header_len != tp->tcp_header_len) {
				1585	int delta = (int) header_len - tp->tcp_header_len;
				1586	mss_now -= delta;
				1587	}
				1588
				1589	return mss_now;
				1590	}
				1591
				1592	/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
				1593	* As additional protections, we do not touch cwnd in retransmission phases,
				1594	* and if application hit its sndbuf limit recently.
				1595	*/
				1596	static void tcp_cwnd_application_limited(struct sock *sk)
				1597	{
				1598	struct tcp_sock *tp = tcp_sk(sk);
				1599
				1600	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
				1601	sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				1602	/* Limited by application or receiver window. */
				1603	u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
				1604	u32 win_used = max(tp->snd_cwnd_used, init_win);
				1605	if (win_used < tp->snd_cwnd) {
				1606	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				1607	tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
				1608	}
				1609	tp->snd_cwnd_used = 0;
				1610	}
				1611	tp->snd_cwnd_stamp = tcp_jiffies32;
				1612	}
				1613
				1614	static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
				1615	{
				1616	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				1617	struct tcp_sock *tp = tcp_sk(sk);
				1618
				1619	/* Track the maximum number of outstanding packets in each
				1620	* window, and remember whether we were cwnd-limited then.
				1621	*/
				1622	if (!before(tp->snd_una, tp->max_packets_seq) \|\|
				1623	tp->packets_out > tp->max_packets_out) {
				1624	tp->max_packets_out = tp->packets_out;
				1625	tp->max_packets_seq = tp->snd_nxt;
				1626	tp->is_cwnd_limited = is_cwnd_limited;
				1627	}
				1628
				1629	if (tcp_is_cwnd_limited(sk)) {
				1630	/* Network is feed fully. */
				1631	tp->snd_cwnd_used = 0;
				1632	tp->snd_cwnd_stamp = tcp_jiffies32;
				1633	} else {
				1634	/* Network starves. */
				1635	if (tp->packets_out > tp->snd_cwnd_used)
				1636	tp->snd_cwnd_used = tp->packets_out;
				1637
				1638	if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
				1639	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
				1640	!ca_ops->cong_control)
				1641	tcp_cwnd_application_limited(sk);
				1642
				1643	/* The following conditions together indicate the starvation
				1644	* is caused by insufficient sender buffer:
				1645	* 1) just sent some data (see tcp_write_xmit)
				1646	* 2) not cwnd limited (this else condition)
				1647	* 3) no more data to send (tcp_write_queue_empty())
				1648	* 4) application is hitting buffer limit (SOCK_NOSPACE)
				1649	*/
				1650	if (tcp_write_queue_empty(sk) && sk->sk_socket &&
				1651	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
				1652	(1 << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				1653	tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1654	}
				1655	}
				1656
				1657	/* Minshall's variant of the Nagle send check. */
				1658	static bool tcp_minshall_check(const struct tcp_sock *tp)
				1659	{
				1660	return after(tp->snd_sml, tp->snd_una) &&
				1661	!after(tp->snd_sml, tp->snd_nxt);
				1662	}
				1663
				1664	/* Update snd_sml if this skb is under mss
				1665	* Note that a TSO packet might end with a sub-mss segment
				1666	* The test is really :
				1667	* if ((skb->len % mss) != 0)
				1668	* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1669	* But we can avoid doing the divide again given we already have
				1670	* skb_pcount = skb->len / mss_now
				1671	*/
				1672	static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
				1673	const struct sk_buff *skb)
				1674	{
				1675	if (skb->len < tcp_skb_pcount(skb) * mss_now)
				1676	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1677	}
				1678
				1679	/* Return false, if packet can be sent now without violation Nagle's rules:
				1680	* 1. It is full sized. (provided by caller in %partial bool)
				1681	* 2. Or it contains FIN. (already checked by caller)
				1682	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
				1683	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
				1684	* With Minshall's modification: all sent small packets are ACKed.
				1685	*/
				1686	static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
				1687	int nonagle)
				1688	{
				1689	return partial &&
				1690	((nonagle & TCP_NAGLE_CORK) \|\|
				1691	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
				1692	}
				1693
				1694	/* Return how many segs we'd like on a TSO packet,
				1695	* to send one TSO packet per ms
				1696	*/
				1697	static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
				1698	int min_tso_segs)
				1699	{
				1700	u32 bytes, segs;
				1701
				1702	bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
				1703	sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
				1704
				1705	/* Goal is to send at least one packet per ms,
				1706	* not one big TSO packet every 100 ms.
				1707	* This preserves ACK clocking and is consistent
				1708	* with tcp_tso_should_defer() heuristic.
				1709	*/
				1710	segs = max_t(u32, bytes / mss_now, min_tso_segs);
				1711
				1712	return segs;
				1713	}
				1714
				1715	/* Return the number of segments we want in the skb we are transmitting.
				1716	* See if congestion control module wants to decide; otherwise, autosize.
				1717	*/
				1718	static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
				1719	{
				1720	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				1721	u32 min_tso, tso_segs;
				1722
				1723	min_tso = ca_ops->min_tso_segs ?
				1724	ca_ops->min_tso_segs(sk) :
				1725	sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
				1726
				1727	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
				1728	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
				1729	}
				1730
				1731	/* Returns the portion of skb which can be sent right away */
				1732	static unsigned int tcp_mss_split_point(const struct sock *sk,
				1733	const struct sk_buff *skb,
				1734	unsigned int mss_now,
				1735	unsigned int max_segs,
				1736	int nonagle)
				1737	{
				1738	const struct tcp_sock *tp = tcp_sk(sk);
				1739	u32 partial, needed, window, max_len;
				1740
				1741	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1742	max_len = mss_now * max_segs;
				1743
				1744	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
				1745	return max_len;
				1746
				1747	needed = min(skb->len, window);
				1748
				1749	if (max_len <= needed)
				1750	return max_len;
				1751
				1752	partial = needed % mss_now;
				1753	/* If last segment is not a full MSS, check if Nagle rules allow us
				1754	* to include this last segment in this skb.
				1755	* Otherwise, we'll split the skb at last MSS boundary
				1756	*/
				1757	if (tcp_nagle_check(partial != 0, tp, nonagle))
				1758	return needed - partial;
				1759
				1760	return needed;
				1761	}
				1762
				1763	/* Can at least one segment of SKB be sent right now, according to the
				1764	* congestion window rules? If so, return how many segments are allowed.
				1765	*/
				1766	static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
				1767	const struct sk_buff *skb)
				1768	{
				1769	u32 in_flight, cwnd, halfcwnd;
				1770
				1771	/* Don't be strict about the congestion window for the final FIN. */
				1772	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
				1773	tcp_skb_pcount(skb) == 1)
				1774	return 1;
				1775
				1776	in_flight = tcp_packets_in_flight(tp);
				1777	cwnd = tp->snd_cwnd;
				1778	if (in_flight >= cwnd)
				1779	return 0;
				1780
				1781	/* For better scheduling, ensure we have at least
				1782	* 2 GSO packets in flight.
				1783	*/
				1784	halfcwnd = max(cwnd >> 1, 1U);
				1785	return min(halfcwnd, cwnd - in_flight);
				1786	}
				1787
				1788	/* Initialize TSO state of a skb.
				1789	* This must be invoked the first time we consider transmitting
				1790	* SKB onto the wire.
				1791	*/
				1792	static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
				1793	{
				1794	int tso_segs = tcp_skb_pcount(skb);
				1795
				1796	if (!tso_segs \|\| (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
				1797	tcp_set_skb_tso_segs(skb, mss_now);
				1798	tso_segs = tcp_skb_pcount(skb);
				1799	}
				1800	return tso_segs;
				1801	}
				1802
				1803
				1804	/* Return true if the Nagle test allows this packet to be
				1805	* sent now.
				1806	*/
				1807	static inline bool tcp_nagle_test(const struct tcp_sock tp, const struct sk_buff skb,
				1808	unsigned int cur_mss, int nonagle)
				1809	{
				1810	/* Nagle rule does not apply to frames, which sit in the middle of the
				1811	* write_queue (they have no chances to get new data).
				1812	*
				1813	* This is implemented in the callers, where they modify the 'nonagle'
				1814	* argument based upon the location of SKB in the send queue.
				1815	*/
				1816	if (nonagle & TCP_NAGLE_PUSH)
				1817	return true;
				1818
				1819	/* Don't use the nagle rule for urgent data (or for the final FIN). */
				1820	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
				1821	return true;
				1822
				1823	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
				1824	return true;
				1825
				1826	return false;
				1827	}
				1828
				1829	/* Does at least the first segment of SKB fit into the send window? */
				1830	static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
				1831	const struct sk_buff *skb,
				1832	unsigned int cur_mss)
				1833	{
				1834	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				1835
				1836	if (skb->len > cur_mss)
				1837	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
				1838
				1839	return !after(end_seq, tcp_wnd_end(tp));
				1840	}
				1841
				1842	/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
				1843	* which is put after SKB on the list. It is very much like
				1844	* tcp_fragment() except that it may make several kinds of assumptions
				1845	* in order to speed up the splitting operation. In particular, we
				1846	* know that all the data is in scatter-gather pages, and that the
				1847	* packet has never been sent out before (and thus is not cloned).
				1848	*/
				1849	static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
				1850	struct sk_buff *skb, unsigned int len,
				1851	unsigned int mss_now, gfp_t gfp)
				1852	{
				1853	struct sk_buff *buff;
				1854	int nlen = skb->len - len;
				1855	u8 flags;
				1856
				1857	/* All of a TSO frame must be composed of paged data. */
				1858	if (skb->len != skb->data_len)
				1859	return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
				1860
				1861	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
				1862	if (unlikely(!buff))
				1863	return -ENOMEM;
				1864
				1865	sk->sk_wmem_queued += buff->truesize;
				1866	sk_mem_charge(sk, buff->truesize);
				1867	buff->truesize += nlen;
				1868	skb->truesize -= nlen;
				1869
				1870	/* Correct the sequence numbers. */
				1871	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1872	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1873	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1874
				1875	/* PSH and FIN should only be set in the second packet. */
				1876	flags = TCP_SKB_CB(skb)->tcp_flags;
				1877	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1878	TCP_SKB_CB(buff)->tcp_flags = flags;
				1879
				1880	/* This packet was never sent out yet, so no SACK bits. */
				1881	TCP_SKB_CB(buff)->sacked = 0;
				1882
				1883	tcp_skb_fragment_eor(skb, buff);
				1884
				1885	buff->ip_summed = CHECKSUM_PARTIAL;
				1886	skb_split(skb, buff, len);
				1887	tcp_fragment_tstamp(skb, buff);
				1888
				1889	/* Fix up tso_factor for both original and new SKB. */
				1890	tcp_set_skb_tso_segs(skb, mss_now);
				1891	tcp_set_skb_tso_segs(buff, mss_now);
				1892
				1893	/* Link BUFF into the send queue. */
				1894	__skb_header_release(buff);
				1895	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
				1896
				1897	return 0;
				1898	}
				1899
				1900	/* Try to defer sending, if possible, in order to minimize the amount
				1901	* of TSO splitting we do. View it as a kind of TSO Nagle test.
				1902	*
				1903	* This algorithm is from John Heffner.
				1904	*/
				1905	static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb,
				1906	bool *is_cwnd_limited,
				1907	bool *is_rwnd_limited,
				1908	u32 max_segs)
				1909	{
				1910	const struct inet_connection_sock *icsk = inet_csk(sk);
				1911	u32 age, send_win, cong_win, limit, in_flight;
				1912	struct tcp_sock *tp = tcp_sk(sk);
				1913	struct sk_buff *head;
				1914	int win_divisor;
				1915
				1916	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
				1917	goto send_now;
				1918
				1919	/* Avoid bursty behavior by allowing defer
				1920	* only if the last write was recent.
				1921	*/
				1922	if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
				1923	goto send_now;
				1924
				1925	in_flight = tcp_packets_in_flight(tp);
				1926
				1927	BUG_ON(tcp_skb_pcount(skb) <= 1);
				1928	BUG_ON(tp->snd_cwnd <= in_flight);
				1929
				1930	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1931
				1932	/* From in_flight test above, we know that cwnd > in_flight. */
				1933	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
				1934
				1935	limit = min(send_win, cong_win);
				1936
				1937	/* If a full-sized TSO skb can be sent, do it. */
				1938	if (limit >= max_segs * tp->mss_cache)
				1939	goto send_now;
				1940
				1941	/* Middle in queue won't get any more data, full sendable already? */
				1942	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
				1943	goto send_now;
				1944
				1945	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
				1946	if (win_divisor) {
				1947	u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
				1948
				1949	/* If at least some fraction of a window is available,
				1950	* just use it.
				1951	*/
				1952	chunk /= win_divisor;
				1953	if (limit >= chunk)
				1954	goto send_now;
				1955	} else {
				1956	/* Different approach, try not to defer past a single
				1957	* ACK. Receiver should ACK every other full sized
				1958	* frame, so if we have space for more than 3 frames
				1959	* then send now.
				1960	*/
				1961	if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
				1962	goto send_now;
				1963	}
				1964
				1965	/* TODO : use tsorted_sent_queue ? */
				1966	head = tcp_rtx_queue_head(sk);
				1967	if (!head)
				1968	goto send_now;
				1969	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
				1970	/* If next ACK is likely to come too late (half srtt), do not defer */
				1971	if (age < (tp->srtt_us >> 4))
				1972	goto send_now;
				1973
				1974	/* Ok, it looks like it is advisable to defer.
				1975	* Three cases are tracked :
				1976	* 1) We are cwnd-limited
				1977	* 2) We are rwnd-limited
				1978	* 3) We are application limited.
				1979	*/
				1980	if (cong_win < send_win) {
				1981	if (cong_win <= skb->len) {
				1982	*is_cwnd_limited = true;
				1983	return true;
				1984	}
				1985	} else {
				1986	if (send_win <= skb->len) {
				1987	*is_rwnd_limited = true;
				1988	return true;
				1989	}
				1990	}
				1991
				1992	/* If this packet won't get more data, do not wait. */
				1993	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1994	goto send_now;
				1995
				1996	return true;
				1997
				1998	send_now:
				1999	return false;
				2000	}
				2001
				2002	static inline void tcp_mtu_check_reprobe(struct sock *sk)
				2003	{
				2004	struct inet_connection_sock *icsk = inet_csk(sk);
				2005	struct tcp_sock *tp = tcp_sk(sk);
				2006	struct net *net = sock_net(sk);
				2007	u32 interval;
				2008	s32 delta;
				2009
				2010	interval = net->ipv4.sysctl_tcp_probe_interval;
				2011	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
				2012	if (unlikely(delta >= interval * HZ)) {
				2013	int mss = tcp_current_mss(sk);
				2014
				2015	/* Update current search range */
				2016	icsk->icsk_mtup.probe_size = 0;
				2017	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
				2018	sizeof(struct tcphdr) +
				2019	icsk->icsk_af_ops->net_header_len;
				2020	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
				2021
				2022	/* Update probe time stamp */
				2023	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
				2024	}
				2025	}
				2026
				2027	static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
				2028	{
				2029	struct sk_buff skb, next;
				2030
				2031	skb = tcp_send_head(sk);
				2032	tcp_for_write_queue_from_safe(skb, next, sk) {
				2033	if (len <= skb->len)
				2034	break;
				2035
				2036	if (unlikely(TCP_SKB_CB(skb)->eor) \|\| tcp_has_tx_tstamp(skb))
				2037	return false;
				2038
				2039	len -= skb->len;
				2040	}
				2041
				2042	return true;
				2043	}
				2044
				2045	/* Create a new MTU probe if we are ready.
				2046	* MTU probe is regularly attempting to increase the path MTU by
				2047	* deliberately sending larger packets. This discovers routing
				2048	* changes resulting in larger path MTUs.
				2049	*
				2050	* Returns 0 if we should wait to probe (no cwnd available),
				2051	* 1 if a probe was sent,
				2052	* -1 otherwise
				2053	*/
				2054	static int tcp_mtu_probe(struct sock *sk)
				2055	{
				2056	struct inet_connection_sock *icsk = inet_csk(sk);
				2057	struct tcp_sock *tp = tcp_sk(sk);
				2058	struct sk_buff skb, nskb, *next;
				2059	struct net *net = sock_net(sk);
				2060	int probe_size;
				2061	int size_needed;
				2062	int copy, len;
				2063	int mss_now;
				2064	int interval;
				2065
				2066	/* Not currently probing/verifying,
				2067	* not in recovery,
				2068	* have enough cwnd, and
				2069	* not SACKing (the variable headers throw things off)
				2070	*/
				2071	if (likely(!icsk->icsk_mtup.enabled \|\|
				2072	icsk->icsk_mtup.probe_size \|\|
				2073	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
				2074	tp->snd_cwnd < 11 \|\|
				2075	tp->rx_opt.num_sacks \|\| tp->rx_opt.dsack))
				2076	return -1;
				2077
				2078	/* Use binary search for probe_size between tcp_mss_base,
				2079	* and current mss_clamp. if (search_high - search_low)
				2080	* smaller than a threshold, backoff from probing.
				2081	*/
				2082	mss_now = tcp_current_mss(sk);
				2083	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
				2084	icsk->icsk_mtup.search_low) >> 1);
				2085	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
				2086	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
				2087	/* When misfortune happens, we are reprobing actively,
				2088	* and then reprobe timer has expired. We stick with current
				2089	* probing process by not resetting search range to its orignal.
				2090	*/
				2091	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
				2092	interval < net->ipv4.sysctl_tcp_probe_threshold) {
				2093	/* Check whether enough time has elaplased for
				2094	* another round of probing.
				2095	*/
				2096	tcp_mtu_check_reprobe(sk);
				2097	return -1;
				2098	}
				2099
				2100	/* Have enough data in the send queue to probe? */
				2101	if (tp->write_seq - tp->snd_nxt < size_needed)
				2102	return -1;
				2103
				2104	if (tp->snd_wnd < size_needed)
				2105	return -1;
				2106	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
				2107	return 0;
				2108
				2109	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
				2110	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
				2111	if (!tcp_packets_in_flight(tp))
				2112	return -1;
				2113	else
				2114	return 0;
				2115	}
				2116
				2117	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
				2118	return -1;
				2119
				2120	/* We're allowed to probe. Build it now. */
				2121	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
				2122	if (!nskb)
				2123	return -1;
				2124	sk->sk_wmem_queued += nskb->truesize;
				2125	sk_mem_charge(sk, nskb->truesize);
				2126
				2127	skb = tcp_send_head(sk);
				2128
				2129	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
				2130	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
				2131	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
				2132	TCP_SKB_CB(nskb)->sacked = 0;
				2133	nskb->csum = 0;
				2134	nskb->ip_summed = CHECKSUM_PARTIAL;
				2135
				2136	tcp_insert_write_queue_before(nskb, skb, sk);
				2137	tcp_highest_sack_replace(sk, skb, nskb);
				2138
				2139	len = 0;
				2140	tcp_for_write_queue_from_safe(skb, next, sk) {
				2141	copy = min_t(int, skb->len, probe_size - len);
				2142	skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
				2143
				2144	if (skb->len <= copy) {
				2145	/* We've eaten all the data from this skb.
				2146	* Throw it away. */
				2147	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				2148	/* If this is the last SKB we copy and eor is set
				2149	* we need to propagate it to the new skb.
				2150	*/
				2151	TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
				2152	tcp_skb_collapse_tstamp(nskb, skb);
				2153	tcp_unlink_write_queue(skb, sk);
				2154	sk_wmem_free_skb(sk, skb);
				2155	} else {
				2156	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags &
				2157	~(TCPHDR_FIN\|TCPHDR_PSH);
				2158	if (!skb_shinfo(skb)->nr_frags) {
				2159	skb_pull(skb, copy);
				2160	} else {
				2161	__pskb_trim_head(skb, copy);
				2162	tcp_set_skb_tso_segs(skb, mss_now);
				2163	}
				2164	TCP_SKB_CB(skb)->seq += copy;
				2165	}
				2166
				2167	len += copy;
				2168
				2169	if (len >= probe_size)
				2170	break;
				2171	}
				2172	tcp_init_tso_segs(nskb, nskb->len);
				2173
				2174	/* We're ready to send. If this fails, the probe will
				2175	* be resegmented into mss-sized pieces by tcp_write_xmit().
				2176	*/
				2177	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
				2178	/* Decrement cwnd here because we are sending
				2179	* effectively two packets. */
				2180	tp->snd_cwnd--;
				2181	tcp_event_new_data_sent(sk, nskb);
				2182
				2183	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
				2184	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
				2185	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
				2186
				2187	return 1;
				2188	}
				2189
				2190	return -1;
				2191	}
				2192
				2193	static bool tcp_pacing_check(const struct sock *sk)
				2194	{
				2195	return tcp_needs_internal_pacing(sk) &&
				2196	hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
				2197	}
				2198
				2199	/* TCP Small Queues :
				2200	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
				2201	* (These limits are doubled for retransmits)
				2202	* This allows for :
				2203	* - better RTT estimation and ACK scheduling
				2204	* - faster recovery
				2205	* - high rates
				2206	* Alas, some drivers / subsystems require a fair amount
				2207	* of queued bytes to ensure line rate.
				2208	* One example is wifi aggregation (802.11 AMPDU)
				2209	*/
				2210	static bool tcp_small_queue_check(struct sock sk, const struct sk_buff skb,
				2211	unsigned int factor)
				2212	{
				2213	unsigned int limit;
				2214
				2215	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
				2216	limit = min_t(u32, limit,
				2217	sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
				2218	limit <<= factor;
				2219
				2220	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
				2221	/* Always send skb if rtx queue is empty.
				2222	* No need to wait for TX completion to call us back,
				2223	* after softirq/tasklet schedule.
				2224	* This helps when TX completions are delayed too much.
				2225	*/
				2226	if (tcp_rtx_queue_empty(sk))
				2227	return false;
				2228
				2229	set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
				2230	/* It is possible TX completion already happened
				2231	* before we set TSQ_THROTTLED, so we must
				2232	* test again the condition.
				2233	*/
				2234	smp_mb__after_atomic();
				2235	if (refcount_read(&sk->sk_wmem_alloc) > limit)
				2236	return true;
				2237	}
				2238	return false;
				2239	}
				2240
				2241	static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
				2242	{
				2243	const u32 now = tcp_jiffies32;
				2244	enum tcp_chrono old = tp->chrono_type;
				2245
				2246	if (old > TCP_CHRONO_UNSPEC)
				2247	tp->chrono_stat[old - 1] += now - tp->chrono_start;
				2248	tp->chrono_start = now;
				2249	tp->chrono_type = new;
				2250	}
				2251
				2252	void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
				2253	{
				2254	struct tcp_sock *tp = tcp_sk(sk);
				2255
				2256	/* If there are multiple conditions worthy of tracking in a
				2257	* chronograph then the highest priority enum takes precedence
				2258	* over the other conditions. So that if something "more interesting"
				2259	* starts happening, stop the previous chrono and start a new one.
				2260	*/
				2261	if (type > tp->chrono_type)
				2262	tcp_chrono_set(tp, type);
				2263	}
				2264
				2265	void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
				2266	{
				2267	struct tcp_sock *tp = tcp_sk(sk);
				2268
				2269
				2270	/* There are multiple conditions worthy of tracking in a
				2271	* chronograph, so that the highest priority enum takes
				2272	* precedence over the other conditions (see tcp_chrono_start).
				2273	* If a condition stops, we only stop chrono tracking if
				2274	* it's the "most interesting" or current chrono we are
				2275	* tracking and starts busy chrono if we have pending data.
				2276	*/
				2277	if (tcp_rtx_and_write_queues_empty(sk))
				2278	tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
				2279	else if (type == tp->chrono_type)
				2280	tcp_chrono_set(tp, TCP_CHRONO_BUSY);
				2281	}
				2282
				2283	/* This routine writes packets to the network. It advances the
				2284	* send_head. This happens as incoming acks open up the remote
				2285	* window for us.
				2286	*
				2287	* LARGESEND note: !tcp_urg_mode is overkill, only frames between
				2288	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
				2289	* account rare use of URG, this is not a big flaw.
				2290	*
				2291	* Send at most one packet when push_one > 0. Temporarily ignore
				2292	* cwnd limit to force at most one packet out when push_one == 2.
				2293
				2294	* Returns true, if no segments are in flight and we have queued segments,
				2295	* but cannot send anything now because of SWS or another problem.
				2296	*/
				2297	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				2298	int push_one, gfp_t gfp)
				2299	{
				2300	struct tcp_sock *tp = tcp_sk(sk);
				2301	struct sk_buff *skb;
				2302	unsigned int tso_segs, sent_pkts;
				2303	int cwnd_quota;
				2304	int result;
				2305	bool is_cwnd_limited = false, is_rwnd_limited = false;
				2306	u32 max_segs;
				2307
				2308	sent_pkts = 0;
				2309
				2310	tcp_mstamp_refresh(tp);
				2311	if (!push_one) {
				2312	/* Do MTU probing. */
				2313	result = tcp_mtu_probe(sk);
				2314	if (!result) {
				2315	return false;
				2316	} else if (result > 0) {
				2317	sent_pkts = 1;
				2318	}
				2319	}
				2320
				2321	max_segs = tcp_tso_segs(sk, mss_now);
				2322	while ((skb = tcp_send_head(sk))) {
				2323	unsigned int limit;
				2324
				2325	if (tcp_pacing_check(sk))
				2326	break;
				2327
				2328	tso_segs = tcp_init_tso_segs(skb, mss_now);
				2329	BUG_ON(!tso_segs);
				2330
				2331	if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
				2332	/* "skb_mstamp" is used as a start point for the retransmit timer */
				2333	tcp_update_skb_after_send(tp, skb);
				2334	goto repair; /* Skip network transmission */
				2335	}
				2336
				2337	cwnd_quota = tcp_cwnd_test(tp, skb);
				2338	if (!cwnd_quota) {
				2339	if (push_one == 2)
				2340	/* Force out a loss probe pkt. */
				2341	cwnd_quota = 1;
				2342	else
				2343	break;
				2344	}
				2345
				2346	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
				2347	is_rwnd_limited = true;
				2348	break;
				2349	}
				2350
				2351	if (tso_segs == 1) {
				2352	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
				2353	(tcp_skb_is_last(sk, skb) ?
				2354	nonagle : TCP_NAGLE_PUSH))))
				2355	break;
				2356	} else {
				2357	if (!push_one &&
				2358	tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
				2359	&is_rwnd_limited, max_segs))
				2360	break;
				2361	}
				2362
				2363	limit = mss_now;
				2364	if (tso_segs > 1 && !tcp_urg_mode(tp))
				2365	limit = tcp_mss_split_point(sk, skb, mss_now,
				2366	min_t(unsigned int,
				2367	cwnd_quota,
				2368	max_segs),
				2369	nonagle);
				2370
				2371	if (skb->len > limit &&
				2372	unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
				2373	skb, limit, mss_now, gfp)))
				2374	break;
				2375
				2376	if (tcp_small_queue_check(sk, skb, 0))
				2377	break;
				2378
				2379	/* Argh, we hit an empty skb(), presumably a thread
				2380	* is sleeping in sendmsg()/sk_stream_wait_memory().
				2381	* We do not want to send a pure-ack packet and have
				2382	* a strange looking rtx queue with empty packet(s).
				2383	*/
				2384	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
				2385	break;
				2386
				2387	if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
				2388	break;
				2389
				2390	repair:
				2391	/* Advance the send_head. This one is sent out.
				2392	* This call will increment packets_out.
				2393	*/
				2394	tcp_event_new_data_sent(sk, skb);
				2395
				2396	tcp_minshall_update(tp, mss_now, skb);
				2397	sent_pkts += tcp_skb_pcount(skb);
				2398
				2399	if (push_one)
				2400	break;
				2401	}
				2402
				2403	if (is_rwnd_limited)
				2404	tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
				2405	else
				2406	tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
				2407
				2408	if (likely(sent_pkts)) {
				2409	if (tcp_in_cwnd_reduction(sk))
				2410	tp->prr_out += sent_pkts;
				2411
				2412	/* Send one loss probe per tail loss episode. */
				2413	if (push_one != 2)
				2414	tcp_schedule_loss_probe(sk, false);
				2415	is_cwnd_limited \|= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
				2416	tcp_cwnd_validate(sk, is_cwnd_limited);
				2417	return false;
				2418	}
				2419	return !tp->packets_out && !tcp_write_queue_empty(sk);
				2420	}
				2421
				2422	bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
				2423	{
				2424	struct inet_connection_sock *icsk = inet_csk(sk);
				2425	struct tcp_sock *tp = tcp_sk(sk);
				2426	u32 timeout, rto_delta_us;
				2427	int early_retrans;
				2428
				2429	/* Don't do any loss probe on a Fast Open connection before 3WHS
				2430	* finishes.
				2431	*/
				2432	if (tp->fastopen_rsk)
				2433	return false;
				2434
				2435	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
				2436	/* Schedule a loss probe in 2*RTT for SACK capable connections
				2437	* not in loss recovery, that are either limited by cwnd or application.
				2438	*/
				2439	if ((early_retrans != 3 && early_retrans != 4) \|\|
				2440	!tp->packets_out \|\| !tcp_is_sack(tp) \|\|
				2441	(icsk->icsk_ca_state != TCP_CA_Open &&
				2442	icsk->icsk_ca_state != TCP_CA_CWR))
				2443	return false;
				2444
				2445	/* Probe timeout is 2*rtt. Add minimum RTO to account
				2446	* for delayed ack when there's one outstanding packet. If no RTT
				2447	* sample is available then probe after TCP_TIMEOUT_INIT.
				2448	*/
				2449	if (tp->srtt_us) {
				2450	timeout = usecs_to_jiffies(tp->srtt_us >> 2);
				2451	if (tp->packets_out == 1)
				2452	timeout += TCP_RTO_MIN;
				2453	else
				2454	timeout += TCP_TIMEOUT_MIN;
				2455	} else {
				2456	timeout = TCP_TIMEOUT_INIT;
				2457	}
				2458
				2459	/* If the RTO formula yields an earlier time, then use that time. */
				2460	rto_delta_us = advancing_rto ?
				2461	jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
				2462	tcp_rto_delta_us(sk); /* How far in future is RTO? */
				2463	if (rto_delta_us > 0)
				2464	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
				2465
				2466	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
				2467	TCP_RTO_MAX);
				2468	return true;
				2469	}
				2470
				2471	/* Thanks to skb fast clones, we can detect if a prior transmit of
				2472	* a packet is still in a qdisc or driver queue.
				2473	* In this case, there is very little point doing a retransmit !
				2474	*/
				2475	static bool skb_still_in_host_queue(const struct sock *sk,
				2476	const struct sk_buff *skb)
				2477	{
				2478	if (unlikely(skb_fclone_busy(sk, skb))) {
				2479	NET_INC_STATS(sock_net(sk),
				2480	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
				2481	return true;
				2482	}
				2483	return false;
				2484	}
				2485
				2486	/* When probe timeout (PTO) fires, try send a new segment if possible, else
				2487	* retransmit the last segment.
				2488	*/
				2489	void tcp_send_loss_probe(struct sock *sk)
				2490	{
				2491	struct tcp_sock *tp = tcp_sk(sk);
				2492	struct sk_buff *skb;
				2493	int pcount;
				2494	int mss = tcp_current_mss(sk);
				2495
				2496	skb = tcp_send_head(sk);
				2497	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
				2498	pcount = tp->packets_out;
				2499	tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
				2500	if (tp->packets_out > pcount)
				2501	goto probe_sent;
				2502	goto rearm_timer;
				2503	}
				2504	skb = skb_rb_last(&sk->tcp_rtx_queue);
				2505	if (unlikely(!skb)) {
				2506	WARN_ONCE(tp->packets_out,
				2507	"invalid inflight: %u state %u cwnd %u mss %d\n",
				2508	tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
				2509	inet_csk(sk)->icsk_pending = 0;
				2510	return;
				2511	}
				2512
				2513	/* At most one outstanding TLP retransmission. */
				2514	if (tp->tlp_high_seq)
				2515	goto rearm_timer;
				2516
				2517	if (skb_still_in_host_queue(sk, skb))
				2518	goto rearm_timer;
				2519
				2520	pcount = tcp_skb_pcount(skb);
				2521	if (WARN_ON(!pcount))
				2522	goto rearm_timer;
				2523
				2524	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
				2525	if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
				2526	(pcount - 1) * mss, mss,
				2527	GFP_ATOMIC)))
				2528	goto rearm_timer;
				2529	skb = skb_rb_next(skb);
				2530	}
				2531
				2532	if (WARN_ON(!skb \|\| !tcp_skb_pcount(skb)))
				2533	goto rearm_timer;
				2534
				2535	if (__tcp_retransmit_skb(sk, skb, 1))
				2536	goto rearm_timer;
				2537
				2538	/* Record snd_nxt for loss detection. */
				2539	tp->tlp_high_seq = tp->snd_nxt;
				2540
				2541	probe_sent:
				2542	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
				2543	/* Reset s.t. tcp_rearm_rto will restart timer from now */
				2544	inet_csk(sk)->icsk_pending = 0;
				2545	rearm_timer:
				2546	tcp_rearm_rto(sk);
				2547	}
				2548
				2549	/* Push out any pending frames which were held back due to
				2550	* TCP_CORK or attempt at coalescing tiny packets.
				2551	* The socket must be locked by the caller.
				2552	*/
				2553	void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
				2554	int nonagle)
				2555	{
				2556	/* If we are closed, the bytes will have to remain here.
				2557	* In time closedown will finish, we empty the write queue and
				2558	* all will be happy.
				2559	*/
				2560	if (unlikely(sk->sk_state == TCP_CLOSE))
				2561	return;
				2562
				2563	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
				2564	sk_gfp_mask(sk, GFP_ATOMIC)))
				2565	tcp_check_probe_timer(sk);
				2566	}
				2567
				2568	/* Send _single_ skb sitting at the send head. This function requires
				2569	* true push pending frames to setup probe timer etc.
				2570	*/
				2571	void tcp_push_one(struct sock *sk, unsigned int mss_now)
				2572	{
				2573	struct sk_buff *skb = tcp_send_head(sk);
				2574
				2575	BUG_ON(!skb \|\| skb->len < mss_now);
				2576
				2577	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
				2578	}
				2579
				2580	/* This function returns the amount that we can raise the
				2581	* usable window based on the following constraints
				2582	*
				2583	* 1. The window can never be shrunk once it is offered (RFC 793)
				2584	* 2. We limit memory per socket
				2585	*
				2586	* RFC 1122:
				2587	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
				2588	* RECV.NEXT + RCV.WIN fixed until:
				2589	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
				2590	*
				2591	* i.e. don't raise the right edge of the window until you can raise
				2592	* it at least MSS bytes.
				2593	*
				2594	* Unfortunately, the recommended algorithm breaks header prediction,
				2595	* since header prediction assumes th->window stays fixed.
				2596	*
				2597	* Strictly speaking, keeping th->window fixed violates the receiver
				2598	* side SWS prevention criteria. The problem is that under this rule
				2599	* a stream of single byte packets will cause the right side of the
				2600	* window to always advance by a single byte.
				2601	*
				2602	* Of course, if the sender implements sender side SWS prevention
				2603	* then this will not be a problem.
				2604	*
				2605	* BSD seems to make the following compromise:
				2606	*
				2607	* If the free space is less than the 1/4 of the maximum
				2608	* space available and the free space is less than 1/2 mss,
				2609	* then set the window to 0.
				2610	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
				2611	* Otherwise, just prevent the window from shrinking
				2612	* and from being larger than the largest representable value.
				2613	*
				2614	* This prevents incremental opening of the window in the regime
				2615	* where TCP is limited by the speed of the reader side taking
				2616	* data out of the TCP receive queue. It does nothing about
				2617	* those cases where the window is constrained on the sender side
				2618	* because the pipeline is full.
				2619	*
				2620	* BSD also seems to "accidentally" limit itself to windows that are a
				2621	* multiple of MSS, at least until the free space gets quite small.
				2622	* This would appear to be a side effect of the mbuf implementation.
				2623	* Combining these two algorithms results in the observed behavior
				2624	* of having a fixed window size at almost all times.
				2625	*
				2626	* Below we obtain similar behavior by forcing the offered window to
				2627	* a multiple of the mss when it is feasible to do so.
				2628	*
				2629	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
				2630	* Regular options like TIMESTAMP are taken into account.
				2631	*/
				2632	u32 __tcp_select_window(struct sock *sk)
				2633	{
				2634	struct inet_connection_sock *icsk = inet_csk(sk);
				2635	struct tcp_sock *tp = tcp_sk(sk);
				2636	/* MSS for the peer's data. Previous versions used mss_clamp
				2637	* here. I don't know if the value based on our guesses
				2638	* of peer's MSS is better for the performance. It's more correct
				2639	* but may be worse for the performance because of rcv_mss
				2640	* fluctuations. --SAW 1998/11/1
				2641	*/
				2642	int mss = icsk->icsk_ack.rcv_mss;
				2643	int free_space = tcp_space(sk);
				2644	int allowed_space = tcp_full_space(sk);
				2645	int full_space = min_t(int, tp->window_clamp, allowed_space);
				2646	int window;
				2647
				2648	if (unlikely(mss > full_space)) {
				2649	mss = full_space;
				2650	if (mss <= 0)
				2651	return 0;
				2652	}
				2653	if (free_space < (full_space >> 1)) {
				2654	icsk->icsk_ack.quick = 0;
				2655
				2656	if (tcp_under_memory_pressure(sk))
				2657	tp->rcv_ssthresh = min(tp->rcv_ssthresh,
				2658	4U * tp->advmss);
				2659
				2660	/* free_space might become our new window, make sure we don't
				2661	* increase it due to wscale.
				2662	*/
				2663	free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
				2664
				2665	/* if free space is less than mss estimate, or is below 1/16th
				2666	* of the maximum allowed, try to move to zero-window, else
				2667	* tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
				2668	* new incoming data is dropped due to memory limits.
				2669	* With large window, mss test triggers way too late in order
				2670	* to announce zero window in time before rmem limit kicks in.
				2671	*/
				2672	if (free_space < (allowed_space >> 4) \|\| free_space < mss)
				2673	return 0;
				2674	}
				2675
				2676	if (free_space > tp->rcv_ssthresh)
				2677	free_space = tp->rcv_ssthresh;
				2678
				2679	/* Don't do rounding if we are using window scaling, since the
				2680	* scaled window will not line up with the MSS boundary anyway.
				2681	*/
				2682	if (tp->rx_opt.rcv_wscale) {
				2683	window = free_space;
				2684
				2685	/* Advertise enough space so that it won't get scaled away.
				2686	* Import case: prevent zero window announcement if
				2687	* 1<<rcv_wscale > mss.
				2688	*/
				2689	window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
				2690	} else {
				2691	window = tp->rcv_wnd;
				2692	/* Get the largest window that is a nice multiple of mss.
				2693	* Window clamp already applied above.
				2694	* If our current window offering is within 1 mss of the
				2695	* free space we just keep it. This prevents the divide
				2696	* and multiply from happening most of the time.
				2697	* We also don't do any window rounding when the free space
				2698	* is too small.
				2699	*/
				2700	if (window <= free_space - mss \|\| window > free_space)
				2701	window = rounddown(free_space, mss);
				2702	else if (mss == full_space &&
				2703	free_space > window + (full_space >> 1))
				2704	window = free_space;
				2705	}
				2706
				2707	return window;
				2708	}
				2709
				2710	void tcp_skb_collapse_tstamp(struct sk_buff *skb,
				2711	const struct sk_buff *next_skb)
				2712	{
				2713	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
				2714	const struct skb_shared_info *next_shinfo =
				2715	skb_shinfo(next_skb);
				2716	struct skb_shared_info *shinfo = skb_shinfo(skb);
				2717
				2718	shinfo->tx_flags \|= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
				2719	shinfo->tskey = next_shinfo->tskey;
				2720	TCP_SKB_CB(skb)->txstamp_ack \|=
				2721	TCP_SKB_CB(next_skb)->txstamp_ack;
				2722	}
				2723	}
				2724
				2725	/* Collapses two adjacent SKB's during retransmission. */
				2726	static bool tcp_collapse_retrans(struct sock sk, struct sk_buff skb)
				2727	{
				2728	struct tcp_sock *tp = tcp_sk(sk);
				2729	struct sk_buff *next_skb = skb_rb_next(skb);
				2730	int next_skb_size;
				2731
				2732	next_skb_size = next_skb->len;
				2733
				2734	BUG_ON(tcp_skb_pcount(skb) != 1 \|\| tcp_skb_pcount(next_skb) != 1);
				2735
				2736	if (next_skb_size) {
				2737	if (next_skb_size <= skb_availroom(skb))
				2738	skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
				2739	next_skb_size);
				2740	else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
				2741	return false;
				2742	}
				2743	tcp_highest_sack_replace(sk, next_skb, skb);
				2744
				2745	/* Update sequence range on original skb. */
				2746	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
				2747
				2748	/* Merge over control information. This moves PSH/FIN etc. over */
				2749	TCP_SKB_CB(skb)->tcp_flags \|= TCP_SKB_CB(next_skb)->tcp_flags;
				2750
				2751	/* All done, get rid of second SKB and account for it so
				2752	* packet counting does not break.
				2753	*/
				2754	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
				2755	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
				2756
				2757	/* changed transmit queue under us so clear hints */
				2758	tcp_clear_retrans_hints_partial(tp);
				2759	if (next_skb == tp->retransmit_skb_hint)
				2760	tp->retransmit_skb_hint = skb;
				2761
				2762	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
				2763
				2764	tcp_skb_collapse_tstamp(skb, next_skb);
				2765
				2766	tcp_rtx_queue_unlink_and_free(next_skb, sk);
				2767	return true;
				2768	}
				2769
				2770	/* Check if coalescing SKBs is legal. */
				2771	static bool tcp_can_collapse(const struct sock sk, const struct sk_buff skb)
				2772	{
				2773	if (tcp_skb_pcount(skb) > 1)
				2774	return false;
				2775	if (skb_cloned(skb))
				2776	return false;
				2777	/* Some heuristics for collapsing over SACK'd could be invented */
				2778	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				2779	return false;
				2780
				2781	return true;
				2782	}
				2783
				2784	/* Collapse packets in the retransmit queue to make to create
				2785	* less packets on the wire. This is only done on retransmission.
				2786	*/
				2787	static void tcp_retrans_try_collapse(struct sock sk, struct sk_buff to,
				2788	int space)
				2789	{
				2790	struct tcp_sock *tp = tcp_sk(sk);
				2791	struct sk_buff skb = to, tmp;
				2792	bool first = true;
				2793
				2794	if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
				2795	return;
				2796	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2797	return;
				2798
				2799	skb_rbtree_walk_from_safe(skb, tmp) {
				2800	if (!tcp_can_collapse(sk, skb))
				2801	break;
				2802
				2803	if (!tcp_skb_can_collapse_to(to))
				2804	break;
				2805
				2806	space -= skb->len;
				2807
				2808	if (first) {
				2809	first = false;
				2810	continue;
				2811	}
				2812
				2813	if (space < 0)
				2814	break;
				2815
				2816	if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
				2817	break;
				2818
				2819	if (!tcp_collapse_retrans(sk, to))
				2820	break;
				2821	}
				2822	}
				2823
				2824	/* This retransmits one SKB. Policy decisions and retransmit queue
				2825	* state updates are done by the caller. Returns non-zero if an
				2826	* error occurred which prevented the send.
				2827	*/
				2828	int __tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
				2829	{
				2830	struct inet_connection_sock *icsk = inet_csk(sk);
				2831	struct tcp_sock *tp = tcp_sk(sk);
				2832	unsigned int cur_mss;
				2833	int diff, len, err;
				2834
				2835
				2836	/* Inconclusive MTU probe */
				2837	if (icsk->icsk_mtup.probe_size)
				2838	icsk->icsk_mtup.probe_size = 0;
				2839
				2840	/* Do not sent more than we queued. 1/4 is reserved for possible
				2841	* copying overhead: fragmentation, tunneling, mangling etc.
				2842	*/
				2843	if (refcount_read(&sk->sk_wmem_alloc) >
				2844	min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
				2845	sk->sk_sndbuf))
				2846	return -EAGAIN;
				2847
				2848	if (skb_still_in_host_queue(sk, skb))
				2849	return -EBUSY;
				2850
				2851	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
				2852	if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
				2853	WARN_ON_ONCE(1);
				2854	return -EINVAL;
				2855	}
				2856	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				2857	return -ENOMEM;
				2858	}
				2859
				2860	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				2861	return -EHOSTUNREACH; /* Routing failure or similar. */
				2862
				2863	cur_mss = tcp_current_mss(sk);
				2864
				2865	/* If receiver has shrunk his window, and skb is out of
				2866	* new window, do not retransmit it. The exception is the
				2867	* case, when window is shrunk to zero. In this case
				2868	* our retransmit serves as a zero window probe.
				2869	*/
				2870	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
				2871	TCP_SKB_CB(skb)->seq != tp->snd_una)
				2872	return -EAGAIN;
				2873
				2874	len = cur_mss * segs;
				2875	if (skb->len > len) {
				2876	if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
				2877	cur_mss, GFP_ATOMIC))
				2878	return -ENOMEM; /* We'll try again later. */
				2879	} else {
				2880	if (skb_unclone(skb, GFP_ATOMIC))
				2881	return -ENOMEM;
				2882
				2883	diff = tcp_skb_pcount(skb);
				2884	tcp_set_skb_tso_segs(skb, cur_mss);
				2885	diff -= tcp_skb_pcount(skb);
				2886	if (diff)
				2887	tcp_adjust_pcount(sk, skb, diff);
				2888	if (skb->len < cur_mss)
				2889	tcp_retrans_try_collapse(sk, skb, cur_mss);
				2890	}
				2891
				2892	/* RFC3168, section 6.1.1.1. ECN fallback */
				2893	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
				2894	tcp_ecn_clear_syn(sk, skb);
				2895
				2896	/* Update global and local TCP statistics. */
				2897	segs = tcp_skb_pcount(skb);
				2898	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
				2899	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2900	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
				2901	tp->total_retrans += segs;
				2902	tp->bytes_retrans += skb->len;
				2903
				2904	/* make sure skb->data is aligned on arches that require it
				2905	* and check if ack-trimming & collapsing extended the headroom
				2906	* beyond what csum_start can cover.
				2907	*/
				2908	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) \|\|
				2909	skb_headroom(skb) >= 0xFFFF)) {
				2910	struct sk_buff *nskb;
				2911
				2912	tcp_skb_tsorted_save(skb) {
				2913	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
				2914	err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
				2915	-ENOBUFS;
				2916	} tcp_skb_tsorted_restore(skb);
				2917
				2918	if (!err) {
				2919	tcp_update_skb_after_send(tp, skb);
				2920	tcp_rate_skb_sent(sk, skb);
				2921	}
				2922	} else {
				2923	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				2924	}
				2925
				2926	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
				2927	tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
				2928	TCP_SKB_CB(skb)->seq, segs, err);
				2929
				2930	if (likely(!err)) {
				2931	TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
				2932	trace_tcp_retransmit_skb(sk, skb);
				2933	} else if (err != -EBUSY) {
				2934	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
				2935	}
				2936	return err;
				2937	}
				2938
				2939	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
				2940	{
				2941	struct tcp_sock *tp = tcp_sk(sk);
				2942	int err = __tcp_retransmit_skb(sk, skb, segs);
				2943
				2944	if (err == 0) {
				2945	#if FASTRETRANS_DEBUG > 0
				2946	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2947	net_dbg_ratelimited("retrans_out leaked\n");
				2948	}
				2949	#endif
				2950	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
				2951	tp->retrans_out += tcp_skb_pcount(skb);
				2952
				2953	/* Save stamp of the first retransmit. */
				2954	if (!tp->retrans_stamp)
				2955	tp->retrans_stamp = tcp_skb_timestamp(skb);
				2956
				2957	}
				2958
				2959	if (tp->undo_retrans < 0)
				2960	tp->undo_retrans = 0;
				2961	tp->undo_retrans += tcp_skb_pcount(skb);
				2962	return err;
				2963	}
				2964
				2965	/* This gets called after a retransmit timeout, and the initially
				2966	* retransmitted data is acknowledged. It tries to continue
				2967	* resending the rest of the retransmit queue, until either
				2968	* we've sent it all or the congestion window limit is reached.
				2969	*/
				2970	void tcp_xmit_retransmit_queue(struct sock *sk)
				2971	{
				2972	const struct inet_connection_sock *icsk = inet_csk(sk);
				2973	struct sk_buff skb, rtx_head, *hole = NULL;
				2974	struct tcp_sock *tp = tcp_sk(sk);
				2975	u32 max_segs;
				2976	int mib_idx;
				2977
				2978	if (!tp->packets_out)
				2979	return;
				2980
				2981	rtx_head = tcp_rtx_queue_head(sk);
				2982	skb = tp->retransmit_skb_hint ?: rtx_head;
				2983	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
				2984	skb_rbtree_walk_from(skb) {
				2985	__u8 sacked;
				2986	int segs;
				2987
				2988	if (tcp_pacing_check(sk))
				2989	break;
				2990
				2991	/* we could do better than to assign each time */
				2992	if (!hole)
				2993	tp->retransmit_skb_hint = skb;
				2994
				2995	segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
				2996	if (segs <= 0)
				2997	return;
				2998	sacked = TCP_SKB_CB(skb)->sacked;
				2999	/* In case tcp_shift_skb_data() have aggregated large skbs,
				3000	* we need to make sure not sending too bigs TSO packets
				3001	*/
				3002	segs = min_t(int, segs, max_segs);
				3003
				3004	if (tp->retrans_out >= tp->lost_out) {
				3005	break;
				3006	} else if (!(sacked & TCPCB_LOST)) {
				3007	if (!hole && !(sacked & (TCPCB_SACKED_RETRANS\|TCPCB_SACKED_ACKED)))
				3008	hole = skb;
				3009	continue;
				3010
				3011	} else {
				3012	if (icsk->icsk_ca_state != TCP_CA_Loss)
				3013	mib_idx = LINUX_MIB_TCPFASTRETRANS;
				3014	else
				3015	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
				3016	}
				3017
				3018	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
				3019	continue;
				3020
				3021	if (tcp_small_queue_check(sk, skb, 1))
				3022	return;
				3023
				3024	if (tcp_retransmit_skb(sk, skb, segs))
				3025	return;
				3026
				3027	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
				3028
				3029	if (tcp_in_cwnd_reduction(sk))
				3030	tp->prr_out += tcp_skb_pcount(skb);
				3031
				3032	if (skb == rtx_head &&
				3033	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
				3034	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3035	inet_csk(sk)->icsk_rto,
				3036	TCP_RTO_MAX);
				3037	}
				3038	}
				3039
				3040	/* We allow to exceed memory limits for FIN packets to expedite
				3041	* connection tear down and (memory) recovery.
				3042	* Otherwise tcp_send_fin() could be tempted to either delay FIN
				3043	* or even be forced to close flow without any FIN.
				3044	* In general, we want to allow one skb per socket to avoid hangs
				3045	* with edge trigger epoll()
				3046	*/
				3047	void sk_forced_mem_schedule(struct sock *sk, int size)
				3048	{
				3049	int amt;
				3050
				3051	if (size <= sk->sk_forward_alloc)
				3052	return;
				3053	amt = sk_mem_pages(size);
				3054	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
				3055	sk_memory_allocated_add(sk, amt);
				3056
				3057	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
				3058	mem_cgroup_charge_skmem(sk->sk_memcg, amt);
				3059	}
				3060
				3061	/* Send a FIN. The caller locks the socket for us.
				3062	* We should try to send a FIN packet really hard, but eventually give up.
				3063	*/
				3064	void tcp_send_fin(struct sock *sk)
				3065	{
				3066	struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
				3067	struct tcp_sock *tp = tcp_sk(sk);
				3068
				3069	/* Optimization, tack on the FIN if we have one skb in write queue and
				3070	* this skb was not yet sent, or we are under memory pressure.
				3071	* Note: in the latter case, FIN packet will be sent after a timeout,
				3072	* as TCP stack thinks it has already been transmitted.
				3073	*/
				3074	if (!tskb && tcp_under_memory_pressure(sk))
				3075	tskb = skb_rb_last(&sk->tcp_rtx_queue);
				3076
				3077	if (tskb) {
				3078	coalesce:
				3079	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
				3080	TCP_SKB_CB(tskb)->end_seq++;
				3081	tp->write_seq++;
				3082	if (tcp_write_queue_empty(sk)) {
				3083	/* This means tskb was already sent.
				3084	* Pretend we included the FIN on previous transmit.
				3085	* We need to set tp->snd_nxt to the value it would have
				3086	* if FIN had been sent. This is because retransmit path
				3087	* does not change tp->snd_nxt.
				3088	*/
				3089	tp->snd_nxt++;
				3090	return;
				3091	}
				3092	} else {
				3093	skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
				3094	if (unlikely(!skb)) {
				3095	if (tskb)
				3096	goto coalesce;
				3097	return;
				3098	}
				3099	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
				3100	skb_reserve(skb, MAX_TCP_HEADER);
				3101	sk_forced_mem_schedule(sk, skb->truesize);
				3102	/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
				3103	tcp_init_nondata_skb(skb, tp->write_seq,
				3104	TCPHDR_ACK \| TCPHDR_FIN);
				3105	tcp_queue_skb(sk, skb);
				3106	}
				3107	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
				3108	}
				3109
				3110	/* We get here when a process closes a file descriptor (either due to
				3111	* an explicit close() or as a byproduct of exit()'ing) and there
				3112	* was unread data in the receive queue. This behavior is recommended
				3113	* by RFC 2525, section 2.17. -DaveM
				3114	*/
				3115	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
				3116	{
				3117	struct sk_buff *skb;
				3118
				3119	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
				3120
				3121	/* NOTE: No TCP options attached and we never retransmit this. */
				3122	skb = alloc_skb(MAX_TCP_HEADER, priority);
				3123	if (!skb) {
				3124	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				3125	return;
				3126	}
				3127
				3128	/* Reserve space for headers and prepare control bits. */
				3129	skb_reserve(skb, MAX_TCP_HEADER);
				3130	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
				3131	TCPHDR_ACK \| TCPHDR_RST);
				3132	tcp_mstamp_refresh(tcp_sk(sk));
				3133	/* Send it off. */
				3134	if (tcp_transmit_skb(sk, skb, 0, priority))
				3135	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				3136
				3137	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
				3138	* skb here is different to the troublesome skb, so use NULL
				3139	*/
				3140	trace_tcp_send_reset(sk, NULL);
				3141	}
				3142
				3143	/* Send a crossed SYN-ACK during socket establishment.
				3144	* WARNING: This routine must only be called when we have already sent
				3145	* a SYN packet that crossed the incoming SYN that caused this routine
				3146	* to get called. If this assumption fails then the initial rcv_wnd
				3147	* and rcv_wscale values will not be correct.
				3148	*/
				3149	int tcp_send_synack(struct sock *sk)
				3150	{
				3151	struct sk_buff *skb;
				3152
				3153	skb = tcp_rtx_queue_head(sk);
				3154	if (!skb \|\| !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				3155	pr_err("%s: wrong queue state\n", __func__);
				3156	return -EFAULT;
				3157	}
				3158	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
				3159	if (skb_cloned(skb)) {
				3160	struct sk_buff *nskb;
				3161
				3162	tcp_skb_tsorted_save(skb) {
				3163	nskb = skb_copy(skb, GFP_ATOMIC);
				3164	} tcp_skb_tsorted_restore(skb);
				3165	if (!nskb)
				3166	return -ENOMEM;
				3167	INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
				3168	tcp_rtx_queue_unlink_and_free(skb, sk);
				3169	__skb_header_release(nskb);
				3170	tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
				3171	sk->sk_wmem_queued += nskb->truesize;
				3172	sk_mem_charge(sk, nskb->truesize);
				3173	skb = nskb;
				3174	}
				3175
				3176	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ACK;
				3177	tcp_ecn_send_synack(sk, skb);
				3178	}
				3179	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3180	}
				3181
				3182	/**
				3183	* tcp_make_synack - Prepare a SYN-ACK.
				3184	* sk: listener socket
				3185	* dst: dst entry attached to the SYNACK
				3186	* req: request_sock pointer
				3187	*
				3188	* Allocate one skb and build a SYNACK packet.
				3189	* @dst is consumed : Caller should not use it again.
				3190	*/
				3191	struct sk_buff tcp_make_synack(const struct sock sk, struct dst_entry *dst,
				3192	struct request_sock *req,
				3193	struct tcp_fastopen_cookie *foc,
				3194	enum tcp_synack_type synack_type)
				3195	{
				3196	struct inet_request_sock *ireq = inet_rsk(req);
				3197	const struct tcp_sock *tp = tcp_sk(sk);
				3198	struct tcp_md5sig_key *md5 = NULL;
				3199	struct tcp_out_options opts;
				3200	struct sk_buff *skb;
				3201	int tcp_header_size;
				3202	struct tcphdr *th;
				3203	int mss;
				3204
				3205	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
				3206	if (unlikely(!skb)) {
				3207	dst_release(dst);
				3208	return NULL;
				3209	}
				3210	/* Reserve space for headers. */
				3211	skb_reserve(skb, MAX_TCP_HEADER);
				3212
				3213	switch (synack_type) {
				3214	case TCP_SYNACK_NORMAL:
				3215	skb_set_owner_w(skb, req_to_sk(req));
				3216	break;
				3217	case TCP_SYNACK_COOKIE:
				3218	/* Under synflood, we do not attach skb to a socket,
				3219	* to avoid false sharing.
				3220	*/
				3221	break;
				3222	case TCP_SYNACK_FASTOPEN:
				3223	/* sk is a const pointer, because we want to express multiple
				3224	* cpu might call us concurrently.
				3225	* sk->sk_wmem_alloc in an atomic, we can promote to rw.
				3226	*/
				3227	skb_set_owner_w(skb, (struct sock *)sk);
				3228	break;
				3229	}
				3230	skb_dst_set(skb, dst);
				3231
				3232	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
				3233
				3234	memset(&opts, 0, sizeof(opts));
				3235	#ifdef CONFIG_SYN_COOKIES
				3236	if (unlikely(req->cookie_ts))
				3237	skb->skb_mstamp = cookie_init_timestamp(req);
				3238	else
				3239	#endif
				3240	skb->skb_mstamp = tcp_clock_us();
				3241
				3242	#ifdef CONFIG_TCP_MD5SIG
				3243	rcu_read_lock();
				3244	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
				3245	#endif
				3246	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
				3247	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
				3248	foc) + sizeof(*th);
				3249
				3250	skb_push(skb, tcp_header_size);
				3251	skb_reset_transport_header(skb);
				3252
				3253	th = (struct tcphdr *)skb->data;
				3254	memset(th, 0, sizeof(struct tcphdr));
				3255	th->syn = 1;
				3256	th->ack = 1;
				3257	tcp_ecn_make_synack(req, th);
				3258	th->source = htons(ireq->ir_num);
				3259	th->dest = ireq->ir_rmt_port;
				3260	skb->mark = ireq->ir_mark;
				3261	skb->ip_summed = CHECKSUM_PARTIAL;
				3262	th->seq = htonl(tcp_rsk(req)->snt_isn);
				3263	/* XXX data is queued and acked as is. No buffer/window check */
				3264	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
				3265
				3266	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
				3267	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
				3268	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
				3269	th->doff = (tcp_header_size >> 2);
				3270	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
				3271
				3272	#ifdef CONFIG_TCP_MD5SIG
				3273	/* Okay, we have all we need - do the md5 hash if needed */
				3274	if (md5)
				3275	tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
				3276	md5, req_to_sk(req), skb);
				3277	rcu_read_unlock();
				3278	#endif
				3279
				3280	/* Do not fool tcpdump (if any), clean our debris */
				3281	skb->tstamp = 0;
				3282	return skb;
				3283	}
				3284	EXPORT_SYMBOL(tcp_make_synack);
				3285
				3286	static void tcp_ca_dst_init(struct sock sk, const struct dst_entry dst)
				3287	{
				3288	struct inet_connection_sock *icsk = inet_csk(sk);
				3289	const struct tcp_congestion_ops *ca;
				3290	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
				3291
				3292	if (ca_key == TCP_CA_UNSPEC)
				3293	return;
				3294
				3295	rcu_read_lock();
				3296	ca = tcp_ca_find_key(ca_key);
				3297	if (likely(ca && try_module_get(ca->owner))) {
				3298	module_put(icsk->icsk_ca_ops->owner);
				3299	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
				3300	icsk->icsk_ca_ops = ca;
				3301	}
				3302	rcu_read_unlock();
				3303	}
				3304
				3305	/* Do all connect socket setups that can be done AF independent. */
				3306	static void tcp_connect_init(struct sock *sk)
				3307	{
				3308	const struct dst_entry *dst = __sk_dst_get(sk);
				3309	struct tcp_sock *tp = tcp_sk(sk);
				3310	__u8 rcv_wscale;
				3311	u32 rcv_wnd;
				3312
				3313	/* We'll fix this up when we get a response from the other end.
				3314	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
				3315	*/
				3316	tp->tcp_header_len = sizeof(struct tcphdr);
				3317	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
				3318	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
				3319
				3320	#ifdef CONFIG_TCP_MD5SIG
				3321	if (tp->af_specific->md5_lookup(sk, sk))
				3322	tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
				3323	#endif
				3324
				3325	/* If user gave his TCP_MAXSEG, record it to clamp */
				3326	if (tp->rx_opt.user_mss)
				3327	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
				3328	tp->max_window = 0;
				3329	tcp_mtup_init(sk);
				3330	tcp_sync_mss(sk, dst_mtu(dst));
				3331
				3332	tcp_ca_dst_init(sk, dst);
				3333
				3334	if (!tp->window_clamp)
				3335	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
				3336	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
				3337
				3338	tcp_initialize_rcv_mss(sk);
				3339
				3340	/* limit the window selection if the user enforce a smaller rx buffer */
				3341	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
				3342	(tp->window_clamp > tcp_full_space(sk) \|\| tp->window_clamp == 0))
				3343	tp->window_clamp = tcp_full_space(sk);
				3344
				3345	rcv_wnd = tcp_rwnd_init_bpf(sk);
				3346	if (rcv_wnd == 0)
				3347	rcv_wnd = dst_metric(dst, RTAX_INITRWND);
				3348
				3349	tcp_select_initial_window(sk, tcp_full_space(sk),
				3350	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				3351	&tp->rcv_wnd,
				3352	&tp->window_clamp,
				3353	sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
				3354	&rcv_wscale,
				3355	rcv_wnd);
				3356
				3357	tp->rx_opt.rcv_wscale = rcv_wscale;
				3358	tp->rcv_ssthresh = tp->rcv_wnd;
				3359
				3360	sk->sk_err = 0;
				3361	sock_reset_flag(sk, SOCK_DONE);
				3362	tp->snd_wnd = 0;
				3363	tcp_init_wl(tp, 0);
				3364	tcp_write_queue_purge(sk);
				3365	tp->snd_una = tp->write_seq;
				3366	tp->snd_sml = tp->write_seq;
				3367	tp->snd_up = tp->write_seq;
				3368	tp->snd_nxt = tp->write_seq;
				3369
				3370	if (likely(!tp->repair))
				3371	tp->rcv_nxt = 0;
				3372	else
				3373	tp->rcv_tstamp = tcp_jiffies32;
				3374	tp->rcv_wup = tp->rcv_nxt;
				3375	tp->copied_seq = tp->rcv_nxt;
				3376
				3377	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
				3378	inet_csk(sk)->icsk_retransmits = 0;
				3379	tcp_clear_retrans(tp);
				3380	}
				3381
				3382	static void tcp_connect_queue_skb(struct sock sk, struct sk_buff skb)
				3383	{
				3384	struct tcp_sock *tp = tcp_sk(sk);
				3385	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				3386
				3387	tcb->end_seq += skb->len;
				3388	__skb_header_release(skb);
				3389	sk->sk_wmem_queued += skb->truesize;
				3390	sk_mem_charge(sk, skb->truesize);
				3391	tp->write_seq = tcb->end_seq;
				3392	tp->packets_out += tcp_skb_pcount(skb);
				3393	}
				3394
				3395	/* Build and send a SYN with data and (cached) Fast Open cookie. However,
				3396	* queue a data-only packet after the regular SYN, such that regular SYNs
				3397	* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
				3398	* only the SYN sequence, the data are retransmitted in the first ACK.
				3399	* If cookie is not cached or other error occurs, falls back to send a
				3400	* regular SYN with Fast Open cookie request option.
				3401	*/
				3402	static int tcp_send_syn_data(struct sock sk, struct sk_buff syn)
				3403	{
				3404	struct tcp_sock *tp = tcp_sk(sk);
				3405	struct tcp_fastopen_request *fo = tp->fastopen_req;
				3406	int space, err = 0;
				3407	struct sk_buff *syn_data;
				3408
				3409	tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
				3410	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
				3411	goto fallback;
				3412
				3413	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
				3414	* user-MSS. Reserve maximum option space for middleboxes that add
				3415	* private TCP options. The cost is reduced data space in SYN :(
				3416	*/
				3417	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
				3418
				3419	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
				3420	MAX_TCP_OPTION_SPACE;
				3421
				3422	space = min_t(size_t, space, fo->size);
				3423
				3424	/* limit to order-0 allocations */
				3425	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
				3426
				3427	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
				3428	if (!syn_data)
				3429	goto fallback;
				3430	syn_data->ip_summed = CHECKSUM_PARTIAL;
				3431	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
				3432	if (space) {
				3433	int copied = copy_from_iter(skb_put(syn_data, space), space,
				3434	&fo->data->msg_iter);
				3435	if (unlikely(!copied)) {
				3436	tcp_skb_tsorted_anchor_cleanup(syn_data);
				3437	kfree_skb(syn_data);
				3438	goto fallback;
				3439	}
				3440	if (copied != space) {
				3441	skb_trim(syn_data, copied);
				3442	space = copied;
				3443	}
				3444	}
				3445	/* No more data pending in inet_wait_for_connect() */
				3446	if (space == fo->size)
				3447	fo->data = NULL;
				3448	fo->copied = space;
				3449
				3450	tcp_connect_queue_skb(sk, syn_data);
				3451	if (syn_data->len)
				3452	tcp_chrono_start(sk, TCP_CHRONO_BUSY);
				3453
				3454	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
				3455
				3456	syn->skb_mstamp = syn_data->skb_mstamp;
				3457
				3458	/* Now full SYN+DATA was cloned and sent (or not),
				3459	* remove the SYN from the original skb (syn_data)
				3460	* we keep in write queue in case of a retransmit, as we
				3461	* also have the SYN packet (with no data) in the same queue.
				3462	*/
				3463	TCP_SKB_CB(syn_data)->seq++;
				3464	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK \| TCPHDR_PSH;
				3465	if (!err) {
				3466	tp->syn_data = (fo->copied > 0);
				3467	tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
				3468	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
				3469	goto done;
				3470	}
				3471
				3472	/* data was not sent, put it in write_queue */
				3473	__skb_queue_tail(&sk->sk_write_queue, syn_data);
				3474	tp->packets_out -= tcp_skb_pcount(syn_data);
				3475
				3476	fallback:
				3477	/* Send a regular SYN with Fast Open cookie request option */
				3478	if (fo->cookie.len > 0)
				3479	fo->cookie.len = 0;
				3480	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
				3481	if (err)
				3482	tp->syn_fastopen = 0;
				3483	done:
				3484	fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
				3485	return err;
				3486	}
				3487
				3488	/* Build a SYN and send it off. */
				3489	int tcp_connect(struct sock *sk)
				3490	{
				3491	struct tcp_sock *tp = tcp_sk(sk);
				3492	struct sk_buff *buff;
				3493	int err;
				3494
				3495	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
				3496
				3497	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				3498	return -EHOSTUNREACH; /* Routing failure or similar. */
				3499
				3500	tcp_connect_init(sk);
				3501
				3502	if (unlikely(tp->repair)) {
				3503	tcp_finish_connect(sk, NULL);
				3504	return 0;
				3505	}
				3506
				3507	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
				3508	if (unlikely(!buff))
				3509	return -ENOBUFS;
				3510
				3511	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
				3512	tcp_mstamp_refresh(tp);
				3513	tp->retrans_stamp = tcp_time_stamp(tp);
				3514	tcp_connect_queue_skb(sk, buff);
				3515	tcp_ecn_send_syn(sk, buff);
				3516	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
				3517
				3518	/* Send off SYN; include data in Fast Open. */
				3519	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
				3520	tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
				3521	if (err == -ECONNREFUSED)
				3522	return err;
				3523
				3524	/* We change tp->snd_nxt after the tcp_transmit_skb() call
				3525	* in order to make this packet get counted in tcpOutSegs.
				3526	*/
				3527	tp->snd_nxt = tp->write_seq;
				3528	tp->pushed_seq = tp->write_seq;
				3529	buff = tcp_send_head(sk);
				3530	if (unlikely(buff)) {
				3531	tp->snd_nxt = TCP_SKB_CB(buff)->seq;
				3532	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
				3533	}
				3534	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
				3535
				3536	/* Timer for repeating the SYN until an answer. */
				3537	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3538	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
				3539	return 0;
				3540	}
				3541	EXPORT_SYMBOL(tcp_connect);
				3542
				3543	/* Send out a delayed ack, the caller does the policy checking
				3544	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
				3545	* for details.
				3546	*/
				3547	void tcp_send_delayed_ack(struct sock *sk)
				3548	{
				3549	struct inet_connection_sock *icsk = inet_csk(sk);
				3550	int ato = icsk->icsk_ack.ato;
				3551	unsigned long timeout;
				3552
				3553	if (ato > TCP_DELACK_MIN) {
				3554	const struct tcp_sock *tp = tcp_sk(sk);
				3555	int max_ato = HZ / 2;
				3556
				3557	if (icsk->icsk_ack.pingpong \|\|
				3558	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
				3559	max_ato = TCP_DELACK_MAX;
				3560
				3561	/* Slow path, intersegment interval is "high". */
				3562
				3563	/* If some rtt estimate is known, use it to bound delayed ack.
				3564	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
				3565	* directly.
				3566	*/
				3567	if (tp->srtt_us) {
				3568	int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
				3569	TCP_DELACK_MIN);
				3570
				3571	if (rtt < max_ato)
				3572	max_ato = rtt;
				3573	}
				3574
				3575	ato = min(ato, max_ato);
				3576	}
				3577
				3578	/* Stay within the limit we were given */
				3579	timeout = jiffies + ato;
				3580
				3581	/* Use new timeout only if there wasn't a older one earlier. */
				3582	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
				3583	/* If delack timer was blocked or is about to expire,
				3584	* send ACK now.
				3585	*/
				3586	if (icsk->icsk_ack.blocked \|\|
				3587	time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
				3588	tcp_send_ack(sk);
				3589	return;
				3590	}
				3591
				3592	if (!time_before(timeout, icsk->icsk_ack.timeout))
				3593	timeout = icsk->icsk_ack.timeout;
				3594	}
				3595	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
				3596	icsk->icsk_ack.timeout = timeout;
				3597	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
				3598	}
				3599
				3600	/* This routine sends an ack and also updates the window. */
				3601	void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
				3602	{
				3603	struct sk_buff *buff;
				3604
				3605	/* If we have been reset, we may not send again. */
				3606	if (sk->sk_state == TCP_CLOSE)
				3607	return;
				3608
				3609	/* We are not putting this on the write queue, so
				3610	* tcp_transmit_skb() will set the ownership to this
				3611	* sock.
				3612	*/
				3613	buff = alloc_skb(MAX_TCP_HEADER,
				3614	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
				3615	if (unlikely(!buff)) {
				3616	inet_csk_schedule_ack(sk);
				3617	inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
				3618	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				3619	TCP_DELACK_MAX, TCP_RTO_MAX);
				3620	return;
				3621	}
				3622
				3623	/* Reserve space for headers and prepare control bits. */
				3624	skb_reserve(buff, MAX_TCP_HEADER);
				3625	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
				3626
				3627	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
				3628	* too much.
				3629	* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
				3630	*/
				3631	skb_set_tcp_pure_ack(buff);
				3632
				3633	/* Send it off, this clears delayed acks for us. */
				3634	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
				3635	}
				3636	EXPORT_SYMBOL_GPL(__tcp_send_ack);
				3637
				3638	void tcp_send_ack(struct sock *sk)
				3639	{
				3640	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
				3641	}
				3642
				3643	/* This routine sends a packet with an out of date sequence
				3644	* number. It assumes the other end will try to ack it.
				3645	*
				3646	* Question: what should we make while urgent mode?
				3647	* 4.4BSD forces sending single byte of data. We cannot send
				3648	* out of window data, because we have SND.NXT==SND.MAX...
				3649	*
				3650	* Current solution: to send TWO zero-length segments in urgent mode:
				3651	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
				3652	* out-of-date with SND.UNA-1 to probe window.
				3653	*/
				3654	static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
				3655	{
				3656	struct tcp_sock *tp = tcp_sk(sk);
				3657	struct sk_buff *skb;
				3658
				3659	/* We don't queue it, tcp_transmit_skb() sets ownership. */
				3660	skb = alloc_skb(MAX_TCP_HEADER,
				3661	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
				3662	if (!skb)
				3663	return -1;
				3664
				3665	/* Reserve space for headers and set control bits. */
				3666	skb_reserve(skb, MAX_TCP_HEADER);
				3667	/* Use a previous sequence. This should cause the other
				3668	* end to send an ack. Don't queue or clone SKB, just
				3669	* send it.
				3670	*/
				3671	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
				3672	NET_INC_STATS(sock_net(sk), mib);
				3673	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
				3674	}
				3675
				3676	/* Called from setsockopt( ... TCP_REPAIR ) */
				3677	void tcp_send_window_probe(struct sock *sk)
				3678	{
				3679	if (sk->sk_state == TCP_ESTABLISHED) {
				3680	tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
				3681	tcp_mstamp_refresh(tcp_sk(sk));
				3682	tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
				3683	}
				3684	}
				3685
				3686	/* Initiate keepalive or window probe from timer. */
				3687	int tcp_write_wakeup(struct sock *sk, int mib)
				3688	{
				3689	struct tcp_sock *tp = tcp_sk(sk);
				3690	struct sk_buff *skb;
				3691
				3692	if (sk->sk_state == TCP_CLOSE)
				3693	return -1;
				3694
				3695	skb = tcp_send_head(sk);
				3696	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
				3697	int err;
				3698	unsigned int mss = tcp_current_mss(sk);
				3699	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				3700
				3701	if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
				3702	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
				3703
				3704	/* We are probing the opening of a window
				3705	* but the window size is != 0
				3706	* must have been a result SWS avoidance ( sender )
				3707	*/
				3708	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
				3709	skb->len > mss) {
				3710	seg_size = min(seg_size, mss);
				3711	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				3712	if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
				3713	skb, seg_size, mss, GFP_ATOMIC))
				3714	return -1;
				3715	} else if (!tcp_skb_pcount(skb))
				3716	tcp_set_skb_tso_segs(skb, mss);
				3717
				3718	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				3719	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3720	if (!err)
				3721	tcp_event_new_data_sent(sk, skb);
				3722	return err;
				3723	} else {
				3724	if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
				3725	tcp_xmit_probe_skb(sk, 1, mib);
				3726	return tcp_xmit_probe_skb(sk, 0, mib);
				3727	}
				3728	}
				3729
				3730	/* A window probe timeout has occurred. If window is not closed send
				3731	* a partial packet else a zero probe.
				3732	*/
				3733	void tcp_send_probe0(struct sock *sk)
				3734	{
				3735	struct inet_connection_sock *icsk = inet_csk(sk);
				3736	struct tcp_sock *tp = tcp_sk(sk);
				3737	struct net *net = sock_net(sk);
				3738	unsigned long probe_max;
				3739	int err;
				3740
				3741	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
				3742
				3743	if (tp->packets_out \|\| tcp_write_queue_empty(sk)) {
				3744	/* Cancel probe timer, if it is not required. */
				3745	icsk->icsk_probes_out = 0;
				3746	icsk->icsk_backoff = 0;
				3747	return;
				3748	}
				3749
				3750	if (err <= 0) {
				3751	if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
				3752	icsk->icsk_backoff++;
				3753	icsk->icsk_probes_out++;
				3754	probe_max = TCP_RTO_MAX;
				3755	} else {
				3756	/* If packet was not sent due to local congestion,
				3757	* do not backoff and do not remember icsk_probes_out.
				3758	* Let local senders to fight for local resources.
				3759	*
				3760	* Use accumulated backoff yet.
				3761	*/
				3762	if (!icsk->icsk_probes_out)
				3763	icsk->icsk_probes_out = 1;
				3764	probe_max = TCP_RESOURCE_PROBE_INTERVAL;
				3765	}
				3766	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3767	tcp_probe0_when(sk, probe_max),
				3768	TCP_RTO_MAX);
				3769	}
				3770
				3771	int tcp_rtx_synack(const struct sock sk, struct request_sock req)
				3772	{
				3773	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
				3774	struct flowi fl;
				3775	int res;
				3776
				3777	tcp_rsk(req)->txhash = net_tx_rndhash();
				3778	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
				3779	if (!res) {
				3780	__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
				3781	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
				3782	if (unlikely(tcp_passive_fastopen(sk)))
				3783	tcp_sk(sk)->total_retrans++;
				3784	trace_tcp_retransmit_synack(sk, req);
				3785	}
				3786	return res;
				3787	}
				3788	EXPORT_SYMBOL(tcp_rtx_synack);