Blame - src/kernel/linux/v4.14/net/ipv4/tcp_output.c - T103

blob: 06eab04e949107642a180e550dfaadad64cf7797 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*/
				20
				21	/*
				22	* Changes: Pedro Roque : Retransmit queue handled by TCP.
				23	* : Fragmentation on mtu decrease
				24	* : Segment collapse on retransmit
				25	* : AF independence
				26	*
				27	* Linus Torvalds : send_delayed_ack
				28	* David S. Miller : Charge memory using the right skb
				29	* during syn/ack processing.
				30	* David S. Miller : Output engine completely rewritten.
				31	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
				32	* Cacophonix Gaul : draft-minshall-nagle-01
				33	* J Hadi Salim : ECN support
				34	*
				35	*/
				36
				37	#define pr_fmt(fmt) "TCP: " fmt
				38
				39	#include <net/tcp.h>
				40
				41	#include <linux/compiler.h>
				42	#include <linux/gfp.h>
				43	#include <linux/module.h>
				44
				45	/* People can turn this off for buggy TCP's found in printers etc. */
				46	int sysctl_tcp_retrans_collapse __read_mostly = 1;
				47
				48	/* People can turn this on to work with those rare, broken TCPs that
				49	* interpret the window field as a signed quantity.
				50	*/
				51	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
				52
				53	/* Default TSQ limit of four TSO segments */
				54	int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
				55
				56	/* This limits the percentage of the congestion window which we
				57	* will allow a single TSO frame to consume. Building TSO frames
				58	* which are too large can cause TCP streams to be bursty.
				59	*/
				60	int sysctl_tcp_tso_win_divisor __read_mostly = 3;
				61
				62	/* By default, RFC2861 behavior. */
				63	int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
				64
				65	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				66	int push_one, gfp_t gfp);
				67
				68	/* Account for new data that has been sent to the network. */
				69	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)
				70	{
				71	struct inet_connection_sock *icsk = inet_csk(sk);
				72	struct tcp_sock *tp = tcp_sk(sk);
				73	unsigned int prior_packets = tp->packets_out;
				74
				75	tcp_advance_send_head(sk, skb);
				76	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
				77
				78	tp->packets_out += tcp_skb_pcount(skb);
				79	if (!prior_packets \|\| icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
				80	tcp_rearm_rto(sk);
				81
				82	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
				83	tcp_skb_pcount(skb));
				84	}
				85
				86	/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
				87	* window scaling factor due to loss of precision.
				88	* If window has been shrunk, what should we make? It is not clear at all.
				89	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
				90	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
				91	* invalid. OK, let's make this for now:
				92	*/
				93	static inline __u32 tcp_acceptable_seq(const struct sock *sk)
				94	{
				95	const struct tcp_sock *tp = tcp_sk(sk);
				96
				97	if (!before(tcp_wnd_end(tp), tp->snd_nxt) \|\|
				98	(tp->rx_opt.wscale_ok &&
				99	((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
				100	return tp->snd_nxt;
				101	else
				102	return tcp_wnd_end(tp);
				103	}
				104
				105	/* Calculate mss to advertise in SYN segment.
				106	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
				107	*
				108	* 1. It is independent of path mtu.
				109	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
				110	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
				111	* attached devices, because some buggy hosts are confused by
				112	* large MSS.
				113	* 4. We do not make 3, we advertise MSS, calculated from first
				114	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
				115	* This may be overridden via information stored in routing table.
				116	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
				117	* probably even Jumbo".
				118	*/
				119	static __u16 tcp_advertise_mss(struct sock *sk)
				120	{
				121	struct tcp_sock *tp = tcp_sk(sk);
				122	const struct dst_entry *dst = __sk_dst_get(sk);
				123	int mss = tp->advmss;
				124
				125	if (dst) {
				126	unsigned int metric = dst_metric_advmss(dst);
				127
				128	if (metric < mss) {
				129	mss = metric;
				130	tp->advmss = mss;
				131	}
				132	}
				133
				134	return (__u16)mss;
				135	}
				136
				137	/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
				138	* This is the first part of cwnd validation mechanism.
				139	*/
				140	void tcp_cwnd_restart(struct sock *sk, s32 delta)
				141	{
				142	struct tcp_sock *tp = tcp_sk(sk);
				143	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
				144	u32 cwnd = tp->snd_cwnd;
				145
				146	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
				147
				148	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				149	restart_cwnd = min(restart_cwnd, cwnd);
				150
				151	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
				152	cwnd >>= 1;
				153	tp->snd_cwnd = max(cwnd, restart_cwnd);
				154	tp->snd_cwnd_stamp = tcp_jiffies32;
				155	tp->snd_cwnd_used = 0;
				156	}
				157
				158	/* Congestion state accounting after a packet has been sent. */
				159	static void tcp_event_data_sent(struct tcp_sock *tp,
				160	struct sock *sk)
				161	{
				162	struct inet_connection_sock *icsk = inet_csk(sk);
				163	const u32 now = tcp_jiffies32;
				164
				165	if (tcp_packets_in_flight(tp) == 0)
				166	tcp_ca_event(sk, CA_EVENT_TX_START);
				167
				168	tp->lsndtime = now;
				169
				170	/* If it is a reply for ato after last received
				171	* packet, enter pingpong mode.
				172	*/
				173	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
				174	icsk->icsk_ack.pingpong = 1;
				175	}
				176
				177	/* Account for an ACK we sent. */
				178	static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
				179	u32 rcv_nxt)
				180	{
				181	struct tcp_sock *tp = tcp_sk(sk);
				182
				183	if (unlikely(rcv_nxt != tp->rcv_nxt))
				184	return; /* Special ACK sent by DCTCP to reflect ECN */
				185	tcp_dec_quickack_mode(sk, pkts);
				186	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
				187	}
				188
				189
				190	u32 tcp_default_init_rwnd(u32 mss)
				191	{
				192	/* Initial receive window should be twice of TCP_INIT_CWND to
				193	* enable proper sending of new unsent data during fast recovery
				194	* (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
				195	* limit when mss is larger than 1460.
				196	*/
				197	u32 init_rwnd = sysctl_tcp_default_init_rwnd;
				198
				199	if (mss > 1460)
				200	init_rwnd = max((1460 * init_rwnd) / mss, 2U);
				201	return init_rwnd;
				202	}
				203
				204	/* Determine a window scaling and initial window to offer.
				205	* Based on the assumption that the given amount of space
				206	* will be offered. Store the results in the tp structure.
				207	* NOTE: for smooth operation initial space offering should
				208	* be a multiple of mss if possible. We assume here that mss >= 1.
				209	* This MUST be enforced by all callers.
				210	*/
				211	void tcp_select_initial_window(int __space, __u32 mss,
				212	__u32 rcv_wnd, __u32 window_clamp,
				213	int wscale_ok, __u8 *rcv_wscale,
				214	__u32 init_rcv_wnd)
				215	{
				216	unsigned int space = (__space < 0 ? 0 : __space);
				217
				218	/* If no clamp set the clamp to the max possible scaled window */
				219	if (*window_clamp == 0)
				220	(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
				221	space = min(*window_clamp, space);
				222
				223	/* Quantize space offering to a multiple of mss if possible. */
				224	if (space > mss)
				225	space = rounddown(space, mss);
				226
				227	/* NOTE: offering an initial window larger than 32767
				228	* will break some buggy TCP stacks. If the admin tells us
				229	* it is likely we could be speaking with such a buggy stack
				230	* we will truncate our initial window offering to 32K-1
				231	* unless the remote has sent us a window scaling option,
				232	* which we interpret as a sign the remote TCP is not
				233	* misinterpreting the window field as a signed quantity.
				234	*/
				235	if (sysctl_tcp_workaround_signed_windows)
				236	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
				237	else
				238	(*rcv_wnd) = space;
				239
				240	(*rcv_wscale) = 0;
				241	if (wscale_ok) {
				242	/* Set window scaling on max possible window */
				243	space = max_t(u32, space, sysctl_tcp_rmem[2]);
				244	space = max_t(u32, space, sysctl_rmem_max);
				245	space = min_t(u32, space, *window_clamp);
				246	while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
				247	space >>= 1;
				248	(*rcv_wscale)++;
				249	}
				250	}
				251
				252	if (mss > (1 << *rcv_wscale)) {
				253	if (!init_rcv_wnd) /* Use default unless specified otherwise */
				254	init_rcv_wnd = tcp_default_init_rwnd(mss);
				255	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
				256	}
				257
				258	/* Set the clamp no higher than max representable value */
				259	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
				260	}
				261	EXPORT_SYMBOL(tcp_select_initial_window);
				262
				263	/* Chose a new window to advertise, update state in tcp_sock for the
				264	* socket, and return result with RFC1323 scaling applied. The return
				265	* value can be stuffed directly into th->window for an outgoing
				266	* frame.
				267	*/
				268	static u16 tcp_select_window(struct sock *sk)
				269	{
				270	struct tcp_sock *tp = tcp_sk(sk);
				271	u32 old_win = tp->rcv_wnd;
				272	u32 cur_win = tcp_receive_window(tp);
				273	u32 new_win = __tcp_select_window(sk);
				274
				275	/* Never shrink the offered window */
				276	if (new_win < cur_win) {
				277	/* Danger Will Robinson!
				278	* Don't update rcv_wup/rcv_wnd here or else
				279	* we will not be able to advertise a zero
				280	* window in time. --DaveM
				281	*
				282	* Relax Will Robinson.
				283	*/
				284	if (new_win == 0)
				285	NET_INC_STATS(sock_net(sk),
				286	LINUX_MIB_TCPWANTZEROWINDOWADV);
				287	new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
				288	}
				289	tp->rcv_wnd = new_win;
				290	tp->rcv_wup = tp->rcv_nxt;
				291
				292	/* Make sure we do not exceed the maximum possible
				293	* scaled window.
				294	*/
				295	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
				296	new_win = min(new_win, MAX_TCP_WINDOW);
				297	else
				298	new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
				299
				300	/* RFC1323 scaling applied */
				301	new_win >>= tp->rx_opt.rcv_wscale;
				302
				303	/* If we advertise zero window, disable fast path. */
				304	if (new_win == 0) {
				305	tp->pred_flags = 0;
				306	if (old_win)
				307	NET_INC_STATS(sock_net(sk),
				308	LINUX_MIB_TCPTOZEROWINDOWADV);
				309	} else if (old_win == 0) {
				310	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
				311	}
				312
				313	return new_win;
				314	}
				315
				316	/* Packet ECN state for a SYN-ACK */
				317	static void tcp_ecn_send_synack(struct sock sk, struct sk_buff skb)
				318	{
				319	const struct tcp_sock *tp = tcp_sk(sk);
				320
				321	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
				322	if (!(tp->ecn_flags & TCP_ECN_OK))
				323	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
				324	else if (tcp_ca_needs_ecn(sk) \|\|
				325	tcp_bpf_ca_needs_ecn(sk))
				326	INET_ECN_xmit(sk);
				327	}
				328
				329	/* Packet ECN state for a SYN. */
				330	static void tcp_ecn_send_syn(struct sock sk, struct sk_buff skb)
				331	{
				332	struct tcp_sock *tp = tcp_sk(sk);
				333	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
				334	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 \|\|
				335	tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn;
				336
				337	if (!use_ecn) {
				338	const struct dst_entry *dst = __sk_dst_get(sk);
				339
				340	if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
				341	use_ecn = true;
				342	}
				343
				344	tp->ecn_flags = 0;
				345
				346	if (use_ecn) {
				347	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ECE \| TCPHDR_CWR;
				348	tp->ecn_flags = TCP_ECN_OK;
				349	if (tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn)
				350	INET_ECN_xmit(sk);
				351	}
				352	}
				353
				354	static void tcp_ecn_clear_syn(struct sock sk, struct sk_buff skb)
				355	{
				356	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
				357	/* tp->ecn_flags are cleared at a later point in time when
				358	* SYN ACK is ultimatively being received.
				359	*/
				360	TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE \| TCPHDR_CWR);
				361	}
				362
				363	static void
				364	tcp_ecn_make_synack(const struct request_sock req, struct tcphdr th)
				365	{
				366	if (inet_rsk(req)->ecn_ok)
				367	th->ece = 1;
				368	}
				369
				370	/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
				371	* be sent.
				372	*/
				373	static void tcp_ecn_send(struct sock sk, struct sk_buff skb,
				374	struct tcphdr *th, int tcp_header_len)
				375	{
				376	struct tcp_sock *tp = tcp_sk(sk);
				377
				378	if (tp->ecn_flags & TCP_ECN_OK) {
				379	/* Not-retransmitted data segment: set ECT and inject CWR. */
				380	if (skb->len != tcp_header_len &&
				381	!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
				382	INET_ECN_xmit(sk);
				383	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
				384	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				385	th->cwr = 1;
				386	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
				387	}
				388	} else if (!tcp_ca_needs_ecn(sk)) {
				389	/* ACK or retransmitted segment: clear ECT\|CE */
				390	INET_ECN_dontxmit(sk);
				391	}
				392	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
				393	th->ece = 1;
				394	}
				395	}
				396
				397	/* Constructs common control bits of non-data skb. If SYN/FIN is present,
				398	* auto increment end seqno.
				399	*/
				400	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
				401	{
				402	skb->ip_summed = CHECKSUM_PARTIAL;
				403	skb->csum = 0;
				404
				405	TCP_SKB_CB(skb)->tcp_flags = flags;
				406	TCP_SKB_CB(skb)->sacked = 0;
				407
				408	tcp_skb_pcount_set(skb, 1);
				409
				410	TCP_SKB_CB(skb)->seq = seq;
				411	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
				412	seq++;
				413	TCP_SKB_CB(skb)->end_seq = seq;
				414	}
				415
				416	static inline bool tcp_urg_mode(const struct tcp_sock *tp)
				417	{
				418	return tp->snd_una != tp->snd_up;
				419	}
				420
				421	#define OPTION_SACK_ADVERTISE (1 << 0)
				422	#define OPTION_TS (1 << 1)
				423	#define OPTION_MD5 (1 << 2)
				424	#define OPTION_WSCALE (1 << 3)
				425	#define OPTION_FAST_OPEN_COOKIE (1 << 8)
				426
				427	struct tcp_out_options {
				428	u16 options; /* bit field of OPTION_* */
				429	u16 mss; /* 0 to disable */
				430	u8 ws; /* window scale, 0 to disable */
				431	u8 num_sack_blocks; /* number of SACK blocks to include */
				432	u8 hash_size; /* bytes in hash_location */
				433	__u8 hash_location; / temporary pointer, overloaded */
				434	__u32 tsval, tsecr; /* need to include OPTION_TS */
				435	struct tcp_fastopen_cookie fastopen_cookie; / Fast open cookie */
				436	};
				437
				438	/* Write previously computed TCP options to the packet.
				439	*
				440	* Beware: Something in the Internet is very sensitive to the ordering of
				441	* TCP options, we learned this through the hard way, so be careful here.
				442	* Luckily we can at least blame others for their non-compliance but from
				443	* inter-operability perspective it seems that we're somewhat stuck with
				444	* the ordering which we have been using if we want to keep working with
				445	* those broken things (not that it currently hurts anybody as there isn't
				446	* particular reason why the ordering would need to be changed).
				447	*
				448	* At least SACK_PERM as the first option is known to lead to a disaster
				449	* (but it may well be that other scenarios fail similarly).
				450	*/
				451	static void tcp_options_write(__be32 ptr, struct tcp_sock tp,
				452	struct tcp_out_options *opts)
				453	{
				454	u16 options = opts->options; /* mungable copy */
				455
				456	if (unlikely(OPTION_MD5 & options)) {
				457	*ptr++ = htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16) \|
				458	(TCPOPT_MD5SIG << 8) \| TCPOLEN_MD5SIG);
				459	/* overload cookie hash location */
				460	opts->hash_location = (__u8 *)ptr;
				461	ptr += 4;
				462	}
				463
				464	if (unlikely(opts->mss)) {
				465	*ptr++ = htonl((TCPOPT_MSS << 24) \|
				466	(TCPOLEN_MSS << 16) \|
				467	opts->mss);
				468	}
				469
				470	if (likely(OPTION_TS & options)) {
				471	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				472	*ptr++ = htonl((TCPOPT_SACK_PERM << 24) \|
				473	(TCPOLEN_SACK_PERM << 16) \|
				474	(TCPOPT_TIMESTAMP << 8) \|
				475	TCPOLEN_TIMESTAMP);
				476	options &= ~OPTION_SACK_ADVERTISE;
				477	} else {
				478	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				479	(TCPOPT_NOP << 16) \|
				480	(TCPOPT_TIMESTAMP << 8) \|
				481	TCPOLEN_TIMESTAMP);
				482	}
				483	*ptr++ = htonl(opts->tsval);
				484	*ptr++ = htonl(opts->tsecr);
				485	}
				486
				487	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				488	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				489	(TCPOPT_NOP << 16) \|
				490	(TCPOPT_SACK_PERM << 8) \|
				491	TCPOLEN_SACK_PERM);
				492	}
				493
				494	if (unlikely(OPTION_WSCALE & options)) {
				495	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				496	(TCPOPT_WINDOW << 16) \|
				497	(TCPOLEN_WINDOW << 8) \|
				498	opts->ws);
				499	}
				500
				501	if (unlikely(opts->num_sack_blocks)) {
				502	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
				503	tp->duplicate_sack : tp->selective_acks;
				504	int this_sack;
				505
				506	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				507	(TCPOPT_NOP << 16) \|
				508	(TCPOPT_SACK << 8) \|
				509	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
				510	TCPOLEN_SACK_PERBLOCK)));
				511
				512	for (this_sack = 0; this_sack < opts->num_sack_blocks;
				513	++this_sack) {
				514	*ptr++ = htonl(sp[this_sack].start_seq);
				515	*ptr++ = htonl(sp[this_sack].end_seq);
				516	}
				517
				518	tp->rx_opt.dsack = 0;
				519	}
				520
				521	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
				522	struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
				523	u8 p = (u8 )ptr;
				524	u32 len; /* Fast Open option length */
				525
				526	if (foc->exp) {
				527	len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
				528	*ptr = htonl((TCPOPT_EXP << 24) \| (len << 16) \|
				529	TCPOPT_FASTOPEN_MAGIC);
				530	p += TCPOLEN_EXP_FASTOPEN_BASE;
				531	} else {
				532	len = TCPOLEN_FASTOPEN_BASE + foc->len;
				533	*p++ = TCPOPT_FASTOPEN;
				534	*p++ = len;
				535	}
				536
				537	memcpy(p, foc->val, foc->len);
				538	if ((len & 3) == 2) {
				539	p[foc->len] = TCPOPT_NOP;
				540	p[foc->len + 1] = TCPOPT_NOP;
				541	}
				542	ptr += (len + 3) >> 2;
				543	}
				544	}
				545
				546	/* Compute TCP options for SYN packets. This is not the final
				547	* network wire format yet.
				548	*/
				549	static unsigned int tcp_syn_options(struct sock sk, struct sk_buff skb,
				550	struct tcp_out_options *opts,
				551	struct tcp_md5sig_key **md5)
				552	{
				553	struct tcp_sock *tp = tcp_sk(sk);
				554	unsigned int remaining = MAX_TCP_OPTION_SPACE;
				555	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
				556
				557	#ifdef CONFIG_TCP_MD5SIG
				558	*md5 = tp->af_specific->md5_lookup(sk, sk);
				559	if (*md5) {
				560	opts->options \|= OPTION_MD5;
				561	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				562	}
				563	#else
				564	*md5 = NULL;
				565	#endif
				566
				567	/* We always get an MSS option. The option bytes which will be seen in
				568	* normal data packets should timestamps be used, must be in the MSS
				569	* advertised. But we subtract them from tp->mss_cache so that
				570	* calculations in tcp_sendmsg are simpler etc. So account for this
				571	* fact here if necessary. If we don't do this correctly, as a
				572	* receiver we won't recognize data packets as being full sized when we
				573	* should, and thus we won't abide by the delayed ACK rules correctly.
				574	* SACKs don't matter, we never delay an ACK when we have any of those
				575	* going out. */
				576	opts->mss = tcp_advertise_mss(sk);
				577	remaining -= TCPOLEN_MSS_ALIGNED;
				578
				579	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
				580	opts->options \|= OPTION_TS;
				581	opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
				582	opts->tsecr = tp->rx_opt.ts_recent;
				583	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				584	}
				585	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
				586	opts->ws = tp->rx_opt.rcv_wscale;
				587	opts->options \|= OPTION_WSCALE;
				588	remaining -= TCPOLEN_WSCALE_ALIGNED;
				589	}
				590	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
				591	opts->options \|= OPTION_SACK_ADVERTISE;
				592	if (unlikely(!(OPTION_TS & opts->options)))
				593	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				594	}
				595
				596	if (fastopen && fastopen->cookie.len >= 0) {
				597	u32 need = fastopen->cookie.len;
				598
				599	need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				600	TCPOLEN_FASTOPEN_BASE;
				601	need = (need + 3) & ~3U; /* Align to 32 bits */
				602	if (remaining >= need) {
				603	opts->options \|= OPTION_FAST_OPEN_COOKIE;
				604	opts->fastopen_cookie = &fastopen->cookie;
				605	remaining -= need;
				606	tp->syn_fastopen = 1;
				607	tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
				608	}
				609	}
				610
				611	return MAX_TCP_OPTION_SPACE - remaining;
				612	}
				613
				614	/* Set up TCP options for SYN-ACKs. */
				615	static unsigned int tcp_synack_options(struct request_sock *req,
				616	unsigned int mss, struct sk_buff *skb,
				617	struct tcp_out_options *opts,
				618	const struct tcp_md5sig_key *md5,
				619	struct tcp_fastopen_cookie *foc,
				620	enum tcp_synack_type synack_type)
				621	{
				622	struct inet_request_sock *ireq = inet_rsk(req);
				623	unsigned int remaining = MAX_TCP_OPTION_SPACE;
				624
				625	#ifdef CONFIG_TCP_MD5SIG
				626	if (md5) {
				627	opts->options \|= OPTION_MD5;
				628	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				629
				630	/* We can't fit any SACK blocks in a packet with MD5 + TS
				631	* options. There was discussion about disabling SACK
				632	* rather than TS in order to fit in better with old,
				633	* buggy kernels, but that was deemed to be unnecessary.
				634	*/
				635	if (synack_type != TCP_SYNACK_COOKIE)
				636	ireq->tstamp_ok &= !ireq->sack_ok;
				637	}
				638	#endif
				639
				640	/* We always send an MSS option. */
				641	opts->mss = mss;
				642	remaining -= TCPOLEN_MSS_ALIGNED;
				643
				644	if (likely(ireq->wscale_ok)) {
				645	opts->ws = ireq->rcv_wscale;
				646	opts->options \|= OPTION_WSCALE;
				647	remaining -= TCPOLEN_WSCALE_ALIGNED;
				648	}
				649	if (likely(ireq->tstamp_ok)) {
				650	opts->options \|= OPTION_TS;
				651	opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
				652	opts->tsecr = req->ts_recent;
				653	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				654	}
				655	if (likely(ireq->sack_ok)) {
				656	opts->options \|= OPTION_SACK_ADVERTISE;
				657	if (unlikely(!ireq->tstamp_ok))
				658	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				659	}
				660	if (foc != NULL && foc->len >= 0) {
				661	u32 need = foc->len;
				662
				663	need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
				664	TCPOLEN_FASTOPEN_BASE;
				665	need = (need + 3) & ~3U; /* Align to 32 bits */
				666	if (remaining >= need) {
				667	opts->options \|= OPTION_FAST_OPEN_COOKIE;
				668	opts->fastopen_cookie = foc;
				669	remaining -= need;
				670	}
				671	}
				672
				673	return MAX_TCP_OPTION_SPACE - remaining;
				674	}
				675
				676	/* Compute TCP options for ESTABLISHED sockets. This is not the
				677	* final wire format yet.
				678	*/
				679	static unsigned int tcp_established_options(struct sock sk, struct sk_buff skb,
				680	struct tcp_out_options *opts,
				681	struct tcp_md5sig_key **md5)
				682	{
				683	struct tcp_sock *tp = tcp_sk(sk);
				684	unsigned int size = 0;
				685	unsigned int eff_sacks;
				686
				687	opts->options = 0;
				688
				689	#ifdef CONFIG_TCP_MD5SIG
				690	*md5 = tp->af_specific->md5_lookup(sk, sk);
				691	if (unlikely(*md5)) {
				692	opts->options \|= OPTION_MD5;
				693	size += TCPOLEN_MD5SIG_ALIGNED;
				694	}
				695	#else
				696	*md5 = NULL;
				697	#endif
				698
				699	if (likely(tp->rx_opt.tstamp_ok)) {
				700	opts->options \|= OPTION_TS;
				701	opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
				702	opts->tsecr = tp->rx_opt.ts_recent;
				703	size += TCPOLEN_TSTAMP_ALIGNED;
				704	}
				705
				706	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
				707	if (unlikely(eff_sacks)) {
				708	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
				709	opts->num_sack_blocks =
				710	min_t(unsigned int, eff_sacks,
				711	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
				712	TCPOLEN_SACK_PERBLOCK);
				713	if (likely(opts->num_sack_blocks))
				714	size += TCPOLEN_SACK_BASE_ALIGNED +
				715	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
				716	}
				717
				718	return size;
				719	}
				720
				721
				722	/* TCP SMALL QUEUES (TSQ)
				723	*
				724	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
				725	* to reduce RTT and bufferbloat.
				726	* We do this using a special skb destructor (tcp_wfree).
				727	*
				728	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
				729	* needs to be reallocated in a driver.
				730	* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
				731	*
				732	* Since transmit from skb destructor is forbidden, we use a tasklet
				733	* to process all sockets that eventually need to send more skbs.
				734	* We use one tasklet per cpu, with its own queue of sockets.
				735	*/
				736	struct tsq_tasklet {
				737	struct tasklet_struct tasklet;
				738	struct list_head head; /* queue of tcp sockets */
				739	};
				740	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
				741
				742	static void tcp_tsq_handler(struct sock *sk)
				743	{
				744	if ((1 << sk->sk_state) &
				745	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|
				746	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK)) {
				747	struct tcp_sock *tp = tcp_sk(sk);
				748
				749	if (tp->lost_out > tp->retrans_out &&
				750	tp->snd_cwnd > tcp_packets_in_flight(tp)) {
				751	tcp_mstamp_refresh(tp);
				752	tcp_xmit_retransmit_queue(sk);
				753	}
				754
				755	tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
				756	0, GFP_ATOMIC);
				757	}
				758	}
				759	/*
				760	* One tasklet per cpu tries to send more skbs.
				761	* We run in tasklet context but need to disable irqs when
				762	* transferring tsq->head because tcp_wfree() might
				763	* interrupt us (non NAPI drivers)
				764	*/
				765	static void tcp_tasklet_func(unsigned long data)
				766	{
				767	struct tsq_tasklet tsq = (struct tsq_tasklet )data;
				768	LIST_HEAD(list);
				769	unsigned long flags;
				770	struct list_head q, n;
				771	struct tcp_sock *tp;
				772	struct sock *sk;
				773
				774	local_irq_save(flags);
				775	list_splice_init(&tsq->head, &list);
				776	local_irq_restore(flags);
				777
				778	list_for_each_safe(q, n, &list) {
				779	tp = list_entry(q, struct tcp_sock, tsq_node);
				780	list_del(&tp->tsq_node);
				781
				782	sk = (struct sock *)tp;
				783	smp_mb__before_atomic();
				784	clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
				785
				786	if (!sk->sk_lock.owned &&
				787	test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
				788	bh_lock_sock(sk);
				789	if (!sock_owned_by_user(sk)) {
				790	clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
				791	tcp_tsq_handler(sk);
				792	}
				793	bh_unlock_sock(sk);
				794	}
				795
				796	sk_free(sk);
				797	}
				798	}
				799
				800	#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED \| \
				801	TCPF_WRITE_TIMER_DEFERRED \| \
				802	TCPF_DELACK_TIMER_DEFERRED \| \
				803	TCPF_MTU_REDUCED_DEFERRED)
				804	/**
				805	* tcp_release_cb - tcp release_sock() callback
				806	* @sk: socket
				807	*
				808	* called from release_sock() to perform protocol dependent
				809	* actions before socket release.
				810	*/
				811	void tcp_release_cb(struct sock *sk)
				812	{
				813	unsigned long flags, nflags;
				814
				815	/* perform an atomic operation only if at least one flag is set */
				816	do {
				817	flags = sk->sk_tsq_flags;
				818	if (!(flags & TCP_DEFERRED_ALL))
				819	return;
				820	nflags = flags & ~TCP_DEFERRED_ALL;
				821	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
				822
				823	if (flags & TCPF_TSQ_DEFERRED)
				824	tcp_tsq_handler(sk);
				825
				826	/* Here begins the tricky part :
				827	* We are called from release_sock() with :
				828	* 1) BH disabled
				829	* 2) sk_lock.slock spinlock held
				830	* 3) socket owned by us (sk->sk_lock.owned == 1)
				831	*
				832	* But following code is meant to be called from BH handlers,
				833	* so we should keep BH disabled, but early release socket ownership
				834	*/
				835	sock_release_ownership(sk);
				836
				837	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
				838	tcp_write_timer_handler(sk);
				839	__sock_put(sk);
				840	}
				841	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
				842	tcp_delack_timer_handler(sk);
				843	__sock_put(sk);
				844	}
				845	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
				846	inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
				847	__sock_put(sk);
				848	}
				849	}
				850	EXPORT_SYMBOL(tcp_release_cb);
				851
				852	void __init tcp_tasklet_init(void)
				853	{
				854	int i;
				855
				856	for_each_possible_cpu(i) {
				857	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
				858
				859	INIT_LIST_HEAD(&tsq->head);
				860	tasklet_init(&tsq->tasklet,
				861	tcp_tasklet_func,
				862	(unsigned long)tsq);
				863	}
				864	}
				865
				866	/*
				867	* Write buffer destructor automatically called from kfree_skb.
				868	* We can't xmit new skbs from this context, as we might already
				869	* hold qdisc lock.
				870	*/
				871	void tcp_wfree(struct sk_buff *skb)
				872	{
				873	struct sock *sk = skb->sk;
				874	struct tcp_sock *tp = tcp_sk(sk);
				875	unsigned long flags, nval, oval;
				876
				877	/* Keep one reference on sk_wmem_alloc.
				878	* Will be released by sk_free() from here or tcp_tasklet_func()
				879	*/
				880	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
				881
				882	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
				883	* Wait until our queues (qdisc + devices) are drained.
				884	* This gives :
				885	* - less callbacks to tcp_write_xmit(), reducing stress (batches)
				886	* - chance for incoming ACK (processed by another cpu maybe)
				887	* to migrate this flow (skb->ooo_okay will be eventually set)
				888	*/
				889	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
				890	goto out;
				891
				892	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
				893	struct tsq_tasklet *tsq;
				894	bool empty;
				895
				896	if (!(oval & TSQF_THROTTLED) \|\| (oval & TSQF_QUEUED))
				897	goto out;
				898
				899	nval = (oval & ~TSQF_THROTTLED) \| TSQF_QUEUED \| TCPF_TSQ_DEFERRED;
				900	nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
				901	if (nval != oval)
				902	continue;
				903
				904	/* queue this socket to tasklet queue */
				905	local_irq_save(flags);
				906	tsq = this_cpu_ptr(&tsq_tasklet);
				907	empty = list_empty(&tsq->head);
				908	list_add(&tp->tsq_node, &tsq->head);
				909	if (empty)
				910	tasklet_schedule(&tsq->tasklet);
				911	local_irq_restore(flags);
				912	return;
				913	}
				914	out:
				915	sk_free(sk);
				916	}
				917
				918	/* Note: Called under hard irq.
				919	* We can not call TCP stack right away.
				920	*/
				921	enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
				922	{
				923	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
				924	struct sock sk = (struct sock )tp;
				925	unsigned long nval, oval;
				926
				927	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
				928	struct tsq_tasklet *tsq;
				929	bool empty;
				930
				931	if (oval & TSQF_QUEUED)
				932	break;
				933
				934	nval = (oval & ~TSQF_THROTTLED) \| TSQF_QUEUED \| TCPF_TSQ_DEFERRED;
				935	nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
				936	if (nval != oval)
				937	continue;
				938
				939	if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
				940	break;
				941	/* queue this socket to tasklet queue */
				942	tsq = this_cpu_ptr(&tsq_tasklet);
				943	empty = list_empty(&tsq->head);
				944	list_add(&tp->tsq_node, &tsq->head);
				945	if (empty)
				946	tasklet_schedule(&tsq->tasklet);
				947	break;
				948	}
				949	return HRTIMER_NORESTART;
				950	}
				951
				952	/* BBR congestion control needs pacing.
				953	* Same remark for SO_MAX_PACING_RATE.
				954	* sch_fq packet scheduler is efficiently handling pacing,
				955	* but is not always installed/used.
				956	* Return true if TCP stack should pace packets itself.
				957	*/
				958	static bool tcp_needs_internal_pacing(const struct sock *sk)
				959	{
				960	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
				961	}
				962
				963	static void tcp_internal_pacing(struct sock sk, const struct sk_buff skb)
				964	{
				965	u64 len_ns;
				966	u32 rate;
				967
				968	if (!tcp_needs_internal_pacing(sk))
				969	return;
				970	rate = sk->sk_pacing_rate;
				971	if (!rate \|\| rate == ~0U)
				972	return;
				973
				974	/* Should account for header sizes as sch_fq does,
				975	* but lets make things simple.
				976	*/
				977	len_ns = (u64)skb->len * NSEC_PER_SEC;
				978	do_div(len_ns, rate);
				979	hrtimer_start(&tcp_sk(sk)->pacing_timer,
				980	ktime_add_ns(ktime_get(), len_ns),
				981	HRTIMER_MODE_ABS_PINNED);
				982	}
				983
				984	/* This routine actually transmits TCP packets queued in by
				985	* tcp_do_sendmsg(). This is used by both the initial
				986	* transmission and possible later retransmissions.
				987	* All SKB's seen here are completely headerless. It is our
				988	* job to build the TCP header, and pass the packet down to
				989	* IP so it can do the same plus pass the packet off to the
				990	* device.
				991	*
				992	* We are working here with either a clone of the original
				993	* SKB, or a fresh unique copy made by the retransmit engine.
				994	*/
				995	static int __tcp_transmit_skb(struct sock sk, struct sk_buff skb,
				996	int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
				997	{
				998	const struct inet_connection_sock *icsk = inet_csk(sk);
				999	struct inet_sock *inet;
				1000	struct tcp_sock *tp;
				1001	struct tcp_skb_cb *tcb;
				1002	struct tcp_out_options opts;
				1003	unsigned int tcp_options_size, tcp_header_size;
				1004	struct sk_buff *oskb = NULL;
				1005	struct tcp_md5sig_key *md5;
				1006	struct tcphdr *th;
				1007	int err;
				1008
				1009	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
				1010	tp = tcp_sk(sk);
				1011
				1012	if (clone_it) {
				1013	TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
				1014	- tp->snd_una;
				1015	oskb = skb;
				1016	if (unlikely(skb_cloned(skb)))
				1017	skb = pskb_copy(skb, gfp_mask);
				1018	else
				1019	skb = skb_clone(skb, gfp_mask);
				1020	if (unlikely(!skb))
				1021	return -ENOBUFS;
				1022	}
				1023	skb->skb_mstamp = tp->tcp_mstamp;
				1024
				1025	inet = inet_sk(sk);
				1026	tcb = TCP_SKB_CB(skb);
				1027	memset(&opts, 0, sizeof(opts));
				1028
				1029	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
				1030	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
				1031	else
				1032	tcp_options_size = tcp_established_options(sk, skb, &opts,
				1033	&md5);
				1034	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
				1035
				1036	/* if no packet is in qdisc/device queue, then allow XPS to select
				1037	* another queue. We can be called from tcp_tsq_handler()
				1038	* which holds one reference to sk_wmem_alloc.
				1039	*
				1040	* TODO: Ideally, in-flight pure ACK packets should not matter here.
				1041	* One way to get this would be to set skb->truesize = 2 on them.
				1042	*/
				1043	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
				1044
				1045	/* If we had to use memory reserve to allocate this skb,
				1046	* this might cause drops if packet is looped back :
				1047	* Other socket might not have SOCK_MEMALLOC.
				1048	* Packets not looped back do not care about pfmemalloc.
				1049	*/
				1050	skb->pfmemalloc = 0;
				1051
				1052	skb_push(skb, tcp_header_size);
				1053	skb_reset_transport_header(skb);
				1054
				1055	skb_orphan(skb);
				1056	skb->sk = sk;
				1057	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
				1058	skb_set_hash_from_sk(skb, sk);
				1059	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				1060
				1061	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
				1062
				1063	/* Build TCP header and checksum it. */
				1064	th = (struct tcphdr *)skb->data;
				1065	th->source = inet->inet_sport;
				1066	th->dest = inet->inet_dport;
				1067	th->seq = htonl(tcb->seq);
				1068	th->ack_seq = htonl(rcv_nxt);
				1069	(((__be16 )th) + 6) = htons(((tcp_header_size >> 2) << 12) \|
				1070	tcb->tcp_flags);
				1071
				1072	th->check = 0;
				1073	th->urg_ptr = 0;
				1074
				1075	/* The urg_mode check is necessary during a below snd_una win probe */
				1076	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
				1077	if (before(tp->snd_up, tcb->seq + 0x10000)) {
				1078	th->urg_ptr = htons(tp->snd_up - tcb->seq);
				1079	th->urg = 1;
				1080	} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
				1081	th->urg_ptr = htons(0xFFFF);
				1082	th->urg = 1;
				1083	}
				1084	}
				1085
				1086	tcp_options_write((__be32 *)(th + 1), tp, &opts);
				1087	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
				1088	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
				1089	th->window = htons(tcp_select_window(sk));
				1090	tcp_ecn_send(sk, skb, th, tcp_header_size);
				1091	} else {
				1092	/* RFC1323: The window in SYN & SYN/ACK segments
				1093	* is never scaled.
				1094	*/
				1095	th->window = htons(min(tp->rcv_wnd, 65535U));
				1096	}
				1097	#ifdef CONFIG_TCP_MD5SIG
				1098	/* Calculate the MD5 hash, as we have all we need now */
				1099	if (md5) {
				1100	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
				1101	tp->af_specific->calc_md5_hash(opts.hash_location,
				1102	md5, sk, skb);
				1103	}
				1104	#endif
				1105
				1106	icsk->icsk_af_ops->send_check(sk, skb);
				1107
				1108	if (likely(tcb->tcp_flags & TCPHDR_ACK))
				1109	tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
				1110
				1111	if (skb->len != tcp_header_size) {
				1112	tcp_event_data_sent(tp, sk);
				1113	tp->data_segs_out += tcp_skb_pcount(skb);
				1114	tcp_internal_pacing(sk, skb);
				1115	}
				1116
				1117	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
				1118	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
				1119	tcp_skb_pcount(skb));
				1120
				1121	tp->segs_out += tcp_skb_pcount(skb);
				1122	/* OK, its time to fill skb_shinfo(skb)->gso_{segs\|size} */
				1123	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
				1124	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
				1125
				1126	/* Our usage of tstamp should remain private */
				1127	skb->tstamp = 0;
				1128
				1129	/* Cleanup our debris for IP stacks */
				1130	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
				1131	sizeof(struct inet6_skb_parm)));
				1132
				1133	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
				1134
				1135	if (unlikely(err > 0)) {
				1136	tcp_enter_cwr(sk);
				1137	err = net_xmit_eval(err);
				1138	}
				1139	if (!err && oskb) {
				1140	oskb->skb_mstamp = tp->tcp_mstamp;
				1141	tcp_rate_skb_sent(sk, oskb);
				1142	}
				1143	return err;
				1144	}
				1145
				1146	static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
				1147	gfp_t gfp_mask)
				1148	{
				1149	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
				1150	tcp_sk(sk)->rcv_nxt);
				1151	}
				1152
				1153	/* This routine just queues the buffer for sending.
				1154	*
				1155	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
				1156	* otherwise socket can stall.
				1157	*/
				1158	static void tcp_queue_skb(struct sock sk, struct sk_buff skb)
				1159	{
				1160	struct tcp_sock *tp = tcp_sk(sk);
				1161
				1162	/* Advance write_seq and place onto the write_queue. */
				1163	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
				1164	__skb_header_release(skb);
				1165	tcp_add_write_queue_tail(sk, skb);
				1166	sk->sk_wmem_queued += skb->truesize;
				1167	sk_mem_charge(sk, skb->truesize);
				1168	}
				1169
				1170	/* Initialize TSO segments for a packet. */
				1171	static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
				1172	{
				1173	if (skb->len <= mss_now \|\| skb->ip_summed == CHECKSUM_NONE) {
				1174	/* Avoid the costly divide in the normal
				1175	* non-TSO case.
				1176	*/
				1177	tcp_skb_pcount_set(skb, 1);
				1178	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1179	} else {
				1180	tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
				1181	TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
				1182	}
				1183	}
				1184
				1185	/* When a modification to fackets out becomes necessary, we need to check
				1186	* skb is counted to fackets_out or not.
				1187	*/
				1188	static void tcp_adjust_fackets_out(struct sock sk, const struct sk_buff skb,
				1189	int decr)
				1190	{
				1191	struct tcp_sock *tp = tcp_sk(sk);
				1192
				1193	if (!tp->sacked_out \|\| tcp_is_reno(tp))
				1194	return;
				1195
				1196	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
				1197	tp->fackets_out -= decr;
				1198	}
				1199
				1200	/* Pcount in the middle of the write queue got changed, we need to do various
				1201	* tweaks to fix counters
				1202	*/
				1203	static void tcp_adjust_pcount(struct sock sk, const struct sk_buff skb, int decr)
				1204	{
				1205	struct tcp_sock *tp = tcp_sk(sk);
				1206
				1207	tp->packets_out -= decr;
				1208
				1209	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				1210	tp->sacked_out -= decr;
				1211	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				1212	tp->retrans_out -= decr;
				1213	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
				1214	tp->lost_out -= decr;
				1215
				1216	/* Reno case is special. Sigh... */
				1217	if (tcp_is_reno(tp) && decr > 0)
				1218	tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
				1219
				1220	tcp_adjust_fackets_out(sk, skb, decr);
				1221
				1222	if (tp->lost_skb_hint &&
				1223	before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
				1224	(tcp_is_fack(tp) \|\| (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
				1225	tp->lost_cnt_hint -= decr;
				1226
				1227	tcp_verify_left_out(tp);
				1228	}
				1229
				1230	static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
				1231	{
				1232	return TCP_SKB_CB(skb)->txstamp_ack \|\|
				1233	(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
				1234	}
				1235
				1236	static void tcp_fragment_tstamp(struct sk_buff skb, struct sk_buff skb2)
				1237	{
				1238	struct skb_shared_info *shinfo = skb_shinfo(skb);
				1239
				1240	if (unlikely(tcp_has_tx_tstamp(skb)) &&
				1241	!before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
				1242	struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
				1243	u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
				1244
				1245	shinfo->tx_flags &= ~tsflags;
				1246	shinfo2->tx_flags \|= tsflags;
				1247	swap(shinfo->tskey, shinfo2->tskey);
				1248	TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
				1249	TCP_SKB_CB(skb)->txstamp_ack = 0;
				1250	}
				1251	}
				1252
				1253	static void tcp_skb_fragment_eor(struct sk_buff skb, struct sk_buff skb2)
				1254	{
				1255	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
				1256	TCP_SKB_CB(skb)->eor = 0;
				1257	}
				1258
				1259	/* Function to create two new TCP segments. Shrinks the given segment
				1260	* to the specified size and appends a new segment with the rest of the
				1261	* packet to the list. This won't be called frequently, I hope.
				1262	* Remember, these are still headerless SKBs at this point.
				1263	*/
				1264	int tcp_fragment(struct sock sk, struct sk_buff skb, u32 len,
				1265	unsigned int mss_now, gfp_t gfp)
				1266	{
				1267	struct tcp_sock *tp = tcp_sk(sk);
				1268	struct sk_buff *buff;
				1269	int nsize, old_factor;
				1270	long limit;
				1271	int nlen;
				1272	u8 flags;
				1273
				1274	if (WARN_ON(len > skb->len))
				1275	return -EINVAL;
				1276
				1277	nsize = skb_headlen(skb) - len;
				1278	if (nsize < 0)
				1279	nsize = 0;
				1280
				1281	/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
				1282	* We need some allowance to not penalize applications setting small
				1283	* SO_SNDBUF values.
				1284	* Also allow first and last skb in retransmit queue to be split.
				1285	*/
				1286	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
				1287	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
				1288	skb != tcp_rtx_queue_head(sk) &&
				1289	skb != tcp_rtx_queue_tail(sk))) {
				1290	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
				1291	return -ENOMEM;
				1292	}
				1293
				1294	if (skb_unclone(skb, gfp))
				1295	return -ENOMEM;
				1296
				1297	/* Get a new skb... force flag on. */
				1298	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
				1299	if (!buff)
				1300	return -ENOMEM; /* We'll just try again later. */
				1301
				1302	sk->sk_wmem_queued += buff->truesize;
				1303	sk_mem_charge(sk, buff->truesize);
				1304	nlen = skb->len - len - nsize;
				1305	buff->truesize += nlen;
				1306	skb->truesize -= nlen;
				1307
				1308	/* Correct the sequence numbers. */
				1309	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1310	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1311	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1312
				1313	/* PSH and FIN should only be set in the second packet. */
				1314	flags = TCP_SKB_CB(skb)->tcp_flags;
				1315	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1316	TCP_SKB_CB(buff)->tcp_flags = flags;
				1317	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
				1318	tcp_skb_fragment_eor(skb, buff);
				1319
				1320	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
				1321	/* Copy and checksum data tail into the new buffer. */
				1322	buff->csum = csum_partial_copy_nocheck(skb->data + len,
				1323	skb_put(buff, nsize),
				1324	nsize, 0);
				1325
				1326	skb_trim(skb, len);
				1327
				1328	skb->csum = csum_block_sub(skb->csum, buff->csum, len);
				1329	} else {
				1330	skb->ip_summed = CHECKSUM_PARTIAL;
				1331	skb_split(skb, buff, len);
				1332	}
				1333
				1334	buff->ip_summed = skb->ip_summed;
				1335
				1336	buff->tstamp = skb->tstamp;
				1337	tcp_fragment_tstamp(skb, buff);
				1338
				1339	old_factor = tcp_skb_pcount(skb);
				1340
				1341	/* Fix up tso_factor for both original and new SKB. */
				1342	tcp_set_skb_tso_segs(skb, mss_now);
				1343	tcp_set_skb_tso_segs(buff, mss_now);
				1344
				1345	/* Update delivered info for the new segment */
				1346	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
				1347
				1348	/* If this packet has been sent out already, we must
				1349	* adjust the various packet counters.
				1350	*/
				1351	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
				1352	int diff = old_factor - tcp_skb_pcount(skb) -
				1353	tcp_skb_pcount(buff);
				1354
				1355	if (diff)
				1356	tcp_adjust_pcount(sk, skb, diff);
				1357	}
				1358
				1359	/* Link BUFF into the send queue. */
				1360	__skb_header_release(buff);
				1361	tcp_insert_write_queue_after(skb, buff, sk);
				1362
				1363	return 0;
				1364	}
				1365
				1366	/* This is similar to __pskb_pull_tail(). The difference is that pulled
				1367	* data is not copied, but immediately discarded.
				1368	*/
				1369	static int __pskb_trim_head(struct sk_buff *skb, int len)
				1370	{
				1371	struct skb_shared_info *shinfo;
				1372	int i, k, eat;
				1373
				1374	eat = min_t(int, len, skb_headlen(skb));
				1375	if (eat) {
				1376	__skb_pull(skb, eat);
				1377	len -= eat;
				1378	if (!len)
				1379	return 0;
				1380	}
				1381	eat = len;
				1382	k = 0;
				1383	shinfo = skb_shinfo(skb);
				1384	for (i = 0; i < shinfo->nr_frags; i++) {
				1385	int size = skb_frag_size(&shinfo->frags[i]);
				1386
				1387	if (size <= eat) {
				1388	skb_frag_unref(skb, i);
				1389	eat -= size;
				1390	} else {
				1391	shinfo->frags[k] = shinfo->frags[i];
				1392	if (eat) {
				1393	shinfo->frags[k].page_offset += eat;
				1394	skb_frag_size_sub(&shinfo->frags[k], eat);
				1395	eat = 0;
				1396	}
				1397	k++;
				1398	}
				1399	}
				1400	shinfo->nr_frags = k;
				1401
				1402	skb->data_len -= len;
				1403	skb->len = skb->data_len;
				1404	return len;
				1405	}
				1406
				1407	/* Remove acked data from a packet in the transmit queue. */
				1408	int tcp_trim_head(struct sock sk, struct sk_buff skb, u32 len)
				1409	{
				1410	u32 delta_truesize;
				1411
				1412	if (skb_unclone(skb, GFP_ATOMIC))
				1413	return -ENOMEM;
				1414
				1415	delta_truesize = __pskb_trim_head(skb, len);
				1416
				1417	TCP_SKB_CB(skb)->seq += len;
				1418	skb->ip_summed = CHECKSUM_PARTIAL;
				1419
				1420	if (delta_truesize) {
				1421	skb->truesize -= delta_truesize;
				1422	sk->sk_wmem_queued -= delta_truesize;
				1423	sk_mem_uncharge(sk, delta_truesize);
				1424	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
				1425	}
				1426
				1427	/* Any change of skb->len requires recalculation of tso factor. */
				1428	if (tcp_skb_pcount(skb) > 1)
				1429	tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
				1430
				1431	return 0;
				1432	}
				1433
				1434	/* Calculate MSS not accounting any TCP options. */
				1435	static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
				1436	{
				1437	const struct tcp_sock *tp = tcp_sk(sk);
				1438	const struct inet_connection_sock *icsk = inet_csk(sk);
				1439	int mss_now;
				1440
				1441	/* Calculate base mss without TCP options:
				1442	It is MMS_S - sizeof(tcphdr) of rfc1122
				1443	*/
				1444	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
				1445
				1446	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
				1447	if (icsk->icsk_af_ops->net_frag_header_len) {
				1448	const struct dst_entry *dst = __sk_dst_get(sk);
				1449
				1450	if (dst && dst_allfrag(dst))
				1451	mss_now -= icsk->icsk_af_ops->net_frag_header_len;
				1452	}
				1453
				1454	/* Clamp it (mss_clamp does not include tcp options) */
				1455	if (mss_now > tp->rx_opt.mss_clamp)
				1456	mss_now = tp->rx_opt.mss_clamp;
				1457
				1458	/* Now subtract optional transport overhead */
				1459	mss_now -= icsk->icsk_ext_hdr_len;
				1460
				1461	/* Then reserve room for full set of TCP options and 8 bytes of data */
				1462	mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
				1463	return mss_now;
				1464	}
				1465
				1466	/* Calculate MSS. Not accounting for SACKs here. */
				1467	int tcp_mtu_to_mss(struct sock *sk, int pmtu)
				1468	{
				1469	/* Subtract TCP options size, not including SACKs */
				1470	return __tcp_mtu_to_mss(sk, pmtu) -
				1471	(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
				1472	}
				1473
				1474	/* Inverse of above */
				1475	int tcp_mss_to_mtu(struct sock *sk, int mss)
				1476	{
				1477	const struct tcp_sock *tp = tcp_sk(sk);
				1478	const struct inet_connection_sock *icsk = inet_csk(sk);
				1479	int mtu;
				1480
				1481	mtu = mss +
				1482	tp->tcp_header_len +
				1483	icsk->icsk_ext_hdr_len +
				1484	icsk->icsk_af_ops->net_header_len;
				1485
				1486	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
				1487	if (icsk->icsk_af_ops->net_frag_header_len) {
				1488	const struct dst_entry *dst = __sk_dst_get(sk);
				1489
				1490	if (dst && dst_allfrag(dst))
				1491	mtu += icsk->icsk_af_ops->net_frag_header_len;
				1492	}
				1493	return mtu;
				1494	}
				1495	EXPORT_SYMBOL(tcp_mss_to_mtu);
				1496
				1497	/* MTU probing init per socket */
				1498	void tcp_mtup_init(struct sock *sk)
				1499	{
				1500	struct tcp_sock *tp = tcp_sk(sk);
				1501	struct inet_connection_sock *icsk = inet_csk(sk);
				1502	struct net *net = sock_net(sk);
				1503
				1504	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
				1505	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
				1506	icsk->icsk_af_ops->net_header_len;
				1507	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
				1508	icsk->icsk_mtup.probe_size = 0;
				1509	if (icsk->icsk_mtup.enabled)
				1510	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
				1511	}
				1512	EXPORT_SYMBOL(tcp_mtup_init);
				1513
				1514	/* This function synchronize snd mss to current pmtu/exthdr set.
				1515
				1516	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
				1517	for TCP options, but includes only bare TCP header.
				1518
				1519	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
				1520	It is minimum of user_mss and mss received with SYN.
				1521	It also does not include TCP options.
				1522
				1523	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
				1524
				1525	tp->mss_cache is current effective sending mss, including
				1526	all tcp options except for SACKs. It is evaluated,
				1527	taking into account current pmtu, but never exceeds
				1528	tp->rx_opt.mss_clamp.
				1529
				1530	NOTE1. rfc1122 clearly states that advertised MSS
				1531	DOES NOT include either tcp or ip options.
				1532
				1533	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
				1534	are READ ONLY outside this function. --ANK (980731)
				1535	*/
				1536	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
				1537	{
				1538	struct tcp_sock *tp = tcp_sk(sk);
				1539	struct inet_connection_sock *icsk = inet_csk(sk);
				1540	int mss_now;
				1541
				1542	if (icsk->icsk_mtup.search_high > pmtu)
				1543	icsk->icsk_mtup.search_high = pmtu;
				1544
				1545	mss_now = tcp_mtu_to_mss(sk, pmtu);
				1546	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
				1547
				1548	/* And store cached results */
				1549	icsk->icsk_pmtu_cookie = pmtu;
				1550	if (icsk->icsk_mtup.enabled)
				1551	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
				1552	tp->mss_cache = mss_now;
				1553
				1554	return mss_now;
				1555	}
				1556	EXPORT_SYMBOL(tcp_sync_mss);
				1557
				1558	/* Compute the current effective MSS, taking SACKs and IP options,
				1559	* and even PMTU discovery events into account.
				1560	*/
				1561	unsigned int tcp_current_mss(struct sock *sk)
				1562	{
				1563	const struct tcp_sock *tp = tcp_sk(sk);
				1564	const struct dst_entry *dst = __sk_dst_get(sk);
				1565	u32 mss_now;
				1566	unsigned int header_len;
				1567	struct tcp_out_options opts;
				1568	struct tcp_md5sig_key *md5;
				1569
				1570	mss_now = tp->mss_cache;
				1571
				1572	if (dst) {
				1573	u32 mtu = dst_mtu(dst);
				1574	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
				1575	mss_now = tcp_sync_mss(sk, mtu);
				1576	}
				1577
				1578	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
				1579	sizeof(struct tcphdr);
				1580	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
				1581	* some common options. If this is an odd packet (because we have SACK
				1582	* blocks etc) then our calculated header_len will be different, and
				1583	* we have to adjust mss_now correspondingly */
				1584	if (header_len != tp->tcp_header_len) {
				1585	int delta = (int) header_len - tp->tcp_header_len;
				1586	mss_now -= delta;
				1587	}
				1588
				1589	return mss_now;
				1590	}
				1591
				1592	/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
				1593	* As additional protections, we do not touch cwnd in retransmission phases,
				1594	* and if application hit its sndbuf limit recently.
				1595	*/
				1596	static void tcp_cwnd_application_limited(struct sock *sk)
				1597	{
				1598	struct tcp_sock *tp = tcp_sk(sk);
				1599
				1600	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
				1601	sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
				1602	/* Limited by application or receiver window. */
				1603	u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
				1604	u32 win_used = max(tp->snd_cwnd_used, init_win);
				1605	if (win_used < tp->snd_cwnd) {
				1606	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				1607	tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
				1608	}
				1609	tp->snd_cwnd_used = 0;
				1610	}
				1611	tp->snd_cwnd_stamp = tcp_jiffies32;
				1612	}
				1613
				1614	static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
				1615	{
				1616	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				1617	struct tcp_sock *tp = tcp_sk(sk);
				1618
				1619	/* Track the maximum number of outstanding packets in each
				1620	* window, and remember whether we were cwnd-limited then.
				1621	*/
				1622	if (!before(tp->snd_una, tp->max_packets_seq) \|\|
				1623	tp->packets_out > tp->max_packets_out) {
				1624	tp->max_packets_out = tp->packets_out;
				1625	tp->max_packets_seq = tp->snd_nxt;
				1626	tp->is_cwnd_limited = is_cwnd_limited;
				1627	}
				1628
				1629	if (tcp_is_cwnd_limited(sk)) {
				1630	/* Network is feed fully. */
				1631	tp->snd_cwnd_used = 0;
				1632	tp->snd_cwnd_stamp = tcp_jiffies32;
				1633	} else {
				1634	/* Network starves. */
				1635	if (tp->packets_out > tp->snd_cwnd_used)
				1636	tp->snd_cwnd_used = tp->packets_out;
				1637
				1638	if (sysctl_tcp_slow_start_after_idle &&
				1639	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
				1640	!ca_ops->cong_control)
				1641	tcp_cwnd_application_limited(sk);
				1642
				1643	/* The following conditions together indicate the starvation
				1644	* is caused by insufficient sender buffer:
				1645	* 1) just sent some data (see tcp_write_xmit)
				1646	* 2) not cwnd limited (this else condition)
				1647	* 3) no more data to send (null tcp_send_head )
				1648	* 4) application is hitting buffer limit (SOCK_NOSPACE)
				1649	*/
				1650	if (!tcp_send_head(sk) && sk->sk_socket &&
				1651	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
				1652	(1 << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				1653	tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1654	}
				1655	}
				1656
				1657	/* Minshall's variant of the Nagle send check. */
				1658	static bool tcp_minshall_check(const struct tcp_sock *tp)
				1659	{
				1660	return after(tp->snd_sml, tp->snd_una) &&
				1661	!after(tp->snd_sml, tp->snd_nxt);
				1662	}
				1663
				1664	/* Update snd_sml if this skb is under mss
				1665	* Note that a TSO packet might end with a sub-mss segment
				1666	* The test is really :
				1667	* if ((skb->len % mss) != 0)
				1668	* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1669	* But we can avoid doing the divide again given we already have
				1670	* skb_pcount = skb->len / mss_now
				1671	*/
				1672	static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
				1673	const struct sk_buff *skb)
				1674	{
				1675	if (skb->len < tcp_skb_pcount(skb) * mss_now)
				1676	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
				1677	}
				1678
				1679	/* Return false, if packet can be sent now without violation Nagle's rules:
				1680	* 1. It is full sized. (provided by caller in %partial bool)
				1681	* 2. Or it contains FIN. (already checked by caller)
				1682	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
				1683	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
				1684	* With Minshall's modification: all sent small packets are ACKed.
				1685	*/
				1686	static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
				1687	int nonagle)
				1688	{
				1689	return partial &&
				1690	((nonagle & TCP_NAGLE_CORK) \|\|
				1691	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
				1692	}
				1693
				1694	/* Return how many segs we'd like on a TSO packet,
				1695	* to send one TSO packet per ms
				1696	*/
				1697	u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
				1698	int min_tso_segs)
				1699	{
				1700	u32 bytes, segs;
				1701
				1702	bytes = min(sk->sk_pacing_rate >> 10,
				1703	sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
				1704
				1705	/* Goal is to send at least one packet per ms,
				1706	* not one big TSO packet every 100 ms.
				1707	* This preserves ACK clocking and is consistent
				1708	* with tcp_tso_should_defer() heuristic.
				1709	*/
				1710	segs = max_t(u32, bytes / mss_now, min_tso_segs);
				1711
				1712	return segs;
				1713	}
				1714	EXPORT_SYMBOL(tcp_tso_autosize);
				1715
				1716	/* Return the number of segments we want in the skb we are transmitting.
				1717	* See if congestion control module wants to decide; otherwise, autosize.
				1718	*/
				1719	static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
				1720	{
				1721	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
				1722	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
				1723
				1724	if (!tso_segs)
				1725	tso_segs = tcp_tso_autosize(sk, mss_now,
				1726	sysctl_tcp_min_tso_segs);
				1727	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
				1728	}
				1729
				1730	/* Returns the portion of skb which can be sent right away */
				1731	static unsigned int tcp_mss_split_point(const struct sock *sk,
				1732	const struct sk_buff *skb,
				1733	unsigned int mss_now,
				1734	unsigned int max_segs,
				1735	int nonagle)
				1736	{
				1737	const struct tcp_sock *tp = tcp_sk(sk);
				1738	u32 partial, needed, window, max_len;
				1739
				1740	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1741	max_len = mss_now * max_segs;
				1742
				1743	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
				1744	return max_len;
				1745
				1746	needed = min(skb->len, window);
				1747
				1748	if (max_len <= needed)
				1749	return max_len;
				1750
				1751	partial = needed % mss_now;
				1752	/* If last segment is not a full MSS, check if Nagle rules allow us
				1753	* to include this last segment in this skb.
				1754	* Otherwise, we'll split the skb at last MSS boundary
				1755	*/
				1756	if (tcp_nagle_check(partial != 0, tp, nonagle))
				1757	return needed - partial;
				1758
				1759	return needed;
				1760	}
				1761
				1762	/* Can at least one segment of SKB be sent right now, according to the
				1763	* congestion window rules? If so, return how many segments are allowed.
				1764	*/
				1765	static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
				1766	const struct sk_buff *skb)
				1767	{
				1768	u32 in_flight, cwnd, halfcwnd;
				1769
				1770	/* Don't be strict about the congestion window for the final FIN. */
				1771	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
				1772	tcp_skb_pcount(skb) == 1)
				1773	return 1;
				1774
				1775	in_flight = tcp_packets_in_flight(tp);
				1776	cwnd = tp->snd_cwnd;
				1777	if (in_flight >= cwnd)
				1778	return 0;
				1779
				1780	/* For better scheduling, ensure we have at least
				1781	* 2 GSO packets in flight.
				1782	*/
				1783	halfcwnd = max(cwnd >> 1, 1U);
				1784	return min(halfcwnd, cwnd - in_flight);
				1785	}
				1786
				1787	/* Initialize TSO state of a skb.
				1788	* This must be invoked the first time we consider transmitting
				1789	* SKB onto the wire.
				1790	*/
				1791	static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
				1792	{
				1793	int tso_segs = tcp_skb_pcount(skb);
				1794
				1795	if (!tso_segs \|\| (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
				1796	tcp_set_skb_tso_segs(skb, mss_now);
				1797	tso_segs = tcp_skb_pcount(skb);
				1798	}
				1799	return tso_segs;
				1800	}
				1801
				1802
				1803	/* Return true if the Nagle test allows this packet to be
				1804	* sent now.
				1805	*/
				1806	static inline bool tcp_nagle_test(const struct tcp_sock tp, const struct sk_buff skb,
				1807	unsigned int cur_mss, int nonagle)
				1808	{
				1809	/* Nagle rule does not apply to frames, which sit in the middle of the
				1810	* write_queue (they have no chances to get new data).
				1811	*
				1812	* This is implemented in the callers, where they modify the 'nonagle'
				1813	* argument based upon the location of SKB in the send queue.
				1814	*/
				1815	if (nonagle & TCP_NAGLE_PUSH)
				1816	return true;
				1817
				1818	/* Don't use the nagle rule for urgent data (or for the final FIN). */
				1819	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
				1820	return true;
				1821
				1822	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
				1823	return true;
				1824
				1825	return false;
				1826	}
				1827
				1828	/* Does at least the first segment of SKB fit into the send window? */
				1829	static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
				1830	const struct sk_buff *skb,
				1831	unsigned int cur_mss)
				1832	{
				1833	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				1834
				1835	if (skb->len > cur_mss)
				1836	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
				1837
				1838	return !after(end_seq, tcp_wnd_end(tp));
				1839	}
				1840
				1841	/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
				1842	* which is put after SKB on the list. It is very much like
				1843	* tcp_fragment() except that it may make several kinds of assumptions
				1844	* in order to speed up the splitting operation. In particular, we
				1845	* know that all the data is in scatter-gather pages, and that the
				1846	* packet has never been sent out before (and thus is not cloned).
				1847	*/
				1848	static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
				1849	unsigned int mss_now, gfp_t gfp)
				1850	{
				1851	struct sk_buff *buff;
				1852	int nlen = skb->len - len;
				1853	u8 flags;
				1854
				1855	/* All of a TSO frame must be composed of paged data. */
				1856	if (skb->len != skb->data_len)
				1857	return tcp_fragment(sk, skb, len, mss_now, gfp);
				1858
				1859	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
				1860	if (unlikely(!buff))
				1861	return -ENOMEM;
				1862
				1863	sk->sk_wmem_queued += buff->truesize;
				1864	sk_mem_charge(sk, buff->truesize);
				1865	buff->truesize += nlen;
				1866	skb->truesize -= nlen;
				1867
				1868	/* Correct the sequence numbers. */
				1869	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1870	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1871	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1872
				1873	/* PSH and FIN should only be set in the second packet. */
				1874	flags = TCP_SKB_CB(skb)->tcp_flags;
				1875	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1876	TCP_SKB_CB(buff)->tcp_flags = flags;
				1877
				1878	/* This packet was never sent out yet, so no SACK bits. */
				1879	TCP_SKB_CB(buff)->sacked = 0;
				1880
				1881	tcp_skb_fragment_eor(skb, buff);
				1882
				1883	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
				1884	skb_split(skb, buff, len);
				1885	tcp_fragment_tstamp(skb, buff);
				1886
				1887	/* Fix up tso_factor for both original and new SKB. */
				1888	tcp_set_skb_tso_segs(skb, mss_now);
				1889	tcp_set_skb_tso_segs(buff, mss_now);
				1890
				1891	/* Link BUFF into the send queue. */
				1892	__skb_header_release(buff);
				1893	tcp_insert_write_queue_after(skb, buff, sk);
				1894
				1895	return 0;
				1896	}
				1897
				1898	/* Try to defer sending, if possible, in order to minimize the amount
				1899	* of TSO splitting we do. View it as a kind of TSO Nagle test.
				1900	*
				1901	* This algorithm is from John Heffner.
				1902	*/
				1903	static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb,
				1904	bool *is_cwnd_limited,
				1905	bool *is_rwnd_limited,
				1906	u32 max_segs)
				1907	{
				1908	const struct inet_connection_sock *icsk = inet_csk(sk);
				1909	u32 age, send_win, cong_win, limit, in_flight;
				1910	struct tcp_sock *tp = tcp_sk(sk);
				1911	struct sk_buff *head;
				1912	int win_divisor;
				1913
				1914	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
				1915	goto send_now;
				1916
				1917	/* Avoid bursty behavior by allowing defer
				1918	* only if the last write was recent.
				1919	*/
				1920	if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
				1921	goto send_now;
				1922
				1923	in_flight = tcp_packets_in_flight(tp);
				1924
				1925	BUG_ON(tcp_skb_pcount(skb) <= 1 \|\| (tp->snd_cwnd <= in_flight));
				1926
				1927	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1928
				1929	/* From in_flight test above, we know that cwnd > in_flight. */
				1930	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
				1931
				1932	limit = min(send_win, cong_win);
				1933
				1934	/* If a full-sized TSO skb can be sent, do it. */
				1935	if (limit >= max_segs * tp->mss_cache)
				1936	goto send_now;
				1937
				1938	/* Middle in queue won't get any more data, full sendable already? */
				1939	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
				1940	goto send_now;
				1941
				1942	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
				1943	if (win_divisor) {
				1944	u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
				1945
				1946	/* If at least some fraction of a window is available,
				1947	* just use it.
				1948	*/
				1949	chunk /= win_divisor;
				1950	if (limit >= chunk)
				1951	goto send_now;
				1952	} else {
				1953	/* Different approach, try not to defer past a single
				1954	* ACK. Receiver should ACK every other full sized
				1955	* frame, so if we have space for more than 3 frames
				1956	* then send now.
				1957	*/
				1958	if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
				1959	goto send_now;
				1960	}
				1961
				1962	head = tcp_write_queue_head(sk);
				1963
				1964	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
				1965	/* If next ACK is likely to come too late (half srtt), do not defer */
				1966	if (age < (tp->srtt_us >> 4))
				1967	goto send_now;
				1968
				1969	/* Ok, it looks like it is advisable to defer.
				1970	* Three cases are tracked :
				1971	* 1) We are cwnd-limited
				1972	* 2) We are rwnd-limited
				1973	* 3) We are application limited.
				1974	*/
				1975	if (cong_win < send_win) {
				1976	if (cong_win <= skb->len) {
				1977	*is_cwnd_limited = true;
				1978	return true;
				1979	}
				1980	} else {
				1981	if (send_win <= skb->len) {
				1982	*is_rwnd_limited = true;
				1983	return true;
				1984	}
				1985	}
				1986
				1987	/* If this packet won't get more data, do not wait. */
				1988	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1989	goto send_now;
				1990
				1991	return true;
				1992
				1993	send_now:
				1994	return false;
				1995	}
				1996
				1997	static inline void tcp_mtu_check_reprobe(struct sock *sk)
				1998	{
				1999	struct inet_connection_sock *icsk = inet_csk(sk);
				2000	struct tcp_sock *tp = tcp_sk(sk);
				2001	struct net *net = sock_net(sk);
				2002	u32 interval;
				2003	s32 delta;
				2004
				2005	interval = net->ipv4.sysctl_tcp_probe_interval;
				2006	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
				2007	if (unlikely(delta >= interval * HZ)) {
				2008	int mss = tcp_current_mss(sk);
				2009
				2010	/* Update current search range */
				2011	icsk->icsk_mtup.probe_size = 0;
				2012	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
				2013	sizeof(struct tcphdr) +
				2014	icsk->icsk_af_ops->net_header_len;
				2015	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
				2016
				2017	/* Update probe time stamp */
				2018	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
				2019	}
				2020	}
				2021
				2022	static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
				2023	{
				2024	struct sk_buff skb, next;
				2025
				2026	skb = tcp_send_head(sk);
				2027	tcp_for_write_queue_from_safe(skb, next, sk) {
				2028	if (len <= skb->len)
				2029	break;
				2030
				2031	if (unlikely(TCP_SKB_CB(skb)->eor) \|\| tcp_has_tx_tstamp(skb))
				2032	return false;
				2033
				2034	len -= skb->len;
				2035	}
				2036
				2037	return true;
				2038	}
				2039
				2040	/* Create a new MTU probe if we are ready.
				2041	* MTU probe is regularly attempting to increase the path MTU by
				2042	* deliberately sending larger packets. This discovers routing
				2043	* changes resulting in larger path MTUs.
				2044	*
				2045	* Returns 0 if we should wait to probe (no cwnd available),
				2046	* 1 if a probe was sent,
				2047	* -1 otherwise
				2048	*/
				2049	static int tcp_mtu_probe(struct sock *sk)
				2050	{
				2051	struct inet_connection_sock *icsk = inet_csk(sk);
				2052	struct tcp_sock *tp = tcp_sk(sk);
				2053	struct sk_buff skb, nskb, *next;
				2054	struct net *net = sock_net(sk);
				2055	int probe_size;
				2056	int size_needed;
				2057	int copy, len;
				2058	int mss_now;
				2059	int interval;
				2060
				2061	/* Not currently probing/verifying,
				2062	* not in recovery,
				2063	* have enough cwnd, and
				2064	* not SACKing (the variable headers throw things off)
				2065	*/
				2066	if (likely(!icsk->icsk_mtup.enabled \|\|
				2067	icsk->icsk_mtup.probe_size \|\|
				2068	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
				2069	tp->snd_cwnd < 11 \|\|
				2070	tp->rx_opt.num_sacks \|\| tp->rx_opt.dsack))
				2071	return -1;
				2072
				2073	/* Use binary search for probe_size between tcp_mss_base,
				2074	* and current mss_clamp. if (search_high - search_low)
				2075	* smaller than a threshold, backoff from probing.
				2076	*/
				2077	mss_now = tcp_current_mss(sk);
				2078	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
				2079	icsk->icsk_mtup.search_low) >> 1);
				2080	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
				2081	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
				2082	/* When misfortune happens, we are reprobing actively,
				2083	* and then reprobe timer has expired. We stick with current
				2084	* probing process by not resetting search range to its orignal.
				2085	*/
				2086	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
				2087	interval < net->ipv4.sysctl_tcp_probe_threshold) {
				2088	/* Check whether enough time has elaplased for
				2089	* another round of probing.
				2090	*/
				2091	tcp_mtu_check_reprobe(sk);
				2092	return -1;
				2093	}
				2094
				2095	/* Have enough data in the send queue to probe? */
				2096	if (tp->write_seq - tp->snd_nxt < size_needed)
				2097	return -1;
				2098
				2099	if (tp->snd_wnd < size_needed)
				2100	return -1;
				2101	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
				2102	return 0;
				2103
				2104	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
				2105	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
				2106	if (!tcp_packets_in_flight(tp))
				2107	return -1;
				2108	else
				2109	return 0;
				2110	}
				2111
				2112	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
				2113	return -1;
				2114
				2115	/* We're allowed to probe. Build it now. */
				2116	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
				2117	if (!nskb)
				2118	return -1;
				2119	sk->sk_wmem_queued += nskb->truesize;
				2120	sk_mem_charge(sk, nskb->truesize);
				2121
				2122	skb = tcp_send_head(sk);
				2123
				2124	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
				2125	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
				2126	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
				2127	TCP_SKB_CB(nskb)->sacked = 0;
				2128	nskb->csum = 0;
				2129	nskb->ip_summed = skb->ip_summed;
				2130
				2131	tcp_insert_write_queue_before(nskb, skb, sk);
				2132	tcp_highest_sack_replace(sk, skb, nskb);
				2133
				2134	len = 0;
				2135	tcp_for_write_queue_from_safe(skb, next, sk) {
				2136	copy = min_t(int, skb->len, probe_size - len);
				2137	if (nskb->ip_summed) {
				2138	skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
				2139	} else {
				2140	__wsum csum = skb_copy_and_csum_bits(skb, 0,
				2141	skb_put(nskb, copy),
				2142	copy, 0);
				2143	nskb->csum = csum_block_add(nskb->csum, csum, len);
				2144	}
				2145
				2146	if (skb->len <= copy) {
				2147	/* We've eaten all the data from this skb.
				2148	* Throw it away. */
				2149	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				2150	/* If this is the last SKB we copy and eor is set
				2151	* we need to propagate it to the new skb.
				2152	*/
				2153	TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
				2154	tcp_skb_collapse_tstamp(nskb, skb);
				2155	tcp_unlink_write_queue(skb, sk);
				2156	sk_wmem_free_skb(sk, skb);
				2157	} else {
				2158	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags &
				2159	~(TCPHDR_FIN\|TCPHDR_PSH);
				2160	if (!skb_shinfo(skb)->nr_frags) {
				2161	skb_pull(skb, copy);
				2162	if (skb->ip_summed != CHECKSUM_PARTIAL)
				2163	skb->csum = csum_partial(skb->data,
				2164	skb->len, 0);
				2165	} else {
				2166	__pskb_trim_head(skb, copy);
				2167	tcp_set_skb_tso_segs(skb, mss_now);
				2168	}
				2169	TCP_SKB_CB(skb)->seq += copy;
				2170	}
				2171
				2172	len += copy;
				2173
				2174	if (len >= probe_size)
				2175	break;
				2176	}
				2177	tcp_init_tso_segs(nskb, nskb->len);
				2178
				2179	/* We're ready to send. If this fails, the probe will
				2180	* be resegmented into mss-sized pieces by tcp_write_xmit().
				2181	*/
				2182	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
				2183	/* Decrement cwnd here because we are sending
				2184	* effectively two packets. */
				2185	tp->snd_cwnd--;
				2186	tcp_event_new_data_sent(sk, nskb);
				2187
				2188	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
				2189	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
				2190	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
				2191
				2192	return 1;
				2193	}
				2194
				2195	return -1;
				2196	}
				2197
				2198	static bool tcp_pacing_check(const struct sock *sk)
				2199	{
				2200	return tcp_needs_internal_pacing(sk) &&
				2201	hrtimer_active(&tcp_sk(sk)->pacing_timer);
				2202	}
				2203
				2204	/* TCP Small Queues :
				2205	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
				2206	* (These limits are doubled for retransmits)
				2207	* This allows for :
				2208	* - better RTT estimation and ACK scheduling
				2209	* - faster recovery
				2210	* - high rates
				2211	* Alas, some drivers / subsystems require a fair amount
				2212	* of queued bytes to ensure line rate.
				2213	* One example is wifi aggregation (802.11 AMPDU)
				2214	*/
				2215	static bool tcp_small_queue_check(struct sock sk, const struct sk_buff skb,
				2216	unsigned int factor)
				2217	{
				2218	unsigned int limit;
				2219
				2220	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
				2221	limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
				2222	limit <<= factor;
				2223
				2224	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
				2225	/* Always send the 1st or 2nd skb in write queue.
				2226	* No need to wait for TX completion to call us back,
				2227	* after softirq/tasklet schedule.
				2228	* This helps when TX completions are delayed too much.
				2229	*/
				2230	if (skb == sk->sk_write_queue.next \|\|
				2231	skb->prev == sk->sk_write_queue.next)
				2232	return false;
				2233
				2234	set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
				2235	/* It is possible TX completion already happened
				2236	* before we set TSQ_THROTTLED, so we must
				2237	* test again the condition.
				2238	*/
				2239	smp_mb__after_atomic();
				2240	if (refcount_read(&sk->sk_wmem_alloc) > limit)
				2241	return true;
				2242	}
				2243	return false;
				2244	}
				2245
				2246	static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
				2247	{
				2248	const u32 now = tcp_jiffies32;
				2249	enum tcp_chrono old = tp->chrono_type;
				2250
				2251	if (old > TCP_CHRONO_UNSPEC)
				2252	tp->chrono_stat[old - 1] += now - tp->chrono_start;
				2253	tp->chrono_start = now;
				2254	tp->chrono_type = new;
				2255	}
				2256
				2257	void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
				2258	{
				2259	struct tcp_sock *tp = tcp_sk(sk);
				2260
				2261	/* If there are multiple conditions worthy of tracking in a
				2262	* chronograph then the highest priority enum takes precedence
				2263	* over the other conditions. So that if something "more interesting"
				2264	* starts happening, stop the previous chrono and start a new one.
				2265	*/
				2266	if (type > tp->chrono_type)
				2267	tcp_chrono_set(tp, type);
				2268	}
				2269
				2270	void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
				2271	{
				2272	struct tcp_sock *tp = tcp_sk(sk);
				2273
				2274
				2275	/* There are multiple conditions worthy of tracking in a
				2276	* chronograph, so that the highest priority enum takes
				2277	* precedence over the other conditions (see tcp_chrono_start).
				2278	* If a condition stops, we only stop chrono tracking if
				2279	* it's the "most interesting" or current chrono we are
				2280	* tracking and starts busy chrono if we have pending data.
				2281	*/
				2282	if (tcp_write_queue_empty(sk))
				2283	tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
				2284	else if (type == tp->chrono_type)
				2285	tcp_chrono_set(tp, TCP_CHRONO_BUSY);
				2286	}
				2287
				2288	/* This routine writes packets to the network. It advances the
				2289	* send_head. This happens as incoming acks open up the remote
				2290	* window for us.
				2291	*
				2292	* LARGESEND note: !tcp_urg_mode is overkill, only frames between
				2293	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
				2294	* account rare use of URG, this is not a big flaw.
				2295	*
				2296	* Send at most one packet when push_one > 0. Temporarily ignore
				2297	* cwnd limit to force at most one packet out when push_one == 2.
				2298
				2299	* Returns true, if no segments are in flight and we have queued segments,
				2300	* but cannot send anything now because of SWS or another problem.
				2301	*/
				2302	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				2303	int push_one, gfp_t gfp)
				2304	{
				2305	struct tcp_sock *tp = tcp_sk(sk);
				2306	struct sk_buff *skb;
				2307	unsigned int tso_segs, sent_pkts;
				2308	int cwnd_quota;
				2309	int result;
				2310	bool is_cwnd_limited = false, is_rwnd_limited = false;
				2311	u32 max_segs;
				2312
				2313	sent_pkts = 0;
				2314
				2315	tcp_mstamp_refresh(tp);
				2316	if (!push_one) {
				2317	/* Do MTU probing. */
				2318	result = tcp_mtu_probe(sk);
				2319	if (!result) {
				2320	return false;
				2321	} else if (result > 0) {
				2322	sent_pkts = 1;
				2323	}
				2324	}
				2325
				2326	max_segs = tcp_tso_segs(sk, mss_now);
				2327	while ((skb = tcp_send_head(sk))) {
				2328	unsigned int limit;
				2329
				2330	if (tcp_pacing_check(sk))
				2331	break;
				2332
				2333	tso_segs = tcp_init_tso_segs(skb, mss_now);
				2334	BUG_ON(!tso_segs);
				2335
				2336	if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
				2337	/* "skb_mstamp" is used as a start point for the retransmit timer */
				2338	skb->skb_mstamp = tp->tcp_mstamp;
				2339	goto repair; /* Skip network transmission */
				2340	}
				2341
				2342	cwnd_quota = tcp_cwnd_test(tp, skb);
				2343	if (!cwnd_quota) {
				2344	if (push_one == 2)
				2345	/* Force out a loss probe pkt. */
				2346	cwnd_quota = 1;
				2347	else
				2348	break;
				2349	}
				2350
				2351	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
				2352	is_rwnd_limited = true;
				2353	break;
				2354	}
				2355
				2356	if (tso_segs == 1) {
				2357	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
				2358	(tcp_skb_is_last(sk, skb) ?
				2359	nonagle : TCP_NAGLE_PUSH))))
				2360	break;
				2361	} else {
				2362	if (!push_one &&
				2363	tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
				2364	&is_rwnd_limited, max_segs))
				2365	break;
				2366	}
				2367
				2368	limit = mss_now;
				2369	if (tso_segs > 1 && !tcp_urg_mode(tp))
				2370	limit = tcp_mss_split_point(sk, skb, mss_now,
				2371	min_t(unsigned int,
				2372	cwnd_quota,
				2373	max_segs),
				2374	nonagle);
				2375
				2376	if (skb->len > limit &&
				2377	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
				2378	break;
				2379
				2380	if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
				2381	clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
				2382	if (tcp_small_queue_check(sk, skb, 0))
				2383	break;
				2384
				2385	/* Argh, we hit an empty skb(), presumably a thread
				2386	* is sleeping in sendmsg()/sk_stream_wait_memory().
				2387	* We do not want to send a pure-ack packet and have
				2388	* a strange looking rtx queue with empty packet(s).
				2389	*/
				2390	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
				2391	break;
				2392
				2393	if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
				2394	break;
				2395
				2396	repair:
				2397	/* Advance the send_head. This one is sent out.
				2398	* This call will increment packets_out.
				2399	*/
				2400	tcp_event_new_data_sent(sk, skb);
				2401
				2402	tcp_minshall_update(tp, mss_now, skb);
				2403	sent_pkts += tcp_skb_pcount(skb);
				2404
				2405	if (push_one)
				2406	break;
				2407	}
				2408
				2409	if (is_rwnd_limited)
				2410	tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
				2411	else
				2412	tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
				2413
				2414	if (likely(sent_pkts)) {
				2415	if (tcp_in_cwnd_reduction(sk))
				2416	tp->prr_out += sent_pkts;
				2417
				2418	/* Send one loss probe per tail loss episode. */
				2419	if (push_one != 2)
				2420	tcp_schedule_loss_probe(sk, false);
				2421	is_cwnd_limited \|= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
				2422	tcp_cwnd_validate(sk, is_cwnd_limited);
				2423	return false;
				2424	}
				2425	return !tp->packets_out && tcp_send_head(sk);
				2426	}
				2427
				2428	bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
				2429	{
				2430	struct inet_connection_sock *icsk = inet_csk(sk);
				2431	struct tcp_sock *tp = tcp_sk(sk);
				2432	u32 timeout, rto_delta_us;
				2433
				2434	/* Don't do any loss probe on a Fast Open connection before 3WHS
				2435	* finishes.
				2436	*/
				2437	if (tp->fastopen_rsk)
				2438	return false;
				2439
				2440	/* Schedule a loss probe in 2*RTT for SACK capable connections
				2441	* in Open state, that are either limited by cwnd or application.
				2442	*/
				2443	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) \|\|
				2444	!tp->packets_out \|\| !tcp_is_sack(tp) \|\|
				2445	icsk->icsk_ca_state != TCP_CA_Open)
				2446	return false;
				2447
				2448	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
				2449	tcp_send_head(sk))
				2450	return false;
				2451
				2452	/* Probe timeout is 2*rtt. Add minimum RTO to account
				2453	* for delayed ack when there's one outstanding packet. If no RTT
				2454	* sample is available then probe after TCP_TIMEOUT_INIT.
				2455	*/
				2456	if (tp->srtt_us) {
				2457	timeout = usecs_to_jiffies(tp->srtt_us >> 2);
				2458	if (tp->packets_out == 1)
				2459	timeout += TCP_RTO_MIN;
				2460	else
				2461	timeout += TCP_TIMEOUT_MIN;
				2462	} else {
				2463	timeout = TCP_TIMEOUT_INIT;
				2464	}
				2465
				2466	/* If the RTO formula yields an earlier time, then use that time. */
				2467	rto_delta_us = advancing_rto ?
				2468	jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
				2469	tcp_rto_delta_us(sk); /* How far in future is RTO? */
				2470	if (rto_delta_us > 0)
				2471	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
				2472
				2473	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
				2474	TCP_RTO_MAX);
				2475	return true;
				2476	}
				2477
				2478	/* Thanks to skb fast clones, we can detect if a prior transmit of
				2479	* a packet is still in a qdisc or driver queue.
				2480	* In this case, there is very little point doing a retransmit !
				2481	*/
				2482	static bool skb_still_in_host_queue(const struct sock *sk,
				2483	const struct sk_buff *skb)
				2484	{
				2485	if (unlikely(skb_fclone_busy(sk, skb))) {
				2486	NET_INC_STATS(sock_net(sk),
				2487	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
				2488	return true;
				2489	}
				2490	return false;
				2491	}
				2492
				2493	/* When probe timeout (PTO) fires, try send a new segment if possible, else
				2494	* retransmit the last segment.
				2495	*/
				2496	void tcp_send_loss_probe(struct sock *sk)
				2497	{
				2498	struct tcp_sock *tp = tcp_sk(sk);
				2499	struct sk_buff *skb;
				2500	int pcount;
				2501	int mss = tcp_current_mss(sk);
				2502
				2503	/* At most one outstanding TLP */
				2504	if (tp->tlp_high_seq)
				2505	goto rearm_timer;
				2506
				2507	tp->tlp_retrans = 0;
				2508	skb = tcp_send_head(sk);
				2509	if (skb) {
				2510	if (tcp_snd_wnd_test(tp, skb, mss)) {
				2511	pcount = tp->packets_out;
				2512	tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
				2513	if (tp->packets_out > pcount)
				2514	goto probe_sent;
				2515	goto rearm_timer;
				2516	}
				2517	skb = tcp_write_queue_prev(sk, skb);
				2518	} else {
				2519	skb = tcp_write_queue_tail(sk);
				2520	}
				2521
				2522	if (unlikely(!skb)) {
				2523	WARN_ONCE(tp->packets_out,
				2524	"invalid inflight: %u state %u cwnd %u mss %d\n",
				2525	tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
				2526	inet_csk(sk)->icsk_pending = 0;
				2527	return;
				2528	}
				2529
				2530	if (skb_still_in_host_queue(sk, skb))
				2531	goto rearm_timer;
				2532
				2533	pcount = tcp_skb_pcount(skb);
				2534	if (WARN_ON(!pcount))
				2535	goto rearm_timer;
				2536
				2537	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
				2538	if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
				2539	GFP_ATOMIC)))
				2540	goto rearm_timer;
				2541	skb = tcp_write_queue_next(sk, skb);
				2542	}
				2543
				2544	if (WARN_ON(!skb \|\| !tcp_skb_pcount(skb)))
				2545	goto rearm_timer;
				2546
				2547	if (__tcp_retransmit_skb(sk, skb, 1))
				2548	goto rearm_timer;
				2549
				2550	tp->tlp_retrans = 1;
				2551
				2552	probe_sent:
				2553	/* Record snd_nxt for loss detection. */
				2554	tp->tlp_high_seq = tp->snd_nxt;
				2555
				2556	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
				2557	/* Reset s.t. tcp_rearm_rto will restart timer from now */
				2558	inet_csk(sk)->icsk_pending = 0;
				2559	rearm_timer:
				2560	tcp_rearm_rto(sk);
				2561	}
				2562
				2563	/* Push out any pending frames which were held back due to
				2564	* TCP_CORK or attempt at coalescing tiny packets.
				2565	* The socket must be locked by the caller.
				2566	*/
				2567	void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
				2568	int nonagle)
				2569	{
				2570	/* If we are closed, the bytes will have to remain here.
				2571	* In time closedown will finish, we empty the write queue and
				2572	* all will be happy.
				2573	*/
				2574	if (unlikely(sk->sk_state == TCP_CLOSE))
				2575	return;
				2576
				2577	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
				2578	sk_gfp_mask(sk, GFP_ATOMIC)))
				2579	tcp_check_probe_timer(sk);
				2580	}
				2581
				2582	/* Send _single_ skb sitting at the send head. This function requires
				2583	* true push pending frames to setup probe timer etc.
				2584	*/
				2585	void tcp_push_one(struct sock *sk, unsigned int mss_now)
				2586	{
				2587	struct sk_buff *skb = tcp_send_head(sk);
				2588
				2589	BUG_ON(!skb \|\| skb->len < mss_now);
				2590
				2591	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
				2592	}
				2593
				2594	/* This function returns the amount that we can raise the
				2595	* usable window based on the following constraints
				2596	*
				2597	* 1. The window can never be shrunk once it is offered (RFC 793)
				2598	* 2. We limit memory per socket
				2599	*
				2600	* RFC 1122:
				2601	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
				2602	* RECV.NEXT + RCV.WIN fixed until:
				2603	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
				2604	*
				2605	* i.e. don't raise the right edge of the window until you can raise
				2606	* it at least MSS bytes.
				2607	*
				2608	* Unfortunately, the recommended algorithm breaks header prediction,
				2609	* since header prediction assumes th->window stays fixed.
				2610	*
				2611	* Strictly speaking, keeping th->window fixed violates the receiver
				2612	* side SWS prevention criteria. The problem is that under this rule
				2613	* a stream of single byte packets will cause the right side of the
				2614	* window to always advance by a single byte.
				2615	*
				2616	* Of course, if the sender implements sender side SWS prevention
				2617	* then this will not be a problem.
				2618	*
				2619	* BSD seems to make the following compromise:
				2620	*
				2621	* If the free space is less than the 1/4 of the maximum
				2622	* space available and the free space is less than 1/2 mss,
				2623	* then set the window to 0.
				2624	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
				2625	* Otherwise, just prevent the window from shrinking
				2626	* and from being larger than the largest representable value.
				2627	*
				2628	* This prevents incremental opening of the window in the regime
				2629	* where TCP is limited by the speed of the reader side taking
				2630	* data out of the TCP receive queue. It does nothing about
				2631	* those cases where the window is constrained on the sender side
				2632	* because the pipeline is full.
				2633	*
				2634	* BSD also seems to "accidentally" limit itself to windows that are a
				2635	* multiple of MSS, at least until the free space gets quite small.
				2636	* This would appear to be a side effect of the mbuf implementation.
				2637	* Combining these two algorithms results in the observed behavior
				2638	* of having a fixed window size at almost all times.
				2639	*
				2640	* Below we obtain similar behavior by forcing the offered window to
				2641	* a multiple of the mss when it is feasible to do so.
				2642	*
				2643	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
				2644	* Regular options like TIMESTAMP are taken into account.
				2645	*/
				2646	u32 __tcp_select_window(struct sock *sk)
				2647	{
				2648	struct inet_connection_sock *icsk = inet_csk(sk);
				2649	struct tcp_sock *tp = tcp_sk(sk);
				2650	/* MSS for the peer's data. Previous versions used mss_clamp
				2651	* here. I don't know if the value based on our guesses
				2652	* of peer's MSS is better for the performance. It's more correct
				2653	* but may be worse for the performance because of rcv_mss
				2654	* fluctuations. --SAW 1998/11/1
				2655	*/
				2656	int mss = icsk->icsk_ack.rcv_mss;
				2657	int free_space = tcp_space(sk);
				2658	int allowed_space = tcp_full_space(sk);
				2659	int full_space = min_t(int, tp->window_clamp, allowed_space);
				2660	int window;
				2661
				2662	if (unlikely(mss > full_space)) {
				2663	mss = full_space;
				2664	if (mss <= 0)
				2665	return 0;
				2666	}
				2667	if (free_space < (full_space >> 1)) {
				2668	icsk->icsk_ack.quick = 0;
				2669
				2670	if (tcp_under_memory_pressure(sk))
				2671	tp->rcv_ssthresh = min(tp->rcv_ssthresh,
				2672	4U * tp->advmss);
				2673
				2674	/* free_space might become our new window, make sure we don't
				2675	* increase it due to wscale.
				2676	*/
				2677	free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
				2678
				2679	/* if free space is less than mss estimate, or is below 1/16th
				2680	* of the maximum allowed, try to move to zero-window, else
				2681	* tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
				2682	* new incoming data is dropped due to memory limits.
				2683	* With large window, mss test triggers way too late in order
				2684	* to announce zero window in time before rmem limit kicks in.
				2685	*/
				2686	if (free_space < (allowed_space >> 4) \|\| free_space < mss)
				2687	return 0;
				2688	}
				2689
				2690	if (free_space > tp->rcv_ssthresh)
				2691	free_space = tp->rcv_ssthresh;
				2692
				2693	/* Don't do rounding if we are using window scaling, since the
				2694	* scaled window will not line up with the MSS boundary anyway.
				2695	*/
				2696	if (tp->rx_opt.rcv_wscale) {
				2697	window = free_space;
				2698
				2699	/* Advertise enough space so that it won't get scaled away.
				2700	* Import case: prevent zero window announcement if
				2701	* 1<<rcv_wscale > mss.
				2702	*/
				2703	window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
				2704	} else {
				2705	window = tp->rcv_wnd;
				2706	/* Get the largest window that is a nice multiple of mss.
				2707	* Window clamp already applied above.
				2708	* If our current window offering is within 1 mss of the
				2709	* free space we just keep it. This prevents the divide
				2710	* and multiply from happening most of the time.
				2711	* We also don't do any window rounding when the free space
				2712	* is too small.
				2713	*/
				2714	if (window <= free_space - mss \|\| window > free_space)
				2715	window = rounddown(free_space, mss);
				2716	else if (mss == full_space &&
				2717	free_space > window + (full_space >> 1))
				2718	window = free_space;
				2719	}
				2720
				2721	return window;
				2722	}
				2723
				2724	void tcp_skb_collapse_tstamp(struct sk_buff *skb,
				2725	const struct sk_buff *next_skb)
				2726	{
				2727	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
				2728	const struct skb_shared_info *next_shinfo =
				2729	skb_shinfo(next_skb);
				2730	struct skb_shared_info *shinfo = skb_shinfo(skb);
				2731
				2732	shinfo->tx_flags \|= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
				2733	shinfo->tskey = next_shinfo->tskey;
				2734	TCP_SKB_CB(skb)->txstamp_ack \|=
				2735	TCP_SKB_CB(next_skb)->txstamp_ack;
				2736	}
				2737	}
				2738
				2739	/* Collapses two adjacent SKB's during retransmission. */
				2740	static bool tcp_collapse_retrans(struct sock sk, struct sk_buff skb)
				2741	{
				2742	struct tcp_sock *tp = tcp_sk(sk);
				2743	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
				2744	int skb_size, next_skb_size;
				2745
				2746	skb_size = skb->len;
				2747	next_skb_size = next_skb->len;
				2748
				2749	BUG_ON(tcp_skb_pcount(skb) != 1 \|\| tcp_skb_pcount(next_skb) != 1);
				2750
				2751	if (next_skb_size) {
				2752	if (next_skb_size <= skb_availroom(skb))
				2753	skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
				2754	next_skb_size);
				2755	else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
				2756	return false;
				2757	}
				2758	tcp_highest_sack_replace(sk, next_skb, skb);
				2759
				2760	tcp_unlink_write_queue(next_skb, sk);
				2761
				2762	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
				2763	skb->ip_summed = CHECKSUM_PARTIAL;
				2764
				2765	if (skb->ip_summed != CHECKSUM_PARTIAL)
				2766	skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
				2767
				2768	/* Update sequence range on original skb. */
				2769	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
				2770
				2771	/* Merge over control information. This moves PSH/FIN etc. over */
				2772	TCP_SKB_CB(skb)->tcp_flags \|= TCP_SKB_CB(next_skb)->tcp_flags;
				2773
				2774	/* All done, get rid of second SKB and account for it so
				2775	* packet counting does not break.
				2776	*/
				2777	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
				2778	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
				2779
				2780	/* changed transmit queue under us so clear hints */
				2781	tcp_clear_retrans_hints_partial(tp);
				2782	if (next_skb == tp->retransmit_skb_hint)
				2783	tp->retransmit_skb_hint = skb;
				2784
				2785	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
				2786
				2787	tcp_skb_collapse_tstamp(skb, next_skb);
				2788
				2789	sk_wmem_free_skb(sk, next_skb);
				2790	return true;
				2791	}
				2792
				2793	/* Check if coalescing SKBs is legal. */
				2794	static bool tcp_can_collapse(const struct sock sk, const struct sk_buff skb)
				2795	{
				2796	if (tcp_skb_pcount(skb) > 1)
				2797	return false;
				2798	if (skb_cloned(skb))
				2799	return false;
				2800	if (skb == tcp_send_head(sk))
				2801	return false;
				2802	/* Some heuristics for collapsing over SACK'd could be invented */
				2803	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				2804	return false;
				2805
				2806	return true;
				2807	}
				2808
				2809	/* Collapse packets in the retransmit queue to make to create
				2810	* less packets on the wire. This is only done on retransmission.
				2811	*/
				2812	static void tcp_retrans_try_collapse(struct sock sk, struct sk_buff to,
				2813	int space)
				2814	{
				2815	struct tcp_sock *tp = tcp_sk(sk);
				2816	struct sk_buff skb = to, tmp;
				2817	bool first = true;
				2818
				2819	if (!sysctl_tcp_retrans_collapse)
				2820	return;
				2821	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2822	return;
				2823
				2824	tcp_for_write_queue_from_safe(skb, tmp, sk) {
				2825	if (!tcp_can_collapse(sk, skb))
				2826	break;
				2827
				2828	if (!tcp_skb_can_collapse_to(to))
				2829	break;
				2830
				2831	space -= skb->len;
				2832
				2833	if (first) {
				2834	first = false;
				2835	continue;
				2836	}
				2837
				2838	if (space < 0)
				2839	break;
				2840
				2841	if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
				2842	break;
				2843
				2844	if (!tcp_collapse_retrans(sk, to))
				2845	break;
				2846	}
				2847	}
				2848
				2849	/* This retransmits one SKB. Policy decisions and retransmit queue
				2850	* state updates are done by the caller. Returns non-zero if an
				2851	* error occurred which prevented the send.
				2852	*/
				2853	int __tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
				2854	{
				2855	struct inet_connection_sock *icsk = inet_csk(sk);
				2856	struct tcp_sock *tp = tcp_sk(sk);
				2857	unsigned int cur_mss;
				2858	int diff, len, err;
				2859
				2860
				2861	/* Inconclusive MTU probe */
				2862	if (icsk->icsk_mtup.probe_size)
				2863	icsk->icsk_mtup.probe_size = 0;
				2864
				2865	/* Do not sent more than we queued. 1/4 is reserved for possible
				2866	* copying overhead: fragmentation, tunneling, mangling etc.
				2867	*/
				2868	if (refcount_read(&sk->sk_wmem_alloc) >
				2869	min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
				2870	sk->sk_sndbuf))
				2871	return -EAGAIN;
				2872
				2873	if (skb_still_in_host_queue(sk, skb))
				2874	return -EBUSY;
				2875
				2876	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
				2877	if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
				2878	WARN_ON_ONCE(1);
				2879	return -EINVAL;
				2880	}
				2881	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				2882	return -ENOMEM;
				2883	}
				2884
				2885	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				2886	return -EHOSTUNREACH; /* Routing failure or similar. */
				2887
				2888	cur_mss = tcp_current_mss(sk);
				2889
				2890	/* If receiver has shrunk his window, and skb is out of
				2891	* new window, do not retransmit it. The exception is the
				2892	* case, when window is shrunk to zero. In this case
				2893	* our retransmit serves as a zero window probe.
				2894	*/
				2895	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
				2896	TCP_SKB_CB(skb)->seq != tp->snd_una)
				2897	return -EAGAIN;
				2898
				2899	len = cur_mss * segs;
				2900	if (skb->len > len) {
				2901	if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
				2902	return -ENOMEM; /* We'll try again later. */
				2903	} else {
				2904	if (skb_unclone(skb, GFP_ATOMIC))
				2905	return -ENOMEM;
				2906
				2907	diff = tcp_skb_pcount(skb);
				2908	tcp_set_skb_tso_segs(skb, cur_mss);
				2909	diff -= tcp_skb_pcount(skb);
				2910	if (diff)
				2911	tcp_adjust_pcount(sk, skb, diff);
				2912	if (skb->len < cur_mss)
				2913	tcp_retrans_try_collapse(sk, skb, cur_mss);
				2914	}
				2915
				2916	/* RFC3168, section 6.1.1.1. ECN fallback */
				2917	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
				2918	tcp_ecn_clear_syn(sk, skb);
				2919
				2920	/* Update global and local TCP statistics. */
				2921	segs = tcp_skb_pcount(skb);
				2922	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
				2923	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2924	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
				2925	tp->total_retrans += segs;
				2926
				2927	/* make sure skb->data is aligned on arches that require it
				2928	* and check if ack-trimming & collapsing extended the headroom
				2929	* beyond what csum_start can cover.
				2930	*/
				2931	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) \|\|
				2932	skb_headroom(skb) >= 0xFFFF)) {
				2933	struct sk_buff *nskb;
				2934
				2935	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
				2936	err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
				2937	-ENOBUFS;
				2938	if (!err) {
				2939	skb->skb_mstamp = tp->tcp_mstamp;
				2940	tcp_rate_skb_sent(sk, skb);
				2941	}
				2942	} else {
				2943	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				2944	}
				2945
				2946	if (likely(!err)) {
				2947	TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
				2948	} else if (err != -EBUSY) {
				2949	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
				2950	}
				2951	return err;
				2952	}
				2953
				2954	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb, int segs)
				2955	{
				2956	struct tcp_sock *tp = tcp_sk(sk);
				2957	int err = __tcp_retransmit_skb(sk, skb, segs);
				2958
				2959	if (err == 0) {
				2960	#if FASTRETRANS_DEBUG > 0
				2961	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2962	net_dbg_ratelimited("retrans_out leaked\n");
				2963	}
				2964	#endif
				2965	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
				2966	tp->retrans_out += tcp_skb_pcount(skb);
				2967
				2968	/* Save stamp of the first retransmit. */
				2969	if (!tp->retrans_stamp)
				2970	tp->retrans_stamp = tcp_skb_timestamp(skb);
				2971
				2972	}
				2973
				2974	if (tp->undo_retrans < 0)
				2975	tp->undo_retrans = 0;
				2976	tp->undo_retrans += tcp_skb_pcount(skb);
				2977	return err;
				2978	}
				2979
				2980	/* This gets called after a retransmit timeout, and the initially
				2981	* retransmitted data is acknowledged. It tries to continue
				2982	* resending the rest of the retransmit queue, until either
				2983	* we've sent it all or the congestion window limit is reached.
				2984	* If doing SACK, the first ACK which comes back for a timeout
				2985	* based retransmit packet might feed us FACK information again.
				2986	* If so, we use it to avoid unnecessarily retransmissions.
				2987	*/
				2988	void tcp_xmit_retransmit_queue(struct sock *sk)
				2989	{
				2990	const struct inet_connection_sock *icsk = inet_csk(sk);
				2991	struct tcp_sock *tp = tcp_sk(sk);
				2992	struct sk_buff *skb;
				2993	struct sk_buff *hole = NULL;
				2994	u32 max_segs;
				2995	int mib_idx;
				2996
				2997	if (!tp->packets_out)
				2998	return;
				2999
				3000	if (tp->retransmit_skb_hint) {
				3001	skb = tp->retransmit_skb_hint;
				3002	} else {
				3003	skb = tcp_write_queue_head(sk);
				3004	}
				3005
				3006	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
				3007	tcp_for_write_queue_from(skb, sk) {
				3008	__u8 sacked;
				3009	int segs;
				3010
				3011	if (skb == tcp_send_head(sk))
				3012	break;
				3013
				3014	if (tcp_pacing_check(sk))
				3015	break;
				3016
				3017	/* we could do better than to assign each time */
				3018	if (!hole)
				3019	tp->retransmit_skb_hint = skb;
				3020
				3021	segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
				3022	if (segs <= 0)
				3023	return;
				3024	sacked = TCP_SKB_CB(skb)->sacked;
				3025	/* In case tcp_shift_skb_data() have aggregated large skbs,
				3026	* we need to make sure not sending too bigs TSO packets
				3027	*/
				3028	segs = min_t(int, segs, max_segs);
				3029
				3030	if (tp->retrans_out >= tp->lost_out) {
				3031	break;
				3032	} else if (!(sacked & TCPCB_LOST)) {
				3033	if (!hole && !(sacked & (TCPCB_SACKED_RETRANS\|TCPCB_SACKED_ACKED)))
				3034	hole = skb;
				3035	continue;
				3036
				3037	} else {
				3038	if (icsk->icsk_ca_state != TCP_CA_Loss)
				3039	mib_idx = LINUX_MIB_TCPFASTRETRANS;
				3040	else
				3041	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
				3042	}
				3043
				3044	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
				3045	continue;
				3046
				3047	if (tcp_small_queue_check(sk, skb, 1))
				3048	return;
				3049
				3050	if (tcp_retransmit_skb(sk, skb, segs))
				3051	return;
				3052
				3053	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
				3054
				3055	if (tcp_in_cwnd_reduction(sk))
				3056	tp->prr_out += tcp_skb_pcount(skb);
				3057
				3058	if (skb == tcp_write_queue_head(sk) &&
				3059	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
				3060	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3061	inet_csk(sk)->icsk_rto,
				3062	TCP_RTO_MAX);
				3063	}
				3064	}
				3065
				3066	/* We allow to exceed memory limits for FIN packets to expedite
				3067	* connection tear down and (memory) recovery.
				3068	* Otherwise tcp_send_fin() could be tempted to either delay FIN
				3069	* or even be forced to close flow without any FIN.
				3070	* In general, we want to allow one skb per socket to avoid hangs
				3071	* with edge trigger epoll()
				3072	*/
				3073	void sk_forced_mem_schedule(struct sock *sk, int size)
				3074	{
				3075	int amt;
				3076
				3077	if (size <= sk->sk_forward_alloc)
				3078	return;
				3079	amt = sk_mem_pages(size);
				3080	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
				3081	sk_memory_allocated_add(sk, amt);
				3082
				3083	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
				3084	mem_cgroup_charge_skmem(sk->sk_memcg, amt);
				3085	}
				3086
				3087	/* Send a FIN. The caller locks the socket for us.
				3088	* We should try to send a FIN packet really hard, but eventually give up.
				3089	*/
				3090	void tcp_send_fin(struct sock *sk)
				3091	{
				3092	struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
				3093	struct tcp_sock *tp = tcp_sk(sk);
				3094
				3095	/* Optimization, tack on the FIN if we have one skb in write queue and
				3096	* this skb was not yet sent, or we are under memory pressure.
				3097	* Note: in the latter case, FIN packet will be sent after a timeout,
				3098	* as TCP stack thinks it has already been transmitted.
				3099	*/
				3100	if (tskb && (tcp_send_head(sk) \|\| tcp_under_memory_pressure(sk))) {
				3101	coalesce:
				3102	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
				3103	TCP_SKB_CB(tskb)->end_seq++;
				3104	tp->write_seq++;
				3105	if (!tcp_send_head(sk)) {
				3106	/* This means tskb was already sent.
				3107	* Pretend we included the FIN on previous transmit.
				3108	* We need to set tp->snd_nxt to the value it would have
				3109	* if FIN had been sent. This is because retransmit path
				3110	* does not change tp->snd_nxt.
				3111	*/
				3112	tp->snd_nxt++;
				3113	return;
				3114	}
				3115	} else {
				3116	skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
				3117	if (unlikely(!skb)) {
				3118	if (tskb)
				3119	goto coalesce;
				3120	return;
				3121	}
				3122	skb_reserve(skb, MAX_TCP_HEADER);
				3123	sk_forced_mem_schedule(sk, skb->truesize);
				3124	/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
				3125	tcp_init_nondata_skb(skb, tp->write_seq,
				3126	TCPHDR_ACK \| TCPHDR_FIN);
				3127	tcp_queue_skb(sk, skb);
				3128	}
				3129	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
				3130	}
				3131
				3132	/* We get here when a process closes a file descriptor (either due to
				3133	* an explicit close() or as a byproduct of exit()'ing) and there
				3134	* was unread data in the receive queue. This behavior is recommended
				3135	* by RFC 2525, section 2.17. -DaveM
				3136	*/
				3137	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
				3138	{
				3139	struct sk_buff *skb;
				3140
				3141	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
				3142
				3143	/* NOTE: No TCP options attached and we never retransmit this. */
				3144	skb = alloc_skb(MAX_TCP_HEADER, priority);
				3145	if (!skb) {
				3146	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				3147	return;
				3148	}
				3149
				3150	/* Reserve space for headers and prepare control bits. */
				3151	skb_reserve(skb, MAX_TCP_HEADER);
				3152	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
				3153	TCPHDR_ACK \| TCPHDR_RST);
				3154	tcp_mstamp_refresh(tcp_sk(sk));
				3155	/* Send it off. */
				3156	if (tcp_transmit_skb(sk, skb, 0, priority))
				3157	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				3158	}
				3159
				3160	/* Send a crossed SYN-ACK during socket establishment.
				3161	* WARNING: This routine must only be called when we have already sent
				3162	* a SYN packet that crossed the incoming SYN that caused this routine
				3163	* to get called. If this assumption fails then the initial rcv_wnd
				3164	* and rcv_wscale values will not be correct.
				3165	*/
				3166	int tcp_send_synack(struct sock *sk)
				3167	{
				3168	struct sk_buff *skb;
				3169
				3170	skb = tcp_write_queue_head(sk);
				3171	if (!skb \|\| !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				3172	pr_debug("%s: wrong queue state\n", __func__);
				3173	return -EFAULT;
				3174	}
				3175	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
				3176	if (skb_cloned(skb)) {
				3177	struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
				3178	if (!nskb)
				3179	return -ENOMEM;
				3180	tcp_unlink_write_queue(skb, sk);
				3181	__skb_header_release(nskb);
				3182	__tcp_add_write_queue_head(sk, nskb);
				3183	sk_wmem_free_skb(sk, skb);
				3184	sk->sk_wmem_queued += nskb->truesize;
				3185	sk_mem_charge(sk, nskb->truesize);
				3186	skb = nskb;
				3187	}
				3188
				3189	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ACK;
				3190	tcp_ecn_send_synack(sk, skb);
				3191	}
				3192	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3193	}
				3194
				3195	/**
				3196	* tcp_make_synack - Prepare a SYN-ACK.
				3197	* sk: listener socket
				3198	* dst: dst entry attached to the SYNACK
				3199	* req: request_sock pointer
				3200	*
				3201	* Allocate one skb and build a SYNACK packet.
				3202	* @dst is consumed : Caller should not use it again.
				3203	*/
				3204	struct sk_buff tcp_make_synack(const struct sock sk, struct dst_entry *dst,
				3205	struct request_sock *req,
				3206	struct tcp_fastopen_cookie *foc,
				3207	enum tcp_synack_type synack_type)
				3208	{
				3209	struct inet_request_sock *ireq = inet_rsk(req);
				3210	const struct tcp_sock *tp = tcp_sk(sk);
				3211	struct tcp_md5sig_key *md5 = NULL;
				3212	struct tcp_out_options opts;
				3213	struct sk_buff *skb;
				3214	int tcp_header_size;
				3215	struct tcphdr *th;
				3216	int mss;
				3217
				3218	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
				3219	if (unlikely(!skb)) {
				3220	dst_release(dst);
				3221	return NULL;
				3222	}
				3223	/* Reserve space for headers. */
				3224	skb_reserve(skb, MAX_TCP_HEADER);
				3225
				3226	switch (synack_type) {
				3227	case TCP_SYNACK_NORMAL:
				3228	skb_set_owner_w(skb, req_to_sk(req));
				3229	break;
				3230	case TCP_SYNACK_COOKIE:
				3231	/* Under synflood, we do not attach skb to a socket,
				3232	* to avoid false sharing.
				3233	*/
				3234	break;
				3235	case TCP_SYNACK_FASTOPEN:
				3236	/* sk is a const pointer, because we want to express multiple
				3237	* cpu might call us concurrently.
				3238	* sk->sk_wmem_alloc in an atomic, we can promote to rw.
				3239	*/
				3240	skb_set_owner_w(skb, (struct sock *)sk);
				3241	break;
				3242	}
				3243	skb_dst_set(skb, dst);
				3244
				3245	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
				3246
				3247	memset(&opts, 0, sizeof(opts));
				3248	#ifdef CONFIG_SYN_COOKIES
				3249	if (unlikely(req->cookie_ts))
				3250	skb->skb_mstamp = cookie_init_timestamp(req);
				3251	else
				3252	#endif
				3253	skb->skb_mstamp = tcp_clock_us();
				3254
				3255	#ifdef CONFIG_TCP_MD5SIG
				3256	rcu_read_lock();
				3257	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
				3258	#endif
				3259	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
				3260	tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5,
				3261	foc, synack_type) + sizeof(*th);
				3262
				3263	skb_push(skb, tcp_header_size);
				3264	skb_reset_transport_header(skb);
				3265
				3266	th = (struct tcphdr *)skb->data;
				3267	memset(th, 0, sizeof(struct tcphdr));
				3268	th->syn = 1;
				3269	th->ack = 1;
				3270	tcp_ecn_make_synack(req, th);
				3271	th->source = htons(ireq->ir_num);
				3272	th->dest = ireq->ir_rmt_port;
				3273	skb->mark = ireq->ir_mark;
				3274	skb->ip_summed = CHECKSUM_PARTIAL;
				3275	th->seq = htonl(tcp_rsk(req)->snt_isn);
				3276	/* XXX data is queued and acked as is. No buffer/window check */
				3277	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
				3278
				3279	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
				3280	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
				3281	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
				3282	th->doff = (tcp_header_size >> 2);
				3283	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
				3284
				3285	#ifdef CONFIG_TCP_MD5SIG
				3286	/* Okay, we have all we need - do the md5 hash if needed */
				3287	if (md5)
				3288	tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
				3289	md5, req_to_sk(req), skb);
				3290	rcu_read_unlock();
				3291	#endif
				3292
				3293	/* Do not fool tcpdump (if any), clean our debris */
				3294	skb->tstamp = 0;
				3295	return skb;
				3296	}
				3297	EXPORT_SYMBOL(tcp_make_synack);
				3298
				3299	static void tcp_ca_dst_init(struct sock sk, const struct dst_entry dst)
				3300	{
				3301	struct inet_connection_sock *icsk = inet_csk(sk);
				3302	const struct tcp_congestion_ops *ca;
				3303	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
				3304
				3305	if (ca_key == TCP_CA_UNSPEC)
				3306	return;
				3307
				3308	rcu_read_lock();
				3309	ca = tcp_ca_find_key(ca_key);
				3310	if (likely(ca && try_module_get(ca->owner))) {
				3311	module_put(icsk->icsk_ca_ops->owner);
				3312	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
				3313	icsk->icsk_ca_ops = ca;
				3314	}
				3315	rcu_read_unlock();
				3316	}
				3317
				3318	/* Do all connect socket setups that can be done AF independent. */
				3319	static void tcp_connect_init(struct sock *sk)
				3320	{
				3321	const struct dst_entry *dst = __sk_dst_get(sk);
				3322	struct tcp_sock *tp = tcp_sk(sk);
				3323	__u8 rcv_wscale;
				3324	u32 rcv_wnd;
				3325
				3326	/* We'll fix this up when we get a response from the other end.
				3327	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
				3328	*/
				3329	tp->tcp_header_len = sizeof(struct tcphdr);
				3330	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
				3331	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
				3332
				3333	#ifdef CONFIG_TCP_MD5SIG
				3334	if (tp->af_specific->md5_lookup(sk, sk))
				3335	tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
				3336	#endif
				3337
				3338	/* If user gave his TCP_MAXSEG, record it to clamp */
				3339	if (tp->rx_opt.user_mss)
				3340	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
				3341	tp->max_window = 0;
				3342	tcp_mtup_init(sk);
				3343	tcp_sync_mss(sk, dst_mtu(dst));
				3344
				3345	tcp_ca_dst_init(sk, dst);
				3346
				3347	if (!tp->window_clamp)
				3348	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
				3349	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
				3350
				3351	tcp_initialize_rcv_mss(sk);
				3352
				3353	/* limit the window selection if the user enforce a smaller rx buffer */
				3354	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
				3355	(tp->window_clamp > tcp_full_space(sk) \|\| tp->window_clamp == 0))
				3356	tp->window_clamp = tcp_full_space(sk);
				3357
				3358	rcv_wnd = tcp_rwnd_init_bpf(sk);
				3359	if (rcv_wnd == 0)
				3360	rcv_wnd = dst_metric(dst, RTAX_INITRWND);
				3361
				3362	tcp_select_initial_window(tcp_full_space(sk),
				3363	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				3364	&tp->rcv_wnd,
				3365	&tp->window_clamp,
				3366	sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
				3367	&rcv_wscale,
				3368	rcv_wnd);
				3369
				3370	tp->rx_opt.rcv_wscale = rcv_wscale;
				3371	tp->rcv_ssthresh = tp->rcv_wnd;
				3372
				3373	sk->sk_err = 0;
				3374	sock_reset_flag(sk, SOCK_DONE);
				3375	tp->snd_wnd = 0;
				3376	tcp_init_wl(tp, 0);
				3377	tcp_write_queue_purge(sk);
				3378	tp->snd_una = tp->write_seq;
				3379	tp->snd_sml = tp->write_seq;
				3380	tp->snd_up = tp->write_seq;
				3381	tp->snd_nxt = tp->write_seq;
				3382
				3383	if (likely(!tp->repair))
				3384	tp->rcv_nxt = 0;
				3385	else
				3386	tp->rcv_tstamp = tcp_jiffies32;
				3387	tp->rcv_wup = tp->rcv_nxt;
				3388	tp->copied_seq = tp->rcv_nxt;
				3389
				3390	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
				3391	inet_csk(sk)->icsk_retransmits = 0;
				3392	tcp_clear_retrans(tp);
				3393	}
				3394
				3395	static void tcp_connect_queue_skb(struct sock sk, struct sk_buff skb)
				3396	{
				3397	struct tcp_sock *tp = tcp_sk(sk);
				3398	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				3399
				3400	tcb->end_seq += skb->len;
				3401	__skb_header_release(skb);
				3402	__tcp_add_write_queue_tail(sk, skb);
				3403	sk->sk_wmem_queued += skb->truesize;
				3404	sk_mem_charge(sk, skb->truesize);
				3405	tp->write_seq = tcb->end_seq;
				3406	tp->packets_out += tcp_skb_pcount(skb);
				3407	}
				3408
				3409	/* Build and send a SYN with data and (cached) Fast Open cookie. However,
				3410	* queue a data-only packet after the regular SYN, such that regular SYNs
				3411	* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
				3412	* only the SYN sequence, the data are retransmitted in the first ACK.
				3413	* If cookie is not cached or other error occurs, falls back to send a
				3414	* regular SYN with Fast Open cookie request option.
				3415	*/
				3416	static int tcp_send_syn_data(struct sock sk, struct sk_buff syn)
				3417	{
				3418	struct tcp_sock *tp = tcp_sk(sk);
				3419	struct tcp_fastopen_request *fo = tp->fastopen_req;
				3420	int space, err = 0;
				3421	struct sk_buff *syn_data;
				3422
				3423	tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
				3424	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
				3425	goto fallback;
				3426
				3427	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
				3428	* user-MSS. Reserve maximum option space for middleboxes that add
				3429	* private TCP options. The cost is reduced data space in SYN :(
				3430	*/
				3431	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
				3432
				3433	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
				3434	MAX_TCP_OPTION_SPACE;
				3435
				3436	space = min_t(size_t, space, fo->size);
				3437
				3438	/* limit to order-0 allocations */
				3439	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
				3440
				3441	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
				3442	if (!syn_data)
				3443	goto fallback;
				3444	syn_data->ip_summed = CHECKSUM_PARTIAL;
				3445	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
				3446	if (space) {
				3447	int copied = copy_from_iter(skb_put(syn_data, space), space,
				3448	&fo->data->msg_iter);
				3449	if (unlikely(!copied)) {
				3450	kfree_skb(syn_data);
				3451	goto fallback;
				3452	}
				3453	if (copied != space) {
				3454	skb_trim(syn_data, copied);
				3455	space = copied;
				3456	}
				3457	}
				3458	/* No more data pending in inet_wait_for_connect() */
				3459	if (space == fo->size)
				3460	fo->data = NULL;
				3461	fo->copied = space;
				3462
				3463	tcp_connect_queue_skb(sk, syn_data);
				3464	if (syn_data->len)
				3465	tcp_chrono_start(sk, TCP_CHRONO_BUSY);
				3466
				3467	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
				3468
				3469	syn->skb_mstamp = syn_data->skb_mstamp;
				3470
				3471	/* Now full SYN+DATA was cloned and sent (or not),
				3472	* remove the SYN from the original skb (syn_data)
				3473	* we keep in write queue in case of a retransmit, as we
				3474	* also have the SYN packet (with no data) in the same queue.
				3475	*/
				3476	TCP_SKB_CB(syn_data)->seq++;
				3477	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK \| TCPHDR_PSH;
				3478	if (!err) {
				3479	tp->syn_data = (fo->copied > 0);
				3480	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
				3481	goto done;
				3482	}
				3483
				3484	/* data was not sent, this is our new send_head */
				3485	sk->sk_send_head = syn_data;
				3486	tp->packets_out -= tcp_skb_pcount(syn_data);
				3487
				3488	fallback:
				3489	/* Send a regular SYN with Fast Open cookie request option */
				3490	if (fo->cookie.len > 0)
				3491	fo->cookie.len = 0;
				3492	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
				3493	if (err)
				3494	tp->syn_fastopen = 0;
				3495	done:
				3496	fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
				3497	return err;
				3498	}
				3499
				3500	/* Build a SYN and send it off. */
				3501	int tcp_connect(struct sock *sk)
				3502	{
				3503	struct tcp_sock *tp = tcp_sk(sk);
				3504	struct sk_buff *buff;
				3505	int err;
				3506
				3507	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
				3508
				3509	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				3510	return -EHOSTUNREACH; /* Routing failure or similar. */
				3511
				3512	tcp_connect_init(sk);
				3513
				3514	if (unlikely(tp->repair)) {
				3515	tcp_finish_connect(sk, NULL);
				3516	return 0;
				3517	}
				3518
				3519	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
				3520	if (unlikely(!buff))
				3521	return -ENOBUFS;
				3522
				3523	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
				3524	tcp_mstamp_refresh(tp);
				3525	tp->retrans_stamp = tcp_time_stamp(tp);
				3526	tcp_connect_queue_skb(sk, buff);
				3527	tcp_ecn_send_syn(sk, buff);
				3528
				3529	/* Send off SYN; include data in Fast Open. */
				3530	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
				3531	tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
				3532	if (err == -ECONNREFUSED)
				3533	return err;
				3534
				3535	/* We change tp->snd_nxt after the tcp_transmit_skb() call
				3536	* in order to make this packet get counted in tcpOutSegs.
				3537	*/
				3538	tp->snd_nxt = tp->write_seq;
				3539	tp->pushed_seq = tp->write_seq;
				3540	buff = tcp_send_head(sk);
				3541	if (unlikely(buff)) {
				3542	tp->snd_nxt = TCP_SKB_CB(buff)->seq;
				3543	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
				3544	}
				3545	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
				3546
				3547	/* Timer for repeating the SYN until an answer. */
				3548	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				3549	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
				3550	return 0;
				3551	}
				3552	EXPORT_SYMBOL(tcp_connect);
				3553
				3554	/* Send out a delayed ack, the caller does the policy checking
				3555	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
				3556	* for details.
				3557	*/
				3558	void tcp_send_delayed_ack(struct sock *sk)
				3559	{
				3560	struct inet_connection_sock *icsk = inet_csk(sk);
				3561	int ato = icsk->icsk_ack.ato;
				3562	unsigned long timeout;
				3563
				3564	if (ato > TCP_DELACK_MIN) {
				3565	const struct tcp_sock *tp = tcp_sk(sk);
				3566	int max_ato = HZ / 2;
				3567
				3568	if (icsk->icsk_ack.pingpong \|\|
				3569	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
				3570	max_ato = TCP_DELACK_MAX;
				3571
				3572	/* Slow path, intersegment interval is "high". */
				3573
				3574	/* If some rtt estimate is known, use it to bound delayed ack.
				3575	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
				3576	* directly.
				3577	*/
				3578	if (tp->srtt_us) {
				3579	int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
				3580	TCP_DELACK_MIN);
				3581
				3582	if (rtt < max_ato)
				3583	max_ato = rtt;
				3584	}
				3585
				3586	ato = min(ato, max_ato);
				3587	}
				3588
				3589	/* Stay within the limit we were given */
				3590	timeout = jiffies + ato;
				3591
				3592	/* Use new timeout only if there wasn't a older one earlier. */
				3593	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
				3594	/* If delack timer was blocked or is about to expire,
				3595	* send ACK now.
				3596	*/
				3597	if (icsk->icsk_ack.blocked \|\|
				3598	time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
				3599	tcp_send_ack(sk);
				3600	return;
				3601	}
				3602
				3603	if (!time_before(timeout, icsk->icsk_ack.timeout))
				3604	timeout = icsk->icsk_ack.timeout;
				3605	}
				3606	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
				3607	icsk->icsk_ack.timeout = timeout;
				3608	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
				3609	}
				3610
				3611	/* This routine sends an ack and also updates the window. */
				3612	void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
				3613	{
				3614	struct sk_buff *buff;
				3615
				3616	/* If we have been reset, we may not send again. */
				3617	if (sk->sk_state == TCP_CLOSE)
				3618	return;
				3619
				3620	/* We are not putting this on the write queue, so
				3621	* tcp_transmit_skb() will set the ownership to this
				3622	* sock.
				3623	*/
				3624	buff = alloc_skb(MAX_TCP_HEADER,
				3625	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
				3626	if (unlikely(!buff)) {
				3627	inet_csk_schedule_ack(sk);
				3628	inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
				3629	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				3630	TCP_DELACK_MAX, TCP_RTO_MAX);
				3631	return;
				3632	}
				3633
				3634	/* Reserve space for headers and prepare control bits. */
				3635	skb_reserve(buff, MAX_TCP_HEADER);
				3636	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
				3637
				3638	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
				3639	* too much.
				3640	* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
				3641	*/
				3642	skb_set_tcp_pure_ack(buff);
				3643
				3644	/* Send it off, this clears delayed acks for us. */
				3645	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
				3646	}
				3647	EXPORT_SYMBOL_GPL(__tcp_send_ack);
				3648
				3649	void tcp_send_ack(struct sock *sk)
				3650	{
				3651	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
				3652	}
				3653
				3654	/* This routine sends a packet with an out of date sequence
				3655	* number. It assumes the other end will try to ack it.
				3656	*
				3657	* Question: what should we make while urgent mode?
				3658	* 4.4BSD forces sending single byte of data. We cannot send
				3659	* out of window data, because we have SND.NXT==SND.MAX...
				3660	*
				3661	* Current solution: to send TWO zero-length segments in urgent mode:
				3662	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
				3663	* out-of-date with SND.UNA-1 to probe window.
				3664	*/
				3665	static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
				3666	{
				3667	struct tcp_sock *tp = tcp_sk(sk);
				3668	struct sk_buff *skb;
				3669
				3670	/* We don't queue it, tcp_transmit_skb() sets ownership. */
				3671	skb = alloc_skb(MAX_TCP_HEADER,
				3672	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
				3673	if (!skb)
				3674	return -1;
				3675
				3676	/* Reserve space for headers and set control bits. */
				3677	skb_reserve(skb, MAX_TCP_HEADER);
				3678	/* Use a previous sequence. This should cause the other
				3679	* end to send an ack. Don't queue or clone SKB, just
				3680	* send it.
				3681	*/
				3682	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
				3683	NET_INC_STATS(sock_net(sk), mib);
				3684	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
				3685	}
				3686
				3687	/* Called from setsockopt( ... TCP_REPAIR ) */
				3688	void tcp_send_window_probe(struct sock *sk)
				3689	{
				3690	if (sk->sk_state == TCP_ESTABLISHED) {
				3691	tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
				3692	tcp_mstamp_refresh(tcp_sk(sk));
				3693	tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
				3694	}
				3695	}
				3696
				3697	/* Initiate keepalive or window probe from timer. */
				3698	int tcp_write_wakeup(struct sock *sk, int mib)
				3699	{
				3700	struct tcp_sock *tp = tcp_sk(sk);
				3701	struct sk_buff *skb;
				3702
				3703	if (sk->sk_state == TCP_CLOSE)
				3704	return -1;
				3705
				3706	skb = tcp_send_head(sk);
				3707	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
				3708	int err;
				3709	unsigned int mss = tcp_current_mss(sk);
				3710	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				3711
				3712	if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
				3713	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
				3714
				3715	/* We are probing the opening of a window
				3716	* but the window size is != 0
				3717	* must have been a result SWS avoidance ( sender )
				3718	*/
				3719	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
				3720	skb->len > mss) {
				3721	seg_size = min(seg_size, mss);
				3722	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				3723	if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
				3724	return -1;
				3725	} else if (!tcp_skb_pcount(skb))
				3726	tcp_set_skb_tso_segs(skb, mss);
				3727
				3728	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				3729	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				3730	if (!err)
				3731	tcp_event_new_data_sent(sk, skb);
				3732	return err;
				3733	} else {
				3734	if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
				3735	tcp_xmit_probe_skb(sk, 1, mib);
				3736	return tcp_xmit_probe_skb(sk, 0, mib);
				3737	}
				3738	}
				3739
				3740	/* A window probe timeout has occurred. If window is not closed send
				3741	* a partial packet else a zero probe.
				3742	*/
				3743	void tcp_send_probe0(struct sock *sk)
				3744	{
				3745	struct inet_connection_sock *icsk = inet_csk(sk);
				3746	struct tcp_sock *tp = tcp_sk(sk);
				3747	struct net *net = sock_net(sk);
				3748	unsigned long probe_max;
				3749	int err;
				3750
				3751	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
				3752
				3753	if (tp->packets_out \|\| !tcp_send_head(sk)) {
				3754	/* Cancel probe timer, if it is not required. */
				3755	icsk->icsk_probes_out = 0;
				3756	icsk->icsk_backoff = 0;
				3757	return;
				3758	}
				3759
				3760	if (err <= 0) {
				3761	if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
				3762	icsk->icsk_backoff++;
				3763	icsk->icsk_probes_out++;
				3764	probe_max = TCP_RTO_MAX;
				3765	} else {
				3766	/* If packet was not sent due to local congestion,
				3767	* do not backoff and do not remember icsk_probes_out.
				3768	* Let local senders to fight for local resources.
				3769	*
				3770	* Use accumulated backoff yet.
				3771	*/
				3772	if (!icsk->icsk_probes_out)
				3773	icsk->icsk_probes_out = 1;
				3774	probe_max = TCP_RESOURCE_PROBE_INTERVAL;
				3775	}
				3776	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3777	tcp_probe0_when(sk, probe_max),
				3778	TCP_RTO_MAX);
				3779	}
				3780
				3781	int tcp_rtx_synack(const struct sock sk, struct request_sock req)
				3782	{
				3783	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
				3784	struct flowi fl;
				3785	int res;
				3786
				3787	tcp_rsk(req)->txhash = net_tx_rndhash();
				3788	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
				3789	if (!res) {
				3790	__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
				3791	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
				3792	if (unlikely(tcp_passive_fastopen(sk)))
				3793	tcp_sk(sk)->total_retrans++;
				3794	}
				3795	return res;
				3796	}
				3797	EXPORT_SYMBOL(tcp_rtx_synack);