Blame - ap/os/linux/linux-3.4.x/net/ipv4/tcp_output.c - R306

blob: cb4c7490c275c606f5aeade344f817311b6984ce [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*/
				20
				21	/*
				22	* Changes: Pedro Roque : Retransmit queue handled by TCP.
				23	* : Fragmentation on mtu decrease
				24	* : Segment collapse on retransmit
				25	* : AF independence
				26	*
				27	* Linus Torvalds : send_delayed_ack
				28	* David S. Miller : Charge memory using the right skb
				29	* during syn/ack processing.
				30	* David S. Miller : Output engine completely rewritten.
				31	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
				32	* Cacophonix Gaul : draft-minshall-nagle-01
				33	* J Hadi Salim : ECN support
				34	*
				35	*/
				36
				37	#include <net/tcp.h>
				38
				39	#include <linux/compiler.h>
				40	#include <linux/gfp.h>
				41	#include <linux/module.h>
				42	#include <net/SI/sock_track.h>
				43
				44	/* People can turn this off for buggy TCP's found in printers etc. */
				45	int sysctl_tcp_retrans_collapse __read_mostly = 1;
				46
				47	/* People can turn this on to work with those rare, broken TCPs that
				48	* interpret the window field as a signed quantity.
				49	*/
				50	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
				51
				52	/* This limits the percentage of the congestion window which we
				53	* will allow a single TSO frame to consume. Building TSO frames
				54	* which are too large can cause TCP streams to be bursty.
				55	*/
				56	int sysctl_tcp_tso_win_divisor __read_mostly = 3;
				57
				58	int sysctl_tcp_mtu_probing __read_mostly = 0;
				59	int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
				60
				61	/* By default, RFC2861 behavior. */
				62	int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
				63
				64	int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
				65	EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
				66
				67
				68	/* Account for new data that has been sent to the network. */
				69	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)
				70	{
				71	struct tcp_sock *tp = tcp_sk(sk);
				72	unsigned int prior_packets = tp->packets_out;
				73
				74	tcp_advance_send_head(sk, skb);
				75	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
				76
				77	/* Don't override Nagle indefinitely with F-RTO */
				78	if (tp->frto_counter == 2)
				79	tp->frto_counter = 3;
				80
				81	tp->packets_out += tcp_skb_pcount(skb);
				82	if (!prior_packets)
				83	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				84	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
				85	}
				86
				87	/* SND.NXT, if window was not shrunk.
				88	* If window has been shrunk, what should we make? It is not clear at all.
				89	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
				90	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
				91	* invalid. OK, let's make this for now:
				92	*/
				93	static inline __u32 tcp_acceptable_seq(const struct sock *sk)
				94	{
				95	const struct tcp_sock *tp = tcp_sk(sk);
				96
				97	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
				98	return tp->snd_nxt;
				99	else
				100	return tcp_wnd_end(tp);
				101	}
				102
				103	/* Calculate mss to advertise in SYN segment.
				104	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
				105	*
				106	* 1. It is independent of path mtu.
				107	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
				108	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
				109	* attached devices, because some buggy hosts are confused by
				110	* large MSS.
				111	* 4. We do not make 3, we advertise MSS, calculated from first
				112	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
				113	* This may be overridden via information stored in routing table.
				114	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
				115	* probably even Jumbo".
				116	*/
				117	static __u16 tcp_advertise_mss(struct sock *sk)
				118	{
				119	struct tcp_sock *tp = tcp_sk(sk);
				120	const struct dst_entry *dst = __sk_dst_get(sk);
				121	int mss = tp->advmss;
				122
				123	if (dst) {
				124	unsigned int metric = dst_metric_advmss(dst);
				125
				126	if (metric < mss) {
				127	mss = metric;
				128	tp->advmss = mss;
				129	}
				130	}
				131
				132	return (__u16)mss;
				133	}
				134
				135	/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
				136	* This is the first part of cwnd validation mechanism. */
				137	static void tcp_cwnd_restart(struct sock sk, const struct dst_entry dst)
				138	{
				139	struct tcp_sock *tp = tcp_sk(sk);
				140	s32 delta = tcp_time_stamp - tp->lsndtime;
				141	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
				142	u32 cwnd = tp->snd_cwnd;
				143
				144	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
				145
				146	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				147	restart_cwnd = min(restart_cwnd, cwnd);
				148
				149	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
				150	cwnd >>= 1;
				151	tp->snd_cwnd = max(cwnd, restart_cwnd);
				152	tp->snd_cwnd_stamp = tcp_time_stamp;
				153	tp->snd_cwnd_used = 0;
				154	}
				155
				156	/* Congestion state accounting after a packet has been sent. */
				157	static void tcp_event_data_sent(struct tcp_sock *tp,
				158	struct sock *sk)
				159	{
				160	struct inet_connection_sock *icsk = inet_csk(sk);
				161	const u32 now = tcp_time_stamp;
				162
				163	if (sysctl_tcp_slow_start_after_idle &&
				164	(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
				165	tcp_cwnd_restart(sk, __sk_dst_get(sk));
				166
				167	tp->lsndtime = now;
				168
				169	/* If it is a reply for ato after last received
				170	* packet, enter pingpong mode.
				171	*/
				172	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
				173	icsk->icsk_ack.pingpong = 1;
				174	}
				175
				176	/* Account for an ACK we sent. */
				177	static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
				178	{
				179	tcp_dec_quickack_mode(sk, pkts);
				180	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
				181	}
				182
				183	/* Determine a window scaling and initial window to offer.
				184	* Based on the assumption that the given amount of space
				185	* will be offered. Store the results in the tp structure.
				186	* NOTE: for smooth operation initial space offering should
				187	* be a multiple of mss if possible. We assume here that mss >= 1.
				188	* This MUST be enforced by all callers.
				189	*/
				190	void tcp_select_initial_window(int __space, __u32 mss,
				191	__u32 rcv_wnd, __u32 window_clamp,
				192	int wscale_ok, __u8 *rcv_wscale,
				193	__u32 init_rcv_wnd)
				194	{
				195	unsigned int space = (__space < 0 ? 0 : __space);
				196
				197	/* If no clamp set the clamp to the max possible scaled window */
				198	if (*window_clamp == 0)
				199	(*window_clamp) = (65535 << 14);
				200	space = min(*window_clamp, space);
				201
				202	/* Quantize space offering to a multiple of mss if possible. */
				203	if (space > mss)
				204	space = (space / mss) * mss;
				205
				206	/* NOTE: offering an initial window larger than 32767
				207	* will break some buggy TCP stacks. If the admin tells us
				208	* it is likely we could be speaking with such a buggy stack
				209	* we will truncate our initial window offering to 32K-1
				210	* unless the remote has sent us a window scaling option,
				211	* which we interpret as a sign the remote TCP is not
				212	* misinterpreting the window field as a signed quantity.
				213	*/
				214	if (sysctl_tcp_workaround_signed_windows)
				215	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
				216	else
				217	(*rcv_wnd) = space;
				218
				219	(*rcv_wscale) = 0;
				220	if (wscale_ok) {
				221	/* Set window scaling on max possible window
				222	* See RFC1323 for an explanation of the limit to 14
				223	*/
				224	space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
				225	space = min_t(u32, space, *window_clamp);
				226	while (space > 65535 && (*rcv_wscale) < 14) {
				227	space >>= 1;
				228	(*rcv_wscale)++;
				229	}
				230	}
				231
				232	/* Set initial window to a value enough for senders starting with
				233	* initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
				234	* a limit on the initial window when mss is larger than 1460.
				235	*/
				236	if (mss > (1 << *rcv_wscale)) {
				237	int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
				238	if (mss > 1460)
				239	init_cwnd =
				240	max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
				241	/* when initializing use the value from init_rcv_wnd
				242	* rather than the default from above
				243	*/
				244	if (init_rcv_wnd)
				245	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
				246	else
				247	rcv_wnd = min(rcv_wnd, init_cwnd * mss);
				248	}
				249
				250	/* Set the clamp no higher than max representable value */
				251	(window_clamp) = min(65535U << (rcv_wscale), *window_clamp);
				252	}
				253	EXPORT_SYMBOL(tcp_select_initial_window);
				254
				255	/* Chose a new window to advertise, update state in tcp_sock for the
				256	* socket, and return result with RFC1323 scaling applied. The return
				257	* value can be stuffed directly into th->window for an outgoing
				258	* frame.
				259	*/
				260	static u16 tcp_select_window(struct sock *sk)
				261	{
				262	struct tcp_sock *tp = tcp_sk(sk);
				263	u32 cur_win = tcp_receive_window(tp);
				264	u32 new_win = __tcp_select_window(sk);
				265
				266	/* Never shrink the offered window */
				267	if (new_win < cur_win) {
				268	/* Danger Will Robinson!
				269	* Don't update rcv_wup/rcv_wnd here or else
				270	* we will not be able to advertise a zero
				271	* window in time. --DaveM
				272	*
				273	* Relax Will Robinson.
				274	*/
				275	new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
				276	}
				277	tp->rcv_wnd = new_win;
				278	tp->rcv_wup = tp->rcv_nxt;
				279
				280	/* Make sure we do not exceed the maximum possible
				281	* scaled window.
				282	*/
				283	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
				284	new_win = min(new_win, MAX_TCP_WINDOW);
				285	else
				286	new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
				287
				288	/* RFC1323 scaling applied */
				289	new_win >>= tp->rx_opt.rcv_wscale;
				290
				291	/* If we advertise zero window, disable fast path. */
				292	if (new_win == 0)
				293	tp->pred_flags = 0;
				294
				295	return new_win;
				296	}
				297
				298	/* Packet ECN state for a SYN-ACK */
				299	static inline void TCP_ECN_send_synack(const struct tcp_sock tp, struct sk_buff skb)
				300	{
				301	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
				302	if (!(tp->ecn_flags & TCP_ECN_OK))
				303	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
				304	}
				305
				306	/* Packet ECN state for a SYN. */
				307	static inline void TCP_ECN_send_syn(struct sock sk, struct sk_buff skb)
				308	{
				309	struct tcp_sock *tp = tcp_sk(sk);
				310
				311	tp->ecn_flags = 0;
				312	if (sysctl_tcp_ecn == 1) {
				313	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ECE \| TCPHDR_CWR;
				314	tp->ecn_flags = TCP_ECN_OK;
				315	}
				316	}
				317
				318	static __inline__ void
				319	TCP_ECN_make_synack(const struct request_sock req, struct tcphdr th)
				320	{
				321	if (inet_rsk(req)->ecn_ok)
				322	th->ece = 1;
				323	}
				324
				325	/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
				326	* be sent.
				327	*/
				328	static inline void TCP_ECN_send(struct sock sk, struct sk_buff skb,
				329	int tcp_header_len)
				330	{
				331	struct tcp_sock *tp = tcp_sk(sk);
				332
				333	if (tp->ecn_flags & TCP_ECN_OK) {
				334	/* Not-retransmitted data segment: set ECT and inject CWR. */
				335	if (skb->len != tcp_header_len &&
				336	!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
				337	INET_ECN_xmit(sk);
				338	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
				339	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				340	tcp_hdr(skb)->cwr = 1;
				341	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
				342	}
				343	} else {
				344	/* ACK or retransmitted segment: clear ECT\|CE */
				345	INET_ECN_dontxmit(sk);
				346	}
				347	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
				348	tcp_hdr(skb)->ece = 1;
				349	}
				350	}
				351
				352	/* Constructs common control bits of non-data skb. If SYN/FIN is present,
				353	* auto increment end seqno.
				354	*/
				355	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
				356	{
				357	skb->ip_summed = CHECKSUM_PARTIAL;
				358	skb->csum = 0;
				359
				360	TCP_SKB_CB(skb)->tcp_flags = flags;
				361	TCP_SKB_CB(skb)->sacked = 0;
				362
				363	skb_shinfo(skb)->gso_segs = 1;
				364	skb_shinfo(skb)->gso_size = 0;
				365	skb_shinfo(skb)->gso_type = 0;
				366
				367	TCP_SKB_CB(skb)->seq = seq;
				368	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
				369	seq++;
				370	TCP_SKB_CB(skb)->end_seq = seq;
				371	}
				372
				373	static inline int tcp_urg_mode(const struct tcp_sock *tp)
				374	{
				375	return tp->snd_una != tp->snd_up;
				376	}
				377
				378	#define OPTION_SACK_ADVERTISE (1 << 0)
				379	#define OPTION_TS (1 << 1)
				380	#define OPTION_MD5 (1 << 2)
				381	#define OPTION_WSCALE (1 << 3)
				382	#define OPTION_COOKIE_EXTENSION (1 << 4)
				383
				384	struct tcp_out_options {
				385	u8 options; /* bit field of OPTION_* */
				386	u8 ws; /* window scale, 0 to disable */
				387	u8 num_sack_blocks; /* number of SACK blocks to include */
				388	u8 hash_size; /* bytes in hash_location */
				389	u16 mss; /* 0 to disable */
				390	__u32 tsval, tsecr; /* need to include OPTION_TS */
				391	__u8 hash_location; / temporary pointer, overloaded */
				392	};
				393
				394	/* The sysctl int routines are generic, so check consistency here.
				395	*/
				396	static u8 tcp_cookie_size_check(u8 desired)
				397	{
				398	int cookie_size;
				399
				400	if (desired > 0)
				401	/* previously specified */
				402	return desired;
				403
				404	cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
				405	if (cookie_size <= 0)
				406	/* no default specified */
				407	return 0;
				408
				409	if (cookie_size <= TCP_COOKIE_MIN)
				410	/* value too small, specify minimum */
				411	return TCP_COOKIE_MIN;
				412
				413	if (cookie_size >= TCP_COOKIE_MAX)
				414	/* value too large, specify maximum */
				415	return TCP_COOKIE_MAX;
				416
				417	if (cookie_size & 1)
				418	/* 8-bit multiple, illegal, fix it */
				419	cookie_size++;
				420
				421	return (u8)cookie_size;
				422	}
				423
				424	/* Write previously computed TCP options to the packet.
				425	*
				426	* Beware: Something in the Internet is very sensitive to the ordering of
				427	* TCP options, we learned this through the hard way, so be careful here.
				428	* Luckily we can at least blame others for their non-compliance but from
				429	* inter-operatibility perspective it seems that we're somewhat stuck with
				430	* the ordering which we have been using if we want to keep working with
				431	* those broken things (not that it currently hurts anybody as there isn't
				432	* particular reason why the ordering would need to be changed).
				433	*
				434	* At least SACK_PERM as the first option is known to lead to a disaster
				435	* (but it may well be that other scenarios fail similarly).
				436	*/
				437	static void tcp_options_write(__be32 ptr, struct tcp_sock tp,
				438	struct tcp_out_options *opts)
				439	{
				440	u8 options = opts->options; /* mungable copy */
				441
				442	/* Having both authentication and cookies for security is redundant,
				443	* and there's certainly not enough room. Instead, the cookie-less
				444	* extension variant is proposed.
				445	*
				446	* Consider the pessimal case with authentication. The options
				447	* could look like:
				448	* COOKIE\|MD5(20) + MSS(4) + SACK\|TS(12) + WSCALE(4) == 40
				449	*/
				450	if (unlikely(OPTION_MD5 & options)) {
				451	if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
				452	*ptr++ = htonl((TCPOPT_COOKIE << 24) \|
				453	(TCPOLEN_COOKIE_BASE << 16) \|
				454	(TCPOPT_MD5SIG << 8) \|
				455	TCPOLEN_MD5SIG);
				456	} else {
				457	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				458	(TCPOPT_NOP << 16) \|
				459	(TCPOPT_MD5SIG << 8) \|
				460	TCPOLEN_MD5SIG);
				461	}
				462	options &= ~OPTION_COOKIE_EXTENSION;
				463	/* overload cookie hash location */
				464	opts->hash_location = (__u8 *)ptr;
				465	ptr += 4;
				466	}
				467
				468	if (unlikely(opts->mss)) {
				469	*ptr++ = htonl((TCPOPT_MSS << 24) \|
				470	(TCPOLEN_MSS << 16) \|
				471	opts->mss);
				472	}
				473
				474	if (likely(OPTION_TS & options)) {
				475	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				476	*ptr++ = htonl((TCPOPT_SACK_PERM << 24) \|
				477	(TCPOLEN_SACK_PERM << 16) \|
				478	(TCPOPT_TIMESTAMP << 8) \|
				479	TCPOLEN_TIMESTAMP);
				480	options &= ~OPTION_SACK_ADVERTISE;
				481	} else {
				482	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				483	(TCPOPT_NOP << 16) \|
				484	(TCPOPT_TIMESTAMP << 8) \|
				485	TCPOLEN_TIMESTAMP);
				486	}
				487	*ptr++ = htonl(opts->tsval);
				488	*ptr++ = htonl(opts->tsecr);
				489	}
				490
				491	/* Specification requires after timestamp, so do it now.
				492	*
				493	* Consider the pessimal case without authentication. The options
				494	* could look like:
				495	* MSS(4) + SACK\|TS(12) + COOKIE(20) + WSCALE(4) == 40
				496	*/
				497	if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
				498	__u8 *cookie_copy = opts->hash_location;
				499	u8 cookie_size = opts->hash_size;
				500
				501	/* 8-bit multiple handled in tcp_cookie_size_check() above,
				502	* and elsewhere.
				503	*/
				504	if (0x2 & cookie_size) {
				505	__u8 p = (__u8 )ptr;
				506
				507	/* 16-bit multiple */
				508	*p++ = TCPOPT_COOKIE;
				509	*p++ = TCPOLEN_COOKIE_BASE + cookie_size;
				510	p++ = cookie_copy++;
				511	p++ = cookie_copy++;
				512	ptr++;
				513	cookie_size -= 2;
				514	} else {
				515	/* 32-bit multiple */
				516	*ptr++ = htonl(((TCPOPT_NOP << 24) \|
				517	(TCPOPT_NOP << 16) \|
				518	(TCPOPT_COOKIE << 8) \|
				519	TCPOLEN_COOKIE_BASE) +
				520	cookie_size);
				521	}
				522
				523	if (cookie_size > 0) {
				524	memcpy(ptr, cookie_copy, cookie_size);
				525	ptr += (cookie_size / 4);
				526	}
				527	}
				528
				529	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
				530	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				531	(TCPOPT_NOP << 16) \|
				532	(TCPOPT_SACK_PERM << 8) \|
				533	TCPOLEN_SACK_PERM);
				534	}
				535
				536	if (unlikely(OPTION_WSCALE & options)) {
				537	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				538	(TCPOPT_WINDOW << 16) \|
				539	(TCPOLEN_WINDOW << 8) \|
				540	opts->ws);
				541	}
				542
				543	if (unlikely(opts->num_sack_blocks)) {
				544	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
				545	tp->duplicate_sack : tp->selective_acks;
				546	int this_sack;
				547
				548	*ptr++ = htonl((TCPOPT_NOP << 24) \|
				549	(TCPOPT_NOP << 16) \|
				550	(TCPOPT_SACK << 8) \|
				551	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
				552	TCPOLEN_SACK_PERBLOCK)));
				553
				554	for (this_sack = 0; this_sack < opts->num_sack_blocks;
				555	++this_sack) {
				556	*ptr++ = htonl(sp[this_sack].start_seq);
				557	*ptr++ = htonl(sp[this_sack].end_seq);
				558	}
				559
				560	tp->rx_opt.dsack = 0;
				561	}
				562	}
				563
				564	/* Compute TCP options for SYN packets. This is not the final
				565	* network wire format yet.
				566	*/
				567	static unsigned tcp_syn_options(struct sock sk, struct sk_buff skb,
				568	struct tcp_out_options *opts,
				569	struct tcp_md5sig_key **md5)
				570	{
				571	struct tcp_sock *tp = tcp_sk(sk);
				572	struct tcp_cookie_values *cvp = tp->cookie_values;
				573	unsigned remaining = MAX_TCP_OPTION_SPACE;
				574	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
				575	tcp_cookie_size_check(cvp->cookie_desired) :
				576	0;
				577
				578	#ifdef CONFIG_TCP_MD5SIG
				579	*md5 = tp->af_specific->md5_lookup(sk, sk);
				580	if (*md5) {
				581	opts->options \|= OPTION_MD5;
				582	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				583	}
				584	#else
				585	*md5 = NULL;
				586	#endif
				587
				588	/* We always get an MSS option. The option bytes which will be seen in
				589	* normal data packets should timestamps be used, must be in the MSS
				590	* advertised. But we subtract them from tp->mss_cache so that
				591	* calculations in tcp_sendmsg are simpler etc. So account for this
				592	* fact here if necessary. If we don't do this correctly, as a
				593	* receiver we won't recognize data packets as being full sized when we
				594	* should, and thus we won't abide by the delayed ACK rules correctly.
				595	* SACKs don't matter, we never delay an ACK when we have any of those
				596	* going out. */
				597	opts->mss = tcp_advertise_mss(sk);
				598	remaining -= TCPOLEN_MSS_ALIGNED;
				599
				600	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
				601	opts->options \|= OPTION_TS;
				602	opts->tsval = TCP_SKB_CB(skb)->when;
				603	opts->tsecr = tp->rx_opt.ts_recent;
				604	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				605	}
				606	if (likely(sysctl_tcp_window_scaling)) {
				607	opts->ws = tp->rx_opt.rcv_wscale;
				608	opts->options \|= OPTION_WSCALE;
				609	remaining -= TCPOLEN_WSCALE_ALIGNED;
				610	}
				611	if (likely(sysctl_tcp_sack)) {
				612	opts->options \|= OPTION_SACK_ADVERTISE;
				613	if (unlikely(!(OPTION_TS & opts->options)))
				614	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				615	}
				616
				617	/* Note that timestamps are required by the specification.
				618	*
				619	* Odd numbers of bytes are prohibited by the specification, ensuring
				620	* that the cookie is 16-bit aligned, and the resulting cookie pair is
				621	* 32-bit aligned.
				622	*/
				623	if (*md5 == NULL &&
				624	(OPTION_TS & opts->options) &&
				625	cookie_size > 0) {
				626	int need = TCPOLEN_COOKIE_BASE + cookie_size;
				627
				628	if (0x2 & need) {
				629	/* 32-bit multiple */
				630	need += 2; /* NOPs */
				631
				632	if (need > remaining) {
				633	/* try shrinking cookie to fit */
				634	cookie_size -= 2;
				635	need -= 4;
				636	}
				637	}
				638	while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
				639	cookie_size -= 4;
				640	need -= 4;
				641	}
				642	if (TCP_COOKIE_MIN <= cookie_size) {
				643	opts->options \|= OPTION_COOKIE_EXTENSION;
				644	opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
				645	opts->hash_size = cookie_size;
				646
				647	/* Remember for future incarnations. */
				648	cvp->cookie_desired = cookie_size;
				649
				650	if (cvp->cookie_desired != cvp->cookie_pair_size) {
				651	/* Currently use random bytes as a nonce,
				652	* assuming these are completely unpredictable
				653	* by hostile users of the same system.
				654	*/
				655	get_random_bytes(&cvp->cookie_pair[0],
				656	cookie_size);
				657	cvp->cookie_pair_size = cookie_size;
				658	}
				659
				660	remaining -= need;
				661	}
				662	}
				663	return MAX_TCP_OPTION_SPACE - remaining;
				664	}
				665
				666	/* Set up TCP options for SYN-ACKs. */
				667	static unsigned tcp_synack_options(struct sock *sk,
				668	struct request_sock *req,
				669	unsigned mss, struct sk_buff *skb,
				670	struct tcp_out_options *opts,
				671	struct tcp_md5sig_key **md5,
				672	struct tcp_extend_values *xvp)
				673	{
				674	struct inet_request_sock *ireq = inet_rsk(req);
				675	unsigned remaining = MAX_TCP_OPTION_SPACE;
				676	u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
				677	xvp->cookie_plus :
				678	0;
				679
				680	#ifdef CONFIG_TCP_MD5SIG
				681	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
				682	if (*md5) {
				683	opts->options \|= OPTION_MD5;
				684	remaining -= TCPOLEN_MD5SIG_ALIGNED;
				685
				686	/* We can't fit any SACK blocks in a packet with MD5 + TS
				687	* options. There was discussion about disabling SACK
				688	* rather than TS in order to fit in better with old,
				689	* buggy kernels, but that was deemed to be unnecessary.
				690	*/
				691	ireq->tstamp_ok &= !ireq->sack_ok;
				692	}
				693	#else
				694	*md5 = NULL;
				695	#endif
				696
				697	/* We always send an MSS option. */
				698	opts->mss = mss;
				699	remaining -= TCPOLEN_MSS_ALIGNED;
				700
				701	if (likely(ireq->wscale_ok)) {
				702	opts->ws = ireq->rcv_wscale;
				703	opts->options \|= OPTION_WSCALE;
				704	remaining -= TCPOLEN_WSCALE_ALIGNED;
				705	}
				706	if (likely(ireq->tstamp_ok)) {
				707	opts->options \|= OPTION_TS;
				708	opts->tsval = TCP_SKB_CB(skb)->when;
				709	opts->tsecr = req->ts_recent;
				710	remaining -= TCPOLEN_TSTAMP_ALIGNED;
				711	}
				712	if (likely(ireq->sack_ok)) {
				713	opts->options \|= OPTION_SACK_ADVERTISE;
				714	if (unlikely(!ireq->tstamp_ok))
				715	remaining -= TCPOLEN_SACKPERM_ALIGNED;
				716	}
				717
				718	/* Similar rationale to tcp_syn_options() applies here, too.
				719	* If the <SYN> options fit, the same options should fit now!
				720	*/
				721	if (*md5 == NULL &&
				722	ireq->tstamp_ok &&
				723	cookie_plus > TCPOLEN_COOKIE_BASE) {
				724	int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
				725
				726	if (0x2 & need) {
				727	/* 32-bit multiple */
				728	need += 2; /* NOPs */
				729	}
				730	if (need <= remaining) {
				731	opts->options \|= OPTION_COOKIE_EXTENSION;
				732	opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
				733	remaining -= need;
				734	} else {
				735	/* There's no error return, so flag it. */
				736	xvp->cookie_out_never = 1; /* true */
				737	opts->hash_size = 0;
				738	}
				739	}
				740	return MAX_TCP_OPTION_SPACE - remaining;
				741	}
				742
				743	/* Compute TCP options for ESTABLISHED sockets. This is not the
				744	* final wire format yet.
				745	*/
				746	static unsigned tcp_established_options(struct sock sk, struct sk_buff skb,
				747	struct tcp_out_options *opts,
				748	struct tcp_md5sig_key **md5)
				749	{
				750	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
				751	struct tcp_sock *tp = tcp_sk(sk);
				752	unsigned size = 0;
				753	unsigned int eff_sacks;
				754
				755	#ifdef CONFIG_TCP_MD5SIG
				756	*md5 = tp->af_specific->md5_lookup(sk, sk);
				757	if (unlikely(*md5)) {
				758	opts->options \|= OPTION_MD5;
				759	size += TCPOLEN_MD5SIG_ALIGNED;
				760	}
				761	#else
				762	*md5 = NULL;
				763	#endif
				764
				765	if (likely(tp->rx_opt.tstamp_ok)) {
				766	opts->options \|= OPTION_TS;
				767	opts->tsval = tcb ? tcb->when : 0;
				768	opts->tsecr = tp->rx_opt.ts_recent;
				769	size += TCPOLEN_TSTAMP_ALIGNED;
				770	}
				771
				772	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
				773	if (unlikely(eff_sacks)) {
				774	const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
				775	opts->num_sack_blocks =
				776	min_t(unsigned, eff_sacks,
				777	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
				778	TCPOLEN_SACK_PERBLOCK);
				779	size += TCPOLEN_SACK_BASE_ALIGNED +
				780	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
				781	}
				782
				783	return size;
				784	}
				785
				786	/* This routine actually transmits TCP packets queued in by
				787	* tcp_do_sendmsg(). This is used by both the initial
				788	* transmission and possible later retransmissions.
				789	* All SKB's seen here are completely headerless. It is our
				790	* job to build the TCP header, and pass the packet down to
				791	* IP so it can do the same plus pass the packet off to the
				792	* device.
				793	*
				794	* We are working here with either a clone of the original
				795	* SKB, or a fresh unique copy made by the retransmit engine.
				796	*/
				797	static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
				798	gfp_t gfp_mask)
				799	{
				800	const struct inet_connection_sock *icsk = inet_csk(sk);
				801	struct inet_sock *inet;
				802	struct tcp_sock *tp;
				803	struct tcp_skb_cb *tcb;
				804	struct tcp_out_options opts;
				805	unsigned tcp_options_size, tcp_header_size;
				806	struct tcp_md5sig_key *md5;
				807	struct tcphdr *th;
				808	int err;
				809
				810	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
				811
				812	/* If congestion control is doing timestamping, we must
				813	* take such a timestamp before we potentially clone/copy.
				814	*/
				815	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
				816	__net_timestamp(skb);
				817
				818	if (likely(clone_it)) {
				819	if (unlikely(skb_cloned(skb)))
				820	skb = pskb_copy(skb, gfp_mask);
				821	else
				822	skb = skb_clone(skb, gfp_mask);
				823	if (unlikely(!skb))
				824	return -ENOBUFS;
				825	}
				826
				827	inet = inet_sk(sk);
				828	tp = tcp_sk(sk);
				829	tcb = TCP_SKB_CB(skb);
				830	memset(&opts, 0, sizeof(opts));
				831
				832	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
				833	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
				834	else
				835	tcp_options_size = tcp_established_options(sk, skb, &opts,
				836	&md5);
				837	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
				838
				839	if (tcp_packets_in_flight(tp) == 0)
				840	tcp_ca_event(sk, CA_EVENT_TX_START);
				841
				842	/* if no packet is in qdisc/device queue, then allow XPS to select
				843	* another queue.
				844	*/
				845	skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
				846
				847	skb_push(skb, tcp_header_size);
				848	skb_reset_transport_header(skb);
				849	skb_set_owner_w(skb, sk);
				850
				851	/* Build TCP header and checksum it. */
				852	th = tcp_hdr(skb);
				853	th->source = inet->inet_sport;
				854	th->dest = inet->inet_dport;
				855	th->seq = htonl(tcb->seq);
				856	th->ack_seq = htonl(tp->rcv_nxt);
				857	(((__be16 )th) + 6) = htons(((tcp_header_size >> 2) << 12) \|
				858	tcb->tcp_flags);
				859
				860	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
				861	/* RFC1323: The window in SYN & SYN/ACK segments
				862	* is never scaled.
				863	*/
				864	th->window = htons(min(tp->rcv_wnd, 65535U));
				865	} else {
				866	th->window = htons(tcp_select_window(sk));
				867	}
				868	th->check = 0;
				869	th->urg_ptr = 0;
				870
				871	/* The urg_mode check is necessary during a below snd_una win probe */
				872	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
				873	if (before(tp->snd_up, tcb->seq + 0x10000)) {
				874	th->urg_ptr = htons(tp->snd_up - tcb->seq);
				875	th->urg = 1;
				876	} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
				877	th->urg_ptr = htons(0xFFFF);
				878	th->urg = 1;
				879	}
				880	}
				881
				882	tcp_options_write((__be32 *)(th + 1), tp, &opts);
				883	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
				884	TCP_ECN_send(sk, skb, tcp_header_size);
				885
				886	#ifdef CONFIG_TCP_MD5SIG
				887	/* Calculate the MD5 hash, as we have all we need now */
				888	if (md5) {
				889	sk_nocaps_add(sk, NETIF_F_GSO_MASK);
				890	tp->af_specific->calc_md5_hash(opts.hash_location,
				891	md5, sk, NULL, skb);
				892	}
				893	#endif
				894
				895	icsk->icsk_af_ops->send_check(sk, skb);
				896
				897	if (likely(tcb->tcp_flags & TCPHDR_ACK))
				898	tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
				899
				900	if (skb->len != tcp_header_size)
				901	tcp_event_data_sent(tp, sk);
				902
				903	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
				904	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
				905	tcp_skb_pcount(skb));
				906
				907	TCP_PKT_STATS_INC(TCP_SEND_PKTS);
				908
				909	err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
				910	if (likely(err <= 0))
				911	return err;
				912
				913	tcp_enter_cwr(sk, 1);
				914
				915	return net_xmit_eval(err);
				916	}
				917
				918	/* This routine just queues the buffer for sending.
				919	*
				920	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
				921	* otherwise socket can stall.
				922	*/
				923	static void tcp_queue_skb(struct sock sk, struct sk_buff skb)
				924	{
				925	struct tcp_sock *tp = tcp_sk(sk);
				926
				927	/* Advance write_seq and place onto the write_queue. */
				928	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
				929	skb_header_release(skb);
				930	tcp_add_write_queue_tail(sk, skb);
				931	sk->sk_wmem_queued += skb->truesize;
				932	sk_mem_charge(sk, skb->truesize);
				933	}
				934
				935	/* Initialize TSO segments for a packet. */
				936	static void tcp_set_skb_tso_segs(const struct sock sk, struct sk_buff skb,
				937	unsigned int mss_now)
				938	{
				939	/* Make sure we own this skb before messing gso_size/gso_segs */
				940	WARN_ON_ONCE(skb_cloned(skb));
				941
				942	if (skb->len <= mss_now \|\| !sk_can_gso(sk) \|\|
				943	skb->ip_summed == CHECKSUM_NONE) {
				944	/* Avoid the costly divide in the normal
				945	* non-TSO case.
				946	*/
				947	skb_shinfo(skb)->gso_segs = 1;
				948	skb_shinfo(skb)->gso_size = 0;
				949	skb_shinfo(skb)->gso_type = 0;
				950	} else {
				951	skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
				952	skb_shinfo(skb)->gso_size = mss_now;
				953	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
				954	}
				955	}
				956
				957	/* When a modification to fackets out becomes necessary, we need to check
				958	* skb is counted to fackets_out or not.
				959	*/
				960	static void tcp_adjust_fackets_out(struct sock sk, const struct sk_buff skb,
				961	int decr)
				962	{
				963	struct tcp_sock *tp = tcp_sk(sk);
				964
				965	if (!tp->sacked_out \|\| tcp_is_reno(tp))
				966	return;
				967
				968	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
				969	tp->fackets_out -= decr;
				970	}
				971
				972	/* Pcount in the middle of the write queue got changed, we need to do various
				973	* tweaks to fix counters
				974	*/
				975	static void tcp_adjust_pcount(struct sock sk, const struct sk_buff skb, int decr)
				976	{
				977	struct tcp_sock *tp = tcp_sk(sk);
				978
				979	tp->packets_out -= decr;
				980
				981	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				982	tp->sacked_out -= decr;
				983	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				984	tp->retrans_out -= decr;
				985	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
				986	tp->lost_out -= decr;
				987
				988	/* Reno case is special. Sigh... */
				989	if (tcp_is_reno(tp) && decr > 0)
				990	tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
				991
				992	tcp_adjust_fackets_out(sk, skb, decr);
				993
				994	if (tp->lost_skb_hint &&
				995	before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
				996	(tcp_is_fack(tp) \|\| (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
				997	tp->lost_cnt_hint -= decr;
				998
				999	tcp_verify_left_out(tp);
				1000	}
				1001
				1002	/* Function to create two new TCP segments. Shrinks the given segment
				1003	* to the specified size and appends a new segment with the rest of the
				1004	* packet to the list. This won't be called frequently, I hope.
				1005	* Remember, these are still headerless SKBs at this point.
				1006	*/
				1007	int tcp_fragment(struct sock sk, struct sk_buff skb, u32 len,
				1008	unsigned int mss_now)
				1009	{
				1010	struct tcp_sock *tp = tcp_sk(sk);
				1011	struct sk_buff *buff;
				1012	int nsize, old_factor;
				1013	int nlen;
				1014	u8 flags;
				1015
				1016	if (WARN_ON(len > skb->len))
				1017	return -EINVAL;
				1018
				1019	nsize = skb_headlen(skb) - len;
				1020	if (nsize < 0)
				1021	nsize = 0;
				1022	//CVE-2019-11478
				1023	if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
				1024	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
				1025	return -ENOMEM;
				1026	}
				1027
				1028	if (skb_unclone(skb, GFP_ATOMIC))
				1029	return -ENOMEM;
				1030
				1031	/* Get a new skb... force flag on. */
				1032	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
				1033	if (buff == NULL)
				1034	return -ENOMEM; /* We'll just try again later. */
				1035
				1036	sk->sk_wmem_queued += buff->truesize;
				1037	sk_mem_charge(sk, buff->truesize);
				1038	nlen = skb->len - len - nsize;
				1039	buff->truesize += nlen;
				1040	skb->truesize -= nlen;
				1041
				1042	/* Correct the sequence numbers. */
				1043	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1044	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1045	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1046
				1047	/* PSH and FIN should only be set in the second packet. */
				1048	flags = TCP_SKB_CB(skb)->tcp_flags;
				1049	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1050	TCP_SKB_CB(buff)->tcp_flags = flags;
				1051	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
				1052
				1053	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
				1054	/* Copy and checksum data tail into the new buffer. */
				1055	buff->csum = csum_partial_copy_nocheck(skb->data + len,
				1056	skb_put(buff, nsize),
				1057	nsize, 0);
				1058
				1059	skb_trim(skb, len);
				1060
				1061	skb->csum = csum_block_sub(skb->csum, buff->csum, len);
				1062	} else {
				1063	skb->ip_summed = CHECKSUM_PARTIAL;
				1064	skb_split(skb, buff, len);
				1065	}
				1066
				1067	buff->ip_summed = skb->ip_summed;
				1068
				1069	/* Looks stupid, but our code really uses when of
				1070	* skbs, which it never sent before. --ANK
				1071	*/
				1072	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
				1073	buff->tstamp = skb->tstamp;
				1074
				1075	old_factor = tcp_skb_pcount(skb);
				1076
				1077	/* Fix up tso_factor for both original and new SKB. */
				1078	tcp_set_skb_tso_segs(sk, skb, mss_now);
				1079	tcp_set_skb_tso_segs(sk, buff, mss_now);
				1080
				1081	/* If this packet has been sent out already, we must
				1082	* adjust the various packet counters.
				1083	*/
				1084	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
				1085	int diff = old_factor - tcp_skb_pcount(skb) -
				1086	tcp_skb_pcount(buff);
				1087
				1088	if (diff)
				1089	tcp_adjust_pcount(sk, skb, diff);
				1090	}
				1091
				1092	/* Link BUFF into the send queue. */
				1093	skb_header_release(buff);
				1094	tcp_insert_write_queue_after(skb, buff, sk);
				1095
				1096	return 0;
				1097	}
				1098
				1099	/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
				1100	* eventually). The difference is that pulled data not copied, but
				1101	* immediately discarded.
				1102	*/
				1103	static void __pskb_trim_head(struct sk_buff *skb, int len)
				1104	{
				1105	int i, k, eat;
				1106
				1107	eat = min_t(int, len, skb_headlen(skb));
				1108	if (eat) {
				1109	__skb_pull(skb, eat);
				1110	len -= eat;
				1111	if (!len)
				1112	return;
				1113	}
				1114	eat = len;
				1115	k = 0;
				1116	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
				1117	int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
				1118
				1119	if (size <= eat) {
				1120	skb_frag_unref(skb, i);
				1121	eat -= size;
				1122	} else {
				1123	skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
				1124	if (eat) {
				1125	skb_shinfo(skb)->frags[k].page_offset += eat;
				1126	skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
				1127	eat = 0;
				1128	}
				1129	k++;
				1130	}
				1131	}
				1132	skb_shinfo(skb)->nr_frags = k;
				1133
				1134	skb_reset_tail_pointer(skb);
				1135	skb->data_len -= len;
				1136	skb->len = skb->data_len;
				1137	}
				1138
				1139	/* Remove acked data from a packet in the transmit queue. */
				1140	int tcp_trim_head(struct sock sk, struct sk_buff skb, u32 len)
				1141	{
				1142	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
				1143	return -ENOMEM;
				1144
				1145	__pskb_trim_head(skb, len);
				1146
				1147	TCP_SKB_CB(skb)->seq += len;
				1148	skb->ip_summed = CHECKSUM_PARTIAL;
				1149
				1150	skb->truesize -= len;
				1151	sk->sk_wmem_queued -= len;
				1152	sk_mem_uncharge(sk, len);
				1153	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
				1154
				1155	/* Any change of skb->len requires recalculation of tso factor. */
				1156	if (tcp_skb_pcount(skb) > 1)
				1157	tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
				1158
				1159	return 0;
				1160	}
				1161
				1162	/* Calculate MSS. Not accounting for SACKs here. */
				1163	int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
				1164	{
				1165	const struct tcp_sock *tp = tcp_sk(sk);
				1166	const struct inet_connection_sock *icsk = inet_csk(sk);
				1167	int mss_now;
				1168
				1169	/* Calculate base mss without TCP options:
				1170	It is MMS_S - sizeof(tcphdr) of rfc1122
				1171	*/
				1172	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
				1173
				1174	/* Clamp it (mss_clamp does not include tcp options) */
				1175	if (mss_now > tp->rx_opt.mss_clamp)
				1176	mss_now = tp->rx_opt.mss_clamp;
				1177
				1178	/* Now subtract optional transport overhead */
				1179	mss_now -= icsk->icsk_ext_hdr_len;
				1180
				1181	/* Then reserve room for full set of TCP options and 8 bytes of data */
				1182	//hub:CVE-2019-11477
				1183	if(mss_now < TCP_MIN_SND_MSS)
				1184	mss_now = TCP_MIN_SND_MSS;
				1185
				1186	/* Now subtract TCP options size, not including SACKs */
				1187	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
				1188
				1189	return mss_now;
				1190	}
				1191
				1192	/* Inverse of above */
				1193	int tcp_mss_to_mtu(const struct sock *sk, int mss)
				1194	{
				1195	const struct tcp_sock *tp = tcp_sk(sk);
				1196	const struct inet_connection_sock *icsk = inet_csk(sk);
				1197	int mtu;
				1198
				1199	mtu = mss +
				1200	tp->tcp_header_len +
				1201	icsk->icsk_ext_hdr_len +
				1202	icsk->icsk_af_ops->net_header_len;
				1203
				1204	return mtu;
				1205	}
				1206
				1207	/* MTU probing init per socket */
				1208	void tcp_mtup_init(struct sock *sk)
				1209	{
				1210	struct tcp_sock *tp = tcp_sk(sk);
				1211	struct inet_connection_sock *icsk = inet_csk(sk);
				1212
				1213	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
				1214	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
				1215	icsk->icsk_af_ops->net_header_len;
				1216	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
				1217	icsk->icsk_mtup.probe_size = 0;
				1218	}
				1219	EXPORT_SYMBOL(tcp_mtup_init);
				1220
				1221	/* This function synchronize snd mss to current pmtu/exthdr set.
				1222
				1223	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
				1224	for TCP options, but includes only bare TCP header.
				1225
				1226	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
				1227	It is minimum of user_mss and mss received with SYN.
				1228	It also does not include TCP options.
				1229
				1230	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
				1231
				1232	tp->mss_cache is current effective sending mss, including
				1233	all tcp options except for SACKs. It is evaluated,
				1234	taking into account current pmtu, but never exceeds
				1235	tp->rx_opt.mss_clamp.
				1236
				1237	NOTE1. rfc1122 clearly states that advertised MSS
				1238	DOES NOT include either tcp or ip options.
				1239
				1240	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
				1241	are READ ONLY outside this function. --ANK (980731)
				1242	*/
				1243	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
				1244	{
				1245	struct tcp_sock *tp = tcp_sk(sk);
				1246	struct inet_connection_sock *icsk = inet_csk(sk);
				1247	int mss_now;
				1248
				1249	if (icsk->icsk_mtup.search_high > pmtu)
				1250	icsk->icsk_mtup.search_high = pmtu;
				1251
				1252	mss_now = tcp_mtu_to_mss(sk, pmtu);
				1253	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
				1254
				1255	/* And store cached results */
				1256	icsk->icsk_pmtu_cookie = pmtu;
				1257	if (icsk->icsk_mtup.enabled)
				1258	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
				1259	tp->mss_cache = mss_now;
				1260
				1261	return mss_now;
				1262	}
				1263	EXPORT_SYMBOL(tcp_sync_mss);
				1264
				1265	/* Compute the current effective MSS, taking SACKs and IP options,
				1266	* and even PMTU discovery events into account.
				1267	*/
				1268	unsigned int tcp_current_mss(struct sock *sk)
				1269	{
				1270	const struct tcp_sock *tp = tcp_sk(sk);
				1271	const struct dst_entry *dst = __sk_dst_get(sk);
				1272	u32 mss_now;
				1273	unsigned header_len;
				1274	struct tcp_out_options opts;
				1275	struct tcp_md5sig_key *md5;
				1276
				1277	mss_now = tp->mss_cache;
				1278
				1279	if (dst) {
				1280	u32 mtu = dst_mtu(dst);
				1281	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
				1282	mss_now = tcp_sync_mss(sk, mtu);
				1283	}
				1284
				1285	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
				1286	sizeof(struct tcphdr);
				1287	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
				1288	* some common options. If this is an odd packet (because we have SACK
				1289	* blocks etc) then our calculated header_len will be different, and
				1290	* we have to adjust mss_now correspondingly */
				1291	if (header_len != tp->tcp_header_len) {
				1292	int delta = (int) header_len - tp->tcp_header_len;
				1293	mss_now -= delta;
				1294	}
				1295
				1296	return mss_now;
				1297	}
				1298
				1299	/* Congestion window validation. (RFC2861) */
				1300	static void tcp_cwnd_validate(struct sock *sk)
				1301	{
				1302	struct tcp_sock *tp = tcp_sk(sk);
				1303
				1304	if (tp->packets_out >= tp->snd_cwnd) {
				1305	/* Network is feed fully. */
				1306	tp->snd_cwnd_used = 0;
				1307	tp->snd_cwnd_stamp = tcp_time_stamp;
				1308	} else {
				1309	/* Network starves. */
				1310	if (tp->packets_out > tp->snd_cwnd_used)
				1311	tp->snd_cwnd_used = tp->packets_out;
				1312
				1313	if (sysctl_tcp_slow_start_after_idle &&
				1314	(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
				1315	tcp_cwnd_application_limited(sk);
				1316	}
				1317	}
				1318
				1319	/* Returns the portion of skb which can be sent right away without
				1320	* introducing MSS oddities to segment boundaries. In rare cases where
				1321	* mss_now != mss_cache, we will request caller to create a small skb
				1322	* per input skb which could be mostly avoided here (if desired).
				1323	*
				1324	* We explicitly want to create a request for splitting write queue tail
				1325	* to a small skb for Nagle purposes while avoiding unnecessary modulos,
				1326	* thus all the complexity (cwnd_len is always MSS multiple which we
				1327	* return whenever allowed by the other factors). Basically we need the
				1328	* modulo only when the receiver window alone is the limiting factor or
				1329	* when we would be allowed to send the split-due-to-Nagle skb fully.
				1330	*/
				1331	static unsigned int tcp_mss_split_point(const struct sock sk, const struct sk_buff skb,
				1332	unsigned int mss_now, unsigned int max_segs)
				1333	{
				1334	const struct tcp_sock *tp = tcp_sk(sk);
				1335	u32 needed, window, max_len;
				1336
				1337	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1338	max_len = mss_now * max_segs;
				1339
				1340	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
				1341	return max_len;
				1342
				1343	needed = min(skb->len, window);
				1344
				1345	if (max_len <= needed)
				1346	return max_len;
				1347
				1348	return needed - needed % mss_now;
				1349	}
				1350
				1351	/* Can at least one segment of SKB be sent right now, according to the
				1352	* congestion window rules? If so, return how many segments are allowed.
				1353	*/
				1354	static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
				1355	const struct sk_buff *skb)
				1356	{
				1357	u32 in_flight, cwnd;
				1358
				1359	/* Don't be strict about the congestion window for the final FIN. */
				1360	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
				1361	tcp_skb_pcount(skb) == 1)
				1362	return 1;
				1363
				1364	in_flight = tcp_packets_in_flight(tp);
				1365	cwnd = tp->snd_cwnd;
				1366	if (in_flight < cwnd)
				1367	return (cwnd - in_flight);
				1368
				1369	return 0;
				1370	}
				1371
				1372	/* Initialize TSO state of a skb.
				1373	* This must be invoked the first time we consider transmitting
				1374	* SKB onto the wire.
				1375	*/
				1376	static int tcp_init_tso_segs(const struct sock sk, struct sk_buff skb,
				1377	unsigned int mss_now)
				1378	{
				1379	int tso_segs = tcp_skb_pcount(skb);
				1380
				1381	if (!tso_segs \|\| (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
				1382	tcp_set_skb_tso_segs(sk, skb, mss_now);
				1383	tso_segs = tcp_skb_pcount(skb);
				1384	}
				1385	return tso_segs;
				1386	}
				1387
				1388	/* Minshall's variant of the Nagle send check. */
				1389	static inline int tcp_minshall_check(const struct tcp_sock *tp)
				1390	{
				1391	return after(tp->snd_sml, tp->snd_una) &&
				1392	!after(tp->snd_sml, tp->snd_nxt);
				1393	}
				1394
				1395	/* Return 0, if packet can be sent now without violation Nagle's rules:
				1396	* 1. It is full sized.
				1397	* 2. Or it contains FIN. (already checked by caller)
				1398	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
				1399	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
				1400	* With Minshall's modification: all sent small packets are ACKed.
				1401	*/
				1402	static inline int tcp_nagle_check(const struct tcp_sock *tp,
				1403	const struct sk_buff *skb,
				1404	unsigned mss_now, int nonagle)
				1405	{
				1406	return skb->len < mss_now &&
				1407	((nonagle & TCP_NAGLE_CORK) \|\|
				1408	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
				1409	}
				1410
				1411	/* Return non-zero if the Nagle test allows this packet to be
				1412	* sent now.
				1413	*/
				1414	static inline int tcp_nagle_test(const struct tcp_sock tp, const struct sk_buff skb,
				1415	unsigned int cur_mss, int nonagle)
				1416	{
				1417	/* Nagle rule does not apply to frames, which sit in the middle of the
				1418	* write_queue (they have no chances to get new data).
				1419	*
				1420	* This is implemented in the callers, where they modify the 'nonagle'
				1421	* argument based upon the location of SKB in the send queue.
				1422	*/
				1423	if (nonagle & TCP_NAGLE_PUSH)
				1424	return 1;
				1425
				1426	/* Don't use the nagle rule for urgent data (or for the final FIN).
				1427	* Nagle can be ignored during F-RTO too (see RFC4138).
				1428	*/
				1429	if (tcp_urg_mode(tp) \|\| (tp->frto_counter == 2) \|\|
				1430	(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
				1431	return 1;
				1432
				1433	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
				1434	return 1;
				1435
				1436	return 0;
				1437	}
				1438
				1439	/* Does at least the first segment of SKB fit into the send window? */
				1440	static inline int tcp_snd_wnd_test(const struct tcp_sock tp, const struct sk_buff skb,
				1441	unsigned int cur_mss)
				1442	{
				1443	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				1444
				1445	if (skb->len > cur_mss)
				1446	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
				1447
				1448	return !after(end_seq, tcp_wnd_end(tp));
				1449	}
				1450
				1451	/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
				1452	* should be put on the wire right now. If so, it returns the number of
				1453	* packets allowed by the congestion window.
				1454	*/
				1455	static unsigned int tcp_snd_test(const struct sock sk, struct sk_buff skb,
				1456	unsigned int cur_mss, int nonagle)
				1457	{
				1458	const struct tcp_sock *tp = tcp_sk(sk);
				1459	unsigned int cwnd_quota;
				1460
				1461	tcp_init_tso_segs(sk, skb, cur_mss);
				1462
				1463	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
				1464	return 0;
				1465
				1466	cwnd_quota = tcp_cwnd_test(tp, skb);
				1467	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
				1468	cwnd_quota = 0;
				1469
				1470	return cwnd_quota;
				1471	}
				1472
				1473	/* Test if sending is allowed right now. */
				1474	int tcp_may_send_now(struct sock *sk)
				1475	{
				1476	const struct tcp_sock *tp = tcp_sk(sk);
				1477	struct sk_buff *skb = tcp_send_head(sk);
				1478
				1479	return skb &&
				1480	tcp_snd_test(sk, skb, tcp_current_mss(sk),
				1481	(tcp_skb_is_last(sk, skb) ?
				1482	tp->nonagle : TCP_NAGLE_PUSH));
				1483	}
				1484
				1485	/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
				1486	* which is put after SKB on the list. It is very much like
				1487	* tcp_fragment() except that it may make several kinds of assumptions
				1488	* in order to speed up the splitting operation. In particular, we
				1489	* know that all the data is in scatter-gather pages, and that the
				1490	* packet has never been sent out before (and thus is not cloned).
				1491	*/
				1492	static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
				1493	unsigned int mss_now, gfp_t gfp)
				1494	{
				1495	struct sk_buff *buff;
				1496	int nlen = skb->len - len;
				1497	u8 flags;
				1498
				1499	/* All of a TSO frame must be composed of paged data. */
				1500	if (skb->len != skb->data_len)
				1501	return tcp_fragment(sk, skb, len, mss_now);
				1502
				1503	buff = sk_stream_alloc_skb(sk, 0, gfp);
				1504	if (unlikely(buff == NULL))
				1505	return -ENOMEM;
				1506
				1507	sk->sk_wmem_queued += buff->truesize;
				1508	sk_mem_charge(sk, buff->truesize);
				1509	buff->truesize += nlen;
				1510	skb->truesize -= nlen;
				1511
				1512	/* Correct the sequence numbers. */
				1513	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
				1514	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
				1515	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
				1516
				1517	/* PSH and FIN should only be set in the second packet. */
				1518	flags = TCP_SKB_CB(skb)->tcp_flags;
				1519	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
				1520	TCP_SKB_CB(buff)->tcp_flags = flags;
				1521
				1522	/* This packet was never sent out yet, so no SACK bits. */
				1523	TCP_SKB_CB(buff)->sacked = 0;
				1524
				1525	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
				1526	skb_split(skb, buff, len);
				1527
				1528	/* Fix up tso_factor for both original and new SKB. */
				1529	tcp_set_skb_tso_segs(sk, skb, mss_now);
				1530	tcp_set_skb_tso_segs(sk, buff, mss_now);
				1531
				1532	/* Link BUFF into the send queue. */
				1533	skb_header_release(buff);
				1534	tcp_insert_write_queue_after(skb, buff, sk);
				1535
				1536	return 0;
				1537	}
				1538
				1539	/* Try to defer sending, if possible, in order to minimize the amount
				1540	* of TSO splitting we do. View it as a kind of TSO Nagle test.
				1541	*
				1542	* This algorithm is from John Heffner.
				1543	*/
				1544	static int tcp_tso_should_defer(struct sock sk, struct sk_buff skb)
				1545	{
				1546	struct tcp_sock *tp = tcp_sk(sk);
				1547	const struct inet_connection_sock *icsk = inet_csk(sk);
				1548	u32 send_win, cong_win, limit, in_flight;
				1549	int win_divisor;
				1550
				1551	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1552	goto send_now;
				1553
				1554	if (icsk->icsk_ca_state != TCP_CA_Open)
				1555	goto send_now;
				1556
				1557	/* Defer for less than two clock ticks. */
				1558	if (tp->tso_deferred &&
				1559	(((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
				1560	goto send_now;
				1561
				1562	in_flight = tcp_packets_in_flight(tp);
				1563
				1564	BUG_ON(tcp_skb_pcount(skb) <= 1 \|\| (tp->snd_cwnd <= in_flight));
				1565
				1566	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				1567
				1568	/* From in_flight test above, we know that cwnd > in_flight. */
				1569	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
				1570
				1571	limit = min(send_win, cong_win);
				1572
				1573	/* If a full-sized TSO skb can be sent, do it. */
				1574	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
				1575	sk->sk_gso_max_segs * tp->mss_cache))
				1576	goto send_now;
				1577
				1578	/* Middle in queue won't get any more data, full sendable already? */
				1579	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
				1580	goto send_now;
				1581
				1582	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
				1583	if (win_divisor) {
				1584	u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
				1585
				1586	/* If at least some fraction of a window is available,
				1587	* just use it.
				1588	*/
				1589	chunk /= win_divisor;
				1590	if (limit >= chunk)
				1591	goto send_now;
				1592	} else {
				1593	/* Different approach, try not to defer past a single
				1594	* ACK. Receiver should ACK every other full sized
				1595	* frame, so if we have space for more than 3 frames
				1596	* then send now.
				1597	*/
				1598	if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
				1599	goto send_now;
				1600	}
				1601
				1602	/* Ok, it looks like it is advisable to defer.
				1603	* Do not rearm the timer if already set to not break TCP ACK clocking.
				1604	*/
				1605	if (!tp->tso_deferred)
				1606	tp->tso_deferred = 1 \| (jiffies << 1);
				1607
				1608	return 1;
				1609
				1610	send_now:
				1611	tp->tso_deferred = 0;
				1612	return 0;
				1613	}
				1614
				1615	/* Create a new MTU probe if we are ready.
				1616	* MTU probe is regularly attempting to increase the path MTU by
				1617	* deliberately sending larger packets. This discovers routing
				1618	* changes resulting in larger path MTUs.
				1619	*
				1620	* Returns 0 if we should wait to probe (no cwnd available),
				1621	* 1 if a probe was sent,
				1622	* -1 otherwise
				1623	*/
				1624	static int tcp_mtu_probe(struct sock *sk)
				1625	{
				1626	struct tcp_sock *tp = tcp_sk(sk);
				1627	struct inet_connection_sock *icsk = inet_csk(sk);
				1628	struct sk_buff skb, nskb, *next;
				1629	int len;
				1630	int probe_size;
				1631	int size_needed;
				1632	int copy;
				1633	int mss_now;
				1634
				1635	/* Not currently probing/verifying,
				1636	* not in recovery,
				1637	* have enough cwnd, and
				1638	* not SACKing (the variable headers throw things off) */
				1639	if (!icsk->icsk_mtup.enabled \|\|
				1640	icsk->icsk_mtup.probe_size \|\|
				1641	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
				1642	tp->snd_cwnd < 11 \|\|
				1643	tp->rx_opt.num_sacks \|\| tp->rx_opt.dsack)
				1644	return -1;
				1645
				1646	/* Very simple search strategy: just double the MSS. */
				1647	mss_now = tcp_current_mss(sk);
				1648	probe_size = 2 * tp->mss_cache;
				1649	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
				1650	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
				1651	/* TODO: set timer for probe_converge_event */
				1652	return -1;
				1653	}
				1654
				1655	/* Have enough data in the send queue to probe? */
				1656	if (tp->write_seq - tp->snd_nxt < size_needed)
				1657	return -1;
				1658
				1659	if (tp->snd_wnd < size_needed)
				1660	return -1;
				1661	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
				1662	return 0;
				1663
				1664	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
				1665	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
				1666	if (!tcp_packets_in_flight(tp))
				1667	return -1;
				1668	else
				1669	return 0;
				1670	}
				1671
				1672	/* We're allowed to probe. Build it now. */
				1673	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
				1674	return -1;
				1675	sk->sk_wmem_queued += nskb->truesize;
				1676	sk_mem_charge(sk, nskb->truesize);
				1677
				1678	skb = tcp_send_head(sk);
				1679
				1680	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
				1681	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
				1682	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
				1683	TCP_SKB_CB(nskb)->sacked = 0;
				1684	nskb->csum = 0;
				1685	nskb->ip_summed = skb->ip_summed;
				1686
				1687	tcp_insert_write_queue_before(nskb, skb, sk);
				1688
				1689	len = 0;
				1690	tcp_for_write_queue_from_safe(skb, next, sk) {
				1691	copy = min_t(int, skb->len, probe_size - len);
				1692	if (nskb->ip_summed)
				1693	skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
				1694	else
				1695	nskb->csum = skb_copy_and_csum_bits(skb, 0,
				1696	skb_put(nskb, copy),
				1697	copy, nskb->csum);
				1698
				1699	if (skb->len <= copy) {
				1700	/* We've eaten all the data from this skb.
				1701	* Throw it away. */
				1702	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1703	tcp_unlink_write_queue(skb, sk);
				1704	sk_wmem_free_skb(sk, skb);
				1705	} else {
				1706	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags &
				1707	~(TCPHDR_FIN\|TCPHDR_PSH);
				1708	if (!skb_shinfo(skb)->nr_frags) {
				1709	skb_pull(skb, copy);
				1710	if (skb->ip_summed != CHECKSUM_PARTIAL)
				1711	skb->csum = csum_partial(skb->data,
				1712	skb->len, 0);
				1713	} else {
				1714	__pskb_trim_head(skb, copy);
				1715	tcp_set_skb_tso_segs(sk, skb, mss_now);
				1716	}
				1717	TCP_SKB_CB(skb)->seq += copy;
				1718	}
				1719
				1720	len += copy;
				1721
				1722	if (len >= probe_size)
				1723	break;
				1724	}
				1725	tcp_init_tso_segs(sk, nskb, nskb->len);
				1726
				1727	/* We're ready to send. If this fails, the probe will
				1728	* be resegmented into mss-sized pieces by tcp_write_xmit(). */
				1729	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
				1730	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
				1731	/* Decrement cwnd here because we are sending
				1732	* effectively two packets. */
				1733	tp->snd_cwnd--;
				1734	tcp_event_new_data_sent(sk, nskb);
				1735
				1736	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
				1737	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
				1738	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
				1739
				1740	return 1;
				1741	}
				1742
				1743	return -1;
				1744	}
				1745
				1746	/* This routine writes packets to the network. It advances the
				1747	* send_head. This happens as incoming acks open up the remote
				1748	* window for us.
				1749	*
				1750	* LARGESEND note: !tcp_urg_mode is overkill, only frames between
				1751	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
				1752	* account rare use of URG, this is not a big flaw.
				1753	*
				1754	* Returns 1, if no segments are in flight and we have queued segments, but
				1755	* cannot send anything now because of SWS or another problem.
				1756	*/
				1757	static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
				1758	int push_one, gfp_t gfp)
				1759	{
				1760	struct tcp_sock *tp = tcp_sk(sk);
				1761	struct sk_buff *skb;
				1762	unsigned int tso_segs, sent_pkts;
				1763	int cwnd_quota;
				1764	int result;
				1765
				1766	sent_pkts = 0;
				1767
				1768	if (!push_one) {
				1769	/* Do MTU probing. */
				1770	result = tcp_mtu_probe(sk);
				1771	if (!result) {
				1772	return 0;
				1773	} else if (result > 0) {
				1774	sent_pkts = 1;
				1775	}
				1776	}
				1777
				1778	while ((skb = tcp_send_head(sk))) {
				1779	unsigned int limit;
				1780
				1781	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
				1782	BUG_ON(!tso_segs);
				1783
				1784	cwnd_quota = tcp_cwnd_test(tp, skb);
				1785	if (!cwnd_quota)
				1786	break;
				1787
				1788	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
				1789	break;
				1790
				1791	if (tso_segs == 1) {
				1792	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
				1793	(tcp_skb_is_last(sk, skb) ?
				1794	nonagle : TCP_NAGLE_PUSH))))
				1795	break;
				1796	} else {
				1797	if (!push_one && tcp_tso_should_defer(sk, skb))
				1798	break;
				1799	}
				1800
				1801	limit = mss_now;
				1802	if (tso_segs > 1 && !tcp_urg_mode(tp))
				1803	limit = tcp_mss_split_point(sk, skb, mss_now,
				1804	min_t(unsigned int,
				1805	cwnd_quota,
				1806	sk->sk_gso_max_segs));
				1807
				1808	if (skb->len > limit &&
				1809	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
				1810	break;
				1811
				1812	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				1813
				1814	if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
				1815	break;
				1816
				1817	/* Advance the send_head. This one is sent out.
				1818	* This call will increment packets_out.
				1819	*/
				1820	tcp_event_new_data_sent(sk, skb);
				1821
				1822	tcp_minshall_update(tp, mss_now, skb);
				1823	sent_pkts += tcp_skb_pcount(skb);
				1824
				1825	if (push_one)
				1826	break;
				1827	}
				1828	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
				1829	tp->prr_out += sent_pkts;
				1830
				1831	if (likely(sent_pkts)) {
				1832	tcp_cwnd_validate(sk);
				1833	return 0;
				1834	}
				1835	return !tp->packets_out && tcp_send_head(sk);
				1836	}
				1837
				1838	/* Push out any pending frames which were held back due to
				1839	* TCP_CORK or attempt at coalescing tiny packets.
				1840	* The socket must be locked by the caller.
				1841	*/
				1842	void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
				1843	int nonagle)
				1844	{
				1845	/* If we are closed, the bytes will have to remain here.
				1846	* In time closedown will finish, we empty the write queue and
				1847	* all will be happy.
				1848	*/
				1849	if (unlikely(sk->sk_state == TCP_CLOSE))
				1850	return;
				1851
				1852	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
				1853	tcp_check_probe_timer(sk);
				1854	}
				1855
				1856	/* Send _single_ skb sitting at the send head. This function requires
				1857	* true push pending frames to setup probe timer etc.
				1858	*/
				1859	void tcp_push_one(struct sock *sk, unsigned int mss_now)
				1860	{
				1861	struct sk_buff *skb = tcp_send_head(sk);
				1862
				1863	BUG_ON(!skb \|\| skb->len < mss_now);
				1864
				1865	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
				1866	}
				1867
				1868	/* This function returns the amount that we can raise the
				1869	* usable window based on the following constraints
				1870	*
				1871	* 1. The window can never be shrunk once it is offered (RFC 793)
				1872	* 2. We limit memory per socket
				1873	*
				1874	* RFC 1122:
				1875	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
				1876	* RECV.NEXT + RCV.WIN fixed until:
				1877	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
				1878	*
				1879	* i.e. don't raise the right edge of the window until you can raise
				1880	* it at least MSS bytes.
				1881	*
				1882	* Unfortunately, the recommended algorithm breaks header prediction,
				1883	* since header prediction assumes th->window stays fixed.
				1884	*
				1885	* Strictly speaking, keeping th->window fixed violates the receiver
				1886	* side SWS prevention criteria. The problem is that under this rule
				1887	* a stream of single byte packets will cause the right side of the
				1888	* window to always advance by a single byte.
				1889	*
				1890	* Of course, if the sender implements sender side SWS prevention
				1891	* then this will not be a problem.
				1892	*
				1893	* BSD seems to make the following compromise:
				1894	*
				1895	* If the free space is less than the 1/4 of the maximum
				1896	* space available and the free space is less than 1/2 mss,
				1897	* then set the window to 0.
				1898	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
				1899	* Otherwise, just prevent the window from shrinking
				1900	* and from being larger than the largest representable value.
				1901	*
				1902	* This prevents incremental opening of the window in the regime
				1903	* where TCP is limited by the speed of the reader side taking
				1904	* data out of the TCP receive queue. It does nothing about
				1905	* those cases where the window is constrained on the sender side
				1906	* because the pipeline is full.
				1907	*
				1908	* BSD also seems to "accidentally" limit itself to windows that are a
				1909	* multiple of MSS, at least until the free space gets quite small.
				1910	* This would appear to be a side effect of the mbuf implementation.
				1911	* Combining these two algorithms results in the observed behavior
				1912	* of having a fixed window size at almost all times.
				1913	*
				1914	* Below we obtain similar behavior by forcing the offered window to
				1915	* a multiple of the mss when it is feasible to do so.
				1916	*
				1917	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
				1918	* Regular options like TIMESTAMP are taken into account.
				1919	*/
				1920	u32 __tcp_select_window(struct sock *sk)
				1921	{
				1922	struct inet_connection_sock *icsk = inet_csk(sk);
				1923	struct tcp_sock *tp = tcp_sk(sk);
				1924	/* MSS for the peer's data. Previous versions used mss_clamp
				1925	* here. I don't know if the value based on our guesses
				1926	* of peer's MSS is better for the performance. It's more correct
				1927	* but may be worse for the performance because of rcv_mss
				1928	* fluctuations. --SAW 1998/11/1
				1929	*/
				1930	int mss = icsk->icsk_ack.rcv_mss;
				1931	int free_space = tcp_space(sk);
				1932	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
				1933	int window;
				1934
				1935	if (mss > full_space)
				1936	mss = full_space;
				1937
				1938	if (free_space < (full_space >> 1)) {
				1939	icsk->icsk_ack.quick = 0;
				1940
				1941	if (sk_under_memory_pressure(sk))
				1942	tp->rcv_ssthresh = min(tp->rcv_ssthresh,
				1943	4U * tp->advmss);
				1944
				1945	if (free_space < mss)
				1946	return 0;
				1947	}
				1948
				1949	if (free_space > tp->rcv_ssthresh)
				1950	free_space = tp->rcv_ssthresh;
				1951
				1952	/* Don't do rounding if we are using window scaling, since the
				1953	* scaled window will not line up with the MSS boundary anyway.
				1954	*/
				1955	window = tp->rcv_wnd;
				1956	if (tp->rx_opt.rcv_wscale) {
				1957	window = free_space;
				1958
				1959	/* Advertise enough space so that it won't get scaled away.
				1960	* Import case: prevent zero window announcement if
				1961	* 1<<rcv_wscale > mss.
				1962	*/
				1963	if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
				1964	window = (((window >> tp->rx_opt.rcv_wscale) + 1)
				1965	<< tp->rx_opt.rcv_wscale);
				1966	} else {
				1967	/* Get the largest window that is a nice multiple of mss.
				1968	* Window clamp already applied above.
				1969	* If our current window offering is within 1 mss of the
				1970	* free space we just keep it. This prevents the divide
				1971	* and multiply from happening most of the time.
				1972	* We also don't do any window rounding when the free space
				1973	* is too small.
				1974	*/
				1975	if (window <= free_space - mss \|\| window > free_space)
				1976	window = (free_space / mss) * mss;
				1977	else if (mss == full_space &&
				1978	free_space > window + (full_space >> 1))
				1979	window = free_space;
				1980	}
				1981
				1982	return window;
				1983	}
				1984
				1985	/* Collapses two adjacent SKB's during retransmission. */
				1986	static void tcp_collapse_retrans(struct sock sk, struct sk_buff skb)
				1987	{
				1988	struct tcp_sock *tp = tcp_sk(sk);
				1989	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
				1990	int skb_size, next_skb_size;
				1991
				1992	skb_size = skb->len;
				1993	next_skb_size = next_skb->len;
				1994
				1995	BUG_ON(tcp_skb_pcount(skb) != 1 \|\| tcp_skb_pcount(next_skb) != 1);
				1996
				1997	tcp_highest_sack_combine(sk, next_skb, skb);
				1998
				1999	tcp_unlink_write_queue(next_skb, sk);
				2000
				2001	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
				2002	next_skb_size);
				2003
				2004	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
				2005	skb->ip_summed = CHECKSUM_PARTIAL;
				2006
				2007	if (skb->ip_summed != CHECKSUM_PARTIAL)
				2008	skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
				2009
				2010	/* Update sequence range on original skb. */
				2011	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
				2012
				2013	/* Merge over control information. This moves PSH/FIN etc. over */
				2014	TCP_SKB_CB(skb)->tcp_flags \|= TCP_SKB_CB(next_skb)->tcp_flags;
				2015
				2016	/* All done, get rid of second SKB and account for it so
				2017	* packet counting does not break.
				2018	*/
				2019	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
				2020
				2021	/* changed transmit queue under us so clear hints */
				2022	tcp_clear_retrans_hints_partial(tp);
				2023	if (next_skb == tp->retransmit_skb_hint)
				2024	tp->retransmit_skb_hint = skb;
				2025
				2026	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
				2027
				2028	sk_wmem_free_skb(sk, next_skb);
				2029	}
				2030
				2031	/* Check if coalescing SKBs is legal. */
				2032	static int tcp_can_collapse(const struct sock sk, const struct sk_buff skb)
				2033	{
				2034	if (tcp_skb_pcount(skb) > 1)
				2035	return 0;
				2036	/* TODO: SACK collapsing could be used to remove this condition */
				2037	if (skb_shinfo(skb)->nr_frags != 0)
				2038	return 0;
				2039	if (skb_cloned(skb))
				2040	return 0;
				2041	if (skb == tcp_send_head(sk))
				2042	return 0;
				2043	/* Some heurestics for collapsing over SACK'd could be invented */
				2044	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
				2045	return 0;
				2046
				2047	return 1;
				2048	}
				2049
				2050	/* Collapse packets in the retransmit queue to make to create
				2051	* less packets on the wire. This is only done on retransmission.
				2052	*/
				2053	static void tcp_retrans_try_collapse(struct sock sk, struct sk_buff to,
				2054	int space)
				2055	{
				2056	struct tcp_sock *tp = tcp_sk(sk);
				2057	struct sk_buff skb = to, tmp;
				2058	int first = 1;
				2059
				2060	if (!sysctl_tcp_retrans_collapse)
				2061	return;
				2062	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
				2063	return;
				2064
				2065	tcp_for_write_queue_from_safe(skb, tmp, sk) {
				2066	if (!tcp_can_collapse(sk, skb))
				2067	break;
				2068
				2069	space -= skb->len;
				2070
				2071	if (first) {
				2072	first = 0;
				2073	continue;
				2074	}
				2075
				2076	if (space < 0)
				2077	break;
				2078	/* Punt if not enough space exists in the first SKB for
				2079	* the data in the second
				2080	*/
				2081	if (skb->len > skb_availroom(to))
				2082	break;
				2083
				2084	if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
				2085	break;
				2086
				2087	tcp_collapse_retrans(sk, to);
				2088	}
				2089	}
				2090
				2091	/* This retransmits one SKB. Policy decisions and retransmit queue
				2092	* state updates are done by the caller. Returns non-zero if an
				2093	* error occurred which prevented the send.
				2094	*/
				2095	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb)
				2096	{
				2097	struct tcp_sock *tp = tcp_sk(sk);
				2098	struct inet_connection_sock *icsk = inet_csk(sk);
				2099	unsigned int cur_mss;
				2100	int err;
				2101
				2102	/* Inconslusive MTU probe */
				2103	if (icsk->icsk_mtup.probe_size) {
				2104	icsk->icsk_mtup.probe_size = 0;
				2105	}
				2106
				2107	/* Do not sent more than we queued. 1/4 is reserved for possible
				2108	* copying overhead: fragmentation, tunneling, mangling etc.
				2109	*/
				2110	if (atomic_read(&sk->sk_wmem_alloc) >
				2111	min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
				2112	return -EAGAIN;
				2113
				2114	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
				2115	if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				2116	BUG();
				2117	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				2118	return -ENOMEM;
				2119	}
				2120
				2121	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
				2122	return -EHOSTUNREACH; /* Routing failure or similar. */
				2123
				2124	cur_mss = tcp_current_mss(sk);
				2125
				2126	/* If receiver has shrunk his window, and skb is out of
				2127	* new window, do not retransmit it. The exception is the
				2128	* case, when window is shrunk to zero. In this case
				2129	* our retransmit serves as a zero window probe.
				2130	*/
				2131	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
				2132	TCP_SKB_CB(skb)->seq != tp->snd_una)
				2133	return -EAGAIN;
				2134
				2135	if (skb->len > cur_mss) {
				2136	if (tcp_fragment(sk, skb, cur_mss, cur_mss))
				2137	return -ENOMEM; /* We'll try again later. */
				2138	} else {
				2139	int oldpcount = tcp_skb_pcount(skb);
				2140
				2141	if (unlikely(oldpcount > 1)) {
				2142	if (skb_unclone(skb, GFP_ATOMIC))
				2143	return -ENOMEM;
				2144	tcp_init_tso_segs(sk, skb, cur_mss);
				2145	tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
				2146	}
				2147	}
				2148
				2149	tcp_retrans_try_collapse(sk, skb, cur_mss);
				2150
				2151	/* Some Solaris stacks overoptimize and ignore the FIN on a
				2152	* retransmit when old data is attached. So strip it off
				2153	* since it is cheap to do so and saves bytes on the network.
				2154	*/
				2155	if (skb->len > 0 &&
				2156	(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
				2157	tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
				2158	if (!pskb_trim(skb, 0)) {
				2159	/* Reuse, even though it does some unnecessary work */
				2160	tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
				2161	TCP_SKB_CB(skb)->tcp_flags);
				2162	skb->ip_summed = CHECKSUM_NONE;
				2163	}
				2164	}
				2165
				2166	/* Make a copy, if the first transmission SKB clone we made
				2167	* is still in somebody's hands, else make a clone.
				2168	*/
				2169	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				2170
				2171	/* make sure skb->data is aligned on arches that require it
				2172	* and check if ack-trimming & collapsing extended the headroom
				2173	* beyond what csum_start can cover.
				2174	*/
				2175	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) \|\|
				2176	skb_headroom(skb) >= 0xFFFF)) {
				2177	struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
				2178	GFP_ATOMIC);
				2179	err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
				2180	-ENOBUFS;
				2181	} else {
				2182	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				2183	}
				2184
				2185	if (err == 0) {
				2186	/* Update global TCP statistics. */
				2187	TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
				2188
				2189	TCP_PKT_STATS_INC(TCP_RETRANS_PKTS);
				2190	TCP_PKT_STATS_INC(TCP_SEND_DROPS);
				2191
				2192	tp->total_retrans++;
				2193
				2194	#if FASTRETRANS_DEBUG > 0
				2195	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2196	if (net_ratelimit())
				2197	printk(KERN_DEBUG "retrans_out leaked.\n");
				2198	}
				2199	#endif
				2200	if (!tp->retrans_out)
				2201	tp->lost_retrans_low = tp->snd_nxt;
				2202	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
				2203	tp->retrans_out += tcp_skb_pcount(skb);
				2204
				2205	/* Save stamp of the first retransmit. */
				2206	if (!tp->retrans_stamp)
				2207	tp->retrans_stamp = TCP_SKB_CB(skb)->when;
				2208
				2209	/* snd_nxt is stored to detect loss of retransmitted segment,
				2210	* see tcp_input.c tcp_sacktag_write_queue().
				2211	*/
				2212	TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
				2213	}
				2214
				2215	if (tp->undo_retrans < 0)
				2216	tp->undo_retrans = 0;
				2217	tp->undo_retrans += tcp_skb_pcount(skb);
				2218	return err;
				2219	}
				2220
				2221	/* Check if we forward retransmits are possible in the current
				2222	* window/congestion state.
				2223	*/
				2224	static int tcp_can_forward_retransmit(struct sock *sk)
				2225	{
				2226	const struct inet_connection_sock *icsk = inet_csk(sk);
				2227	const struct tcp_sock *tp = tcp_sk(sk);
				2228
				2229	/* Forward retransmissions are possible only during Recovery. */
				2230	if (icsk->icsk_ca_state != TCP_CA_Recovery)
				2231	return 0;
				2232
				2233	/* No forward retransmissions in Reno are possible. */
				2234	if (tcp_is_reno(tp))
				2235	return 0;
				2236
				2237	/* Yeah, we have to make difficult choice between forward transmission
				2238	* and retransmission... Both ways have their merits...
				2239	*
				2240	* For now we do not retransmit anything, while we have some new
				2241	* segments to send. In the other cases, follow rule 3 for
				2242	* NextSeg() specified in RFC3517.
				2243	*/
				2244
				2245	if (tcp_may_send_now(sk))
				2246	return 0;
				2247
				2248	return 1;
				2249	}
				2250
				2251	/* This gets called after a retransmit timeout, and the initially
				2252	* retransmitted data is acknowledged. It tries to continue
				2253	* resending the rest of the retransmit queue, until either
				2254	* we've sent it all or the congestion window limit is reached.
				2255	* If doing SACK, the first ACK which comes back for a timeout
				2256	* based retransmit packet might feed us FACK information again.
				2257	* If so, we use it to avoid unnecessarily retransmissions.
				2258	*/
				2259	void tcp_xmit_retransmit_queue(struct sock *sk)
				2260	{
				2261	const struct inet_connection_sock *icsk = inet_csk(sk);
				2262	struct tcp_sock *tp = tcp_sk(sk);
				2263	struct sk_buff *skb;
				2264	struct sk_buff *hole = NULL;
				2265	u32 last_lost;
				2266	int mib_idx;
				2267	int fwd_rexmitting = 0;
				2268
				2269	if (!tp->packets_out)
				2270	return;
				2271
				2272	if (!tp->lost_out)
				2273	tp->retransmit_high = tp->snd_una;
				2274
				2275	if (tp->retransmit_skb_hint) {
				2276	skb = tp->retransmit_skb_hint;
				2277	last_lost = TCP_SKB_CB(skb)->end_seq;
				2278	if (after(last_lost, tp->retransmit_high))
				2279	last_lost = tp->retransmit_high;
				2280	} else {
				2281	skb = tcp_write_queue_head(sk);
				2282	last_lost = tp->snd_una;
				2283	}
				2284
				2285	tcp_for_write_queue_from(skb, sk) {
				2286	__u8 sacked = TCP_SKB_CB(skb)->sacked;
				2287
				2288	if (skb == tcp_send_head(sk))
				2289	break;
				2290	/* we could do better than to assign each time */
				2291	if (hole == NULL)
				2292	tp->retransmit_skb_hint = skb;
				2293
				2294	/* Assume this retransmit will generate
				2295	* only one packet for congestion window
				2296	* calculation purposes. This works because
				2297	* tcp_retransmit_skb() will chop up the
				2298	* packet to be MSS sized and all the
				2299	* packet counting works out.
				2300	*/
				2301	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
				2302	return;
				2303
				2304	if (fwd_rexmitting) {
				2305	begin_fwd:
				2306	if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
				2307	break;
				2308	mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
				2309
				2310	} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
				2311	tp->retransmit_high = last_lost;
				2312	if (!tcp_can_forward_retransmit(sk))
				2313	break;
				2314	/* Backtrack if necessary to non-L'ed skb */
				2315	if (hole != NULL) {
				2316	skb = hole;
				2317	hole = NULL;
				2318	}
				2319	fwd_rexmitting = 1;
				2320	goto begin_fwd;
				2321
				2322	} else if (!(sacked & TCPCB_LOST)) {
				2323	if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS\|TCPCB_SACKED_ACKED)))
				2324	hole = skb;
				2325	continue;
				2326
				2327	} else {
				2328	last_lost = TCP_SKB_CB(skb)->end_seq;
				2329	if (icsk->icsk_ca_state != TCP_CA_Loss)
				2330	mib_idx = LINUX_MIB_TCPFASTRETRANS;
				2331	else
				2332	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
				2333	}
				2334
				2335	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
				2336	continue;
				2337
				2338	if (tcp_retransmit_skb(sk, skb)) {
				2339	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
				2340	return;
				2341	}
				2342	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				2343
				2344	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
				2345	tp->prr_out += tcp_skb_pcount(skb);
				2346
				2347	if (skb == tcp_write_queue_head(sk))
				2348	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2349	inet_csk(sk)->icsk_rto,
				2350	TCP_RTO_MAX);
				2351	}
				2352	}
				2353
				2354	/* Send a fin. The caller locks the socket for us. This cannot be
				2355	* allowed to fail queueing a FIN frame under any circumstances.
				2356	*/
				2357	void tcp_send_fin(struct sock *sk)
				2358	{
				2359	struct tcp_sock *tp = tcp_sk(sk);
				2360	struct sk_buff *skb = tcp_write_queue_tail(sk);
				2361	int mss_now;
				2362
				2363	/* Optimization, tack on the FIN if we have a queue of
				2364	* unsent frames. But be careful about outgoing SACKS
				2365	* and IP options.
				2366	*/
				2367	mss_now = tcp_current_mss(sk);
				2368
				2369	if (tcp_send_head(sk) != NULL) {
				2370	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_FIN;
				2371	TCP_SKB_CB(skb)->end_seq++;
				2372	tp->write_seq++;
				2373	} else {
				2374	/* Socket is locked, keep trying until memory is available. */
				2375	for (;;) {
				2376	skb = alloc_skb_fclone(MAX_TCP_HEADER,
				2377	sk->sk_allocation);
				2378	if (skb)
				2379	break;
				2380	yield();
				2381	}
				2382
				2383	/* Reserve space for headers and prepare control bits. */
				2384	skb_reserve(skb, MAX_TCP_HEADER);
				2385	/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
				2386	tcp_init_nondata_skb(skb, tp->write_seq,
				2387	TCPHDR_ACK \| TCPHDR_FIN);
				2388	tcp_queue_skb(sk, skb);
				2389	}
				2390
				2391	TCP_SOCK_TRACK(sk, TCP_FIN_SEND);
				2392
				2393	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
				2394	}
				2395
				2396	/* We get here when a process closes a file descriptor (either due to
				2397	* an explicit close() or as a byproduct of exit()'ing) and there
				2398	* was unread data in the receive queue. This behavior is recommended
				2399	* by RFC 2525, section 2.17. -DaveM
				2400	*/
				2401	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
				2402	{
				2403	struct sk_buff *skb;
				2404
				2405	/* NOTE: No TCP options attached and we never retransmit this. */
				2406	skb = alloc_skb(MAX_TCP_HEADER, priority);
				2407	if (!skb) {
				2408	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				2409	return;
				2410	}
				2411
				2412	/* Reserve space for headers and prepare control bits. */
				2413	skb_reserve(skb, MAX_TCP_HEADER);
				2414	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
				2415	TCPHDR_ACK \| TCPHDR_RST);
				2416	/* Send it off. */
				2417	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				2418	if (tcp_transmit_skb(sk, skb, 0, priority))
				2419	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
				2420
				2421	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
				2422	TCP_PKT_STATS_INC(TCP_RST_SEND_NUM);
				2423
				2424	TCP_SOCK_TRACK(sk, TCP_RST_SEND);
				2425	}
				2426
				2427	/* Send a crossed SYN-ACK during socket establishment.
				2428	* WARNING: This routine must only be called when we have already sent
				2429	* a SYN packet that crossed the incoming SYN that caused this routine
				2430	* to get called. If this assumption fails then the initial rcv_wnd
				2431	* and rcv_wscale values will not be correct.
				2432	*/
				2433	int tcp_send_synack(struct sock *sk)
				2434	{
				2435	struct sk_buff *skb;
				2436
				2437	skb = tcp_write_queue_head(sk);
				2438	if (skb == NULL \|\| !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				2439	printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
				2440	return -EFAULT;
				2441	}
				2442	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
				2443	if (skb_cloned(skb)) {
				2444	struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
				2445	if (nskb == NULL)
				2446	return -ENOMEM;
				2447	tcp_unlink_write_queue(skb, sk);
				2448	skb_header_release(nskb);
				2449	__tcp_add_write_queue_head(sk, nskb);
				2450	sk_wmem_free_skb(sk, skb);
				2451	sk->sk_wmem_queued += nskb->truesize;
				2452	sk_mem_charge(sk, nskb->truesize);
				2453	skb = nskb;
				2454	}
				2455
				2456	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ACK;
				2457	TCP_ECN_send_synack(tcp_sk(sk), skb);
				2458	}
				2459	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				2460	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				2461	}
				2462
				2463	/* Prepare a SYN-ACK. */
				2464	struct sk_buff tcp_make_synack(struct sock sk, struct dst_entry *dst,
				2465	struct request_sock *req,
				2466	struct request_values *rvp)
				2467	{
				2468	struct tcp_out_options opts;
				2469	struct tcp_extend_values *xvp = tcp_xv(rvp);
				2470	struct inet_request_sock *ireq = inet_rsk(req);
				2471	struct tcp_sock *tp = tcp_sk(sk);
				2472	const struct tcp_cookie_values *cvp = tp->cookie_values;
				2473	struct tcphdr *th;
				2474	struct sk_buff *skb;
				2475	struct tcp_md5sig_key *md5;
				2476	int tcp_header_size;
				2477	int mss;
				2478	int s_data_desired = 0;
				2479
				2480	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
				2481	s_data_desired = cvp->s_data_desired;
				2482	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
				2483	if (skb == NULL)
				2484	return NULL;
				2485
				2486	/* Reserve space for headers. */
				2487	skb_reserve(skb, MAX_TCP_HEADER);
				2488
				2489	skb_dst_set(skb, dst_clone(dst));
				2490
				2491	mss = dst_metric_advmss(dst);
				2492	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
				2493	mss = tp->rx_opt.user_mss;
				2494
				2495	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
				2496	__u8 rcv_wscale;
				2497	/* Set this up on the first call only */
				2498	req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
				2499
				2500	/* limit the window selection if the user enforce a smaller rx buffer */
				2501	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
				2502	(req->window_clamp > tcp_full_space(sk) \|\| req->window_clamp == 0))
				2503	req->window_clamp = tcp_full_space(sk);
				2504
				2505	/* tcp_full_space because it is guaranteed to be the first packet */
				2506	tcp_select_initial_window(tcp_full_space(sk),
				2507	mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
				2508	&req->rcv_wnd,
				2509	&req->window_clamp,
				2510	ireq->wscale_ok,
				2511	&rcv_wscale,
				2512	dst_metric(dst, RTAX_INITRWND));
				2513	ireq->rcv_wscale = rcv_wscale;
				2514	}
				2515
				2516	memset(&opts, 0, sizeof(opts));
				2517	#ifdef CONFIG_SYN_COOKIES
				2518	if (unlikely(req->cookie_ts))
				2519	TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
				2520	else
				2521	#endif
				2522	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				2523	tcp_header_size = tcp_synack_options(sk, req, mss,
				2524	skb, &opts, &md5, xvp)
				2525	+ sizeof(*th);
				2526
				2527	skb_push(skb, tcp_header_size);
				2528	skb_reset_transport_header(skb);
				2529
				2530	th = tcp_hdr(skb);
				2531	memset(th, 0, sizeof(struct tcphdr));
				2532	th->syn = 1;
				2533	th->ack = 1;
				2534	TCP_ECN_make_synack(req, th);
				2535	th->source = ireq->loc_port;
				2536	th->dest = ireq->rmt_port;
				2537	/* Setting of flags are superfluous here for callers (and ECE is
				2538	* not even correctly set)
				2539	*/
				2540	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
				2541	TCPHDR_SYN \| TCPHDR_ACK);
				2542
				2543	if (OPTION_COOKIE_EXTENSION & opts.options) {
				2544	if (s_data_desired) {
				2545	u8 *buf = skb_put(skb, s_data_desired);
				2546
				2547	/* copy data directly from the listening socket. */
				2548	memcpy(buf, cvp->s_data_payload, s_data_desired);
				2549	TCP_SKB_CB(skb)->end_seq += s_data_desired;
				2550	}
				2551
				2552	if (opts.hash_size > 0) {
				2553	__u32 workspace[SHA_WORKSPACE_WORDS];
				2554	u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
				2555	u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
				2556
				2557	/* Secret recipe depends on the Timestamp, (future)
				2558	* Sequence and Acknowledgment Numbers, Initiator
				2559	* Cookie, and others handled by IP variant caller.
				2560	*/
				2561	*tail-- ^= opts.tsval;
				2562	*tail-- ^= tcp_rsk(req)->rcv_isn + 1;
				2563	*tail-- ^= TCP_SKB_CB(skb)->seq + 1;
				2564
				2565	/* recommended */
				2566	*tail-- ^= (((__force u32)th->dest << 16) \| (__force u32)th->source);
				2567	tail-- ^= (u32)(unsigned long)cvp; / per sockopt */
				2568
				2569	sha_transform((__u32 *)&xvp->cookie_bakery[0],
				2570	(char *)mess,
				2571	&workspace[0]);
				2572	opts.hash_location =
				2573	(__u8 *)&xvp->cookie_bakery[0];
				2574	}
				2575	}
				2576
				2577	th->seq = htonl(TCP_SKB_CB(skb)->seq);
				2578	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
				2579
				2580	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
				2581	th->window = htons(min(req->rcv_wnd, 65535U));
				2582	tcp_options_write((__be32 *)(th + 1), tp, &opts);
				2583	th->doff = (tcp_header_size >> 2);
				2584	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
				2585
				2586	TCP_PKT_STATS_INC(TCP_SEND_PKTS);
				2587
				2588	#ifdef CONFIG_TCP_MD5SIG
				2589	/* Okay, we have all we need - do the md5 hash if needed */
				2590	if (md5) {
				2591	tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
				2592	md5, NULL, req, skb);
				2593	}
				2594	#endif
				2595
				2596	return skb;
				2597	}
				2598	EXPORT_SYMBOL(tcp_make_synack);
				2599
				2600	/* Do all connect socket setups that can be done AF independent. */
				2601	static void tcp_connect_init(struct sock *sk)
				2602	{
				2603	const struct dst_entry *dst = __sk_dst_get(sk);
				2604	struct tcp_sock *tp = tcp_sk(sk);
				2605	__u8 rcv_wscale;
				2606
				2607	/* We'll fix this up when we get a response from the other end.
				2608	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
				2609	*/
				2610	tp->tcp_header_len = sizeof(struct tcphdr) +
				2611	(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
				2612
				2613	#ifdef CONFIG_TCP_MD5SIG
				2614	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
				2615	tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
				2616	#endif
				2617
				2618	/* If user gave his TCP_MAXSEG, record it to clamp */
				2619	if (tp->rx_opt.user_mss)
				2620	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
				2621	tp->max_window = 0;
				2622	tcp_mtup_init(sk);
				2623	tcp_sync_mss(sk, dst_mtu(dst));
				2624
				2625	if (!tp->window_clamp)
				2626	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
				2627	tp->advmss = dst_metric_advmss(dst);
				2628	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
				2629	tp->advmss = tp->rx_opt.user_mss;
				2630
				2631	tcp_initialize_rcv_mss(sk);
				2632
				2633	/* limit the window selection if the user enforce a smaller rx buffer */
				2634	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
				2635	(tp->window_clamp > tcp_full_space(sk) \|\| tp->window_clamp == 0))
				2636	tp->window_clamp = tcp_full_space(sk);
				2637
				2638	tcp_select_initial_window(tcp_full_space(sk),
				2639	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
				2640	&tp->rcv_wnd,
				2641	&tp->window_clamp,
				2642	sysctl_tcp_window_scaling,
				2643	&rcv_wscale,
				2644	dst_metric(dst, RTAX_INITRWND));
				2645
				2646	tp->rx_opt.rcv_wscale = rcv_wscale;
				2647	tp->rcv_ssthresh = tp->rcv_wnd;
				2648
				2649	sk->sk_err = 0;
				2650	sock_reset_flag(sk, SOCK_DONE);
				2651	tp->snd_wnd = 0;
				2652	tcp_init_wl(tp, 0);
				2653	tp->snd_una = tp->write_seq;
				2654	tp->snd_sml = tp->write_seq;
				2655	tp->snd_up = tp->write_seq;
				2656	tp->rcv_nxt = 0;
				2657	tp->rcv_wup = 0;
				2658	tp->copied_seq = 0;
				2659
				2660	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
				2661	inet_csk(sk)->icsk_retransmits = 0;
				2662	tcp_clear_retrans(tp);
				2663	}
				2664
				2665	/* Build a SYN and send it off. */
				2666	int tcp_connect(struct sock *sk)
				2667	{
				2668	struct tcp_sock *tp = tcp_sk(sk);
				2669	struct sk_buff *buff;
				2670	int err;
				2671
				2672	tcp_connect_init(sk);
				2673
				2674	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
				2675	if (unlikely(buff == NULL))
				2676	return -ENOBUFS;
				2677
				2678	/* Reserve space for headers. */
				2679	skb_reserve(buff, MAX_TCP_HEADER);
				2680
				2681	tp->snd_nxt = tp->write_seq;
				2682	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
				2683	TCP_ECN_send_syn(sk, buff);
				2684
				2685	/* Send it off. */
				2686	TCP_SKB_CB(buff)->when = tcp_time_stamp;
				2687	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
				2688	skb_header_release(buff);
				2689	__tcp_add_write_queue_tail(sk, buff);
				2690	sk->sk_wmem_queued += buff->truesize;
				2691	sk_mem_charge(sk, buff->truesize);
				2692	tp->packets_out += tcp_skb_pcount(buff);
				2693	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
				2694	if (err == -ECONNREFUSED)
				2695	return err;
				2696
				2697	/* We change tp->snd_nxt after the tcp_transmit_skb() call
				2698	* in order to make this packet get counted in tcpOutSegs.
				2699	*/
				2700	tp->snd_nxt = tp->write_seq;
				2701	tp->pushed_seq = tp->write_seq;
				2702	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
				2703
				2704	/* Timer for repeating the SYN until an answer. */
				2705	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				2706	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
				2707	return 0;
				2708	}
				2709	EXPORT_SYMBOL(tcp_connect);
				2710
				2711	/* Send out a delayed ack, the caller does the policy checking
				2712	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
				2713	* for details.
				2714	*/
				2715	void tcp_send_delayed_ack(struct sock *sk)
				2716	{
				2717	struct inet_connection_sock *icsk = inet_csk(sk);
				2718	int ato = icsk->icsk_ack.ato;
				2719	unsigned long timeout;
				2720
				2721	if (ato > TCP_DELACK_MIN) {
				2722	const struct tcp_sock *tp = tcp_sk(sk);
				2723	int max_ato = HZ / 2;
				2724
				2725	if (icsk->icsk_ack.pingpong \|\|
				2726	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
				2727	max_ato = TCP_DELACK_MAX;
				2728
				2729	/* Slow path, intersegment interval is "high". */
				2730
				2731	/* If some rtt estimate is known, use it to bound delayed ack.
				2732	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
				2733	* directly.
				2734	*/
				2735	if (tp->srtt) {
				2736	int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
				2737
				2738	if (rtt < max_ato)
				2739	max_ato = rtt;
				2740	}
				2741
				2742	ato = min(ato, max_ato);
				2743	}
				2744
				2745	/* Stay within the limit we were given */
				2746	timeout = jiffies + ato;
				2747
				2748	/* Use new timeout only if there wasn't a older one earlier. */
				2749	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
				2750	/* If delack timer was blocked or is about to expire,
				2751	* send ACK now.
				2752	*/
				2753	if (icsk->icsk_ack.blocked \|\|
				2754	time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
				2755	tcp_send_ack(sk);
				2756	return;
				2757	}
				2758
				2759	if (!time_before(timeout, icsk->icsk_ack.timeout))
				2760	timeout = icsk->icsk_ack.timeout;
				2761	}
				2762	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
				2763	icsk->icsk_ack.timeout = timeout;
				2764	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
				2765	}
				2766
				2767	/* This routine sends an ack and also updates the window. */
				2768	void tcp_send_ack(struct sock *sk)
				2769	{
				2770	struct sk_buff *buff;
				2771
				2772	/* If we have been reset, we may not send again. */
				2773	if (sk->sk_state == TCP_CLOSE)
				2774	return;
				2775
				2776	/* We are not putting this on the write queue, so
				2777	* tcp_transmit_skb() will set the ownership to this
				2778	* sock.
				2779	*/
				2780	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
				2781	if (buff == NULL) {
				2782	inet_csk_schedule_ack(sk);
				2783	inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
				2784	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				2785	TCP_DELACK_MAX, TCP_RTO_MAX);
				2786	return;
				2787	}
				2788
				2789	/* Reserve space for headers and prepare control bits. */
				2790	skb_reserve(buff, MAX_TCP_HEADER);
				2791	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
				2792
				2793	/* Send it off, this clears delayed acks for us. */
				2794	TCP_SKB_CB(buff)->when = tcp_time_stamp;
				2795	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
				2796	}
				2797
				2798	/* This routine sends a packet with an out of date sequence
				2799	* number. It assumes the other end will try to ack it.
				2800	*
				2801	* Question: what should we make while urgent mode?
				2802	* 4.4BSD forces sending single byte of data. We cannot send
				2803	* out of window data, because we have SND.NXT==SND.MAX...
				2804	*
				2805	* Current solution: to send TWO zero-length segments in urgent mode:
				2806	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
				2807	* out-of-date with SND.UNA-1 to probe window.
				2808	*/
				2809	static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
				2810	{
				2811	struct tcp_sock *tp = tcp_sk(sk);
				2812	struct sk_buff *skb;
				2813
				2814	/* We don't queue it, tcp_transmit_skb() sets ownership. */
				2815	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
				2816	if (skb == NULL)
				2817	return -1;
				2818
				2819	/* Reserve space for headers and set control bits. */
				2820	skb_reserve(skb, MAX_TCP_HEADER);
				2821	/* Use a previous sequence. This should cause the other
				2822	* end to send an ack. Don't queue or clone SKB, just
				2823	* send it.
				2824	*/
				2825	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
				2826	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				2827	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
				2828	}
				2829
				2830	/* Initiate keepalive or window probe from timer. */
				2831	int tcp_write_wakeup(struct sock *sk)
				2832	{
				2833	struct tcp_sock *tp = tcp_sk(sk);
				2834	struct sk_buff *skb;
				2835
				2836	if (sk->sk_state == TCP_CLOSE)
				2837	return -1;
				2838
				2839	if ((skb = tcp_send_head(sk)) != NULL &&
				2840	before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
				2841	int err;
				2842	unsigned int mss = tcp_current_mss(sk);
				2843	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
				2844
				2845	if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
				2846	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
				2847
				2848	/* We are probing the opening of a window
				2849	* but the window size is != 0
				2850	* must have been a result SWS avoidance ( sender )
				2851	*/
				2852	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
				2853	skb->len > mss) {
				2854	seg_size = min(seg_size, mss);
				2855	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				2856	if (tcp_fragment(sk, skb, seg_size, mss))
				2857	return -1;
				2858	} else if (!tcp_skb_pcount(skb))
				2859	tcp_set_skb_tso_segs(sk, skb, mss);
				2860
				2861	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				2862	TCP_SKB_CB(skb)->when = tcp_time_stamp;
				2863	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
				2864	if (!err)
				2865	tcp_event_new_data_sent(sk, skb);
				2866	return err;
				2867	} else {
				2868	if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
				2869	tcp_xmit_probe_skb(sk, 1);
				2870	return tcp_xmit_probe_skb(sk, 0);
				2871	}
				2872	}
				2873
				2874	/* A window probe timeout has occurred. If window is not closed send
				2875	* a partial packet else a zero probe.
				2876	*/
				2877	void tcp_send_probe0(struct sock *sk)
				2878	{
				2879	struct inet_connection_sock *icsk = inet_csk(sk);
				2880	struct tcp_sock *tp = tcp_sk(sk);
				2881	int err;
				2882
				2883	err = tcp_write_wakeup(sk);
				2884
				2885	if (tp->packets_out \|\| !tcp_send_head(sk)) {
				2886	/* Cancel probe timer, if it is not required. */
				2887	icsk->icsk_probes_out = 0;
				2888	icsk->icsk_backoff = 0;
				2889	return;
				2890	}
				2891
				2892	if (err <= 0) {
				2893	if (icsk->icsk_backoff < sysctl_tcp_retries2)
				2894	icsk->icsk_backoff++;
				2895	icsk->icsk_probes_out++;
				2896	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				2897	min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
				2898	TCP_RTO_MAX);
				2899	} else {
				2900	/* If packet was not sent due to local congestion,
				2901	* do not backoff and do not remember icsk_probes_out.
				2902	* Let local senders to fight for local resources.
				2903	*
				2904	* Use accumulated backoff yet.
				2905	*/
				2906	if (!icsk->icsk_probes_out)
				2907	icsk->icsk_probes_out = 1;
				2908	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				2909	min(icsk->icsk_rto << icsk->icsk_backoff,
				2910	TCP_RESOURCE_PROBE_INTERVAL),
				2911	TCP_RTO_MAX);
				2912	}
				2913	}