Blame - src/kernel/linux/v4.14/net/ipv4/tcp.c - T103

blob: c9f6f28e54f3e08904b33cb2717e3fca9bbef685 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*
				20	* Fixes:
				21	* Alan Cox : Numerous verify_area() calls
				22	* Alan Cox : Set the ACK bit on a reset
				23	* Alan Cox : Stopped it crashing if it closed while
				24	* sk->inuse=1 and was trying to connect
				25	* (tcp_err()).
				26	* Alan Cox : All icmp error handling was broken
				27	* pointers passed where wrong and the
				28	* socket was looked up backwards. Nobody
				29	* tested any icmp error code obviously.
				30	* Alan Cox : tcp_err() now handled properly. It
				31	* wakes people on errors. poll
				32	* behaves and the icmp error race
				33	* has gone by moving it into sock.c
				34	* Alan Cox : tcp_send_reset() fixed to work for
				35	* everything not just packets for
				36	* unknown sockets.
				37	* Alan Cox : tcp option processing.
				38	* Alan Cox : Reset tweaked (still not 100%) [Had
				39	* syn rule wrong]
				40	* Herp Rosmanith : More reset fixes
				41	* Alan Cox : No longer acks invalid rst frames.
				42	* Acking any kind of RST is right out.
				43	* Alan Cox : Sets an ignore me flag on an rst
				44	* receive otherwise odd bits of prattle
				45	* escape still
				46	* Alan Cox : Fixed another acking RST frame bug.
				47	* Should stop LAN workplace lockups.
				48	* Alan Cox : Some tidyups using the new skb list
				49	* facilities
				50	* Alan Cox : sk->keepopen now seems to work
				51	* Alan Cox : Pulls options out correctly on accepts
				52	* Alan Cox : Fixed assorted sk->rqueue->next errors
				53	* Alan Cox : PSH doesn't end a TCP read. Switched a
				54	* bit to skb ops.
				55	* Alan Cox : Tidied tcp_data to avoid a potential
				56	* nasty.
				57	* Alan Cox : Added some better commenting, as the
				58	* tcp is hard to follow
				59	* Alan Cox : Removed incorrect check for 20 * psh
				60	* Michael O'Reilly : ack < copied bug fix.
				61	* Johannes Stille : Misc tcp fixes (not all in yet).
				62	* Alan Cox : FIN with no memory -> CRASH
				63	* Alan Cox : Added socket option proto entries.
				64	* Also added awareness of them to accept.
				65	* Alan Cox : Added TCP options (SOL_TCP)
				66	* Alan Cox : Switched wakeup calls to callbacks,
				67	* so the kernel can layer network
				68	* sockets.
				69	* Alan Cox : Use ip_tos/ip_ttl settings.
				70	* Alan Cox : Handle FIN (more) properly (we hope).
				71	* Alan Cox : RST frames sent on unsynchronised
				72	* state ack error.
				73	* Alan Cox : Put in missing check for SYN bit.
				74	* Alan Cox : Added tcp_select_window() aka NET2E
				75	* window non shrink trick.
				76	* Alan Cox : Added a couple of small NET2E timer
				77	* fixes
				78	* Charles Hedrick : TCP fixes
				79	* Toomas Tamm : TCP window fixes
				80	* Alan Cox : Small URG fix to rlogin ^C ack fight
				81	* Charles Hedrick : Rewrote most of it to actually work
				82	* Linus : Rewrote tcp_read() and URG handling
				83	* completely
				84	* Gerhard Koerting: Fixed some missing timer handling
				85	* Matthew Dillon : Reworked TCP machine states as per RFC
				86	* Gerhard Koerting: PC/TCP workarounds
				87	* Adam Caldwell : Assorted timer/timing errors
				88	* Matthew Dillon : Fixed another RST bug
				89	* Alan Cox : Move to kernel side addressing changes.
				90	* Alan Cox : Beginning work on TCP fastpathing
				91	* (not yet usable)
				92	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				93	* Alan Cox : TCP fast path debugging
				94	* Alan Cox : Window clamping
				95	* Michael Riepe : Bug in tcp_check()
				96	* Matt Dillon : More TCP improvements and RST bug fixes
				97	* Matt Dillon : Yet more small nasties remove from the
				98	* TCP code (Be very nice to this man if
				99	* tcp finally works 100%) 8)
				100	* Alan Cox : BSD accept semantics.
				101	* Alan Cox : Reset on closedown bug.
				102	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				103	* Michael Pall : Handle poll() after URG properly in
				104	* all cases.
				105	* Michael Pall : Undo the last fix in tcp_read_urg()
				106	* (multi URG PUSH broke rlogin).
				107	* Michael Pall : Fix the multi URG PUSH problem in
				108	* tcp_readable(), poll() after URG
				109	* works now.
				110	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				111	* BSD api.
				112	* Alan Cox : Changed the semantics of sk->socket to
				113	* fix a race and a signal problem with
				114	* accept() and async I/O.
				115	* Alan Cox : Relaxed the rules on tcp_sendto().
				116	* Yury Shevchuk : Really fixed accept() blocking problem.
				117	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				118	* clients/servers which listen in on
				119	* fixed ports.
				120	* Alan Cox : Cleaned the above up and shrank it to
				121	* a sensible code size.
				122	* Alan Cox : Self connect lockup fix.
				123	* Alan Cox : No connect to multicast.
				124	* Ross Biro : Close unaccepted children on master
				125	* socket close.
				126	* Alan Cox : Reset tracing code.
				127	* Alan Cox : Spurious resets on shutdown.
				128	* Alan Cox : Giant 15 minute/60 second timer error
				129	* Alan Cox : Small whoops in polling before an
				130	* accept.
				131	* Alan Cox : Kept the state trace facility since
				132	* it's handy for debugging.
				133	* Alan Cox : More reset handler fixes.
				134	* Alan Cox : Started rewriting the code based on
				135	* the RFC's for other useful protocol
				136	* references see: Comer, KA9Q NOS, and
				137	* for a reference on the difference
				138	* between specifications and how BSD
				139	* works see the 4.4lite source.
				140	* A.N.Kuznetsov : Don't time wait on completion of tidy
				141	* close.
				142	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				143	* Linus Torvalds : Fixed BSD port reuse to work first syn
				144	* Alan Cox : Reimplemented timers as per the RFC
				145	* and using multiple timers for sanity.
				146	* Alan Cox : Small bug fixes, and a lot of new
				147	* comments.
				148	* Alan Cox : Fixed dual reader crash by locking
				149	* the buffers (much like datagram.c)
				150	* Alan Cox : Fixed stuck sockets in probe. A probe
				151	* now gets fed up of retrying without
				152	* (even a no space) answer.
				153	* Alan Cox : Extracted closing code better
				154	* Alan Cox : Fixed the closing state machine to
				155	* resemble the RFC.
				156	* Alan Cox : More 'per spec' fixes.
				157	* Jorge Cwik : Even faster checksumming.
				158	* Alan Cox : tcp_data() doesn't ack illegal PSH
				159	* only frames. At least one pc tcp stack
				160	* generates them.
				161	* Alan Cox : Cache last socket.
				162	* Alan Cox : Per route irtt.
				163	* Matt Day : poll()->select() match BSD precisely on error
				164	* Alan Cox : New buffers
				165	* Marc Tamsky : Various sk->prot->retransmits and
				166	* sk->retransmits misupdating fixed.
				167	* Fixed tcp_write_timeout: stuck close,
				168	* and TCP syn retries gets used now.
				169	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				170	* ack if state is TCP_CLOSED.
				171	* Alan Cox : Look up device on a retransmit - routes may
				172	* change. Doesn't yet cope with MSS shrink right
				173	* but it's a start!
				174	* Marc Tamsky : Closing in closing fixes.
				175	* Mike Shaver : RFC1122 verifications.
				176	* Alan Cox : rcv_saddr errors.
				177	* Alan Cox : Block double connect().
				178	* Alan Cox : Small hooks for enSKIP.
				179	* Alexey Kuznetsov: Path MTU discovery.
				180	* Alan Cox : Support soft errors.
				181	* Alan Cox : Fix MTU discovery pathological case
				182	* when the remote claims no mtu!
				183	* Marc Tamsky : TCP_CLOSE fix.
				184	* Colin (G3TNE) : Send a reset on syn ack replies in
				185	* window but wrong (fixes NT lpd problems)
				186	* Pedro Roque : Better TCP window handling, delayed ack.
				187	* Joerg Reuter : No modification of locked buffers in
				188	* tcp_do_retransmit()
				189	* Eric Schenk : Changed receiver side silly window
				190	* avoidance algorithm to BSD style
				191	* algorithm. This doubles throughput
				192	* against machines running Solaris,
				193	* and seems to result in general
				194	* improvement.
				195	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				196	* Willy Konynenberg : Transparent proxying support.
				197	* Mike McLagan : Routing by source
				198	* Keith Owens : Do proper merging with partial SKB's in
				199	* tcp_do_sendmsg to avoid burstiness.
				200	* Eric Schenk : Fix fast close down bug with
				201	* shutdown() followed by close().
				202	* Andi Kleen : Make poll agree with SIGIO
				203	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				204	* lingertime == 0 (RFC 793 ABORT Call)
				205	* Hirokazu Takahashi : Use copy_from_user() instead of
				206	* csum_and_copy_from_user() if possible.
				207	*
				208	* This program is free software; you can redistribute it and/or
				209	* modify it under the terms of the GNU General Public License
				210	* as published by the Free Software Foundation; either version
				211	* 2 of the License, or(at your option) any later version.
				212	*
				213	* Description of States:
				214	*
				215	* TCP_SYN_SENT sent a connection request, waiting for ack
				216	*
				217	* TCP_SYN_RECV received a connection request, sent ack,
				218	* waiting for final ack in three-way handshake.
				219	*
				220	* TCP_ESTABLISHED connection established
				221	*
				222	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				223	* transmission of remaining buffered data
				224	*
				225	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				226	* to shutdown
				227	*
				228	* TCP_CLOSING both sides have shutdown but we still have
				229	* data we have to finish sending
				230	*
				231	* TCP_TIME_WAIT timeout to catch resent junk before entering
				232	* closed, can only be entered from FIN_WAIT2
				233	* or CLOSING. Required because the other end
				234	* may not have gotten our last ACK causing it
				235	* to retransmit the data packet (which we ignore)
				236	*
				237	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				238	* us to finish writing our data and to shutdown
				239	* (we have to close() to move on to LAST_ACK)
				240	*
				241	* TCP_LAST_ACK out side has shutdown after remote has
				242	* shutdown. There may still be data in our
				243	* buffer that we have to finish sending
				244	*
				245	* TCP_CLOSE socket is finished
				246	*/
				247
				248	#define pr_fmt(fmt) "TCP: " fmt
				249
				250	#include <crypto/hash.h>
				251	#include <linux/kernel.h>
				252	#include <linux/module.h>
				253	#include <linux/types.h>
				254	#include <linux/fcntl.h>
				255	#include <linux/poll.h>
				256	#include <linux/inet_diag.h>
				257	#include <linux/init.h>
				258	#include <linux/fs.h>
				259	#include <linux/skbuff.h>
				260	#include <linux/scatterlist.h>
				261	#include <linux/splice.h>
				262	#include <linux/net.h>
				263	#include <linux/socket.h>
				264	#include <linux/random.h>
				265	#include <linux/bootmem.h>
				266	#include <linux/highmem.h>
				267	#include <linux/swap.h>
				268	#include <linux/cache.h>
				269	#include <linux/err.h>
				270	#include <linux/time.h>
				271	#include <linux/slab.h>
				272	#include <linux/errqueue.h>
				273
				274	#include <net/icmp.h>
				275	#include <net/inet_common.h>
				276	#include <net/tcp.h>
				277	#include <net/xfrm.h>
				278	#include <net/ip.h>
				279	#include <net/sock.h>
				280
				281	#include <linux/uaccess.h>
				282	#include <asm/ioctls.h>
				283	#include <net/busy_poll.h>
				284
				285	int sysctl_tcp_min_tso_segs __read_mostly = 2;
				286
				287	int sysctl_tcp_autocorking __read_mostly = 1;
				288
				289	struct percpu_counter tcp_orphan_count;
				290	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				291
				292	long sysctl_tcp_mem[3] __read_mostly;
				293	int sysctl_tcp_wmem[3] __read_mostly;
				294	int sysctl_tcp_rmem[3] __read_mostly;
				295
				296	EXPORT_SYMBOL(sysctl_tcp_mem);
				297	EXPORT_SYMBOL(sysctl_tcp_rmem);
				298	EXPORT_SYMBOL(sysctl_tcp_wmem);
				299
				300	atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
				301	EXPORT_SYMBOL(tcp_memory_allocated);
				302
				303	/*
				304	* Current number of TCP sockets.
				305	*/
				306	struct percpu_counter tcp_sockets_allocated;
				307	EXPORT_SYMBOL(tcp_sockets_allocated);
				308
				309	/*
				310	* TCP splice context
				311	*/
				312	struct tcp_splice_state {
				313	struct pipe_inode_info *pipe;
				314	size_t len;
				315	unsigned int flags;
				316	};
				317
				318	/*
				319	* Pressure flag: try to collapse.
				320	* Technical note: it is used by multiple contexts non atomically.
				321	* All the __sk_mem_schedule() is of this nature: accounting
				322	* is strict, actions are advisory and have some latency.
				323	*/
				324	unsigned long tcp_memory_pressure __read_mostly;
				325	EXPORT_SYMBOL_GPL(tcp_memory_pressure);
				326
				327	void tcp_enter_memory_pressure(struct sock *sk)
				328	{
				329	unsigned long val;
				330
				331	if (READ_ONCE(tcp_memory_pressure))
				332	return;
				333	val = jiffies;
				334
				335	if (!val)
				336	val--;
				337	if (!cmpxchg(&tcp_memory_pressure, 0, val))
				338	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
				339	}
				340	EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
				341
				342	void tcp_leave_memory_pressure(struct sock *sk)
				343	{
				344	unsigned long val;
				345
				346	if (!READ_ONCE(tcp_memory_pressure))
				347	return;
				348	val = xchg(&tcp_memory_pressure, 0);
				349	if (val)
				350	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
				351	jiffies_to_msecs(jiffies - val));
				352	}
				353	EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
				354
				355	/* Convert seconds to retransmits based on initial and max timeout */
				356	static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
				357	{
				358	u8 res = 0;
				359
				360	if (seconds > 0) {
				361	int period = timeout;
				362
				363	res = 1;
				364	while (seconds > period && res < 255) {
				365	res++;
				366	timeout <<= 1;
				367	if (timeout > rto_max)
				368	timeout = rto_max;
				369	period += timeout;
				370	}
				371	}
				372	return res;
				373	}
				374
				375	/* Convert retransmits to seconds based on initial and max timeout */
				376	static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
				377	{
				378	int period = 0;
				379
				380	if (retrans > 0) {
				381	period = timeout;
				382	while (--retrans) {
				383	timeout <<= 1;
				384	if (timeout > rto_max)
				385	timeout = rto_max;
				386	period += timeout;
				387	}
				388	}
				389	return period;
				390	}
				391
				392	static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
				393	{
				394	u32 rate = READ_ONCE(tp->rate_delivered);
				395	u32 intv = READ_ONCE(tp->rate_interval_us);
				396	u64 rate64 = 0;
				397
				398	if (rate && intv) {
				399	rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
				400	do_div(rate64, intv);
				401	}
				402	return rate64;
				403	}
				404
				405	/* Address-family independent initialization for a tcp_sock.
				406	*
				407	* NOTE: A lot of things set to zero explicitly by call to
				408	* sk_alloc() so need not be done here.
				409	*/
				410	void tcp_init_sock(struct sock *sk)
				411	{
				412	struct inet_connection_sock *icsk = inet_csk(sk);
				413	struct tcp_sock *tp = tcp_sk(sk);
				414
				415	tp->out_of_order_queue = RB_ROOT;
				416	tcp_init_xmit_timers(sk);
				417	INIT_LIST_HEAD(&tp->tsq_node);
				418
				419	icsk->icsk_rto = TCP_TIMEOUT_INIT;
				420	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
				421	minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
				422
				423	/* So many TCP implementations out there (incorrectly) count the
				424	* initial SYN frame in their delayed-ACK and congestion control
				425	* algorithms that we must have the following bandaid to talk
				426	* efficiently to them. -DaveM
				427	*/
				428	tp->snd_cwnd = TCP_INIT_CWND;
				429
				430	/* There's a bubble in the pipe until at least the first ACK. */
				431	tp->app_limited = ~0U;
				432
				433	/* See draft-stevens-tcpca-spec-01 for discussion of the
				434	* initialization of these values.
				435	*/
				436	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
				437	tp->snd_cwnd_clamp = ~0;
				438	tp->mss_cache = TCP_MSS_DEFAULT;
				439
				440	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
				441	tcp_assign_congestion_control(sk);
				442
				443	tp->tsoffset = 0;
				444
				445	sk->sk_state = TCP_CLOSE;
				446
				447	sk->sk_write_space = sk_stream_write_space;
				448	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				449
				450	icsk->icsk_sync_mss = tcp_sync_mss;
				451
				452	sk->sk_sndbuf = sysctl_tcp_wmem[1];
				453	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
				454
				455	sk_sockets_allocated_inc(sk);
				456	}
				457	EXPORT_SYMBOL(tcp_init_sock);
				458
				459	static void tcp_tx_timestamp(struct sock sk, u16 tsflags, struct sk_buff skb)
				460	{
				461	if (tsflags && skb) {
				462	struct skb_shared_info *shinfo = skb_shinfo(skb);
				463	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				464
				465	sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
				466	if (tsflags & SOF_TIMESTAMPING_TX_ACK)
				467	tcb->txstamp_ack = 1;
				468	if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
				469	shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
				470	}
				471	}
				472
				473	/*
				474	* Wait for a TCP event.
				475	*
				476	* Note that we don't need to lock the socket, as the upper poll layers
				477	* take care of normal races (between the test and the event) and we don't
				478	* go look at any of the socket buffers directly.
				479	*/
				480	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				481	{
				482	unsigned int mask;
				483	struct sock *sk = sock->sk;
				484	const struct tcp_sock *tp = tcp_sk(sk);
				485	int state;
				486
				487	sock_rps_record_flow(sk);
				488
				489	sock_poll_wait(file, sk_sleep(sk), wait);
				490
				491	state = sk_state_load(sk);
				492	if (state == TCP_LISTEN)
				493	return inet_csk_listen_poll(sk);
				494
				495	/* Socket is not locked. We are protected from async events
				496	* by poll logic and correct handling of state changes
				497	* made by other threads is impossible in any case.
				498	*/
				499
				500	mask = 0;
				501
				502	/*
				503	* POLLHUP is certainly not done right. But poll() doesn't
				504	* have a notion of HUP in just one direction, and for a
				505	* socket the read side is more interesting.
				506	*
				507	* Some poll() documentation says that POLLHUP is incompatible
				508	* with the POLLOUT/POLLWR flags, so somebody should check this
				509	* all. But careful, it tends to be safer to return too many
				510	* bits than too few, and you can easily break real applications
				511	* if you don't tell them that something has hung up!
				512	*
				513	* Check-me.
				514	*
				515	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				516	* our fs/select.c). It means that after we received EOF,
				517	* poll always returns immediately, making impossible poll() on write()
				518	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				519	* if and only if shutdown has been made in both directions.
				520	* Actually, it is interesting to look how Solaris and DUX
				521	* solve this dilemma. I would prefer, if POLLHUP were maskable,
				522	* then we could set it on SND_SHUTDOWN. BTW examples given
				523	* in Stevens' books assume exactly this behaviour, it explains
				524	* why POLLHUP is incompatible with POLLOUT. --ANK
				525	*
				526	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				527	* blocking on fresh not-connected or disconnected socket. --ANK
				528	*/
				529	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| state == TCP_CLOSE)
				530	mask \|= POLLHUP;
				531	if (sk->sk_shutdown & RCV_SHUTDOWN)
				532	mask \|= POLLIN \| POLLRDNORM \| POLLRDHUP;
				533
				534	/* Connected or passive Fast Open socket? */
				535	if (state != TCP_SYN_SENT &&
				536	(state != TCP_SYN_RECV \|\| tp->fastopen_rsk)) {
				537	int target = sock_rcvlowat(sk, 0, INT_MAX);
				538
				539	if (tp->urg_seq == tp->copied_seq &&
				540	!sock_flag(sk, SOCK_URGINLINE) &&
				541	tp->urg_data)
				542	target++;
				543
				544	if (tp->rcv_nxt - tp->copied_seq >= target)
				545	mask \|= POLLIN \| POLLRDNORM;
				546
				547	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				548	if (sk_stream_is_writeable(sk)) {
				549	mask \|= POLLOUT \| POLLWRNORM;
				550	} else { /* send SIGIO later */
				551	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				552	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				553
				554	/* Race breaker. If space is freed after
				555	* wspace test but before the flags are set,
				556	* IO signal will be lost. Memory barrier
				557	* pairs with the input side.
				558	*/
				559	smp_mb__after_atomic();
				560	if (sk_stream_is_writeable(sk))
				561	mask \|= POLLOUT \| POLLWRNORM;
				562	}
				563	} else
				564	mask \|= POLLOUT \| POLLWRNORM;
				565
				566	if (tp->urg_data & TCP_URG_VALID)
				567	mask \|= POLLPRI;
				568	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
				569	/* Active TCP fastopen socket with defer_connect
				570	* Return POLLOUT so application can call write()
				571	* in order for kernel to generate SYN+data
				572	*/
				573	mask \|= POLLOUT \| POLLWRNORM;
				574	}
				575	/* This barrier is coupled with smp_wmb() in tcp_reset() */
				576	smp_rmb();
				577	if (sk->sk_err \|\| !skb_queue_empty_lockless(&sk->sk_error_queue))
				578	mask \|= POLLERR;
				579
				580	return mask;
				581	}
				582	EXPORT_SYMBOL(tcp_poll);
				583
				584	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				585	{
				586	struct tcp_sock *tp = tcp_sk(sk);
				587	int answ;
				588	bool slow;
				589
				590	switch (cmd) {
				591	case SIOCINQ:
				592	if (sk->sk_state == TCP_LISTEN)
				593	return -EINVAL;
				594
				595	slow = lock_sock_fast(sk);
				596	answ = tcp_inq(sk);
				597	unlock_sock_fast(sk, slow);
				598	break;
				599	case SIOCATMARK:
				600	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				601	break;
				602	case SIOCOUTQ:
				603	if (sk->sk_state == TCP_LISTEN)
				604	return -EINVAL;
				605
				606	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				607	answ = 0;
				608	else
				609	answ = tp->write_seq - tp->snd_una;
				610	break;
				611	case SIOCOUTQNSD:
				612	if (sk->sk_state == TCP_LISTEN)
				613	return -EINVAL;
				614
				615	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				616	answ = 0;
				617	else
				618	answ = tp->write_seq - tp->snd_nxt;
				619	break;
				620	default:
				621	return -ENOIOCTLCMD;
				622	}
				623
				624	return put_user(answ, (int __user *)arg);
				625	}
				626	EXPORT_SYMBOL(tcp_ioctl);
				627
				628	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				629	{
				630	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				631	tp->pushed_seq = tp->write_seq;
				632	}
				633
				634	static inline bool forced_push(const struct tcp_sock *tp)
				635	{
				636	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				637	}
				638
				639	static void skb_entail(struct sock sk, struct sk_buff skb)
				640	{
				641	struct tcp_sock *tp = tcp_sk(sk);
				642	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				643
				644	skb->csum = 0;
				645	tcb->seq = tcb->end_seq = tp->write_seq;
				646	tcb->tcp_flags = TCPHDR_ACK;
				647	tcb->sacked = 0;
				648	__skb_header_release(skb);
				649	tcp_add_write_queue_tail(sk, skb);
				650	sk->sk_wmem_queued += skb->truesize;
				651	sk_mem_charge(sk, skb->truesize);
				652	if (tp->nonagle & TCP_NAGLE_PUSH)
				653	tp->nonagle &= ~TCP_NAGLE_PUSH;
				654
				655	tcp_slow_start_after_idle_check(sk);
				656	}
				657
				658	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
				659	{
				660	if (flags & MSG_OOB)
				661	tp->snd_up = tp->write_seq;
				662	}
				663
				664	/* If a not yet filled skb is pushed, do not send it if
				665	* we have data packets in Qdisc or NIC queues :
				666	* Because TX completion will happen shortly, it gives a chance
				667	* to coalesce future sendmsg() payload into this skb, without
				668	* need for a timer, and with no latency trade off.
				669	* As packets containing data payload have a bigger truesize
				670	* than pure acks (dataless) packets, the last checks prevent
				671	* autocorking if we only have an ACK in Qdisc/NIC queues,
				672	* or if TX completion was delayed after we processed ACK packet.
				673	*/
				674	static bool tcp_should_autocork(struct sock sk, struct sk_buff skb,
				675	int size_goal)
				676	{
				677	return skb->len < size_goal &&
				678	sysctl_tcp_autocorking &&
				679	skb != tcp_write_queue_head(sk) &&
				680	refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
				681	}
				682
				683	static void tcp_push(struct sock *sk, int flags, int mss_now,
				684	int nonagle, int size_goal)
				685	{
				686	struct tcp_sock *tp = tcp_sk(sk);
				687	struct sk_buff *skb;
				688
				689	if (!tcp_send_head(sk))
				690	return;
				691
				692	skb = tcp_write_queue_tail(sk);
				693	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				694	tcp_mark_push(tp, skb);
				695
				696	tcp_mark_urg(tp, flags);
				697
				698	if (tcp_should_autocork(sk, skb, size_goal)) {
				699
				700	/* avoid atomic op if TSQ_THROTTLED bit is already set */
				701	if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
				702	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
				703	set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
				704	}
				705	/* It is possible TX completion already happened
				706	* before we set TSQ_THROTTLED.
				707	*/
				708	if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
				709	return;
				710	}
				711
				712	if (flags & MSG_MORE)
				713	nonagle = TCP_NAGLE_CORK;
				714
				715	__tcp_push_pending_frames(sk, mss_now, nonagle);
				716	}
				717
				718	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct sk_buff skb,
				719	unsigned int offset, size_t len)
				720	{
				721	struct tcp_splice_state *tss = rd_desc->arg.data;
				722	int ret;
				723
				724	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
				725	min(rd_desc->count, len), tss->flags);
				726	if (ret > 0)
				727	rd_desc->count -= ret;
				728	return ret;
				729	}
				730
				731	static int __tcp_splice_read(struct sock sk, struct tcp_splice_state tss)
				732	{
				733	/* Store TCP splice context information in read_descriptor_t. */
				734	read_descriptor_t rd_desc = {
				735	.arg.data = tss,
				736	.count = tss->len,
				737	};
				738
				739	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
				740	}
				741
				742	/**
				743	* tcp_splice_read - splice data from TCP socket to a pipe
				744	* @sock: socket to splice from
				745	* @ppos: position (not valid)
				746	* @pipe: pipe to splice to
				747	* @len: number of bytes to splice
				748	* @flags: splice modifier flags
				749	*
				750	* Description:
				751	* Will read pages from given socket and fill them into a pipe.
				752	*
				753	**/
				754	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,
				755	struct pipe_inode_info *pipe, size_t len,
				756	unsigned int flags)
				757	{
				758	struct sock *sk = sock->sk;
				759	struct tcp_splice_state tss = {
				760	.pipe = pipe,
				761	.len = len,
				762	.flags = flags,
				763	};
				764	long timeo;
				765	ssize_t spliced;
				766	int ret;
				767
				768	sock_rps_record_flow(sk);
				769	/*
				770	* We can't seek on a socket input
				771	*/
				772	if (unlikely(*ppos))
				773	return -ESPIPE;
				774
				775	ret = spliced = 0;
				776
				777	lock_sock(sk);
				778
				779	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
				780	while (tss.len) {
				781	ret = __tcp_splice_read(sk, &tss);
				782	if (ret < 0)
				783	break;
				784	else if (!ret) {
				785	if (spliced)
				786	break;
				787	if (sock_flag(sk, SOCK_DONE))
				788	break;
				789	if (sk->sk_err) {
				790	ret = sock_error(sk);
				791	break;
				792	}
				793	if (sk->sk_shutdown & RCV_SHUTDOWN)
				794	break;
				795	if (sk->sk_state == TCP_CLOSE) {
				796	/*
				797	* This occurs when user tries to read
				798	* from never connected socket.
				799	*/
				800	if (!sock_flag(sk, SOCK_DONE))
				801	ret = -ENOTCONN;
				802	break;
				803	}
				804	if (!timeo) {
				805	ret = -EAGAIN;
				806	break;
				807	}
				808	/* if __tcp_splice_read() got nothing while we have
				809	* an skb in receive queue, we do not want to loop.
				810	* This might happen with URG data.
				811	*/
				812	if (!skb_queue_empty(&sk->sk_receive_queue))
				813	break;
				814	sk_wait_data(sk, &timeo, NULL);
				815	if (signal_pending(current)) {
				816	ret = sock_intr_errno(timeo);
				817	break;
				818	}
				819	continue;
				820	}
				821	tss.len -= ret;
				822	spliced += ret;
				823
				824	if (!timeo)
				825	break;
				826	release_sock(sk);
				827	lock_sock(sk);
				828
				829	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|
				830	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				831	signal_pending(current))
				832	break;
				833	}
				834
				835	release_sock(sk);
				836
				837	if (spliced)
				838	return spliced;
				839
				840	return ret;
				841	}
				842	EXPORT_SYMBOL(tcp_splice_read);
				843
				844	struct sk_buff sk_stream_alloc_skb(struct sock sk, int size, gfp_t gfp,
				845	bool force_schedule)
				846	{
				847	struct sk_buff *skb;
				848
				849	/* The TCP header must be at least 32-bit aligned. */
				850	size = ALIGN(size, 4);
				851
				852	if (unlikely(tcp_under_memory_pressure(sk)))
				853	sk_mem_reclaim_partial(sk);
				854
				855	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
				856	if (likely(skb)) {
				857	bool mem_scheduled;
				858
				859	if (force_schedule) {
				860	mem_scheduled = true;
				861	sk_forced_mem_schedule(sk, skb->truesize);
				862	} else {
				863	mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
				864	}
				865	if (likely(mem_scheduled)) {
				866	skb_reserve(skb, sk->sk_prot->max_header);
				867	/*
				868	* Make sure that we have exactly size bytes
				869	* available to the caller, no more, no less.
				870	*/
				871	skb->reserved_tailroom = skb->end - skb->tail - size;
				872	return skb;
				873	}
				874	__kfree_skb(skb);
				875	} else {
				876	sk->sk_prot->enter_memory_pressure(sk);
				877	sk_stream_moderate_sndbuf(sk);
				878	}
				879	return NULL;
				880	}
				881
				882	static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
				883	int large_allowed)
				884	{
				885	struct tcp_sock *tp = tcp_sk(sk);
				886	u32 new_size_goal, size_goal;
				887
				888	if (!large_allowed \|\| !sk_can_gso(sk))
				889	return mss_now;
				890
				891	/* Note : tcp_tso_autosize() will eventually split this later */
				892	new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
				893	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
				894
				895	/* We try hard to avoid divides here */
				896	size_goal = tp->gso_segs * mss_now;
				897	if (unlikely(new_size_goal < size_goal \|\|
				898	new_size_goal >= size_goal + mss_now)) {
				899	tp->gso_segs = min_t(u16, new_size_goal / mss_now,
				900	sk->sk_gso_max_segs);
				901	size_goal = tp->gso_segs * mss_now;
				902	}
				903
				904	return max(size_goal, mss_now);
				905	}
				906
				907	static int tcp_send_mss(struct sock sk, int size_goal, int flags)
				908	{
				909	int mss_now;
				910
				911	mss_now = tcp_current_mss(sk);
				912	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
				913
				914	return mss_now;
				915	}
				916
				917	/* In some cases, both sendpage() and sendmsg() could have added
				918	* an skb to the write queue, but failed adding payload on it.
				919	* We need to remove it to consume less memory, but more
				920	* importantly be able to generate EPOLLOUT for Edge Trigger epoll()
				921	* users.
				922	*/
				923	static void tcp_remove_empty_skb(struct sock sk, struct sk_buff skb)
				924	{
				925	if (skb && !skb->len &&
				926	TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
				927	tcp_unlink_write_queue(skb, sk);
				928	tcp_check_send_head(sk, skb);
				929	sk_wmem_free_skb(sk, skb);
				930	}
				931	}
				932
				933	ssize_t do_tcp_sendpages(struct sock sk, struct page page, int offset,
				934	size_t size, int flags)
				935	{
				936	struct tcp_sock *tp = tcp_sk(sk);
				937	int mss_now, size_goal;
				938	int err;
				939	ssize_t copied;
				940	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				941
				942	/* Wait for a connection to finish. One exception is TCP Fast Open
				943	* (passive side) where data is allowed to be sent before a connection
				944	* is fully established.
				945	*/
				946	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) &&
				947	!tcp_passive_fastopen(sk)) {
				948	err = sk_stream_wait_connect(sk, &timeo);
				949	if (err != 0)
				950	goto out_err;
				951	}
				952
				953	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				954
				955	mss_now = tcp_send_mss(sk, &size_goal, flags);
				956	copied = 0;
				957
				958	err = -EPIPE;
				959	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				960	goto out_err;
				961
				962	while (size > 0) {
				963	struct sk_buff *skb = tcp_write_queue_tail(sk);
				964	int copy, i;
				965	bool can_coalesce;
				966
				967	if (!tcp_send_head(sk) \|\| (copy = size_goal - skb->len) <= 0 \|\|
				968	!tcp_skb_can_collapse_to(skb)) {
				969	new_segment:
				970	if (!sk_stream_memory_free(sk))
				971	goto wait_for_sndbuf;
				972
				973	skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
				974	skb_queue_empty(&sk->sk_write_queue));
				975	if (!skb)
				976	goto wait_for_memory;
				977
				978	skb_entail(sk, skb);
				979	copy = size_goal;
				980	}
				981
				982	if (copy > size)
				983	copy = size;
				984
				985	i = skb_shinfo(skb)->nr_frags;
				986	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				987	if (!can_coalesce && i >= sysctl_max_skb_frags) {
				988	tcp_mark_push(tp, skb);
				989	goto new_segment;
				990	}
				991	if (!sk_wmem_schedule(sk, copy))
				992	goto wait_for_memory;
				993
				994	if (can_coalesce) {
				995	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				996	} else {
				997	get_page(page);
				998	skb_fill_page_desc(skb, i, page, offset, copy);
				999	}
				1000	skb_shinfo(skb)->tx_flags \|= SKBTX_SHARED_FRAG;
				1001
				1002	skb->len += copy;
				1003	skb->data_len += copy;
				1004	skb->truesize += copy;
				1005	sk->sk_wmem_queued += copy;
				1006	sk_mem_charge(sk, copy);
				1007	skb->ip_summed = CHECKSUM_PARTIAL;
				1008	tp->write_seq += copy;
				1009	TCP_SKB_CB(skb)->end_seq += copy;
				1010	tcp_skb_pcount_set(skb, 0);
				1011
				1012	if (!copied)
				1013	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
				1014
				1015	copied += copy;
				1016	offset += copy;
				1017	size -= copy;
				1018	if (!size)
				1019	goto out;
				1020
				1021	if (skb->len < size_goal \|\| (flags & MSG_OOB))
				1022	continue;
				1023
				1024	if (forced_push(tp)) {
				1025	tcp_mark_push(tp, skb);
				1026	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
				1027	} else if (skb == tcp_send_head(sk))
				1028	tcp_push_one(sk, mss_now);
				1029	continue;
				1030
				1031	wait_for_sndbuf:
				1032	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				1033	wait_for_memory:
				1034	tcp_push(sk, flags & ~MSG_MORE, mss_now,
				1035	TCP_NAGLE_PUSH, size_goal);
				1036
				1037	err = sk_stream_wait_memory(sk, &timeo);
				1038	if (err != 0)
				1039	goto do_error;
				1040
				1041	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1042	}
				1043
				1044	out:
				1045	if (copied) {
				1046	tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
				1047	if (!(flags & MSG_SENDPAGE_NOTLAST))
				1048	tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
				1049	}
				1050	return copied;
				1051
				1052	do_error:
				1053	tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
				1054	if (copied)
				1055	goto out;
				1056	out_err:
				1057	/* make sure we wake any epoll edge trigger waiter */
				1058	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
				1059	err == -EAGAIN)) {
				1060	sk->sk_write_space(sk);
				1061	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1062	}
				1063	return sk_stream_error(sk, flags, err);
				1064	}
				1065	EXPORT_SYMBOL_GPL(do_tcp_sendpages);
				1066
				1067	int tcp_sendpage_locked(struct sock sk, struct page page, int offset,
				1068	size_t size, int flags)
				1069	{
				1070	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				1071	!sk_check_csum_caps(sk))
				1072	return sock_no_sendpage_locked(sk, page, offset, size, flags);
				1073
				1074	tcp_rate_check_app_limited(sk); /* is sending application-limited? */
				1075
				1076	return do_tcp_sendpages(sk, page, offset, size, flags);
				1077	}
				1078	EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
				1079
				1080	int tcp_sendpage(struct sock sk, struct page page, int offset,
				1081	size_t size, int flags)
				1082	{
				1083	int ret;
				1084
				1085	lock_sock(sk);
				1086	ret = tcp_sendpage_locked(sk, page, offset, size, flags);
				1087	release_sock(sk);
				1088
				1089	return ret;
				1090	}
				1091	EXPORT_SYMBOL(tcp_sendpage);
				1092
				1093	/* Do not bother using a page frag for very small frames.
				1094	* But use this heuristic only for the first skb in write queue.
				1095	*
				1096	* Having no payload in skb->head allows better SACK shifting
				1097	* in tcp_shift_skb_data(), reducing sack/rack overhead, because
				1098	* write queue has less skbs.
				1099	* Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
				1100	* This also speeds up tso_fragment(), since it wont fallback
				1101	* to tcp_fragment().
				1102	*/
				1103	static int linear_payload_sz(bool first_skb)
				1104	{
				1105	if (first_skb)
				1106	return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
				1107	return 0;
				1108	}
				1109
				1110	static int select_size(const struct sock *sk, bool sg, bool first_skb)
				1111	{
				1112	const struct tcp_sock *tp = tcp_sk(sk);
				1113	int tmp = tp->mss_cache;
				1114
				1115	if (sg) {
				1116	if (sk_can_gso(sk)) {
				1117	tmp = linear_payload_sz(first_skb);
				1118	} else {
				1119	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				1120
				1121	if (tmp >= pgbreak &&
				1122	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				1123	tmp = pgbreak;
				1124	}
				1125	}
				1126
				1127	return tmp;
				1128	}
				1129
				1130	void tcp_free_fastopen_req(struct tcp_sock *tp)
				1131	{
				1132	if (tp->fastopen_req) {
				1133	kfree(tp->fastopen_req);
				1134	tp->fastopen_req = NULL;
				1135	}
				1136	}
				1137
				1138	static int tcp_sendmsg_fastopen(struct sock sk, struct msghdr msg,
				1139	int *copied, size_t size)
				1140	{
				1141	struct tcp_sock *tp = tcp_sk(sk);
				1142	struct inet_sock *inet = inet_sk(sk);
				1143	struct sockaddr *uaddr = msg->msg_name;
				1144	int err, flags;
				1145
				1146	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) \|\|
				1147	(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
				1148	uaddr->sa_family == AF_UNSPEC))
				1149	return -EOPNOTSUPP;
				1150	if (tp->fastopen_req)
				1151	return -EALREADY; /* Another Fast Open is in progress */
				1152
				1153	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
				1154	sk->sk_allocation);
				1155	if (unlikely(!tp->fastopen_req))
				1156	return -ENOBUFS;
				1157	tp->fastopen_req->data = msg;
				1158	tp->fastopen_req->size = size;
				1159
				1160	if (inet->defer_connect) {
				1161	err = tcp_connect(sk);
				1162	/* Same failure procedure as in tcp_v4/6_connect */
				1163	if (err) {
				1164	tcp_set_state(sk, TCP_CLOSE);
				1165	inet->inet_dport = 0;
				1166	sk->sk_route_caps = 0;
				1167	}
				1168	}
				1169	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
				1170	err = __inet_stream_connect(sk->sk_socket, uaddr,
				1171	msg->msg_namelen, flags, 1);
				1172	/* fastopen_req could already be freed in __inet_stream_connect
				1173	* if the connection times out or gets rst
				1174	*/
				1175	if (tp->fastopen_req) {
				1176	*copied = tp->fastopen_req->copied;
				1177	tcp_free_fastopen_req(tp);
				1178	inet->defer_connect = 0;
				1179	}
				1180	return err;
				1181	}
				1182
				1183	int tcp_sendmsg_locked(struct sock sk, struct msghdr msg, size_t size)
				1184	{
				1185	struct tcp_sock *tp = tcp_sk(sk);
				1186	struct ubuf_info *uarg = NULL;
				1187	struct sk_buff *skb;
				1188	struct sockcm_cookie sockc;
				1189	int flags, err, copied = 0;
				1190	int mss_now = 0, size_goal, copied_syn = 0;
				1191	bool process_backlog = false;
				1192	bool sg;
				1193	long timeo;
				1194
				1195	flags = msg->msg_flags;
				1196
				1197	if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
				1198	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) {
				1199	err = -EINVAL;
				1200	goto out_err;
				1201	}
				1202
				1203	skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
				1204	uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
				1205	if (!uarg) {
				1206	err = -ENOBUFS;
				1207	goto out_err;
				1208	}
				1209
				1210	if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
				1211	uarg->zerocopy = 0;
				1212	}
				1213
				1214	if (unlikely(flags & MSG_FASTOPEN \|\| inet_sk(sk)->defer_connect) &&
				1215	!tp->repair) {
				1216	err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
				1217	if (err == -EINPROGRESS && copied_syn > 0)
				1218	goto out;
				1219	else if (err)
				1220	goto out_err;
				1221	}
				1222
				1223	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				1224
				1225	tcp_rate_check_app_limited(sk); /* is sending application-limited? */
				1226
				1227	/* Wait for a connection to finish. One exception is TCP Fast Open
				1228	* (passive side) where data is allowed to be sent before a connection
				1229	* is fully established.
				1230	*/
				1231	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) &&
				1232	!tcp_passive_fastopen(sk)) {
				1233	err = sk_stream_wait_connect(sk, &timeo);
				1234	if (err != 0)
				1235	goto do_error;
				1236	}
				1237
				1238	if (unlikely(tp->repair)) {
				1239	if (tp->repair_queue == TCP_RECV_QUEUE) {
				1240	copied = tcp_send_rcvq(sk, msg, size);
				1241	goto out_nopush;
				1242	}
				1243
				1244	err = -EINVAL;
				1245	if (tp->repair_queue == TCP_NO_QUEUE)
				1246	goto out_err;
				1247
				1248	/* 'common' sending to sendq */
				1249	}
				1250
				1251	sockc.tsflags = sk->sk_tsflags;
				1252	if (msg->msg_controllen) {
				1253	err = sock_cmsg_send(sk, msg, &sockc);
				1254	if (unlikely(err)) {
				1255	err = -EINVAL;
				1256	goto out_err;
				1257	}
				1258	}
				1259
				1260	/* This should be in poll */
				1261	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				1262
				1263	/* Ok commence sending. */
				1264	copied = 0;
				1265
				1266	restart:
				1267	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1268
				1269	err = -EPIPE;
				1270	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				1271	goto do_error;
				1272
				1273	sg = !!(sk->sk_route_caps & NETIF_F_SG);
				1274
				1275	while (msg_data_left(msg)) {
				1276	int copy = 0;
				1277	int max = size_goal;
				1278
				1279	skb = tcp_write_queue_tail(sk);
				1280	if (tcp_send_head(sk)) {
				1281	if (skb->ip_summed == CHECKSUM_NONE)
				1282	max = mss_now;
				1283	copy = max - skb->len;
				1284	}
				1285
				1286	if (copy <= 0 \|\| !tcp_skb_can_collapse_to(skb)) {
				1287	bool first_skb;
				1288
				1289	new_segment:
				1290	/* Allocate new segment. If the interface is SG,
				1291	* allocate skb fitting to single page.
				1292	*/
				1293	if (!sk_stream_memory_free(sk))
				1294	goto wait_for_sndbuf;
				1295
				1296	if (process_backlog && sk_flush_backlog(sk)) {
				1297	process_backlog = false;
				1298	goto restart;
				1299	}
				1300	first_skb = skb_queue_empty(&sk->sk_write_queue);
				1301	skb = sk_stream_alloc_skb(sk,
				1302	select_size(sk, sg, first_skb),
				1303	sk->sk_allocation,
				1304	first_skb);
				1305	if (!skb)
				1306	goto wait_for_memory;
				1307
				1308	process_backlog = true;
				1309	/*
				1310	* Check whether we can use HW checksum.
				1311	*/
				1312	if (sk_check_csum_caps(sk))
				1313	skb->ip_summed = CHECKSUM_PARTIAL;
				1314
				1315	skb_entail(sk, skb);
				1316	copy = size_goal;
				1317	max = size_goal;
				1318
				1319	/* All packets are restored as if they have
				1320	* already been sent. skb_mstamp isn't set to
				1321	* avoid wrong rtt estimation.
				1322	*/
				1323	if (tp->repair)
				1324	TCP_SKB_CB(skb)->sacked \|= TCPCB_REPAIRED;
				1325	}
				1326
				1327	/* Try to append data to the end of skb. */
				1328	if (copy > msg_data_left(msg))
				1329	copy = msg_data_left(msg);
				1330
				1331	/* Where to copy to? */
				1332	if (skb_availroom(skb) > 0) {
				1333	/* We have some space in skb head. Superb! */
				1334	copy = min_t(int, copy, skb_availroom(skb));
				1335	err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
				1336	if (err)
				1337	goto do_fault;
				1338	} else if (!uarg \|\| !uarg->zerocopy) {
				1339	bool merge = true;
				1340	int i = skb_shinfo(skb)->nr_frags;
				1341	struct page_frag *pfrag = sk_page_frag(sk);
				1342
				1343	if (!sk_page_frag_refill(sk, pfrag))
				1344	goto wait_for_memory;
				1345
				1346	if (!skb_can_coalesce(skb, i, pfrag->page,
				1347	pfrag->offset)) {
				1348	if (i >= sysctl_max_skb_frags \|\| !sg) {
				1349	tcp_mark_push(tp, skb);
				1350	goto new_segment;
				1351	}
				1352	merge = false;
				1353	}
				1354
				1355	copy = min_t(int, copy, pfrag->size - pfrag->offset);
				1356
				1357	if (!sk_wmem_schedule(sk, copy))
				1358	goto wait_for_memory;
				1359
				1360	err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
				1361	pfrag->page,
				1362	pfrag->offset,
				1363	copy);
				1364	if (err)
				1365	goto do_error;
				1366
				1367	/* Update the skb. */
				1368	if (merge) {
				1369	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				1370	} else {
				1371	skb_fill_page_desc(skb, i, pfrag->page,
				1372	pfrag->offset, copy);
				1373	page_ref_inc(pfrag->page);
				1374	}
				1375	pfrag->offset += copy;
				1376	} else {
				1377	err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
				1378	if (err == -EMSGSIZE \|\| err == -EEXIST)
				1379	goto new_segment;
				1380	if (err < 0)
				1381	goto do_error;
				1382	copy = err;
				1383	}
				1384
				1385	if (!copied)
				1386	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
				1387
				1388	tp->write_seq += copy;
				1389	TCP_SKB_CB(skb)->end_seq += copy;
				1390	tcp_skb_pcount_set(skb, 0);
				1391
				1392	copied += copy;
				1393	if (!msg_data_left(msg)) {
				1394	if (unlikely(flags & MSG_EOR))
				1395	TCP_SKB_CB(skb)->eor = 1;
				1396	goto out;
				1397	}
				1398
				1399	if (skb->len < max \|\| (flags & MSG_OOB) \|\| unlikely(tp->repair))
				1400	continue;
				1401
				1402	if (forced_push(tp)) {
				1403	tcp_mark_push(tp, skb);
				1404	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
				1405	} else if (skb == tcp_send_head(sk))
				1406	tcp_push_one(sk, mss_now);
				1407	continue;
				1408
				1409	wait_for_sndbuf:
				1410	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				1411	wait_for_memory:
				1412	if (copied)
				1413	tcp_push(sk, flags & ~MSG_MORE, mss_now,
				1414	TCP_NAGLE_PUSH, size_goal);
				1415
				1416	err = sk_stream_wait_memory(sk, &timeo);
				1417	if (err != 0)
				1418	goto do_error;
				1419
				1420	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1421	}
				1422
				1423	out:
				1424	if (copied) {
				1425	tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
				1426	tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
				1427	}
				1428	out_nopush:
				1429	sock_zerocopy_put(uarg);
				1430	return copied + copied_syn;
				1431
				1432	do_error:
				1433	skb = tcp_write_queue_tail(sk);
				1434	do_fault:
				1435	tcp_remove_empty_skb(sk, skb);
				1436
				1437	if (copied + copied_syn)
				1438	goto out;
				1439	out_err:
				1440	sock_zerocopy_put_abort(uarg);
				1441	err = sk_stream_error(sk, flags, err);
				1442	/* make sure we wake any epoll edge trigger waiter */
				1443	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
				1444	err == -EAGAIN)) {
				1445	sk->sk_write_space(sk);
				1446	tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
				1447	}
				1448	return err;
				1449	}
				1450	EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
				1451
				1452	int tcp_sendmsg(struct sock sk, struct msghdr msg, size_t size)
				1453	{
				1454	int ret;
				1455
				1456	lock_sock(sk);
				1457	ret = tcp_sendmsg_locked(sk, msg, size);
				1458	release_sock(sk);
				1459
				1460	return ret;
				1461	}
				1462	EXPORT_SYMBOL(tcp_sendmsg);
				1463
				1464	/*
				1465	* Handle reading urgent data. BSD has very simple semantics for
				1466	* this, no blocking and very strange errors 8)
				1467	*/
				1468
				1469	static int tcp_recv_urg(struct sock sk, struct msghdr msg, int len, int flags)
				1470	{
				1471	struct tcp_sock *tp = tcp_sk(sk);
				1472
				1473	/* No URG data to read. */
				1474	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				1475	tp->urg_data == TCP_URG_READ)
				1476	return -EINVAL; /* Yes this is right ! */
				1477
				1478	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				1479	return -ENOTCONN;
				1480
				1481	if (tp->urg_data & TCP_URG_VALID) {
				1482	int err = 0;
				1483	char c = tp->urg_data;
				1484
				1485	if (!(flags & MSG_PEEK))
				1486	tp->urg_data = TCP_URG_READ;
				1487
				1488	/* Read urgent data. */
				1489	msg->msg_flags \|= MSG_OOB;
				1490
				1491	if (len > 0) {
				1492	if (!(flags & MSG_TRUNC))
				1493	err = memcpy_to_msg(msg, &c, 1);
				1494	len = 1;
				1495	} else
				1496	msg->msg_flags \|= MSG_TRUNC;
				1497
				1498	return err ? -EFAULT : len;
				1499	}
				1500
				1501	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1502	return 0;
				1503
				1504	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1505	* the available implementations agree in this case:
				1506	* this call should never block, independent of the
				1507	* blocking state of the socket.
				1508	* Mike <pall@rz.uni-karlsruhe.de>
				1509	*/
				1510	return -EAGAIN;
				1511	}
				1512
				1513	static int tcp_peek_sndq(struct sock sk, struct msghdr msg, int len)
				1514	{
				1515	struct sk_buff *skb;
				1516	int copied = 0, err = 0;
				1517
				1518	/* XXX -- need to support SO_PEEK_OFF */
				1519
				1520	skb_queue_walk(&sk->sk_write_queue, skb) {
				1521	err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
				1522	if (err)
				1523	break;
				1524
				1525	copied += skb->len;
				1526	}
				1527
				1528	return err ?: copied;
				1529	}
				1530
				1531	/* Clean up the receive buffer for full frames taken by the user,
				1532	* then send an ACK if necessary. COPIED is the number of bytes
				1533	* tcp_recvmsg has given to the user so far, it speeds up the
				1534	* calculation of whether or not we must ACK for the sake of
				1535	* a window update.
				1536	*/
				1537	static void tcp_cleanup_rbuf(struct sock *sk, int copied)
				1538	{
				1539	struct tcp_sock *tp = tcp_sk(sk);
				1540	bool time_to_ack = false;
				1541
				1542	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1543
				1544	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
				1545	"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
				1546	tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
				1547
				1548	if (inet_csk_ack_scheduled(sk)) {
				1549	const struct inet_connection_sock *icsk = inet_csk(sk);
				1550	/* Delayed ACKs frequently hit locked sockets during bulk
				1551	* receive. */
				1552	if (icsk->icsk_ack.blocked \|\|
				1553	/* Once-per-two-segments ACK was not sent by tcp_input.c */
				1554	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
				1555	/*
				1556	* If this read emptied read buffer, we send ACK, if
				1557	* connection is not bidirectional, user drained
				1558	* receive buffer and there was a small segment
				1559	* in queue.
				1560	*/
				1561	(copied > 0 &&
				1562	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|
				1563	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
				1564	!icsk->icsk_ack.pingpong)) &&
				1565	!atomic_read(&sk->sk_rmem_alloc)))
				1566	time_to_ack = true;
				1567	}
				1568
				1569	/* We send an ACK if we can now advertise a non-zero window
				1570	* which has been raised "significantly".
				1571	*
				1572	* Even if window raised up to infinity, do not send window open ACK
				1573	* in states, where we will not receive more. It is useless.
				1574	*/
				1575	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1576	__u32 rcv_window_now = tcp_receive_window(tp);
				1577
				1578	/* Optimize, __tcp_select_window() is not cheap. */
				1579	if (2*rcv_window_now <= tp->window_clamp) {
				1580	__u32 new_window = __tcp_select_window(sk);
				1581
				1582	/* Send ACK now, if this read freed lots of space
				1583	* in our buffer. Certainly, new_window is new window.
				1584	* We can advertise it now, if it is not less than current one.
				1585	* "Lots" means "at least twice" here.
				1586	*/
				1587	if (new_window && new_window >= 2 * rcv_window_now)
				1588	time_to_ack = true;
				1589	}
				1590	}
				1591	if (time_to_ack)
				1592	tcp_send_ack(sk);
				1593	}
				1594
				1595	static struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1596	{
				1597	struct sk_buff *skb;
				1598	u32 offset;
				1599
				1600	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
				1601	offset = seq - TCP_SKB_CB(skb)->seq;
				1602	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				1603	pr_err_once("%s: found a SYN, please report !\n", __func__);
				1604	offset--;
				1605	}
				1606	if (offset < skb->len \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
				1607	*off = offset;
				1608	return skb;
				1609	}
				1610	/* This looks weird, but this can happen if TCP collapsing
				1611	* splitted a fat GRO packet, while we released socket lock
				1612	* in skb_splice_bits()
				1613	*/
				1614	sk_eat_skb(sk, skb);
				1615	}
				1616	return NULL;
				1617	}
				1618
				1619	/*
				1620	* This routine provides an alternative to tcp_recvmsg() for routines
				1621	* that would like to handle copying from skbuffs directly in 'sendfile'
				1622	* fashion.
				1623	* Note:
				1624	* - It is assumed that the socket was locked by the caller.
				1625	* - The routine does not block.
				1626	* - At present, there is no support for reading OOB data
				1627	* or for 'peeking' the socket using this routine
				1628	* (although both would be easy to implement).
				1629	*/
				1630	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1631	sk_read_actor_t recv_actor)
				1632	{
				1633	struct sk_buff *skb;
				1634	struct tcp_sock *tp = tcp_sk(sk);
				1635	u32 seq = tp->copied_seq;
				1636	u32 offset;
				1637	int copied = 0;
				1638
				1639	if (sk->sk_state == TCP_LISTEN)
				1640	return -ENOTCONN;
				1641	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1642	if (offset < skb->len) {
				1643	int used;
				1644	size_t len;
				1645
				1646	len = skb->len - offset;
				1647	/* Stop reading if we hit a patch of urgent data */
				1648	if (tp->urg_data) {
				1649	u32 urg_offset = tp->urg_seq - seq;
				1650	if (urg_offset < len)
				1651	len = urg_offset;
				1652	if (!len)
				1653	break;
				1654	}
				1655	used = recv_actor(desc, skb, offset, len);
				1656	if (used <= 0) {
				1657	if (!copied)
				1658	copied = used;
				1659	break;
				1660	} else if (used <= len) {
				1661	seq += used;
				1662	copied += used;
				1663	offset += used;
				1664	}
				1665	/* If recv_actor drops the lock (e.g. TCP splice
				1666	* receive) the skb pointer might be invalid when
				1667	* getting here: tcp_collapse might have deleted it
				1668	* while aggregating skbs from the socket queue.
				1669	*/
				1670	skb = tcp_recv_skb(sk, seq - 1, &offset);
				1671	if (!skb)
				1672	break;
				1673	/* TCP coalescing might have appended data to the skb.
				1674	* Try to splice more frags
				1675	*/
				1676	if (offset + 1 != skb->len)
				1677	continue;
				1678	}
				1679	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
				1680	sk_eat_skb(sk, skb);
				1681	++seq;
				1682	break;
				1683	}
				1684	sk_eat_skb(sk, skb);
				1685	if (!desc->count)
				1686	break;
				1687	tp->copied_seq = seq;
				1688	}
				1689	tp->copied_seq = seq;
				1690
				1691	tcp_rcv_space_adjust(sk);
				1692
				1693	/* Clean up data we have read: This will do ACK frames. */
				1694	if (copied > 0) {
				1695	tcp_recv_skb(sk, seq, &offset);
				1696	tcp_cleanup_rbuf(sk, copied);
				1697	}
				1698	return copied;
				1699	}
				1700	EXPORT_SYMBOL(tcp_read_sock);
				1701
				1702	int tcp_peek_len(struct socket *sock)
				1703	{
				1704	return tcp_inq(sock->sk);
				1705	}
				1706	EXPORT_SYMBOL(tcp_peek_len);
				1707
				1708	static void tcp_update_recv_tstamps(struct sk_buff *skb,
				1709	struct scm_timestamping *tss)
				1710	{
				1711	if (skb->tstamp)
				1712	tss->ts[0] = ktime_to_timespec(skb->tstamp);
				1713	else
				1714	tss->ts[0] = (struct timespec) {0};
				1715
				1716	if (skb_hwtstamps(skb)->hwtstamp)
				1717	tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
				1718	else
				1719	tss->ts[2] = (struct timespec) {0};
				1720	}
				1721
				1722	/* Similar to __sock_recv_timestamp, but does not require an skb */
				1723	void tcp_recv_timestamp(struct msghdr msg, const struct sock sk,
				1724	struct scm_timestamping *tss)
				1725	{
				1726	struct timeval tv;
				1727	bool has_timestamping = false;
				1728
				1729	if (tss->ts[0].tv_sec \|\| tss->ts[0].tv_nsec) {
				1730	if (sock_flag(sk, SOCK_RCVTSTAMP)) {
				1731	if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
				1732	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
				1733	sizeof(tss->ts[0]), &tss->ts[0]);
				1734	} else {
				1735	tv.tv_sec = tss->ts[0].tv_sec;
				1736	tv.tv_usec = tss->ts[0].tv_nsec / 1000;
				1737
				1738	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
				1739	sizeof(tv), &tv);
				1740	}
				1741	}
				1742
				1743	if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
				1744	has_timestamping = true;
				1745	else
				1746	tss->ts[0] = (struct timespec) {0};
				1747	}
				1748
				1749	if (tss->ts[2].tv_sec \|\| tss->ts[2].tv_nsec) {
				1750	if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
				1751	has_timestamping = true;
				1752	else
				1753	tss->ts[2] = (struct timespec) {0};
				1754	}
				1755
				1756	if (has_timestamping) {
				1757	tss->ts[1] = (struct timespec) {0};
				1758	put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
				1759	sizeof(*tss), tss);
				1760	}
				1761	}
				1762
				1763	/*
				1764	* This routine copies from a sock struct into the user buffer.
				1765	*
				1766	* Technical note: in 2.3 we work on _locked_ socket, so that
				1767	* tricks with *seq access order and skb->users are not required.
				1768	* Probably, code can be easily improved even more.
				1769	*/
				1770
				1771	int tcp_recvmsg(struct sock sk, struct msghdr msg, size_t len, int nonblock,
				1772	int flags, int *addr_len)
				1773	{
				1774	struct tcp_sock *tp = tcp_sk(sk);
				1775	int copied = 0;
				1776	u32 peek_seq;
				1777	u32 *seq;
				1778	unsigned long used;
				1779	int err;
				1780	int target; /* Read at least this many bytes */
				1781	long timeo;
				1782	struct sk_buff skb, last;
				1783	u32 urg_hole = 0;
				1784	struct scm_timestamping tss;
				1785	bool has_tss = false;
				1786
				1787	if (unlikely(flags & MSG_ERRQUEUE))
				1788	return inet_recv_error(sk, msg, len, addr_len);
				1789
				1790	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
				1791	(sk->sk_state == TCP_ESTABLISHED))
				1792	sk_busy_loop(sk, nonblock);
				1793
				1794	lock_sock(sk);
				1795
				1796	err = -ENOTCONN;
				1797	if (sk->sk_state == TCP_LISTEN)
				1798	goto out;
				1799
				1800	timeo = sock_rcvtimeo(sk, nonblock);
				1801
				1802	/* Urgent data needs to be handled specially. */
				1803	if (flags & MSG_OOB)
				1804	goto recv_urg;
				1805
				1806	if (unlikely(tp->repair)) {
				1807	err = -EPERM;
				1808	if (!(flags & MSG_PEEK))
				1809	goto out;
				1810
				1811	if (tp->repair_queue == TCP_SEND_QUEUE)
				1812	goto recv_sndq;
				1813
				1814	err = -EINVAL;
				1815	if (tp->repair_queue == TCP_NO_QUEUE)
				1816	goto out;
				1817
				1818	/* 'common' recv queue MSG_PEEK-ing */
				1819	}
				1820
				1821	seq = &tp->copied_seq;
				1822	if (flags & MSG_PEEK) {
				1823	peek_seq = tp->copied_seq;
				1824	seq = &peek_seq;
				1825	}
				1826
				1827	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1828
				1829	do {
				1830	u32 offset;
				1831
				1832	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1833	if (tp->urg_data && tp->urg_seq == *seq) {
				1834	if (copied)
				1835	break;
				1836	if (signal_pending(current)) {
				1837	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1838	break;
				1839	}
				1840	}
				1841
				1842	/* Next get a buffer. */
				1843
				1844	last = skb_peek_tail(&sk->sk_receive_queue);
				1845	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1846	last = skb;
				1847	/* Now that we have two receive queues this
				1848	* shouldn't happen.
				1849	*/
				1850	if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
				1851	"TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
				1852	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
				1853	flags))
				1854	break;
				1855
				1856	offset = *seq - TCP_SKB_CB(skb)->seq;
				1857	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
				1858	pr_err_once("%s: found a SYN, please report !\n", __func__);
				1859	offset--;
				1860	}
				1861	if (offset < skb->len)
				1862	goto found_ok_skb;
				1863	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1864	goto found_fin_ok;
				1865	WARN(!(flags & MSG_PEEK),
				1866	"TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
				1867	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
				1868	}
				1869
				1870	/* Well, if we have backlog, try to process it now yet. */
				1871
				1872	if (copied >= target && !sk->sk_backlog.tail)
				1873	break;
				1874
				1875	if (copied) {
				1876	if (sk->sk_err \|\|
				1877	sk->sk_state == TCP_CLOSE \|\|
				1878	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1879	!timeo \|\|
				1880	signal_pending(current))
				1881	break;
				1882	} else {
				1883	if (sock_flag(sk, SOCK_DONE))
				1884	break;
				1885
				1886	if (sk->sk_err) {
				1887	copied = sock_error(sk);
				1888	break;
				1889	}
				1890
				1891	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1892	break;
				1893
				1894	if (sk->sk_state == TCP_CLOSE) {
				1895	if (!sock_flag(sk, SOCK_DONE)) {
				1896	/* This occurs when user tries to read
				1897	* from never connected socket.
				1898	*/
				1899	copied = -ENOTCONN;
				1900	break;
				1901	}
				1902	break;
				1903	}
				1904
				1905	if (!timeo) {
				1906	copied = -EAGAIN;
				1907	break;
				1908	}
				1909
				1910	if (signal_pending(current)) {
				1911	copied = sock_intr_errno(timeo);
				1912	break;
				1913	}
				1914	}
				1915
				1916	tcp_cleanup_rbuf(sk, copied);
				1917
				1918	if (copied >= target) {
				1919	/* Do not sleep, just process backlog. */
				1920	release_sock(sk);
				1921	lock_sock(sk);
				1922	} else {
				1923	sk_wait_data(sk, &timeo, last);
				1924	}
				1925
				1926	if ((flags & MSG_PEEK) &&
				1927	(peek_seq - copied - urg_hole != tp->copied_seq)) {
				1928	net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
				1929	current->comm,
				1930	task_pid_nr(current));
				1931	peek_seq = tp->copied_seq;
				1932	}
				1933	continue;
				1934
				1935	found_ok_skb:
				1936	/* Ok so how much can we use? */
				1937	used = skb->len - offset;
				1938	if (len < used)
				1939	used = len;
				1940
				1941	/* Do we have urgent data here? */
				1942	if (tp->urg_data) {
				1943	u32 urg_offset = tp->urg_seq - *seq;
				1944	if (urg_offset < used) {
				1945	if (!urg_offset) {
				1946	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1947	++*seq;
				1948	urg_hole++;
				1949	offset++;
				1950	used--;
				1951	if (!used)
				1952	goto skip_copy;
				1953	}
				1954	} else
				1955	used = urg_offset;
				1956	}
				1957	}
				1958
				1959	if (!(flags & MSG_TRUNC)) {
				1960	err = skb_copy_datagram_msg(skb, offset, msg, used);
				1961	if (err) {
				1962	/* Exception. Bailout! */
				1963	if (!copied)
				1964	copied = -EFAULT;
				1965	break;
				1966	}
				1967	}
				1968
				1969	*seq += used;
				1970	copied += used;
				1971	len -= used;
				1972
				1973	tcp_rcv_space_adjust(sk);
				1974
				1975	skip_copy:
				1976	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1977	tp->urg_data = 0;
				1978	tcp_fast_path_check(sk);
				1979	}
				1980
				1981	if (TCP_SKB_CB(skb)->has_rxtstamp) {
				1982	tcp_update_recv_tstamps(skb, &tss);
				1983	has_tss = true;
				1984	}
				1985
				1986	if (used + offset < skb->len)
				1987	continue;
				1988
				1989	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1990	goto found_fin_ok;
				1991	if (!(flags & MSG_PEEK))
				1992	sk_eat_skb(sk, skb);
				1993	continue;
				1994
				1995	found_fin_ok:
				1996	/* Process the FIN. */
				1997	++*seq;
				1998	if (!(flags & MSG_PEEK))
				1999	sk_eat_skb(sk, skb);
				2000	break;
				2001	} while (len > 0);
				2002
				2003	/* According to UNIX98, msg_name/msg_namelen are ignored
				2004	* on connected socket. I was just happy when found this 8) --ANK
				2005	*/
				2006
				2007	if (has_tss)
				2008	tcp_recv_timestamp(msg, sk, &tss);
				2009
				2010	/* Clean up data we have read: This will do ACK frames. */
				2011	tcp_cleanup_rbuf(sk, copied);
				2012
				2013	release_sock(sk);
				2014	return copied;
				2015
				2016	out:
				2017	release_sock(sk);
				2018	return err;
				2019
				2020	recv_urg:
				2021	err = tcp_recv_urg(sk, msg, len, flags);
				2022	goto out;
				2023
				2024	recv_sndq:
				2025	err = tcp_peek_sndq(sk, msg, len);
				2026	goto out;
				2027	}
				2028	EXPORT_SYMBOL(tcp_recvmsg);
				2029
				2030	void tcp_set_state(struct sock *sk, int state)
				2031	{
				2032	int oldstate = sk->sk_state;
				2033
				2034	switch (state) {
				2035	case TCP_ESTABLISHED:
				2036	if (oldstate != TCP_ESTABLISHED)
				2037	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
				2038	break;
				2039
				2040	case TCP_CLOSE:
				2041	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
				2042	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
				2043
				2044	sk->sk_prot->unhash(sk);
				2045	if (inet_csk(sk)->icsk_bind_hash &&
				2046	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
				2047	inet_put_port(sk);
				2048	/* fall through */
				2049	default:
				2050	if (oldstate == TCP_ESTABLISHED)
				2051	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
				2052	}
				2053
				2054	/* Change state AFTER socket is unhashed to avoid closed
				2055	* socket sitting in hash tables.
				2056	*/
				2057	sk_state_store(sk, state);
				2058
				2059	#ifdef STATE_TRACE
				2060	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
				2061	#endif
				2062	}
				2063	EXPORT_SYMBOL_GPL(tcp_set_state);
				2064
				2065	/*
				2066	* State processing on a close. This implements the state shift for
				2067	* sending our FIN frame. Note that we only send a FIN for some
				2068	* states. A shutdown() may have already sent the FIN, or we may be
				2069	* closed.
				2070	*/
				2071
				2072	static const unsigned char new_state[16] = {
				2073	/* current state: new state: action: */
				2074	[0 /* (Invalid) */] = TCP_CLOSE,
				2075	[TCP_ESTABLISHED] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				2076	[TCP_SYN_SENT] = TCP_CLOSE,
				2077	[TCP_SYN_RECV] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				2078	[TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
				2079	[TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
				2080	[TCP_TIME_WAIT] = TCP_CLOSE,
				2081	[TCP_CLOSE] = TCP_CLOSE,
				2082	[TCP_CLOSE_WAIT] = TCP_LAST_ACK \| TCP_ACTION_FIN,
				2083	[TCP_LAST_ACK] = TCP_LAST_ACK,
				2084	[TCP_LISTEN] = TCP_CLOSE,
				2085	[TCP_CLOSING] = TCP_CLOSING,
				2086	[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
				2087	};
				2088
				2089	static int tcp_close_state(struct sock *sk)
				2090	{
				2091	int next = (int)new_state[sk->sk_state];
				2092	int ns = next & TCP_STATE_MASK;
				2093
				2094	tcp_set_state(sk, ns);
				2095
				2096	return next & TCP_ACTION_FIN;
				2097	}
				2098
				2099	/*
				2100	* Shutdown the sending side of a connection. Much like close except
				2101	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
				2102	*/
				2103
				2104	void tcp_shutdown(struct sock *sk, int how)
				2105	{
				2106	/* We need to grab some memory, and put together a FIN,
				2107	* and then put it into the queue to be sent.
				2108	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				2109	*/
				2110	if (!(how & SEND_SHUTDOWN))
				2111	return;
				2112
				2113	/* If we've already sent a FIN, or it's a closed state, skip this. */
				2114	if ((1 << sk->sk_state) &
				2115	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				2116	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				2117	/* Clear out any half completed packets. FIN if needed. */
				2118	if (tcp_close_state(sk))
				2119	tcp_send_fin(sk);
				2120	}
				2121	}
				2122	EXPORT_SYMBOL(tcp_shutdown);
				2123
				2124	bool tcp_check_oom(struct sock *sk, int shift)
				2125	{
				2126	bool too_many_orphans, out_of_socket_memory;
				2127
				2128	too_many_orphans = tcp_too_many_orphans(sk, shift);
				2129	out_of_socket_memory = tcp_out_of_memory(sk);
				2130
				2131	if (too_many_orphans)
				2132	net_info_ratelimited("too many orphaned sockets\n");
				2133	if (out_of_socket_memory)
				2134	net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
				2135	return too_many_orphans \|\| out_of_socket_memory;
				2136	}
				2137
				2138	void tcp_close(struct sock *sk, long timeout)
				2139	{
				2140	struct sk_buff *skb;
				2141	int data_was_unread = 0;
				2142	int state;
				2143
				2144	lock_sock(sk);
				2145	sk->sk_shutdown = SHUTDOWN_MASK;
				2146
				2147	if (sk->sk_state == TCP_LISTEN) {
				2148	tcp_set_state(sk, TCP_CLOSE);
				2149
				2150	/* Special case. */
				2151	inet_csk_listen_stop(sk);
				2152
				2153	goto adjudge_to_death;
				2154	}
				2155
				2156	/* We need to flush the recv. buffs. We do this only on the
				2157	* descriptor close, not protocol-sourced closes, because the
				2158	* reader process may not have drained the data yet!
				2159	*/
				2160	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				2161	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
				2162
				2163	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				2164	len--;
				2165	data_was_unread += len;
				2166	__kfree_skb(skb);
				2167	}
				2168
				2169	sk_mem_reclaim(sk);
				2170
				2171	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
				2172	if (sk->sk_state == TCP_CLOSE)
				2173	goto adjudge_to_death;
				2174
				2175	/* As outlined in RFC 2525, section 2.17, we send a RST here because
				2176	* data was lost. To witness the awful effects of the old behavior of
				2177	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
				2178	* GET in an FTP client, suspend the process, wait for the client to
				2179	* advertise a zero window, then kill -9 the FTP client, wheee...
				2180	* Note: timeout is always zero in such a case.
				2181	*/
				2182	if (unlikely(tcp_sk(sk)->repair)) {
				2183	sk->sk_prot->disconnect(sk, 0);
				2184	} else if (data_was_unread) {
				2185	/* Unread data was tossed, zap the connection. */
				2186	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
				2187	tcp_set_state(sk, TCP_CLOSE);
				2188	tcp_send_active_reset(sk, sk->sk_allocation);
				2189	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				2190	/* Check zero linger _after_ checking for unread data. */
				2191	sk->sk_prot->disconnect(sk, 0);
				2192	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				2193	} else if (tcp_close_state(sk)) {
				2194	/* We FIN if the application ate all the data before
				2195	* zapping the connection.
				2196	*/
				2197
				2198	/* RED-PEN. Formally speaking, we have broken TCP state
				2199	* machine. State transitions:
				2200	*
				2201	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				2202	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				2203	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				2204	*
				2205	* are legal only when FIN has been sent (i.e. in window),
				2206	* rather than queued out of window. Purists blame.
				2207	*
				2208	* F.e. "RFC state" is ESTABLISHED,
				2209	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				2210	*
				2211	* The visible declinations are that sometimes
				2212	* we enter time-wait state, when it is not required really
				2213	* (harmless), do not send active resets, when they are
				2214	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				2215	* they look as CLOSING or LAST_ACK for Linux)
				2216	* Probably, I missed some more holelets.
				2217	* --ANK
				2218	* XXX (TFO) - To start off we don't support SYN+ACK+FIN
				2219	* in a single packet! (May consider it later but will
				2220	* probably need API support or TCP_CORK SYN-ACK until
				2221	* data is written and socket is closed.)
				2222	*/
				2223	tcp_send_fin(sk);
				2224	}
				2225
				2226	sk_stream_wait_close(sk, timeout);
				2227
				2228	adjudge_to_death:
				2229	state = sk->sk_state;
				2230	sock_hold(sk);
				2231	sock_orphan(sk);
				2232
				2233	local_bh_disable();
				2234	bh_lock_sock(sk);
				2235	/* remove backlog if any, without releasing ownership. */
				2236	__release_sock(sk);
				2237
				2238	percpu_counter_inc(sk->sk_prot->orphan_count);
				2239
				2240	/* Have we already been destroyed by a softirq or backlog? */
				2241	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
				2242	goto out;
				2243
				2244	/* This is a (useful) BSD violating of the RFC. There is a
				2245	* problem with TCP as specified in that the other end could
				2246	* keep a socket open forever with no application left this end.
				2247	* We use a 1 minute timeout (about the same as BSD) then kill
				2248	* our end. If they send after that then tough - BUT: long enough
				2249	* that we won't make the old 4*rto = almost no time - whoops
				2250	* reset mistake.
				2251	*
				2252	* Nope, it was not mistake. It is really desired behaviour
				2253	* f.e. on http servers, when such sockets are useless, but
				2254	* consume significant resources. Let's do it with special
				2255	* linger2 option. --ANK
				2256	*/
				2257
				2258	if (sk->sk_state == TCP_FIN_WAIT2) {
				2259	struct tcp_sock *tp = tcp_sk(sk);
				2260	if (tp->linger2 < 0) {
				2261	tcp_set_state(sk, TCP_CLOSE);
				2262	tcp_send_active_reset(sk, GFP_ATOMIC);
				2263	__NET_INC_STATS(sock_net(sk),
				2264	LINUX_MIB_TCPABORTONLINGER);
				2265	} else {
				2266	const int tmo = tcp_fin_time(sk);
				2267
				2268	if (tmo > TCP_TIMEWAIT_LEN) {
				2269	inet_csk_reset_keepalive_timer(sk,
				2270	tmo - TCP_TIMEWAIT_LEN);
				2271	} else {
				2272	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				2273	goto out;
				2274	}
				2275	}
				2276	}
				2277	if (sk->sk_state != TCP_CLOSE) {
				2278	sk_mem_reclaim(sk);
				2279	if (tcp_check_oom(sk, 0)) {
				2280	tcp_set_state(sk, TCP_CLOSE);
				2281	tcp_send_active_reset(sk, GFP_ATOMIC);
				2282	__NET_INC_STATS(sock_net(sk),
				2283	LINUX_MIB_TCPABORTONMEMORY);
				2284	} else if (!check_net(sock_net(sk))) {
				2285	/* Not possible to send reset; just close */
				2286	tcp_set_state(sk, TCP_CLOSE);
				2287	}
				2288	}
				2289
				2290	if (sk->sk_state == TCP_CLOSE) {
				2291	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
				2292	/* We could get here with a non-NULL req if the socket is
				2293	* aborted (e.g., closed with unread data) before 3WHS
				2294	* finishes.
				2295	*/
				2296	if (req)
				2297	reqsk_fastopen_remove(sk, req, false);
				2298	inet_csk_destroy_sock(sk);
				2299	}
				2300	/* Otherwise, socket is reprieved until protocol close. */
				2301
				2302	out:
				2303	bh_unlock_sock(sk);
				2304	local_bh_enable();
				2305	release_sock(sk);
				2306	sock_put(sk);
				2307	}
				2308	EXPORT_SYMBOL(tcp_close);
				2309
				2310	/* These states need RST on ABORT according to RFC793 */
				2311
				2312	static inline bool tcp_need_reset(int state)
				2313	{
				2314	return (1 << state) &
				2315	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				2316	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				2317	}
				2318
				2319	int tcp_disconnect(struct sock *sk, int flags)
				2320	{
				2321	struct inet_sock *inet = inet_sk(sk);
				2322	struct inet_connection_sock *icsk = inet_csk(sk);
				2323	struct tcp_sock *tp = tcp_sk(sk);
				2324	int err = 0;
				2325	int old_state = sk->sk_state;
				2326
				2327	if (old_state != TCP_CLOSE)
				2328	tcp_set_state(sk, TCP_CLOSE);
				2329
				2330	/* ABORT function of RFC793 */
				2331	if (old_state == TCP_LISTEN) {
				2332	inet_csk_listen_stop(sk);
				2333	} else if (unlikely(tp->repair)) {
				2334	sk->sk_err = ECONNABORTED;
				2335	} else if (tcp_need_reset(old_state) \|\|
				2336	(tp->snd_nxt != tp->write_seq &&
				2337	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				2338	/* The last check adjusts for discrepancy of Linux wrt. RFC
				2339	* states
				2340	*/
				2341	tcp_send_active_reset(sk, gfp_any());
				2342	sk->sk_err = ECONNRESET;
				2343	} else if (old_state == TCP_SYN_SENT)
				2344	sk->sk_err = ECONNRESET;
				2345
				2346	tcp_clear_xmit_timers(sk);
				2347	__skb_queue_purge(&sk->sk_receive_queue);
				2348	tcp_write_queue_purge(sk);
				2349	tcp_fastopen_active_disable_ofo_check(sk);
				2350	skb_rbtree_purge(&tp->out_of_order_queue);
				2351
				2352	inet->inet_dport = 0;
				2353
				2354	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				2355	inet_reset_saddr(sk);
				2356
				2357	sk->sk_shutdown = 0;
				2358	sock_reset_flag(sk, SOCK_DONE);
				2359	tp->srtt_us = 0;
				2360	tp->write_seq += tp->max_window + 2;
				2361	if (tp->write_seq == 0)
				2362	tp->write_seq = 1;
				2363	tp->snd_cwnd = 2;
				2364	icsk->icsk_probes_out = 0;
				2365	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
				2366	tp->snd_cwnd_cnt = 0;
				2367	tp->window_clamp = 0;
				2368	tp->delivered = 0;
				2369	if (icsk->icsk_ca_ops->release)
				2370	icsk->icsk_ca_ops->release(sk);
				2371	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
				2372	tcp_set_ca_state(sk, TCP_CA_Open);
				2373	tp->is_sack_reneg = 0;
				2374	tcp_clear_retrans(tp);
				2375	tp->total_retrans = 0;
				2376	inet_csk_delack_init(sk);
				2377	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
				2378	* issue in __tcp_select_window()
				2379	*/
				2380	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
				2381	tcp_init_send_head(sk);
				2382	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
				2383	__sk_dst_reset(sk);
				2384	dst_release(sk->sk_rx_dst);
				2385	sk->sk_rx_dst = NULL;
				2386	tcp_saved_syn_free(tp);
				2387	tp->segs_in = 0;
				2388	tp->segs_out = 0;
				2389	tp->bytes_acked = 0;
				2390	tp->bytes_received = 0;
				2391	tp->data_segs_in = 0;
				2392	tp->data_segs_out = 0;
				2393
				2394	/* Clean up fastopen related fields */
				2395	tcp_free_fastopen_req(tp);
				2396	inet->defer_connect = 0;
				2397
				2398	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
				2399
				2400	if (sk->sk_frag.page) {
				2401	put_page(sk->sk_frag.page);
				2402	sk->sk_frag.page = NULL;
				2403	sk->sk_frag.offset = 0;
				2404	}
				2405
				2406	sk->sk_error_report(sk);
				2407	return err;
				2408	}
				2409	EXPORT_SYMBOL(tcp_disconnect);
				2410
				2411	static inline bool tcp_can_repair_sock(const struct sock *sk)
				2412	{
				2413	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
				2414	(sk->sk_state != TCP_LISTEN);
				2415	}
				2416
				2417	static int tcp_repair_set_window(struct tcp_sock tp, char __user optbuf, int len)
				2418	{
				2419	struct tcp_repair_window opt;
				2420
				2421	if (!tp->repair)
				2422	return -EPERM;
				2423
				2424	if (len != sizeof(opt))
				2425	return -EINVAL;
				2426
				2427	if (copy_from_user(&opt, optbuf, sizeof(opt)))
				2428	return -EFAULT;
				2429
				2430	if (opt.max_window < opt.snd_wnd)
				2431	return -EINVAL;
				2432
				2433	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
				2434	return -EINVAL;
				2435
				2436	if (after(opt.rcv_wup, tp->rcv_nxt))
				2437	return -EINVAL;
				2438
				2439	tp->snd_wl1 = opt.snd_wl1;
				2440	tp->snd_wnd = opt.snd_wnd;
				2441	tp->max_window = opt.max_window;
				2442
				2443	tp->rcv_wnd = opt.rcv_wnd;
				2444	tp->rcv_wup = opt.rcv_wup;
				2445
				2446	return 0;
				2447	}
				2448
				2449	static int tcp_repair_options_est(struct sock *sk,
				2450	struct tcp_repair_opt __user *optbuf, unsigned int len)
				2451	{
				2452	struct tcp_sock *tp = tcp_sk(sk);
				2453	struct tcp_repair_opt opt;
				2454
				2455	while (len >= sizeof(opt)) {
				2456	if (copy_from_user(&opt, optbuf, sizeof(opt)))
				2457	return -EFAULT;
				2458
				2459	optbuf++;
				2460	len -= sizeof(opt);
				2461
				2462	switch (opt.opt_code) {
				2463	case TCPOPT_MSS:
				2464	tp->rx_opt.mss_clamp = opt.opt_val;
				2465	tcp_mtup_init(sk);
				2466	break;
				2467	case TCPOPT_WINDOW:
				2468	{
				2469	u16 snd_wscale = opt.opt_val & 0xFFFF;
				2470	u16 rcv_wscale = opt.opt_val >> 16;
				2471
				2472	if (snd_wscale > TCP_MAX_WSCALE \|\| rcv_wscale > TCP_MAX_WSCALE)
				2473	return -EFBIG;
				2474
				2475	tp->rx_opt.snd_wscale = snd_wscale;
				2476	tp->rx_opt.rcv_wscale = rcv_wscale;
				2477	tp->rx_opt.wscale_ok = 1;
				2478	}
				2479	break;
				2480	case TCPOPT_SACK_PERM:
				2481	if (opt.opt_val != 0)
				2482	return -EINVAL;
				2483
				2484	tp->rx_opt.sack_ok \|= TCP_SACK_SEEN;
				2485	if (sysctl_tcp_fack)
				2486	tcp_enable_fack(tp);
				2487	break;
				2488	case TCPOPT_TIMESTAMP:
				2489	if (opt.opt_val != 0)
				2490	return -EINVAL;
				2491
				2492	tp->rx_opt.tstamp_ok = 1;
				2493	break;
				2494	}
				2495	}
				2496
				2497	return 0;
				2498	}
				2499
				2500	/*
				2501	* Socket option code for TCP.
				2502	*/
				2503	static int do_tcp_setsockopt(struct sock *sk, int level,
				2504	int optname, char __user *optval, unsigned int optlen)
				2505	{
				2506	struct tcp_sock *tp = tcp_sk(sk);
				2507	struct inet_connection_sock *icsk = inet_csk(sk);
				2508	struct net *net = sock_net(sk);
				2509	int val;
				2510	int err = 0;
				2511
				2512	/* These are data/string values, all the others are ints */
				2513	switch (optname) {
				2514	case TCP_CONGESTION: {
				2515	char name[TCP_CA_NAME_MAX];
				2516
				2517	if (optlen < 1)
				2518	return -EINVAL;
				2519
				2520	val = strncpy_from_user(name, optval,
				2521	min_t(long, TCP_CA_NAME_MAX-1, optlen));
				2522	if (val < 0)
				2523	return -EFAULT;
				2524	name[val] = 0;
				2525
				2526	lock_sock(sk);
				2527	err = tcp_set_congestion_control(sk, name, true, true,
				2528	ns_capable(sock_net(sk)->user_ns,
				2529	CAP_NET_ADMIN));
				2530	release_sock(sk);
				2531	return err;
				2532	}
				2533	case TCP_ULP: {
				2534	char name[TCP_ULP_NAME_MAX];
				2535
				2536	if (optlen < 1)
				2537	return -EINVAL;
				2538
				2539	val = strncpy_from_user(name, optval,
				2540	min_t(long, TCP_ULP_NAME_MAX - 1,
				2541	optlen));
				2542	if (val < 0)
				2543	return -EFAULT;
				2544	name[val] = 0;
				2545
				2546	lock_sock(sk);
				2547	err = tcp_set_ulp(sk, name);
				2548	release_sock(sk);
				2549	return err;
				2550	}
				2551	default:
				2552	/* fallthru */
				2553	break;
				2554	}
				2555
				2556	if (optlen < sizeof(int))
				2557	return -EINVAL;
				2558
				2559	if (get_user(val, (int __user *)optval))
				2560	return -EFAULT;
				2561
				2562	lock_sock(sk);
				2563
				2564	switch (optname) {
				2565	case TCP_MAXSEG:
				2566	/* Values greater than interface MTU won't take effect. However
				2567	* at the point when this call is done we typically don't yet
				2568	* know which interface is going to be used
				2569	*/
				2570	if (val && (val < TCP_MIN_MSS \|\| val > MAX_TCP_WINDOW)) {
				2571	err = -EINVAL;
				2572	break;
				2573	}
				2574	tp->rx_opt.user_mss = val;
				2575	break;
				2576
				2577	case TCP_NODELAY:
				2578	if (val) {
				2579	/* TCP_NODELAY is weaker than TCP_CORK, so that
				2580	* this option on corked socket is remembered, but
				2581	* it is not activated until cork is cleared.
				2582	*
				2583	* However, when TCP_NODELAY is set we make
				2584	* an explicit push, which overrides even TCP_CORK
				2585	* for currently queued segments.
				2586	*/
				2587	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				2588	tcp_push_pending_frames(sk);
				2589	} else {
				2590	tp->nonagle &= ~TCP_NAGLE_OFF;
				2591	}
				2592	break;
				2593
				2594	case TCP_THIN_LINEAR_TIMEOUTS:
				2595	if (val < 0 \|\| val > 1)
				2596	err = -EINVAL;
				2597	else
				2598	tp->thin_lto = val;
				2599	break;
				2600
				2601	case TCP_THIN_DUPACK:
				2602	if (val < 0 \|\| val > 1)
				2603	err = -EINVAL;
				2604	break;
				2605
				2606	case TCP_REPAIR:
				2607	if (!tcp_can_repair_sock(sk))
				2608	err = -EPERM;
				2609	else if (val == 1) {
				2610	tp->repair = 1;
				2611	sk->sk_reuse = SK_FORCE_REUSE;
				2612	tp->repair_queue = TCP_NO_QUEUE;
				2613	} else if (val == 0) {
				2614	tp->repair = 0;
				2615	sk->sk_reuse = SK_NO_REUSE;
				2616	tcp_send_window_probe(sk);
				2617	} else
				2618	err = -EINVAL;
				2619
				2620	break;
				2621
				2622	case TCP_REPAIR_QUEUE:
				2623	if (!tp->repair)
				2624	err = -EPERM;
				2625	else if ((unsigned int)val < TCP_QUEUES_NR)
				2626	tp->repair_queue = val;
				2627	else
				2628	err = -EINVAL;
				2629	break;
				2630
				2631	case TCP_QUEUE_SEQ:
				2632	if (sk->sk_state != TCP_CLOSE)
				2633	err = -EPERM;
				2634	else if (tp->repair_queue == TCP_SEND_QUEUE)
				2635	tp->write_seq = val;
				2636	else if (tp->repair_queue == TCP_RECV_QUEUE)
				2637	tp->rcv_nxt = val;
				2638	else
				2639	err = -EINVAL;
				2640	break;
				2641
				2642	case TCP_REPAIR_OPTIONS:
				2643	if (!tp->repair)
				2644	err = -EINVAL;
				2645	else if (sk->sk_state == TCP_ESTABLISHED)
				2646	err = tcp_repair_options_est(sk,
				2647	(struct tcp_repair_opt __user *)optval,
				2648	optlen);
				2649	else
				2650	err = -EPERM;
				2651	break;
				2652
				2653	case TCP_CORK:
				2654	/* When set indicates to always queue non-full frames.
				2655	* Later the user clears this option and we transmit
				2656	* any pending partial frames in the queue. This is
				2657	* meant to be used alongside sendfile() to get properly
				2658	* filled frames when the user (for example) must write
				2659	* out headers with a write() call first and then use
				2660	* sendfile to send out the data parts.
				2661	*
				2662	* TCP_CORK can be set together with TCP_NODELAY and it is
				2663	* stronger than TCP_NODELAY.
				2664	*/
				2665	if (val) {
				2666	tp->nonagle \|= TCP_NAGLE_CORK;
				2667	} else {
				2668	tp->nonagle &= ~TCP_NAGLE_CORK;
				2669	if (tp->nonagle&TCP_NAGLE_OFF)
				2670	tp->nonagle \|= TCP_NAGLE_PUSH;
				2671	tcp_push_pending_frames(sk);
				2672	}
				2673	break;
				2674
				2675	case TCP_KEEPIDLE:
				2676	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				2677	err = -EINVAL;
				2678	else {
				2679	tp->keepalive_time = val * HZ;
				2680	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				2681	!((1 << sk->sk_state) &
				2682	(TCPF_CLOSE \| TCPF_LISTEN))) {
				2683	u32 elapsed = keepalive_time_elapsed(tp);
				2684	if (tp->keepalive_time > elapsed)
				2685	elapsed = tp->keepalive_time - elapsed;
				2686	else
				2687	elapsed = 0;
				2688	inet_csk_reset_keepalive_timer(sk, elapsed);
				2689	}
				2690	}
				2691	break;
				2692	case TCP_KEEPINTVL:
				2693	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				2694	err = -EINVAL;
				2695	else
				2696	tp->keepalive_intvl = val * HZ;
				2697	break;
				2698	case TCP_KEEPCNT:
				2699	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				2700	err = -EINVAL;
				2701	else
				2702	tp->keepalive_probes = val;
				2703	break;
				2704	case TCP_SYNCNT:
				2705	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				2706	err = -EINVAL;
				2707	else
				2708	icsk->icsk_syn_retries = val;
				2709	break;
				2710
				2711	case TCP_SAVE_SYN:
				2712	if (val < 0 \|\| val > 1)
				2713	err = -EINVAL;
				2714	else
				2715	tp->save_syn = val;
				2716	break;
				2717
				2718	case TCP_LINGER2:
				2719	if (val < 0)
				2720	tp->linger2 = -1;
				2721	else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
				2722	tp->linger2 = 0;
				2723	else
				2724	tp->linger2 = val * HZ;
				2725	break;
				2726
				2727	case TCP_DEFER_ACCEPT:
				2728	/* Translate value in seconds to number of retransmits */
				2729	icsk->icsk_accept_queue.rskq_defer_accept =
				2730	secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
				2731	TCP_RTO_MAX / HZ);
				2732	break;
				2733
				2734	case TCP_WINDOW_CLAMP:
				2735	if (!val) {
				2736	if (sk->sk_state != TCP_CLOSE) {
				2737	err = -EINVAL;
				2738	break;
				2739	}
				2740	tp->window_clamp = 0;
				2741	} else
				2742	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				2743	SOCK_MIN_RCVBUF / 2 : val;
				2744	break;
				2745
				2746	case TCP_QUICKACK:
				2747	if (!val) {
				2748	icsk->icsk_ack.pingpong = 1;
				2749	} else {
				2750	icsk->icsk_ack.pingpong = 0;
				2751	if ((1 << sk->sk_state) &
				2752	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
				2753	inet_csk_ack_scheduled(sk)) {
				2754	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				2755	tcp_cleanup_rbuf(sk, 1);
				2756	if (!(val & 1))
				2757	icsk->icsk_ack.pingpong = 1;
				2758	}
				2759	}
				2760	break;
				2761
				2762	#ifdef CONFIG_TCP_MD5SIG
				2763	case TCP_MD5SIG:
				2764	case TCP_MD5SIG_EXT:
				2765	err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
				2766	break;
				2767	#endif
				2768	case TCP_USER_TIMEOUT:
				2769	/* Cap the max time in ms TCP will retry or probe the window
				2770	* before giving up and aborting (ETIMEDOUT) a connection.
				2771	*/
				2772	if (val < 0)
				2773	err = -EINVAL;
				2774	else
				2775	icsk->icsk_user_timeout = msecs_to_jiffies(val);
				2776	break;
				2777
				2778	case TCP_FASTOPEN:
				2779	if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE \|
				2780	TCPF_LISTEN))) {
				2781	tcp_fastopen_init_key_once(true);
				2782
				2783	fastopen_queue_tune(sk, val);
				2784	} else {
				2785	err = -EINVAL;
				2786	}
				2787	break;
				2788	case TCP_FASTOPEN_CONNECT:
				2789	if (val > 1 \|\| val < 0) {
				2790	err = -EINVAL;
				2791	} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
				2792	if (sk->sk_state == TCP_CLOSE)
				2793	tp->fastopen_connect = val;
				2794	else
				2795	err = -EINVAL;
				2796	} else {
				2797	err = -EOPNOTSUPP;
				2798	}
				2799	break;
				2800	case TCP_TIMESTAMP:
				2801	if (!tp->repair)
				2802	err = -EPERM;
				2803	else
				2804	tp->tsoffset = val - tcp_time_stamp_raw();
				2805	break;
				2806	case TCP_REPAIR_WINDOW:
				2807	err = tcp_repair_set_window(tp, optval, optlen);
				2808	break;
				2809	case TCP_NOTSENT_LOWAT:
				2810	tp->notsent_lowat = val;
				2811	sk->sk_write_space(sk);
				2812	break;
				2813	default:
				2814	err = -ENOPROTOOPT;
				2815	break;
				2816	}
				2817
				2818	release_sock(sk);
				2819	return err;
				2820	}
				2821
				2822	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				2823	unsigned int optlen)
				2824	{
				2825	const struct inet_connection_sock *icsk = inet_csk(sk);
				2826
				2827	if (level != SOL_TCP)
				2828	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
				2829	optval, optlen);
				2830	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				2831	}
				2832	EXPORT_SYMBOL(tcp_setsockopt);
				2833
				2834	#ifdef CONFIG_COMPAT
				2835	int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
				2836	char __user *optval, unsigned int optlen)
				2837	{
				2838	if (level != SOL_TCP)
				2839	return inet_csk_compat_setsockopt(sk, level, optname,
				2840	optval, optlen);
				2841	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				2842	}
				2843	EXPORT_SYMBOL(compat_tcp_setsockopt);
				2844	#endif
				2845
				2846	static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
				2847	struct tcp_info *info)
				2848	{
				2849	u64 stats[__TCP_CHRONO_MAX], total = 0;
				2850	enum tcp_chrono i;
				2851
				2852	for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
				2853	stats[i] = tp->chrono_stat[i - 1];
				2854	if (i == tp->chrono_type)
				2855	stats[i] += tcp_jiffies32 - tp->chrono_start;
				2856	stats[i] *= USEC_PER_SEC / HZ;
				2857	total += stats[i];
				2858	}
				2859
				2860	info->tcpi_busy_time = total;
				2861	info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
				2862	info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
				2863	}
				2864
				2865	/* Return information about state of tcp endpoint in API format. */
				2866	void tcp_get_info(struct sock sk, struct tcp_info info)
				2867	{
				2868	const struct tcp_sock tp = tcp_sk(sk); / iff sk_type == SOCK_STREAM */
				2869	const struct inet_connection_sock *icsk = inet_csk(sk);
				2870	u32 now;
				2871	u64 rate64;
				2872	bool slow;
				2873	u32 rate;
				2874
				2875	memset(info, 0, sizeof(*info));
				2876	if (sk->sk_type != SOCK_STREAM)
				2877	return;
				2878
				2879	info->tcpi_state = sk_state_load(sk);
				2880
				2881	/* Report meaningful fields for all TCP states, including listeners */
				2882	rate = READ_ONCE(sk->sk_pacing_rate);
				2883	rate64 = rate != ~0U ? rate : ~0ULL;
				2884	info->tcpi_pacing_rate = rate64;
				2885
				2886	rate = READ_ONCE(sk->sk_max_pacing_rate);
				2887	rate64 = rate != ~0U ? rate : ~0ULL;
				2888	info->tcpi_max_pacing_rate = rate64;
				2889
				2890	info->tcpi_reordering = tp->reordering;
				2891	info->tcpi_snd_cwnd = tp->snd_cwnd;
				2892
				2893	if (info->tcpi_state == TCP_LISTEN) {
				2894	/* listeners aliased fields :
				2895	* tcpi_unacked -> Number of children ready for accept()
				2896	* tcpi_sacked -> max backlog
				2897	*/
				2898	info->tcpi_unacked = sk->sk_ack_backlog;
				2899	info->tcpi_sacked = sk->sk_max_ack_backlog;
				2900	return;
				2901	}
				2902
				2903	slow = lock_sock_fast(sk);
				2904
				2905	info->tcpi_ca_state = icsk->icsk_ca_state;
				2906	info->tcpi_retransmits = icsk->icsk_retransmits;
				2907	info->tcpi_probes = icsk->icsk_probes_out;
				2908	info->tcpi_backoff = icsk->icsk_backoff;
				2909
				2910	if (tp->rx_opt.tstamp_ok)
				2911	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				2912	if (tcp_is_sack(tp))
				2913	info->tcpi_options \|= TCPI_OPT_SACK;
				2914	if (tp->rx_opt.wscale_ok) {
				2915	info->tcpi_options \|= TCPI_OPT_WSCALE;
				2916	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				2917	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				2918	}
				2919
				2920	if (tp->ecn_flags & TCP_ECN_OK)
				2921	info->tcpi_options \|= TCPI_OPT_ECN;
				2922	if (tp->ecn_flags & TCP_ECN_SEEN)
				2923	info->tcpi_options \|= TCPI_OPT_ECN_SEEN;
				2924	if (tp->syn_data_acked)
				2925	info->tcpi_options \|= TCPI_OPT_SYN_DATA;
				2926
				2927	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				2928	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
				2929	info->tcpi_snd_mss = tp->mss_cache;
				2930	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
				2931
				2932	info->tcpi_unacked = tp->packets_out;
				2933	info->tcpi_sacked = tp->sacked_out;
				2934
				2935	info->tcpi_lost = tp->lost_out;
				2936	info->tcpi_retrans = tp->retrans_out;
				2937	info->tcpi_fackets = tp->fackets_out;
				2938
				2939	now = tcp_jiffies32;
				2940	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
				2941	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
				2942	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				2943
				2944	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
				2945	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				2946	info->tcpi_rtt = tp->srtt_us >> 3;
				2947	info->tcpi_rttvar = tp->mdev_us >> 2;
				2948	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				2949	info->tcpi_advmss = tp->advmss;
				2950
				2951	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
				2952	info->tcpi_rcv_space = tp->rcvq_space.space;
				2953
				2954	info->tcpi_total_retrans = tp->total_retrans;
				2955
				2956	info->tcpi_bytes_acked = tp->bytes_acked;
				2957	info->tcpi_bytes_received = tp->bytes_received;
				2958	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
				2959	tcp_get_info_chrono_stats(tp, info);
				2960
				2961	info->tcpi_segs_out = tp->segs_out;
				2962	info->tcpi_segs_in = tp->segs_in;
				2963
				2964	info->tcpi_min_rtt = tcp_min_rtt(tp);
				2965	info->tcpi_data_segs_in = tp->data_segs_in;
				2966	info->tcpi_data_segs_out = tp->data_segs_out;
				2967
				2968	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
				2969	rate64 = tcp_compute_delivery_rate(tp);
				2970	if (rate64)
				2971	info->tcpi_delivery_rate = rate64;
				2972	unlock_sock_fast(sk, slow);
				2973	}
				2974	EXPORT_SYMBOL_GPL(tcp_get_info);
				2975
				2976	struct sk_buff tcp_get_timestamping_opt_stats(const struct sock sk)
				2977	{
				2978	const struct tcp_sock *tp = tcp_sk(sk);
				2979	struct sk_buff *stats;
				2980	struct tcp_info info;
				2981	u64 rate64;
				2982	u32 rate;
				2983
				2984	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
				2985	3 * nla_total_size(sizeof(u32)) +
				2986	2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
				2987	if (!stats)
				2988	return NULL;
				2989
				2990	tcp_get_info_chrono_stats(tp, &info);
				2991	nla_put_u64_64bit(stats, TCP_NLA_BUSY,
				2992	info.tcpi_busy_time, TCP_NLA_PAD);
				2993	nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
				2994	info.tcpi_rwnd_limited, TCP_NLA_PAD);
				2995	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
				2996	info.tcpi_sndbuf_limited, TCP_NLA_PAD);
				2997	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
				2998	tp->data_segs_out, TCP_NLA_PAD);
				2999	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
				3000	tp->total_retrans, TCP_NLA_PAD);
				3001
				3002	rate = READ_ONCE(sk->sk_pacing_rate);
				3003	rate64 = rate != ~0U ? rate : ~0ULL;
				3004	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
				3005
				3006	rate64 = tcp_compute_delivery_rate(tp);
				3007	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
				3008
				3009	nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
				3010	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
				3011	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
				3012
				3013	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
				3014	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
				3015	return stats;
				3016	}
				3017
				3018	static int do_tcp_getsockopt(struct sock *sk, int level,
				3019	int optname, char __user optval, int __user optlen)
				3020	{
				3021	struct inet_connection_sock *icsk = inet_csk(sk);
				3022	struct tcp_sock *tp = tcp_sk(sk);
				3023	struct net *net = sock_net(sk);
				3024	int val, len;
				3025
				3026	if (get_user(len, optlen))
				3027	return -EFAULT;
				3028
				3029	len = min_t(unsigned int, len, sizeof(int));
				3030
				3031	if (len < 0)
				3032	return -EINVAL;
				3033
				3034	switch (optname) {
				3035	case TCP_MAXSEG:
				3036	val = tp->mss_cache;
				3037	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				3038	val = tp->rx_opt.user_mss;
				3039	if (tp->repair)
				3040	val = tp->rx_opt.mss_clamp;
				3041	break;
				3042	case TCP_NODELAY:
				3043	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				3044	break;
				3045	case TCP_CORK:
				3046	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				3047	break;
				3048	case TCP_KEEPIDLE:
				3049	val = keepalive_time_when(tp) / HZ;
				3050	break;
				3051	case TCP_KEEPINTVL:
				3052	val = keepalive_intvl_when(tp) / HZ;
				3053	break;
				3054	case TCP_KEEPCNT:
				3055	val = keepalive_probes(tp);
				3056	break;
				3057	case TCP_SYNCNT:
				3058	val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
				3059	break;
				3060	case TCP_LINGER2:
				3061	val = tp->linger2;
				3062	if (val >= 0)
				3063	val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
				3064	break;
				3065	case TCP_DEFER_ACCEPT:
				3066	val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
				3067	TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
				3068	break;
				3069	case TCP_WINDOW_CLAMP:
				3070	val = tp->window_clamp;
				3071	break;
				3072	case TCP_INFO: {
				3073	struct tcp_info info;
				3074
				3075	if (get_user(len, optlen))
				3076	return -EFAULT;
				3077
				3078	tcp_get_info(sk, &info);
				3079
				3080	len = min_t(unsigned int, len, sizeof(info));
				3081	if (put_user(len, optlen))
				3082	return -EFAULT;
				3083	if (copy_to_user(optval, &info, len))
				3084	return -EFAULT;
				3085	return 0;
				3086	}
				3087	case TCP_CC_INFO: {
				3088	const struct tcp_congestion_ops *ca_ops;
				3089	union tcp_cc_info info;
				3090	size_t sz = 0;
				3091	int attr;
				3092
				3093	if (get_user(len, optlen))
				3094	return -EFAULT;
				3095
				3096	ca_ops = icsk->icsk_ca_ops;
				3097	if (ca_ops && ca_ops->get_info)
				3098	sz = ca_ops->get_info(sk, ~0U, &attr, &info);
				3099
				3100	len = min_t(unsigned int, len, sz);
				3101	if (put_user(len, optlen))
				3102	return -EFAULT;
				3103	if (copy_to_user(optval, &info, len))
				3104	return -EFAULT;
				3105	return 0;
				3106	}
				3107	case TCP_QUICKACK:
				3108	val = !icsk->icsk_ack.pingpong;
				3109	break;
				3110
				3111	case TCP_CONGESTION:
				3112	if (get_user(len, optlen))
				3113	return -EFAULT;
				3114	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				3115	if (put_user(len, optlen))
				3116	return -EFAULT;
				3117	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
				3118	return -EFAULT;
				3119	return 0;
				3120
				3121	case TCP_ULP:
				3122	if (get_user(len, optlen))
				3123	return -EFAULT;
				3124	len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
				3125	if (!icsk->icsk_ulp_ops) {
				3126	if (put_user(0, optlen))
				3127	return -EFAULT;
				3128	return 0;
				3129	}
				3130	if (put_user(len, optlen))
				3131	return -EFAULT;
				3132	if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
				3133	return -EFAULT;
				3134	return 0;
				3135
				3136	case TCP_THIN_LINEAR_TIMEOUTS:
				3137	val = tp->thin_lto;
				3138	break;
				3139
				3140	case TCP_THIN_DUPACK:
				3141	val = 0;
				3142	break;
				3143
				3144	case TCP_REPAIR:
				3145	val = tp->repair;
				3146	break;
				3147
				3148	case TCP_REPAIR_QUEUE:
				3149	if (tp->repair)
				3150	val = tp->repair_queue;
				3151	else
				3152	return -EINVAL;
				3153	break;
				3154
				3155	case TCP_REPAIR_WINDOW: {
				3156	struct tcp_repair_window opt;
				3157
				3158	if (get_user(len, optlen))
				3159	return -EFAULT;
				3160
				3161	if (len != sizeof(opt))
				3162	return -EINVAL;
				3163
				3164	if (!tp->repair)
				3165	return -EPERM;
				3166
				3167	opt.snd_wl1 = tp->snd_wl1;
				3168	opt.snd_wnd = tp->snd_wnd;
				3169	opt.max_window = tp->max_window;
				3170	opt.rcv_wnd = tp->rcv_wnd;
				3171	opt.rcv_wup = tp->rcv_wup;
				3172
				3173	if (copy_to_user(optval, &opt, len))
				3174	return -EFAULT;
				3175	return 0;
				3176	}
				3177	case TCP_QUEUE_SEQ:
				3178	if (tp->repair_queue == TCP_SEND_QUEUE)
				3179	val = tp->write_seq;
				3180	else if (tp->repair_queue == TCP_RECV_QUEUE)
				3181	val = tp->rcv_nxt;
				3182	else
				3183	return -EINVAL;
				3184	break;
				3185
				3186	case TCP_USER_TIMEOUT:
				3187	val = jiffies_to_msecs(icsk->icsk_user_timeout);
				3188	break;
				3189
				3190	case TCP_FASTOPEN:
				3191	val = icsk->icsk_accept_queue.fastopenq.max_qlen;
				3192	break;
				3193
				3194	case TCP_FASTOPEN_CONNECT:
				3195	val = tp->fastopen_connect;
				3196	break;
				3197
				3198	case TCP_TIMESTAMP:
				3199	val = tcp_time_stamp_raw() + tp->tsoffset;
				3200	break;
				3201	case TCP_NOTSENT_LOWAT:
				3202	val = tp->notsent_lowat;
				3203	break;
				3204	case TCP_SAVE_SYN:
				3205	val = tp->save_syn;
				3206	break;
				3207	case TCP_SAVED_SYN: {
				3208	if (get_user(len, optlen))
				3209	return -EFAULT;
				3210
				3211	lock_sock(sk);
				3212	if (tp->saved_syn) {
				3213	if (len < tp->saved_syn[0]) {
				3214	if (put_user(tp->saved_syn[0], optlen)) {
				3215	release_sock(sk);
				3216	return -EFAULT;
				3217	}
				3218	release_sock(sk);
				3219	return -EINVAL;
				3220	}
				3221	len = tp->saved_syn[0];
				3222	if (put_user(len, optlen)) {
				3223	release_sock(sk);
				3224	return -EFAULT;
				3225	}
				3226	if (copy_to_user(optval, tp->saved_syn + 1, len)) {
				3227	release_sock(sk);
				3228	return -EFAULT;
				3229	}
				3230	tcp_saved_syn_free(tp);
				3231	release_sock(sk);
				3232	} else {
				3233	release_sock(sk);
				3234	len = 0;
				3235	if (put_user(len, optlen))
				3236	return -EFAULT;
				3237	}
				3238	return 0;
				3239	}
				3240	default:
				3241	return -ENOPROTOOPT;
				3242	}
				3243
				3244	if (put_user(len, optlen))
				3245	return -EFAULT;
				3246	if (copy_to_user(optval, &val, len))
				3247	return -EFAULT;
				3248	return 0;
				3249	}
				3250
				3251	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				3252	int __user *optlen)
				3253	{
				3254	struct inet_connection_sock *icsk = inet_csk(sk);
				3255
				3256	if (level != SOL_TCP)
				3257	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
				3258	optval, optlen);
				3259	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				3260	}
				3261	EXPORT_SYMBOL(tcp_getsockopt);
				3262
				3263	#ifdef CONFIG_COMPAT
				3264	int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
				3265	char __user optval, int __user optlen)
				3266	{
				3267	if (level != SOL_TCP)
				3268	return inet_csk_compat_getsockopt(sk, level, optname,
				3269	optval, optlen);
				3270	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				3271	}
				3272	EXPORT_SYMBOL(compat_tcp_getsockopt);
				3273	#endif
				3274
				3275	#ifdef CONFIG_TCP_MD5SIG
				3276	static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
				3277	static DEFINE_MUTEX(tcp_md5sig_mutex);
				3278	static bool tcp_md5sig_pool_populated = false;
				3279
				3280	static void __tcp_alloc_md5sig_pool(void)
				3281	{
				3282	struct crypto_ahash *hash;
				3283	int cpu;
				3284
				3285	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
				3286	if (IS_ERR(hash))
				3287	return;
				3288
				3289	for_each_possible_cpu(cpu) {
				3290	void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
				3291	struct ahash_request *req;
				3292
				3293	if (!scratch) {
				3294	scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
				3295	sizeof(struct tcphdr),
				3296	GFP_KERNEL,
				3297	cpu_to_node(cpu));
				3298	if (!scratch)
				3299	return;
				3300	per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
				3301	}
				3302	if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
				3303	continue;
				3304
				3305	req = ahash_request_alloc(hash, GFP_KERNEL);
				3306	if (!req)
				3307	return;
				3308
				3309	ahash_request_set_callback(req, 0, NULL, NULL);
				3310
				3311	per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
				3312	}
				3313	/* before setting tcp_md5sig_pool_populated, we must commit all writes
				3314	* to memory. See smp_rmb() in tcp_get_md5sig_pool()
				3315	*/
				3316	smp_wmb();
				3317	tcp_md5sig_pool_populated = true;
				3318	}
				3319
				3320	bool tcp_alloc_md5sig_pool(void)
				3321	{
				3322	if (unlikely(!tcp_md5sig_pool_populated)) {
				3323	mutex_lock(&tcp_md5sig_mutex);
				3324
				3325	if (!tcp_md5sig_pool_populated)
				3326	__tcp_alloc_md5sig_pool();
				3327
				3328	mutex_unlock(&tcp_md5sig_mutex);
				3329	}
				3330	return tcp_md5sig_pool_populated;
				3331	}
				3332	EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
				3333
				3334
				3335	/**
				3336	* tcp_get_md5sig_pool - get md5sig_pool for this user
				3337	*
				3338	* We use percpu structure, so if we succeed, we exit with preemption
				3339	* and BH disabled, to make sure another thread or softirq handling
				3340	* wont try to get same context.
				3341	*/
				3342	struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
				3343	{
				3344	local_bh_disable();
				3345
				3346	if (tcp_md5sig_pool_populated) {
				3347	/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
				3348	smp_rmb();
				3349	return this_cpu_ptr(&tcp_md5sig_pool);
				3350	}
				3351	local_bh_enable();
				3352	return NULL;
				3353	}
				3354	EXPORT_SYMBOL(tcp_get_md5sig_pool);
				3355
				3356	int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
				3357	const struct sk_buff *skb, unsigned int header_len)
				3358	{
				3359	struct scatterlist sg;
				3360	const struct tcphdr *tp = tcp_hdr(skb);
				3361	struct ahash_request *req = hp->md5_req;
				3362	unsigned int i;
				3363	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
				3364	skb_headlen(skb) - header_len : 0;
				3365	const struct skb_shared_info *shi = skb_shinfo(skb);
				3366	struct sk_buff *frag_iter;
				3367
				3368	sg_init_table(&sg, 1);
				3369
				3370	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
				3371	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
				3372	if (crypto_ahash_update(req))
				3373	return 1;
				3374
				3375	for (i = 0; i < shi->nr_frags; ++i) {
				3376	const struct skb_frag_struct *f = &shi->frags[i];
				3377	unsigned int offset = f->page_offset;
				3378	struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
				3379
				3380	sg_set_page(&sg, page, skb_frag_size(f),
				3381	offset_in_page(offset));
				3382	ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
				3383	if (crypto_ahash_update(req))
				3384	return 1;
				3385	}
				3386
				3387	skb_walk_frags(skb, frag_iter)
				3388	if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
				3389	return 1;
				3390
				3391	return 0;
				3392	}
				3393	EXPORT_SYMBOL(tcp_md5_hash_skb_data);
				3394
				3395	int tcp_md5_hash_key(struct tcp_md5sig_pool hp, const struct tcp_md5sig_key key)
				3396	{
				3397	u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
				3398	struct scatterlist sg;
				3399
				3400	sg_init_one(&sg, key->key, keylen);
				3401	ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
				3402
				3403	/* tcp_md5_do_add() might change key->key under us */
				3404	return crypto_ahash_update(hp->md5_req);
				3405	}
				3406	EXPORT_SYMBOL(tcp_md5_hash_key);
				3407
				3408	#endif
				3409
				3410	void tcp_done(struct sock *sk)
				3411	{
				3412	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
				3413
				3414	if (sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)
				3415	TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
				3416
				3417	tcp_set_state(sk, TCP_CLOSE);
				3418	tcp_clear_xmit_timers(sk);
				3419	if (req)
				3420	reqsk_fastopen_remove(sk, req, false);
				3421
				3422	sk->sk_shutdown = SHUTDOWN_MASK;
				3423
				3424	if (!sock_flag(sk, SOCK_DEAD))
				3425	sk->sk_state_change(sk);
				3426	else
				3427	inet_csk_destroy_sock(sk);
				3428	}
				3429	EXPORT_SYMBOL_GPL(tcp_done);
				3430
				3431	int tcp_abort(struct sock *sk, int err)
				3432	{
				3433	if (!sk_fullsock(sk)) {
				3434	if (sk->sk_state == TCP_NEW_SYN_RECV) {
				3435	struct request_sock *req = inet_reqsk(sk);
				3436
				3437	local_bh_disable();
				3438	inet_csk_reqsk_queue_drop(req->rsk_listener, req);
				3439	local_bh_enable();
				3440	return 0;
				3441	}
				3442	return -EOPNOTSUPP;
				3443	}
				3444
				3445	/* Don't race with userspace socket closes such as tcp_close. */
				3446	lock_sock(sk);
				3447
				3448	if (sk->sk_state == TCP_LISTEN) {
				3449	tcp_set_state(sk, TCP_CLOSE);
				3450	inet_csk_listen_stop(sk);
				3451	}
				3452
				3453	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
				3454	local_bh_disable();
				3455	bh_lock_sock(sk);
				3456
				3457	if (!sock_flag(sk, SOCK_DEAD)) {
				3458	sk->sk_err = err;
				3459	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				3460	smp_wmb();
				3461	sk->sk_error_report(sk);
				3462	if (tcp_need_reset(sk->sk_state))
				3463	tcp_send_active_reset(sk, GFP_ATOMIC);
				3464	tcp_done(sk);
				3465	}
				3466
				3467	bh_unlock_sock(sk);
				3468	local_bh_enable();
				3469	tcp_write_queue_purge(sk);
				3470	release_sock(sk);
				3471	return 0;
				3472	}
				3473	EXPORT_SYMBOL_GPL(tcp_abort);
				3474
				3475	extern struct tcp_congestion_ops tcp_reno;
				3476
				3477	static __initdata unsigned long thash_entries;
				3478	static int __init set_thash_entries(char *str)
				3479	{
				3480	ssize_t ret;
				3481
				3482	if (!str)
				3483	return 0;
				3484
				3485	ret = kstrtoul(str, 0, &thash_entries);
				3486	if (ret)
				3487	return 0;
				3488
				3489	return 1;
				3490	}
				3491	__setup("thash_entries=", set_thash_entries);
				3492
				3493	static void __init tcp_init_mem(void)
				3494	{
				3495	unsigned long limit = nr_free_buffer_pages() / 16;
				3496
				3497	limit = max(limit, 128UL);
				3498	sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
				3499	sysctl_tcp_mem[1] = limit; /* 6.25 % */
				3500	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
				3501	}
				3502
				3503	void __init tcp_init(void)
				3504	{
				3505	int max_rshare, max_wshare, cnt;
				3506	unsigned long limit;
				3507	unsigned int i;
				3508
				3509	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
				3510	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
				3511	FIELD_SIZEOF(struct sk_buff, cb));
				3512
				3513	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
				3514	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
				3515	inet_hashinfo_init(&tcp_hashinfo);
				3516	tcp_hashinfo.bind_bucket_cachep =
				3517	kmem_cache_create("tcp_bind_bucket",
				3518	sizeof(struct inet_bind_bucket), 0,
				3519	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
				3520
				3521	/* Size and allocate the main established and bind bucket
				3522	* hash tables.
				3523	*
				3524	* The methodology is similar to that of the buffer cache.
				3525	*/
				3526	tcp_hashinfo.ehash =
				3527	alloc_large_system_hash("TCP established",
				3528	sizeof(struct inet_ehash_bucket),
				3529	thash_entries,
				3530	17, /* one slot per 128 KB of memory */
				3531	0,
				3532	NULL,
				3533	&tcp_hashinfo.ehash_mask,
				3534	0,
				3535	thash_entries ? 0 : 512 * 1024);
				3536	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
				3537	INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
				3538
				3539	if (inet_ehash_locks_alloc(&tcp_hashinfo))
				3540	panic("TCP: failed to alloc ehash_locks");
				3541	tcp_hashinfo.bhash =
				3542	alloc_large_system_hash("TCP bind",
				3543	sizeof(struct inet_bind_hashbucket),
				3544	tcp_hashinfo.ehash_mask + 1,
				3545	17, /* one slot per 128 KB of memory */
				3546	0,
				3547	&tcp_hashinfo.bhash_size,
				3548	NULL,
				3549	0,
				3550	64 * 1024);
				3551	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
				3552	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				3553	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				3554	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
				3555	}
				3556
				3557
				3558	cnt = tcp_hashinfo.ehash_mask + 1;
				3559	sysctl_tcp_max_orphans = cnt / 2;
				3560
				3561	tcp_init_mem();
				3562	/* Set per-socket limits to no more than 1/128 the pressure threshold */
				3563	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
				3564	max_wshare = min(4UL10241024, limit);
				3565	max_rshare = min(6UL10241024, limit);
				3566
				3567	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
				3568	sysctl_tcp_wmem[1] = 16*1024;
				3569	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
				3570
				3571	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
				3572	sysctl_tcp_rmem[1] = 87380;
				3573	sysctl_tcp_rmem[2] = max(87380, max_rshare);
				3574
				3575	pr_info("Hash tables configured (established %u bind %u)\n",
				3576	tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
				3577
				3578	tcp_v4_init();
				3579	tcp_metrics_init();
				3580	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
				3581	tcp_tasklet_init();
				3582	}