Blame - ap/os/linux/linux-3.4.x/net/ipv4/tcp.c - T106_DC

blob: 82bb0e7161eb06b0809ecc0b718b6be21a007a4c [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*
				20	* Fixes:
				21	* Alan Cox : Numerous verify_area() calls
				22	* Alan Cox : Set the ACK bit on a reset
				23	* Alan Cox : Stopped it crashing if it closed while
				24	* sk->inuse=1 and was trying to connect
				25	* (tcp_err()).
				26	* Alan Cox : All icmp error handling was broken
				27	* pointers passed where wrong and the
				28	* socket was looked up backwards. Nobody
				29	* tested any icmp error code obviously.
				30	* Alan Cox : tcp_err() now handled properly. It
				31	* wakes people on errors. poll
				32	* behaves and the icmp error race
				33	* has gone by moving it into sock.c
				34	* Alan Cox : tcp_send_reset() fixed to work for
				35	* everything not just packets for
				36	* unknown sockets.
				37	* Alan Cox : tcp option processing.
				38	* Alan Cox : Reset tweaked (still not 100%) [Had
				39	* syn rule wrong]
				40	* Herp Rosmanith : More reset fixes
				41	* Alan Cox : No longer acks invalid rst frames.
				42	* Acking any kind of RST is right out.
				43	* Alan Cox : Sets an ignore me flag on an rst
				44	* receive otherwise odd bits of prattle
				45	* escape still
				46	* Alan Cox : Fixed another acking RST frame bug.
				47	* Should stop LAN workplace lockups.
				48	* Alan Cox : Some tidyups using the new skb list
				49	* facilities
				50	* Alan Cox : sk->keepopen now seems to work
				51	* Alan Cox : Pulls options out correctly on accepts
				52	* Alan Cox : Fixed assorted sk->rqueue->next errors
				53	* Alan Cox : PSH doesn't end a TCP read. Switched a
				54	* bit to skb ops.
				55	* Alan Cox : Tidied tcp_data to avoid a potential
				56	* nasty.
				57	* Alan Cox : Added some better commenting, as the
				58	* tcp is hard to follow
				59	* Alan Cox : Removed incorrect check for 20 * psh
				60	* Michael O'Reilly : ack < copied bug fix.
				61	* Johannes Stille : Misc tcp fixes (not all in yet).
				62	* Alan Cox : FIN with no memory -> CRASH
				63	* Alan Cox : Added socket option proto entries.
				64	* Also added awareness of them to accept.
				65	* Alan Cox : Added TCP options (SOL_TCP)
				66	* Alan Cox : Switched wakeup calls to callbacks,
				67	* so the kernel can layer network
				68	* sockets.
				69	* Alan Cox : Use ip_tos/ip_ttl settings.
				70	* Alan Cox : Handle FIN (more) properly (we hope).
				71	* Alan Cox : RST frames sent on unsynchronised
				72	* state ack error.
				73	* Alan Cox : Put in missing check for SYN bit.
				74	* Alan Cox : Added tcp_select_window() aka NET2E
				75	* window non shrink trick.
				76	* Alan Cox : Added a couple of small NET2E timer
				77	* fixes
				78	* Charles Hedrick : TCP fixes
				79	* Toomas Tamm : TCP window fixes
				80	* Alan Cox : Small URG fix to rlogin ^C ack fight
				81	* Charles Hedrick : Rewrote most of it to actually work
				82	* Linus : Rewrote tcp_read() and URG handling
				83	* completely
				84	* Gerhard Koerting: Fixed some missing timer handling
				85	* Matthew Dillon : Reworked TCP machine states as per RFC
				86	* Gerhard Koerting: PC/TCP workarounds
				87	* Adam Caldwell : Assorted timer/timing errors
				88	* Matthew Dillon : Fixed another RST bug
				89	* Alan Cox : Move to kernel side addressing changes.
				90	* Alan Cox : Beginning work on TCP fastpathing
				91	* (not yet usable)
				92	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
				93	* Alan Cox : TCP fast path debugging
				94	* Alan Cox : Window clamping
				95	* Michael Riepe : Bug in tcp_check()
				96	* Matt Dillon : More TCP improvements and RST bug fixes
				97	* Matt Dillon : Yet more small nasties remove from the
				98	* TCP code (Be very nice to this man if
				99	* tcp finally works 100%) 8)
				100	* Alan Cox : BSD accept semantics.
				101	* Alan Cox : Reset on closedown bug.
				102	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
				103	* Michael Pall : Handle poll() after URG properly in
				104	* all cases.
				105	* Michael Pall : Undo the last fix in tcp_read_urg()
				106	* (multi URG PUSH broke rlogin).
				107	* Michael Pall : Fix the multi URG PUSH problem in
				108	* tcp_readable(), poll() after URG
				109	* works now.
				110	* Michael Pall : recv(...,MSG_OOB) never blocks in the
				111	* BSD api.
				112	* Alan Cox : Changed the semantics of sk->socket to
				113	* fix a race and a signal problem with
				114	* accept() and async I/O.
				115	* Alan Cox : Relaxed the rules on tcp_sendto().
				116	* Yury Shevchuk : Really fixed accept() blocking problem.
				117	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
				118	* clients/servers which listen in on
				119	* fixed ports.
				120	* Alan Cox : Cleaned the above up and shrank it to
				121	* a sensible code size.
				122	* Alan Cox : Self connect lockup fix.
				123	* Alan Cox : No connect to multicast.
				124	* Ross Biro : Close unaccepted children on master
				125	* socket close.
				126	* Alan Cox : Reset tracing code.
				127	* Alan Cox : Spurious resets on shutdown.
				128	* Alan Cox : Giant 15 minute/60 second timer error
				129	* Alan Cox : Small whoops in polling before an
				130	* accept.
				131	* Alan Cox : Kept the state trace facility since
				132	* it's handy for debugging.
				133	* Alan Cox : More reset handler fixes.
				134	* Alan Cox : Started rewriting the code based on
				135	* the RFC's for other useful protocol
				136	* references see: Comer, KA9Q NOS, and
				137	* for a reference on the difference
				138	* between specifications and how BSD
				139	* works see the 4.4lite source.
				140	* A.N.Kuznetsov : Don't time wait on completion of tidy
				141	* close.
				142	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
				143	* Linus Torvalds : Fixed BSD port reuse to work first syn
				144	* Alan Cox : Reimplemented timers as per the RFC
				145	* and using multiple timers for sanity.
				146	* Alan Cox : Small bug fixes, and a lot of new
				147	* comments.
				148	* Alan Cox : Fixed dual reader crash by locking
				149	* the buffers (much like datagram.c)
				150	* Alan Cox : Fixed stuck sockets in probe. A probe
				151	* now gets fed up of retrying without
				152	* (even a no space) answer.
				153	* Alan Cox : Extracted closing code better
				154	* Alan Cox : Fixed the closing state machine to
				155	* resemble the RFC.
				156	* Alan Cox : More 'per spec' fixes.
				157	* Jorge Cwik : Even faster checksumming.
				158	* Alan Cox : tcp_data() doesn't ack illegal PSH
				159	* only frames. At least one pc tcp stack
				160	* generates them.
				161	* Alan Cox : Cache last socket.
				162	* Alan Cox : Per route irtt.
				163	* Matt Day : poll()->select() match BSD precisely on error
				164	* Alan Cox : New buffers
				165	* Marc Tamsky : Various sk->prot->retransmits and
				166	* sk->retransmits misupdating fixed.
				167	* Fixed tcp_write_timeout: stuck close,
				168	* and TCP syn retries gets used now.
				169	* Mark Yarvis : In tcp_read_wakeup(), don't send an
				170	* ack if state is TCP_CLOSED.
				171	* Alan Cox : Look up device on a retransmit - routes may
				172	* change. Doesn't yet cope with MSS shrink right
				173	* but it's a start!
				174	* Marc Tamsky : Closing in closing fixes.
				175	* Mike Shaver : RFC1122 verifications.
				176	* Alan Cox : rcv_saddr errors.
				177	* Alan Cox : Block double connect().
				178	* Alan Cox : Small hooks for enSKIP.
				179	* Alexey Kuznetsov: Path MTU discovery.
				180	* Alan Cox : Support soft errors.
				181	* Alan Cox : Fix MTU discovery pathological case
				182	* when the remote claims no mtu!
				183	* Marc Tamsky : TCP_CLOSE fix.
				184	* Colin (G3TNE) : Send a reset on syn ack replies in
				185	* window but wrong (fixes NT lpd problems)
				186	* Pedro Roque : Better TCP window handling, delayed ack.
				187	* Joerg Reuter : No modification of locked buffers in
				188	* tcp_do_retransmit()
				189	* Eric Schenk : Changed receiver side silly window
				190	* avoidance algorithm to BSD style
				191	* algorithm. This doubles throughput
				192	* against machines running Solaris,
				193	* and seems to result in general
				194	* improvement.
				195	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
				196	* Willy Konynenberg : Transparent proxying support.
				197	* Mike McLagan : Routing by source
				198	* Keith Owens : Do proper merging with partial SKB's in
				199	* tcp_do_sendmsg to avoid burstiness.
				200	* Eric Schenk : Fix fast close down bug with
				201	* shutdown() followed by close().
				202	* Andi Kleen : Make poll agree with SIGIO
				203	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
				204	* lingertime == 0 (RFC 793 ABORT Call)
				205	* Hirokazu Takahashi : Use copy_from_user() instead of
				206	* csum_and_copy_from_user() if possible.
				207	*
				208	* This program is free software; you can redistribute it and/or
				209	* modify it under the terms of the GNU General Public License
				210	* as published by the Free Software Foundation; either version
				211	* 2 of the License, or(at your option) any later version.
				212	*
				213	* Description of States:
				214	*
				215	* TCP_SYN_SENT sent a connection request, waiting for ack
				216	*
				217	* TCP_SYN_RECV received a connection request, sent ack,
				218	* waiting for final ack in three-way handshake.
				219	*
				220	* TCP_ESTABLISHED connection established
				221	*
				222	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
				223	* transmission of remaining buffered data
				224	*
				225	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
				226	* to shutdown
				227	*
				228	* TCP_CLOSING both sides have shutdown but we still have
				229	* data we have to finish sending
				230	*
				231	* TCP_TIME_WAIT timeout to catch resent junk before entering
				232	* closed, can only be entered from FIN_WAIT2
				233	* or CLOSING. Required because the other end
				234	* may not have gotten our last ACK causing it
				235	* to retransmit the data packet (which we ignore)
				236	*
				237	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
				238	* us to finish writing our data and to shutdown
				239	* (we have to close() to move on to LAST_ACK)
				240	*
				241	* TCP_LAST_ACK out side has shutdown after remote has
				242	* shutdown. There may still be data in our
				243	* buffer that we have to finish sending
				244	*
				245	* TCP_CLOSE socket is finished
				246	*/
				247
				248	#define pr_fmt(fmt) "TCP: " fmt
				249
				250	#include <linux/kernel.h>
				251	#include <linux/module.h>
				252	#include <linux/types.h>
				253	#include <linux/fcntl.h>
				254	#include <linux/poll.h>
				255	#include <linux/init.h>
				256	#include <linux/fs.h>
				257	#include <linux/skbuff.h>
				258	#include <linux/scatterlist.h>
				259	#include <linux/splice.h>
				260	#include <linux/net.h>
				261	#include <linux/socket.h>
				262	#include <linux/random.h>
				263	#include <linux/bootmem.h>
				264	#include <linux/highmem.h>
				265	#include <linux/swap.h>
				266	#include <linux/cache.h>
				267	#include <linux/err.h>
				268	#include <linux/crypto.h>
				269	#include <linux/time.h>
				270	#include <linux/slab.h>
				271	#include <linux/uid_stat.h>
				272
				273	#include <net/icmp.h>
				274	#include <net/tcp.h>
				275	#include <net/xfrm.h>
				276	#include <net/ip.h>
				277	#include <net/ip6_route.h>
				278	#include <net/ipv6.h>
				279	#include <net/transp_v6.h>
				280	#include <net/netdma.h>
				281	#include <net/sock.h>
				282
				283	#include <asm/uaccess.h>
				284	#include <asm/ioctls.h>
				285
				286	#include <net/SI/errno_track.h>
				287	#include <net/SI/sock_track.h>
				288
				289	int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
				290
				291	struct percpu_counter tcp_orphan_count;
				292	EXPORT_SYMBOL_GPL(tcp_orphan_count);
				293
				294	int sysctl_tcp_wmem[3] __read_mostly;
				295	int sysctl_tcp_rmem[3] __read_mostly;
				296
				297	EXPORT_SYMBOL(sysctl_tcp_rmem);
				298	EXPORT_SYMBOL(sysctl_tcp_wmem);
				299
				300	atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
				301	EXPORT_SYMBOL(tcp_memory_allocated);
				302
				303	/*
				304	* Current number of TCP sockets.
				305	*/
				306	struct percpu_counter tcp_sockets_allocated;
				307	EXPORT_SYMBOL(tcp_sockets_allocated);
				308
				309	/*
				310	* TCP splice context
				311	*/
				312	struct tcp_splice_state {
				313	struct pipe_inode_info *pipe;
				314	size_t len;
				315	unsigned int flags;
				316	};
				317
				318	/*
				319	* Pressure flag: try to collapse.
				320	* Technical note: it is used by multiple contexts non atomically.
				321	* All the __sk_mem_schedule() is of this nature: accounting
				322	* is strict, actions are advisory and have some latency.
				323	*/
				324	int tcp_memory_pressure __read_mostly;
				325	EXPORT_SYMBOL(tcp_memory_pressure);
				326
				327	void tcp_enter_memory_pressure(struct sock *sk)
				328	{
				329	if (!tcp_memory_pressure) {
				330	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
				331	tcp_memory_pressure = 1;
				332	}
				333	}
				334	EXPORT_SYMBOL(tcp_enter_memory_pressure);
				335
				336	/* Convert seconds to retransmits based on initial and max timeout */
				337	static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
				338	{
				339	u8 res = 0;
				340
				341	if (seconds > 0) {
				342	int period = timeout;
				343
				344	res = 1;
				345	while (seconds > period && res < 255) {
				346	res++;
				347	timeout <<= 1;
				348	if (timeout > rto_max)
				349	timeout = rto_max;
				350	period += timeout;
				351	}
				352	}
				353	return res;
				354	}
				355
				356	/* Convert retransmits to seconds based on initial and max timeout */
				357	static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
				358	{
				359	int period = 0;
				360
				361	if (retrans > 0) {
				362	period = timeout;
				363	while (--retrans) {
				364	timeout <<= 1;
				365	if (timeout > rto_max)
				366	timeout = rto_max;
				367	period += timeout;
				368	}
				369	}
				370	return period;
				371	}
				372
				373	/*
				374	* Wait for a TCP event.
				375	*
				376	* Note that we don't need to lock the socket, as the upper poll layers
				377	* take care of normal races (between the test and the event) and we don't
				378	* go look at any of the socket buffers directly.
				379	*/
				380	unsigned int tcp_poll(struct file file, struct socket sock, poll_table *wait)
				381	{
				382	unsigned int mask;
				383	struct sock *sk = sock->sk;
				384	const struct tcp_sock *tp = tcp_sk(sk);
				385
				386	sock_poll_wait(file, sk_sleep(sk), wait);
				387	if (sk->sk_state == TCP_LISTEN)
				388	return inet_csk_listen_poll(sk);
				389
				390	/* Socket is not locked. We are protected from async events
				391	* by poll logic and correct handling of state changes
				392	* made by other threads is impossible in any case.
				393	*/
				394
				395	mask = 0;
				396
				397	/*
				398	* POLLHUP is certainly not done right. But poll() doesn't
				399	* have a notion of HUP in just one direction, and for a
				400	* socket the read side is more interesting.
				401	*
				402	* Some poll() documentation says that POLLHUP is incompatible
				403	* with the POLLOUT/POLLWR flags, so somebody should check this
				404	* all. But careful, it tends to be safer to return too many
				405	* bits than too few, and you can easily break real applications
				406	* if you don't tell them that something has hung up!
				407	*
				408	* Check-me.
				409	*
				410	* Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
				411	* our fs/select.c). It means that after we received EOF,
				412	* poll always returns immediately, making impossible poll() on write()
				413	* in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
				414	* if and only if shutdown has been made in both directions.
				415	* Actually, it is interesting to look how Solaris and DUX
				416	* solve this dilemma. I would prefer, if POLLHUP were maskable,
				417	* then we could set it on SND_SHUTDOWN. BTW examples given
				418	* in Stevens' books assume exactly this behaviour, it explains
				419	* why POLLHUP is incompatible with POLLOUT. --ANK
				420	*
				421	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
				422	* blocking on fresh not-connected or disconnected socket. --ANK
				423	*/
				424	if (sk->sk_shutdown == SHUTDOWN_MASK \|\| sk->sk_state == TCP_CLOSE)
				425	mask \|= POLLHUP;
				426	if (sk->sk_shutdown & RCV_SHUTDOWN)
				427	mask \|= POLLIN \| POLLRDNORM \| POLLRDHUP;
				428
				429	/* Connected? */
				430	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
				431	int target = sock_rcvlowat(sk, 0, INT_MAX);
				432
				433	if (tp->urg_seq == tp->copied_seq &&
				434	!sock_flag(sk, SOCK_URGINLINE) &&
				435	tp->urg_data)
				436	target++;
				437
				438	/* Potential race condition. If read of tp below will
				439	* escape above sk->sk_state, we can be illegally awaken
				440	* in SYN_* states. */
				441	if (tp->rcv_nxt - tp->copied_seq >= target)
				442	mask \|= POLLIN \| POLLRDNORM;
				443
				444	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
				445	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
				446	mask \|= POLLOUT \| POLLWRNORM;
				447	} else { /* send SIGIO later */
				448	set_bit(SOCK_ASYNC_NOSPACE,
				449	&sk->sk_socket->flags);
				450	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				451
				452	/* Race breaker. If space is freed after
				453	* wspace test but before the flags are set,
				454	* IO signal will be lost.
				455	*/
				456	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
				457	mask \|= POLLOUT \| POLLWRNORM;
				458	}
				459	} else
				460	mask \|= POLLOUT \| POLLWRNORM;
				461
				462	if (tp->urg_data & TCP_URG_VALID)
				463	mask \|= POLLPRI;
				464	}
				465	/* This barrier is coupled with smp_wmb() in tcp_reset() */
				466	smp_rmb();
				467	if (sk->sk_err)
				468	mask \|= POLLERR;
				469
				470	return mask;
				471	}
				472	EXPORT_SYMBOL(tcp_poll);
				473
				474	int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
				475	{
				476	struct tcp_sock *tp = tcp_sk(sk);
				477	int answ;
				478	int retval = 0;
				479
				480	switch (cmd) {
				481	case SIOCINQ:
				482	if (sk->sk_state == TCP_LISTEN)
				483	//return -EINVAL;
				484	return ERRNO_TRACK(-EINVAL);
				485
				486	lock_sock(sk);
				487	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				488	answ = 0;
				489	else if (sock_flag(sk, SOCK_URGINLINE) \|\|
				490	!tp->urg_data \|\|
				491	before(tp->urg_seq, tp->copied_seq) \|\|
				492	!before(tp->urg_seq, tp->rcv_nxt)) {
				493
				494	answ = tp->rcv_nxt - tp->copied_seq;
				495
				496	/* Subtract 1, if FIN was received */
				497	if (answ && sock_flag(sk, SOCK_DONE))
				498	answ--;
				499	} else
				500	answ = tp->urg_seq - tp->copied_seq;
				501	release_sock(sk);
				502	break;
				503	case SIOCATMARK:
				504	answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
				505	break;
				506	case SIOCOUTQ:
				507	if (sk->sk_state == TCP_LISTEN)
				508	//return -EINVAL;
				509	return ERRNO_TRACK(-EINVAL);
				510
				511	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				512	answ = 0;
				513	else
				514	answ = tp->write_seq - tp->snd_una;
				515	break;
				516	case SIOCOUTQNSD:
				517	if (sk->sk_state == TCP_LISTEN)
				518	//return -EINVAL;
				519	return ERRNO_TRACK(-EINVAL);
				520
				521	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
				522	answ = 0;
				523	else
				524	answ = tp->write_seq - tp->snd_nxt;
				525	break;
				526	default:
				527	//return -ENOIOCTLCMD;
				528	return ERRNO_TRACK(-ENOIOCTLCMD);
				529	}
				530
				531	retval = put_user(answ, (int __user *)arg);
				532	return ERRNO_TRACK(retval);
				533
				534	//return put_user(answ, (int __user *)arg);
				535	}
				536	EXPORT_SYMBOL(tcp_ioctl);
				537
				538	static inline void tcp_mark_push(struct tcp_sock tp, struct sk_buff skb)
				539	{
				540	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
				541	tp->pushed_seq = tp->write_seq;
				542	}
				543
				544	static inline int forced_push(const struct tcp_sock *tp)
				545	{
				546	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
				547	}
				548
				549	static inline void skb_entail(struct sock sk, struct sk_buff skb)
				550	{
				551	struct tcp_sock *tp = tcp_sk(sk);
				552	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
				553
				554	skb->csum = 0;
				555	tcb->seq = tcb->end_seq = tp->write_seq;
				556	tcb->tcp_flags = TCPHDR_ACK;
				557	tcb->sacked = 0;
				558	skb_header_release(skb);
				559	tcp_add_write_queue_tail(sk, skb);
				560	sk->sk_wmem_queued += skb->truesize;
				561	sk_mem_charge(sk, skb->truesize);
				562	if (tp->nonagle & TCP_NAGLE_PUSH)
				563	tp->nonagle &= ~TCP_NAGLE_PUSH;
				564	}
				565
				566	static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
				567	{
				568	if (flags & MSG_OOB)
				569	tp->snd_up = tp->write_seq;
				570	}
				571
				572	static inline void tcp_push(struct sock *sk, int flags, int mss_now,
				573	int nonagle)
				574	{
				575	if (tcp_send_head(sk)) {
				576	struct tcp_sock *tp = tcp_sk(sk);
				577
				578	if (!(flags & MSG_MORE) \|\| forced_push(tp))
				579	tcp_mark_push(tp, tcp_write_queue_tail(sk));
				580
				581	tcp_mark_urg(tp, flags);
				582	__tcp_push_pending_frames(sk, mss_now,
				583	(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
				584	}
				585	}
				586
				587	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct sk_buff skb,
				588	unsigned int offset, size_t len)
				589	{
				590	struct tcp_splice_state *tss = rd_desc->arg.data;
				591	int ret;
				592
				593	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
				594	tss->flags);
				595	if (ret > 0)
				596	rd_desc->count -= ret;
				597	//return ret;
				598	return ERRNO_TRACK(ret);
				599	}
				600
				601	static int __tcp_splice_read(struct sock sk, struct tcp_splice_state tss)
				602	{
				603	/* Store TCP splice context information in read_descriptor_t. */
				604	read_descriptor_t rd_desc = {
				605	.arg.data = tss,
				606	.count = tss->len,
				607	};
				608
				609	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
				610	}
				611
				612	/**
				613	* tcp_splice_read - splice data from TCP socket to a pipe
				614	* @sock: socket to splice from
				615	* @ppos: position (not valid)
				616	* @pipe: pipe to splice to
				617	* @len: number of bytes to splice
				618	* @flags: splice modifier flags
				619	*
				620	* Description:
				621	* Will read pages from given socket and fill them into a pipe.
				622	*
				623	**/
				624	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,
				625	struct pipe_inode_info *pipe, size_t len,
				626	unsigned int flags)
				627	{
				628	struct sock *sk = sock->sk;
				629	struct tcp_splice_state tss = {
				630	.pipe = pipe,
				631	.len = len,
				632	.flags = flags,
				633	};
				634	long timeo;
				635	ssize_t spliced;
				636	int ret;
				637
				638	sock_rps_record_flow(sk);
				639	/*
				640	* We can't seek on a socket input
				641	*/
				642	if (unlikely(*ppos))
				643	//return -ESPIPE;
				644	return ERRNO_TRACK(-ESPIPE);
				645
				646	ret = spliced = 0;
				647
				648	lock_sock(sk);
				649
				650	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
				651	while (tss.len) {
				652	ret = __tcp_splice_read(sk, &tss);
				653	if (ret < 0)
				654	break;
				655	else if (!ret) {
				656	if (spliced)
				657	break;
				658	if (sock_flag(sk, SOCK_DONE))
				659	break;
				660	if (sk->sk_err) {
				661	ret = sock_error(sk);
				662	break;
				663	}
				664	if (sk->sk_shutdown & RCV_SHUTDOWN)
				665	break;
				666	if (sk->sk_state == TCP_CLOSE) {
				667	/*
				668	* This occurs when user tries to read
				669	* from never connected socket.
				670	*/
				671	if (!sock_flag(sk, SOCK_DONE))
				672	ret = -ENOTCONN;
				673	break;
				674	}
				675	if (!timeo) {
				676	ret = ERRNO_TRACK(-EAGAIN);
				677	break;
				678	}
				679	/* if __tcp_splice_read() got nothing while we have
				680	* an skb in receive queue, we do not want to loop.
				681	* This might happen with URG data.
				682	*/
				683	if (!skb_queue_empty(&sk->sk_receive_queue))
				684	break;
				685	sk_wait_data(sk, &timeo);
				686	if (signal_pending(current)) {
				687	ret = sock_intr_errno(timeo);
				688	break;
				689	}
				690	continue;
				691	}
				692	tss.len -= ret;
				693	spliced += ret;
				694
				695	if (!timeo)
				696	break;
				697	release_sock(sk);
				698	lock_sock(sk);
				699
				700	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|
				701	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				702	signal_pending(current))
				703	break;
				704	}
				705
				706	release_sock(sk);
				707
				708	if (spliced)
				709	//return spliced;
				710	return ERRNO_TRACK(spliced);
				711
				712	//return ret;
				713	return ERRNO_TRACK(ret);
				714	}
				715	EXPORT_SYMBOL(tcp_splice_read);
				716
				717	struct sk_buff sk_stream_alloc_skb(struct sock sk, int size, gfp_t gfp)
				718	{
				719	struct sk_buff *skb;
				720
				721	/* The TCP header must be at least 32-bit aligned. */
				722	size = ALIGN(size, 4);
				723
				724	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
				725	if (skb) {
				726	if (sk_wmem_schedule(sk, skb->truesize)) {
				727	skb_reserve(skb, sk->sk_prot->max_header);
				728	/*
				729	* Make sure that we have exactly size bytes
				730	* available to the caller, no more, no less.
				731	*/
				732	skb->reserved_tailroom = skb->end - skb->tail - size;
				733	return skb;
				734	}
				735	__kfree_skb(skb);
				736	} else {
				737	sk->sk_prot->enter_memory_pressure(sk);
				738	sk_stream_moderate_sndbuf(sk);
				739	}
				740	return NULL;
				741	}
				742
				743	static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
				744	int large_allowed)
				745	{
				746	struct tcp_sock *tp = tcp_sk(sk);
				747	u32 xmit_size_goal, old_size_goal;
				748
				749	xmit_size_goal = mss_now;
				750
				751	if (large_allowed && sk_can_gso(sk)) {
				752	xmit_size_goal = ((sk->sk_gso_max_size - 1) -
				753	inet_csk(sk)->icsk_af_ops->net_header_len -
				754	inet_csk(sk)->icsk_ext_hdr_len -
				755	tp->tcp_header_len);
				756
				757	xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
				758
				759	/* We try hard to avoid divides here */
				760	old_size_goal = tp->xmit_size_goal_segs * mss_now;
				761
				762	if (likely(old_size_goal <= xmit_size_goal &&
				763	old_size_goal + mss_now > xmit_size_goal)) {
				764	xmit_size_goal = old_size_goal;
				765	} else {
				766	tp->xmit_size_goal_segs =
				767	min_t(u16, xmit_size_goal / mss_now,
				768	sk->sk_gso_max_segs);
				769	xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
				770	}
				771	}
				772
				773	return max(xmit_size_goal, mss_now);
				774	}
				775
				776	static int tcp_send_mss(struct sock sk, int size_goal, int flags)
				777	{
				778	int mss_now;
				779
				780	mss_now = tcp_current_mss(sk);
				781	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
				782
				783	return mss_now;
				784	}
				785
				786	static ssize_t do_tcp_sendpages(struct sock sk, struct page *pages, int poffset,
				787	size_t psize, int flags)
				788	{
				789	struct tcp_sock *tp = tcp_sk(sk);
				790	int mss_now, size_goal;
				791	int err;
				792	ssize_t copied;
				793	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				794	int retval = 0;
				795
				796	/* Wait for a connection to finish. */
				797	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				798	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				799	goto out_err;
				800
				801	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				802
				803	mss_now = tcp_send_mss(sk, &size_goal, flags);
				804	copied = 0;
				805
				806	err = -EPIPE;
				807	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				808	goto out_err;
				809
				810	while (psize > 0) {
				811	struct sk_buff *skb = tcp_write_queue_tail(sk);
				812	struct page *page = pages[poffset / PAGE_SIZE];
				813	int copy, i, can_coalesce;
				814	int offset = poffset % PAGE_SIZE;
				815	int size = min_t(size_t, psize, PAGE_SIZE - offset);
				816
				817	if (!tcp_send_head(sk) \|\| (copy = size_goal - skb->len) <= 0) {
				818	new_segment:
				819	if (!sk_stream_memory_free(sk))
				820	goto wait_for_sndbuf;
				821
				822	skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
				823	if (!skb)
				824	goto wait_for_memory;
				825
				826	skb_entail(sk, skb);
				827	copy = size_goal;
				828	}
				829
				830	if (copy > size)
				831	copy = size;
				832
				833	i = skb_shinfo(skb)->nr_frags;
				834	can_coalesce = skb_can_coalesce(skb, i, page, offset);
				835	if (!can_coalesce && i >= MAX_SKB_FRAGS) {
				836	tcp_mark_push(tp, skb);
				837	goto new_segment;
				838	}
				839	if (!sk_wmem_schedule(sk, copy))
				840	goto wait_for_memory;
				841
				842	if (can_coalesce) {
				843	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				844	} else {
				845	get_page(page);
				846	skb_fill_page_desc(skb, i, page, offset, copy);
				847	}
				848
				849	skb->len += copy;
				850	skb->data_len += copy;
				851	skb->truesize += copy;
				852	sk->sk_wmem_queued += copy;
				853	sk_mem_charge(sk, copy);
				854	skb->ip_summed = CHECKSUM_PARTIAL;
				855	tp->write_seq += copy;
				856	TCP_SKB_CB(skb)->end_seq += copy;
				857	skb_shinfo(skb)->gso_segs = 0;
				858
				859	if (!copied)
				860	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
				861
				862	copied += copy;
				863	poffset += copy;
				864	if (!(psize -= copy))
				865	goto out;
				866
				867	if (skb->len < size_goal \|\| (flags & MSG_OOB))
				868	continue;
				869
				870	if (forced_push(tp)) {
				871	tcp_mark_push(tp, skb);
				872	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
				873	} else if (skb == tcp_send_head(sk))
				874	tcp_push_one(sk, mss_now);
				875	continue;
				876
				877	wait_for_sndbuf:
				878	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				879	wait_for_memory:
				880	tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				881
				882	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				883	goto do_error;
				884
				885	mss_now = tcp_send_mss(sk, &size_goal, flags);
				886	}
				887
				888	out:
				889	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
				890	tcp_push(sk, flags, mss_now, tp->nonagle);
				891	//return copied;
				892	return ERRNO_TRACK(copied);
				893
				894	do_error:
				895	if (copied)
				896	goto out;
				897	out_err:
				898
				899	retval = sk_stream_error(sk, flags, err);
				900	return ERRNO_TRACK(retval);
				901	}
				902
				903	int tcp_sendpage(struct sock sk, struct page page, int offset,
				904	size_t size, int flags)
				905	{
				906	ssize_t res;
				907	int retval = 0;
				908	if (!(sk->sk_route_caps & NETIF_F_SG) \|\|
				909	!(sk->sk_route_caps & NETIF_F_ALL_CSUM))
				910	{
				911	retval = sock_no_sendpage(sk->sk_socket, page, offset, size,
				912	flags);
				913	return ERRNO_TRACK(retval);
				914	}
				915	lock_sock(sk);
				916	res = do_tcp_sendpages(sk, &page, offset, size, flags);
				917	release_sock(sk);
				918	return res;
				919	}
				920	EXPORT_SYMBOL(tcp_sendpage);
				921
				922	static inline int select_size(const struct sock *sk, bool sg)
				923	{
				924	const struct tcp_sock *tp = tcp_sk(sk);
				925	int tmp = tp->mss_cache;
				926
				927	if (sg) {
				928	if (sk_can_gso(sk)) {
				929	/* Small frames wont use a full page:
				930	* Payload will immediately follow tcp header.
				931	*/
				932	tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
				933	} else {
				934	int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
				935
				936	if (tmp >= pgbreak &&
				937	tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
				938	tmp = pgbreak;
				939	}
				940	}
				941
				942	return tmp;
				943	}
				944
				945	int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				946	size_t size)
				947	{
				948	struct iovec *iov;
				949	struct tcp_sock *tp = tcp_sk(sk);
				950	struct sk_buff *skb;
				951	int iovlen, flags, err, copied;
				952	int mss_now, size_goal;
				953	bool sg;
				954	long timeo;
				955
				956	lock_sock(sk);
				957
				958	flags = msg->msg_flags;
				959	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
				960
				961	/* Wait for a connection to finish. */
				962	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
				963	if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
				964	goto out_err;
				965
				966	/* This should be in poll */
				967	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
				968
				969	mss_now = tcp_send_mss(sk, &size_goal, flags);
				970
				971	/* Ok commence sending. */
				972	iovlen = msg->msg_iovlen;
				973	iov = msg->msg_iov;
				974	copied = 0;
				975
				976	err = -EPIPE;
				977	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
				978	goto out_err;
				979
				980	sg = !!(sk->sk_route_caps & NETIF_F_SG);
				981
				982	while (--iovlen >= 0) {
				983	size_t seglen = iov->iov_len;
				984	unsigned char __user *from = iov->iov_base;
				985
				986	iov++;
				987
				988	while (seglen > 0) {
				989	int copy = 0;
				990	int max = size_goal;
				991
				992	skb = tcp_write_queue_tail(sk);
				993	if (tcp_send_head(sk)) {
				994	if (skb->ip_summed == CHECKSUM_NONE)
				995	max = mss_now;
				996	copy = max - skb->len;
				997	}
				998
				999	if (copy <= 0) {
				1000	new_segment:
				1001	/* Allocate new segment. If the interface is SG,
				1002	* allocate skb fitting to single page.
				1003	*/
				1004	if (!sk_stream_memory_free(sk))
				1005	goto wait_for_sndbuf;
				1006
				1007	skb = sk_stream_alloc_skb(sk,
				1008	select_size(sk, sg),
				1009	sk->sk_allocation);
				1010	if (!skb)
				1011	goto wait_for_memory;
				1012
				1013	/*
				1014	* Check whether we can use HW checksum.
				1015	*/
				1016	if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
				1017	skb->ip_summed = CHECKSUM_PARTIAL;
				1018
				1019	skb_entail(sk, skb);
				1020	copy = size_goal;
				1021	max = size_goal;
				1022	}
				1023
				1024	/* Try to append data to the end of skb. */
				1025	if (copy > seglen)
				1026	copy = seglen;
				1027
				1028	/* Where to copy to? */
				1029	if (skb_availroom(skb) > 0) {
				1030	/* We have some space in skb head. Superb! */
				1031	copy = min_t(int, copy, skb_availroom(skb));
				1032	err = skb_add_data_nocache(sk, skb, from, copy);
				1033	if (err)
				1034	goto do_fault;
				1035	} else {
				1036	int merge = 0;
				1037	int i = skb_shinfo(skb)->nr_frags;
				1038	struct page *page = sk->sk_sndmsg_page;
				1039	int off;
				1040
				1041	if (page && page_count(page) == 1)
				1042	sk->sk_sndmsg_off = 0;
				1043
				1044	off = sk->sk_sndmsg_off;
				1045
				1046	if (skb_can_coalesce(skb, i, page, off) &&
				1047	off != PAGE_SIZE) {
				1048	/* We can extend the last page
				1049	* fragment. */
				1050	merge = 1;
				1051	} else if (i == MAX_SKB_FRAGS \|\| !sg) {
				1052	/* Need to add new fragment and cannot
				1053	* do this because interface is non-SG,
				1054	* or because all the page slots are
				1055	* busy. */
				1056	tcp_mark_push(tp, skb);
				1057	goto new_segment;
				1058	} else if (page) {
				1059	if (off == PAGE_SIZE) {
				1060	put_page(page);
				1061	sk->sk_sndmsg_page = page = NULL;
				1062	off = 0;
				1063	}
				1064	} else
				1065	off = 0;
				1066
				1067	if (copy > PAGE_SIZE - off)
				1068	copy = PAGE_SIZE - off;
				1069
				1070	if (!sk_wmem_schedule(sk, copy))
				1071	goto wait_for_memory;
				1072
				1073	if (!page) {
				1074	/* Allocate new cache page. */
				1075	if (!(page = sk_stream_alloc_page(sk)))
				1076	goto wait_for_memory;
				1077	}
				1078
				1079	/* Time to copy data. We are close to
				1080	* the end! */
				1081	err = skb_copy_to_page_nocache(sk, from, skb,
				1082	page, off, copy);
				1083	if (err) {
				1084	/* If this page was new, give it to the
				1085	* socket so it does not get leaked.
				1086	*/
				1087	if (!sk->sk_sndmsg_page) {
				1088	sk->sk_sndmsg_page = page;
				1089	sk->sk_sndmsg_off = 0;
				1090	}
				1091	goto do_error;
				1092	}
				1093
				1094	/* Update the skb. */
				1095	if (merge) {
				1096	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				1097	} else {
				1098	skb_fill_page_desc(skb, i, page, off, copy);
				1099	if (sk->sk_sndmsg_page) {
				1100	get_page(page);
				1101	} else if (off + copy < PAGE_SIZE) {
				1102	get_page(page);
				1103	sk->sk_sndmsg_page = page;
				1104	}
				1105	}
				1106
				1107	sk->sk_sndmsg_off = off + copy;
				1108	}
				1109
				1110	if (!copied)
				1111	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
				1112
				1113	tp->write_seq += copy;
				1114	TCP_SKB_CB(skb)->end_seq += copy;
				1115	skb_shinfo(skb)->gso_segs = 0;
				1116
				1117	from += copy;
				1118	copied += copy;
				1119	if ((seglen -= copy) == 0 && iovlen == 0)
				1120	goto out;
				1121
				1122	if (skb->len < max \|\| (flags & MSG_OOB))
				1123	continue;
				1124
				1125	if (forced_push(tp)) {
				1126	tcp_mark_push(tp, skb);
				1127	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
				1128	} else if (skb == tcp_send_head(sk))
				1129	tcp_push_one(sk, mss_now);
				1130	continue;
				1131
				1132	wait_for_sndbuf:
				1133	TCP_SOCK_TRACK(sk, TCP_SEND_BUFF_FULL);
				1134	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				1135	wait_for_memory:
				1136	if (copied)
				1137	tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
				1138
				1139	if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
				1140	goto do_error;
				1141
				1142	mss_now = tcp_send_mss(sk, &size_goal, flags);
				1143	}
				1144	}
				1145
				1146	out:
				1147	if (copied)
				1148	tcp_push(sk, flags, mss_now, tp->nonagle);
				1149	release_sock(sk);
				1150
				1151	if (copied > 0)
				1152	uid_stat_tcp_snd(current_uid(), copied);
				1153	return ERRNO_TRACK(copied);
				1154	//return copied;
				1155	do_fault:
				1156	if (!skb->len) {
				1157	tcp_unlink_write_queue(skb, sk);
				1158	/* It is the one place in all of TCP, except connection
				1159	* reset, where we can be unlinking the send_head.
				1160	*/
				1161	tcp_check_send_head(sk, skb);
				1162	sk_wmem_free_skb(sk, skb);
				1163	}
				1164
				1165	do_error:
				1166	if (copied)
				1167	goto out;
				1168	out_err:
				1169	err = sk_stream_error(sk, flags, err);
				1170	release_sock(sk);
				1171	//return err;
				1172	return ERRNO_TRACK(err);
				1173	}
				1174	EXPORT_SYMBOL(tcp_sendmsg);
				1175
				1176	/*
				1177	* Handle reading urgent data. BSD has very simple semantics for
				1178	* this, no blocking and very strange errors 8)
				1179	*/
				1180
				1181	static int tcp_recv_urg(struct sock sk, struct msghdr msg, int len, int flags)
				1182	{
				1183	struct tcp_sock *tp = tcp_sk(sk);
				1184	int retval = 0;
				1185
				1186	/* No URG data to read. */
				1187	if (sock_flag(sk, SOCK_URGINLINE) \|\| !tp->urg_data \|\|
				1188	tp->urg_data == TCP_URG_READ)
				1189	//return -EINVAL; /* Yes this is right ! */
				1190	return ERRNO_TRACK(-EINVAL);
				1191	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
				1192	//return -ENOTCONN;
				1193	return ERRNO_TRACK(-ENOTCONN);
				1194
				1195	if (tp->urg_data & TCP_URG_VALID) {
				1196	int err = 0;
				1197	char c = tp->urg_data;
				1198
				1199	if (!(flags & MSG_PEEK))
				1200	tp->urg_data = TCP_URG_READ;
				1201
				1202	/* Read urgent data. */
				1203	msg->msg_flags \|= MSG_OOB;
				1204
				1205	if (len > 0) {
				1206	if (!(flags & MSG_TRUNC))
				1207	err = memcpy_toiovec(msg->msg_iov, &c, 1);
				1208	len = 1;
				1209	} else
				1210	msg->msg_flags \|= MSG_TRUNC;
				1211	retval = err ? -EFAULT : len;
				1212	//return err ? -EFAULT : len;
				1213	return ERRNO_TRACK(retval);
				1214	}
				1215
				1216	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
				1217	return 0;
				1218
				1219	/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
				1220	* the available implementations agree in this case:
				1221	* this call should never block, independent of the
				1222	* blocking state of the socket.
				1223	* Mike <pall@rz.uni-karlsruhe.de>
				1224	*/
				1225	//return -EAGAIN;
				1226	//??????¨ª??¨®??¨¢??????????????¨¤????
				1227	return ERRNO_TRACK(-EAGAIN);
				1228
				1229	}
				1230
				1231	/* Clean up the receive buffer for full frames taken by the user,
				1232	* then send an ACK if necessary. COPIED is the number of bytes
				1233	* tcp_recvmsg has given to the user so far, it speeds up the
				1234	* calculation of whether or not we must ACK for the sake of
				1235	* a window update.
				1236	*/
				1237	void tcp_cleanup_rbuf(struct sock *sk, int copied)
				1238	{
				1239	struct tcp_sock *tp = tcp_sk(sk);
				1240	int time_to_ack = 0;
				1241
				1242	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				1243
				1244	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
				1245	"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
				1246	tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
				1247
				1248	if (inet_csk_ack_scheduled(sk)) {
				1249	const struct inet_connection_sock *icsk = inet_csk(sk);
				1250	/* Delayed ACKs frequently hit locked sockets during bulk
				1251	* receive. */
				1252	if (icsk->icsk_ack.blocked \|\|
				1253	/* Once-per-two-segments ACK was not sent by tcp_input.c */
				1254	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
				1255	/*
				1256	* If this read emptied read buffer, we send ACK, if
				1257	* connection is not bidirectional, user drained
				1258	* receive buffer and there was a small segment
				1259	* in queue.
				1260	*/
				1261	(copied > 0 &&
				1262	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|
				1263	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
				1264	!icsk->icsk_ack.pingpong)) &&
				1265	!atomic_read(&sk->sk_rmem_alloc)))
				1266	time_to_ack = 1;
				1267	}
				1268
				1269	/* We send an ACK if we can now advertise a non-zero window
				1270	* which has been raised "significantly".
				1271	*
				1272	* Even if window raised up to infinity, do not send window open ACK
				1273	* in states, where we will not receive more. It is useless.
				1274	*/
				1275	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
				1276	__u32 rcv_window_now = tcp_receive_window(tp);
				1277
				1278	/* Optimize, __tcp_select_window() is not cheap. */
				1279	if (2*rcv_window_now <= tp->window_clamp) {
				1280	__u32 new_window = __tcp_select_window(sk);
				1281
				1282	/* Send ACK now, if this read freed lots of space
				1283	* in our buffer. Certainly, new_window is new window.
				1284	* We can advertise it now, if it is not less than current one.
				1285	* "Lots" means "at least twice" here.
				1286	*/
				1287	if (new_window && new_window >= 2 * rcv_window_now)
				1288	time_to_ack = 1;
				1289	}
				1290	}
				1291	if (time_to_ack)
				1292	tcp_send_ack(sk);
				1293	}
				1294
				1295	static void tcp_prequeue_process(struct sock *sk)
				1296	{
				1297	struct sk_buff *skb;
				1298	struct tcp_sock *tp = tcp_sk(sk);
				1299
				1300	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
				1301
				1302	/* RX process wants to run with disabled BHs, though it is not
				1303	* necessary */
				1304	local_bh_disable();
				1305	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
				1306	sk_backlog_rcv(sk, skb);
				1307	local_bh_enable();
				1308
				1309	/* Clear memory counter. */
				1310	tp->ucopy.memory = 0;
				1311	}
				1312
				1313	#ifdef CONFIG_NET_DMA
				1314	static void tcp_service_net_dma(struct sock *sk, bool wait)
				1315	{
				1316	dma_cookie_t done, used;
				1317	dma_cookie_t last_issued;
				1318	struct tcp_sock *tp = tcp_sk(sk);
				1319
				1320	if (!tp->ucopy.dma_chan)
				1321	return;
				1322
				1323	last_issued = tp->ucopy.dma_cookie;
				1324	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
				1325
				1326	do {
				1327	if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
				1328	last_issued, &done,
				1329	&used) == DMA_SUCCESS) {
				1330	/* Safe to free early-copied skbs now */
				1331	__skb_queue_purge(&sk->sk_async_wait_queue);
				1332	break;
				1333	} else {
				1334	struct sk_buff *skb;
				1335	while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
				1336	(dma_async_is_complete(skb->dma_cookie, done,
				1337	used) == DMA_SUCCESS)) {
				1338	__skb_dequeue(&sk->sk_async_wait_queue);
				1339	kfree_skb(skb);
				1340	}
				1341	}
				1342	} while (wait);
				1343	}
				1344	#endif
				1345
				1346	static inline struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
				1347	{
				1348	struct sk_buff *skb;
				1349	u32 offset;
				1350
				1351	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1352	offset = seq - TCP_SKB_CB(skb)->seq;
				1353	if (tcp_hdr(skb)->syn)
				1354	offset--;
				1355	if (offset < skb->len \|\| tcp_hdr(skb)->fin) {
				1356	*off = offset;
				1357	return skb;
				1358	}
				1359	}
				1360	return NULL;
				1361	}
				1362
				1363	/*
				1364	* This routine provides an alternative to tcp_recvmsg() for routines
				1365	* that would like to handle copying from skbuffs directly in 'sendfile'
				1366	* fashion.
				1367	* Note:
				1368	* - It is assumed that the socket was locked by the caller.
				1369	* - The routine does not block.
				1370	* - At present, there is no support for reading OOB data
				1371	* or for 'peeking' the socket using this routine
				1372	* (although both would be easy to implement).
				1373	*/
				1374	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
				1375	sk_read_actor_t recv_actor)
				1376	{
				1377	struct sk_buff *skb;
				1378	struct tcp_sock *tp = tcp_sk(sk);
				1379	u32 seq = tp->copied_seq;
				1380	u32 offset;
				1381	int copied = 0;
				1382
				1383	if (sk->sk_state == TCP_LISTEN)
				1384	//return -ENOTCONN;
				1385	return ERRNO_TRACK(-ENOTCONN);
				1386
				1387	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
				1388	if (offset < skb->len) {
				1389	int used;
				1390	size_t len;
				1391
				1392	len = skb->len - offset;
				1393	/* Stop reading if we hit a patch of urgent data */
				1394	if (tp->urg_data) {
				1395	u32 urg_offset = tp->urg_seq - seq;
				1396	if (urg_offset < len)
				1397	len = urg_offset;
				1398	if (!len)
				1399	break;
				1400	}
				1401	used = recv_actor(desc, skb, offset, len);
				1402	if (used < 0) {
				1403	if (!copied)
				1404	copied = used;
				1405	break;
				1406	} else if (used <= len) {
				1407	seq += used;
				1408	copied += used;
				1409	offset += used;
				1410	}
				1411	/*
				1412	* If recv_actor drops the lock (e.g. TCP splice
				1413	* receive) the skb pointer might be invalid when
				1414	* getting here: tcp_collapse might have deleted it
				1415	* while aggregating skbs from the socket queue.
				1416	*/
				1417	skb = tcp_recv_skb(sk, seq-1, &offset);
				1418	if (!skb \|\| (offset+1 != skb->len))
				1419	break;
				1420	}
				1421	if (tcp_hdr(skb)->fin) {
				1422	sk_eat_skb(sk, skb, 0);
				1423	++seq;
				1424	break;
				1425	}
				1426	sk_eat_skb(sk, skb, 0);
				1427	if (!desc->count)
				1428	break;
				1429	tp->copied_seq = seq;
				1430	}
				1431	tp->copied_seq = seq;
				1432
				1433	tcp_rcv_space_adjust(sk);
				1434
				1435	/* Clean up data we have read: This will do ACK frames. */
				1436	if (copied > 0) {
				1437	tcp_cleanup_rbuf(sk, copied);
				1438	uid_stat_tcp_rcv(current_uid(), copied);
				1439	}
				1440
				1441	return copied;
				1442	}
				1443	EXPORT_SYMBOL(tcp_read_sock);
				1444
				1445	/*
				1446	* This routine copies from a sock struct into the user buffer.
				1447	*
				1448	* Technical note: in 2.3 we work on _locked_ socket, so that
				1449	* tricks with *seq access order and skb->users are not required.
				1450	* Probably, code can be easily improved even more.
				1451	*/
				1452
				1453	int tcp_recvmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
				1454	size_t len, int nonblock, int flags, int *addr_len)
				1455	{
				1456	struct tcp_sock *tp = tcp_sk(sk);
				1457	int copied = 0;
				1458	u32 peek_seq;
				1459	u32 *seq;
				1460	unsigned long used;
				1461	int err;
				1462	int target; /* Read at least this many bytes */
				1463	long timeo;
				1464	struct task_struct *user_recv = NULL;
				1465	int copied_early = 0;
				1466	struct sk_buff *skb;
				1467	u32 urg_hole = 0;
				1468
				1469	lock_sock(sk);
				1470
				1471	err = -ENOTCONN;
				1472	if (sk->sk_state == TCP_LISTEN)
				1473	goto out;
				1474
				1475	timeo = sock_rcvtimeo(sk, nonblock);
				1476
				1477	/* Urgent data needs to be handled specially. */
				1478	if (flags & MSG_OOB)
				1479	goto recv_urg;
				1480
				1481	seq = &tp->copied_seq;
				1482	if (flags & MSG_PEEK) {
				1483	peek_seq = tp->copied_seq;
				1484	seq = &peek_seq;
				1485	}
				1486
				1487	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
				1488
				1489	#ifdef CONFIG_NET_DMA
				1490	tp->ucopy.dma_chan = NULL;
				1491	preempt_disable();
				1492	skb = skb_peek_tail(&sk->sk_receive_queue);
				1493	{
				1494	int available = 0;
				1495
				1496	if (skb)
				1497	available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
				1498	if ((available < target) &&
				1499	(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
				1500	!sysctl_tcp_low_latency &&
				1501	net_dma_find_channel()) {
				1502	preempt_enable_no_resched();
				1503	tp->ucopy.pinned_list =
				1504	dma_pin_iovec_pages(msg->msg_iov, len);
				1505	} else {
				1506	preempt_enable_no_resched();
				1507	}
				1508	}
				1509	#endif
				1510
				1511	do {
				1512	u32 offset;
				1513
				1514	/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
				1515	if (tp->urg_data && tp->urg_seq == *seq) {
				1516	if (copied)
				1517	break;
				1518	if (signal_pending(current)) {
				1519	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
				1520	ERRNO_TRACK(-EAGAIN);
				1521	break;
				1522	}
				1523	}
				1524
				1525	/* Next get a buffer. */
				1526
				1527	skb_queue_walk(&sk->sk_receive_queue, skb) {
				1528	/* Now that we have two receive queues this
				1529	* shouldn't happen.
				1530	*/
				1531	if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
				1532	"recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
				1533	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
				1534	flags))
				1535	break;
				1536
				1537	offset = *seq - TCP_SKB_CB(skb)->seq;
				1538	if (tcp_hdr(skb)->syn)
				1539	offset--;
				1540	if (offset < skb->len)
				1541	goto found_ok_skb;
				1542	if (tcp_hdr(skb)->fin)
				1543	goto found_fin_ok;
				1544	WARN(!(flags & MSG_PEEK),
				1545	"recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
				1546	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
				1547	}
				1548
				1549	/* Well, if we have backlog, try to process it now yet. */
				1550
				1551	if (copied >= target && !sk->sk_backlog.tail)
				1552	break;
				1553
				1554	if (copied) {
				1555	if (sk->sk_err \|\|
				1556	sk->sk_state == TCP_CLOSE \|\|
				1557	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
				1558	!timeo \|\|
				1559	signal_pending(current))
				1560	break;
				1561	} else {
				1562	if (sock_flag(sk, SOCK_DONE))
				1563	break;
				1564
				1565	if (sk->sk_err) {
				1566	copied = sock_error(sk);
				1567	break;
				1568	}
				1569
				1570	if (sk->sk_shutdown & RCV_SHUTDOWN)
				1571	break;
				1572
				1573	if (sk->sk_state == TCP_CLOSE) {
				1574	if (!sock_flag(sk, SOCK_DONE)) {
				1575	/* This occurs when user tries to read
				1576	* from never connected socket.
				1577	*/
				1578	copied = -ENOTCONN;
				1579	break;
				1580	}
				1581	break;
				1582	}
				1583
				1584	if (!timeo) {
				1585	copied = ERRNO_TRACK(-EAGAIN);
				1586	break;
				1587	}
				1588
				1589	if (signal_pending(current)) {
				1590	copied = sock_intr_errno(timeo);
				1591	break;
				1592	}
				1593	}
				1594
				1595	tcp_cleanup_rbuf(sk, copied);
				1596
				1597	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
				1598	/* Install new reader */
				1599	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
				1600	user_recv = current;
				1601	tp->ucopy.task = user_recv;
				1602	tp->ucopy.iov = msg->msg_iov;
				1603	}
				1604
				1605	tp->ucopy.len = len;
				1606
				1607	WARN_ON(tp->copied_seq != tp->rcv_nxt &&
				1608	!(flags & (MSG_PEEK \| MSG_TRUNC)));
				1609
				1610	/* Ugly... If prequeue is not empty, we have to
				1611	* process it before releasing socket, otherwise
				1612	* order will be broken at second iteration.
				1613	* More elegant solution is required!!!
				1614	*
				1615	* Look: we have the following (pseudo)queues:
				1616	*
				1617	* 1. packets in flight
				1618	* 2. backlog
				1619	* 3. prequeue
				1620	* 4. receive_queue
				1621	*
				1622	* Each queue can be processed only if the next ones
				1623	* are empty. At this point we have empty receive_queue.
				1624	* But prequeue _can_ be not empty after 2nd iteration,
				1625	* when we jumped to start of loop because backlog
				1626	* processing added something to receive_queue.
				1627	* We cannot release_sock(), because backlog contains
				1628	* packets arrived _after_ prequeued ones.
				1629	*
				1630	* Shortly, algorithm is clear --- to process all
				1631	* the queues in order. We could make it more directly,
				1632	* requeueing packets from backlog to prequeue, if
				1633	* is not empty. It is more elegant, but eats cycles,
				1634	* unfortunately.
				1635	*/
				1636	if (!skb_queue_empty(&tp->ucopy.prequeue))
				1637	goto do_prequeue;
				1638
				1639	/* __ Set realtime policy in scheduler __ */
				1640	}
				1641
				1642	#ifdef CONFIG_NET_DMA
				1643	if (tp->ucopy.dma_chan) {
				1644	if (tp->rcv_wnd == 0 &&
				1645	!skb_queue_empty(&sk->sk_async_wait_queue)) {
				1646	tcp_service_net_dma(sk, true);
				1647	tcp_cleanup_rbuf(sk, copied);
				1648	} else
				1649	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
				1650	}
				1651	#endif
				1652	if (copied >= target) {
				1653	/* Do not sleep, just process backlog. */
				1654	release_sock(sk);
				1655	lock_sock(sk);
				1656	} else
				1657	sk_wait_data(sk, &timeo);
				1658
				1659	#ifdef CONFIG_NET_DMA
				1660	tcp_service_net_dma(sk, false); /* Don't block */
				1661	tp->ucopy.wakeup = 0;
				1662	#endif
				1663
				1664	if (user_recv) {
				1665	int chunk;
				1666
				1667	/* __ Restore normal policy in scheduler __ */
				1668
				1669	if ((chunk = len - tp->ucopy.len) != 0) {
				1670	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
				1671	len -= chunk;
				1672	copied += chunk;
				1673	}
				1674
				1675	if (tp->rcv_nxt == tp->copied_seq &&
				1676	!skb_queue_empty(&tp->ucopy.prequeue)) {
				1677	do_prequeue:
				1678	tcp_prequeue_process(sk);
				1679
				1680	if ((chunk = len - tp->ucopy.len) != 0) {
				1681	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1682	len -= chunk;
				1683	copied += chunk;
				1684	}
				1685	}
				1686	}
				1687	if ((flags & MSG_PEEK) &&
				1688	(peek_seq - copied - urg_hole != tp->copied_seq)) {
				1689	if (net_ratelimit())
				1690	printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
				1691	current->comm, task_pid_nr(current));
				1692	peek_seq = tp->copied_seq;
				1693	}
				1694	continue;
				1695
				1696	found_ok_skb:
				1697	/* Ok so how much can we use? */
				1698	used = skb->len - offset;
				1699	if (len < used)
				1700	used = len;
				1701
				1702	/* Do we have urgent data here? */
				1703	if (tp->urg_data) {
				1704	u32 urg_offset = tp->urg_seq - *seq;
				1705	if (urg_offset < used) {
				1706	if (!urg_offset) {
				1707	if (!sock_flag(sk, SOCK_URGINLINE)) {
				1708	++*seq;
				1709	urg_hole++;
				1710	offset++;
				1711	used--;
				1712	if (!used)
				1713	goto skip_copy;
				1714	}
				1715	} else
				1716	used = urg_offset;
				1717	}
				1718	}
				1719
				1720	if (!(flags & MSG_TRUNC)) {
				1721	#ifdef CONFIG_NET_DMA
				1722	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
				1723	tp->ucopy.dma_chan = net_dma_find_channel();
				1724
				1725	if (tp->ucopy.dma_chan) {
				1726	tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
				1727	tp->ucopy.dma_chan, skb, offset,
				1728	msg->msg_iov, used,
				1729	tp->ucopy.pinned_list);
				1730
				1731	if (tp->ucopy.dma_cookie < 0) {
				1732
				1733	pr_alert("%s: dma_cookie < 0\n",
				1734	__func__);
				1735
				1736	/* Exception. Bailout! */
				1737	if (!copied)
				1738	copied = -EFAULT;
				1739	break;
				1740	}
				1741
				1742	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
				1743
				1744	if ((offset + used) == skb->len)
				1745	copied_early = 1;
				1746
				1747	} else
				1748	#endif
				1749	{
				1750	err = skb_copy_datagram_iovec(skb, offset,
				1751	msg->msg_iov, used);
				1752	if (err) {
				1753	/* Exception. Bailout! */
				1754	if (!copied)
				1755	copied = -EFAULT;
				1756	break;
				1757	}
				1758	}
				1759	}
				1760
				1761	*seq += used;
				1762	copied += used;
				1763	len -= used;
				1764
				1765	tcp_rcv_space_adjust(sk);
				1766
				1767	skip_copy:
				1768	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
				1769	tp->urg_data = 0;
				1770	tcp_fast_path_check(sk);
				1771	}
				1772	if (used + offset < skb->len)
				1773	continue;
				1774
				1775	if (tcp_hdr(skb)->fin)
				1776	goto found_fin_ok;
				1777	if (!(flags & MSG_PEEK)) {
				1778	sk_eat_skb(sk, skb, copied_early);
				1779	copied_early = 0;
				1780	}
				1781	continue;
				1782
				1783	found_fin_ok:
				1784	/* Process the FIN. */
				1785	++*seq;
				1786	if (!(flags & MSG_PEEK)) {
				1787	sk_eat_skb(sk, skb, copied_early);
				1788	copied_early = 0;
				1789	}
				1790	break;
				1791	} while (len > 0);
				1792
				1793	if (user_recv) {
				1794	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
				1795	int chunk;
				1796
				1797	tp->ucopy.len = copied > 0 ? len : 0;
				1798
				1799	tcp_prequeue_process(sk);
				1800
				1801	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
				1802	NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
				1803	len -= chunk;
				1804	copied += chunk;
				1805	}
				1806	}
				1807
				1808	tp->ucopy.task = NULL;
				1809	tp->ucopy.len = 0;
				1810	}
				1811
				1812	#ifdef CONFIG_NET_DMA
				1813	tcp_service_net_dma(sk, true); /* Wait for queue to drain */
				1814	tp->ucopy.dma_chan = NULL;
				1815
				1816	if (tp->ucopy.pinned_list) {
				1817	dma_unpin_iovec_pages(tp->ucopy.pinned_list);
				1818	tp->ucopy.pinned_list = NULL;
				1819	}
				1820	#endif
				1821
				1822	/* According to UNIX98, msg_name/msg_namelen are ignored
				1823	* on connected socket. I was just happy when found this 8) --ANK
				1824	*/
				1825
				1826	/* Clean up data we have read: This will do ACK frames. */
				1827	tcp_cleanup_rbuf(sk, copied);
				1828
				1829	release_sock(sk);
				1830
				1831	if (copied > 0)
				1832	uid_stat_tcp_rcv(current_uid(), copied);
				1833	//return copied;
				1834	return ERRNO_TRACK(copied);
				1835
				1836	out:
				1837	release_sock(sk);
				1838	//return err;
				1839	return ERRNO_TRACK(err);
				1840
				1841	recv_urg:
				1842	err = tcp_recv_urg(sk, msg, len, flags);
				1843	if (err > 0)
				1844	uid_stat_tcp_rcv(current_uid(), err);
				1845	goto out;
				1846	}
				1847	EXPORT_SYMBOL(tcp_recvmsg);
				1848
				1849	void tcp_set_state(struct sock *sk, int state)
				1850	{
				1851	int oldstate = sk->sk_state;
				1852
				1853	switch (state) {
				1854	case TCP_ESTABLISHED:
				1855	if (oldstate != TCP_ESTABLISHED)
				1856	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
				1857	break;
				1858
				1859	case TCP_CLOSE:
				1860	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
				1861	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
				1862
				1863	sk->sk_prot->unhash(sk);
				1864	if (inet_csk(sk)->icsk_bind_hash &&
				1865	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
				1866	inet_put_port(sk);
				1867	/* fall through */
				1868	default:
				1869	if (oldstate == TCP_ESTABLISHED)
				1870	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
				1871	}
				1872
				1873	/* Change state AFTER socket is unhashed to avoid closed
				1874	* socket sitting in hash tables.
				1875	*/
				1876	sk->sk_state = state;
				1877
				1878	#ifdef STATE_TRACE
				1879	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
				1880	#endif
				1881	}
				1882	EXPORT_SYMBOL_GPL(tcp_set_state);
				1883
				1884	/*
				1885	* State processing on a close. This implements the state shift for
				1886	* sending our FIN frame. Note that we only send a FIN for some
				1887	* states. A shutdown() may have already sent the FIN, or we may be
				1888	* closed.
				1889	*/
				1890
				1891	static const unsigned char new_state[16] = {
				1892	/* current state: new state: action: */
				1893	/* (Invalid) */ TCP_CLOSE,
				1894	/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1895	/* TCP_SYN_SENT */ TCP_CLOSE,
				1896	/* TCP_SYN_RECV */ TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
				1897	/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
				1898	/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
				1899	/* TCP_TIME_WAIT */ TCP_CLOSE,
				1900	/* TCP_CLOSE */ TCP_CLOSE,
				1901	/* TCP_CLOSE_WAIT */ TCP_LAST_ACK \| TCP_ACTION_FIN,
				1902	/* TCP_LAST_ACK */ TCP_LAST_ACK,
				1903	/* TCP_LISTEN */ TCP_CLOSE,
				1904	/* TCP_CLOSING */ TCP_CLOSING,
				1905	};
				1906
				1907	static int tcp_close_state(struct sock *sk)
				1908	{
				1909	int next = (int)new_state[sk->sk_state];
				1910	int ns = next & TCP_STATE_MASK;
				1911
				1912	tcp_set_state(sk, ns);
				1913
				1914	return next & TCP_ACTION_FIN;
				1915	}
				1916
				1917	/*
				1918	* Shutdown the sending side of a connection. Much like close except
				1919	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
				1920	*/
				1921
				1922	void tcp_shutdown(struct sock *sk, int how)
				1923	{
				1924	/* We need to grab some memory, and put together a FIN,
				1925	* and then put it into the queue to be sent.
				1926	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
				1927	*/
				1928	if (!(how & SEND_SHUTDOWN))
				1929	return;
				1930
				1931	/* If we've already sent a FIN, or it's a closed state, skip this. */
				1932	if ((1 << sk->sk_state) &
				1933	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
				1934	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
				1935	/* Clear out any half completed packets. FIN if needed. */
				1936	if (tcp_close_state(sk))
				1937	tcp_send_fin(sk);
				1938	}
				1939	}
				1940	EXPORT_SYMBOL(tcp_shutdown);
				1941
				1942	bool tcp_check_oom(struct sock *sk, int shift)
				1943	{
				1944	bool too_many_orphans, out_of_socket_memory;
				1945
				1946	too_many_orphans = tcp_too_many_orphans(sk, shift);
				1947	out_of_socket_memory = tcp_out_of_memory(sk);
				1948
				1949	if (too_many_orphans && net_ratelimit())
				1950	pr_info("too many orphaned sockets\n");
				1951	if (out_of_socket_memory && net_ratelimit())
				1952	pr_info("out of memory -- consider tuning tcp_mem\n");
				1953	return too_many_orphans \|\| out_of_socket_memory;
				1954	}
				1955
				1956	void tcp_close(struct sock *sk, long timeout)
				1957	{
				1958	struct sk_buff *skb;
				1959	int data_was_unread = 0;
				1960	int state;
				1961
				1962	lock_sock(sk);
				1963	sk->sk_shutdown = SHUTDOWN_MASK;
				1964
				1965	if (sk->sk_state == TCP_LISTEN) {
				1966	tcp_set_state(sk, TCP_CLOSE);
				1967
				1968	/* Special case. */
				1969	inet_csk_listen_stop(sk);
				1970
				1971	goto adjudge_to_death;
				1972	}
				1973
				1974	/* We need to flush the recv. buffs. We do this only on the
				1975	* descriptor close, not protocol-sourced closes, because the
				1976	* reader process may not have drained the data yet!
				1977	*/
				1978	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
				1979	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
				1980	tcp_hdr(skb)->fin;
				1981	data_was_unread += len;
				1982	__kfree_skb(skb);
				1983	}
				1984
				1985	sk_mem_reclaim(sk);
				1986
				1987	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
				1988	if (sk->sk_state == TCP_CLOSE)
				1989	goto adjudge_to_death;
				1990
				1991	/* As outlined in RFC 2525, section 2.17, we send a RST here because
				1992	* data was lost. To witness the awful effects of the old behavior of
				1993	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
				1994	* GET in an FTP client, suspend the process, wait for the client to
				1995	* advertise a zero window, then kill -9 the FTP client, wheee...
				1996	* Note: timeout is always zero in such a case.
				1997	*/
				1998	if (data_was_unread) {
				1999	/* Unread data was tossed, zap the connection. */
				2000	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
				2001	tcp_set_state(sk, TCP_CLOSE);
				2002	tcp_send_active_reset(sk, sk->sk_allocation);
				2003	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
				2004	/* Check zero linger _after_ checking for unread data. */
				2005	sk->sk_prot->disconnect(sk, 0);
				2006	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				2007	} else if (tcp_close_state(sk)) {
				2008	/* We FIN if the application ate all the data before
				2009	* zapping the connection.
				2010	*/
				2011
				2012	/* RED-PEN. Formally speaking, we have broken TCP state
				2013	* machine. State transitions:
				2014	*
				2015	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
				2016	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
				2017	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
				2018	*
				2019	* are legal only when FIN has been sent (i.e. in window),
				2020	* rather than queued out of window. Purists blame.
				2021	*
				2022	* F.e. "RFC state" is ESTABLISHED,
				2023	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
				2024	*
				2025	* The visible declinations are that sometimes
				2026	* we enter time-wait state, when it is not required really
				2027	* (harmless), do not send active resets, when they are
				2028	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
				2029	* they look as CLOSING or LAST_ACK for Linux)
				2030	* Probably, I missed some more holelets.
				2031	* --ANK
				2032	*/
				2033	tcp_send_fin(sk);
				2034	}
				2035
				2036	sk_stream_wait_close(sk, timeout);
				2037
				2038	adjudge_to_death:
				2039	state = sk->sk_state;
				2040	sock_hold(sk);
				2041	sock_orphan(sk);
				2042
				2043	/* It is the last release_sock in its life. It will remove backlog. */
				2044	release_sock(sk);
				2045
				2046
				2047	/* Now socket is owned by kernel and we acquire BH lock
				2048	to finish close. No need to check for user refs.
				2049	*/
				2050	local_bh_disable();
				2051	bh_lock_sock(sk);
				2052	WARN_ON(sock_owned_by_user(sk));
				2053
				2054	percpu_counter_inc(sk->sk_prot->orphan_count);
				2055
				2056	/* Have we already been destroyed by a softirq or backlog? */
				2057	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
				2058	goto out;
				2059
				2060	/* This is a (useful) BSD violating of the RFC. There is a
				2061	* problem with TCP as specified in that the other end could
				2062	* keep a socket open forever with no application left this end.
				2063	* We use a 3 minute timeout (about the same as BSD) then kill
				2064	* our end. If they send after that then tough - BUT: long enough
				2065	* that we won't make the old 4*rto = almost no time - whoops
				2066	* reset mistake.
				2067	*
				2068	* Nope, it was not mistake. It is really desired behaviour
				2069	* f.e. on http servers, when such sockets are useless, but
				2070	* consume significant resources. Let's do it with special
				2071	* linger2 option. --ANK
				2072	*/
				2073
				2074	if (sk->sk_state == TCP_FIN_WAIT2) {
				2075	struct tcp_sock *tp = tcp_sk(sk);
				2076	if (tp->linger2 < 0) {
				2077	tcp_set_state(sk, TCP_CLOSE);
				2078	tcp_send_active_reset(sk, GFP_ATOMIC);
				2079	NET_INC_STATS_BH(sock_net(sk),
				2080	LINUX_MIB_TCPABORTONLINGER);
				2081	} else {
				2082	const int tmo = tcp_fin_time(sk);
				2083
				2084	if (tmo > TCP_TIMEWAIT_LEN) {
				2085	inet_csk_reset_keepalive_timer(sk,
				2086	tmo - TCP_TIMEWAIT_LEN);
				2087	} else {
				2088	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				2089	goto out;
				2090	}
				2091	}
				2092	}
				2093	if (sk->sk_state != TCP_CLOSE) {
				2094	sk_mem_reclaim(sk);
				2095	if (tcp_check_oom(sk, 0)) {
				2096	tcp_set_state(sk, TCP_CLOSE);
				2097	tcp_send_active_reset(sk, GFP_ATOMIC);
				2098	NET_INC_STATS_BH(sock_net(sk),
				2099	LINUX_MIB_TCPABORTONMEMORY);
				2100	}
				2101	}
				2102
				2103	if (sk->sk_state == TCP_CLOSE)
				2104	inet_csk_destroy_sock(sk);
				2105	/* Otherwise, socket is reprieved until protocol close. */
				2106
				2107	out:
				2108	bh_unlock_sock(sk);
				2109	local_bh_enable();
				2110	sock_put(sk);
				2111	}
				2112	EXPORT_SYMBOL(tcp_close);
				2113
				2114	/* These states need RST on ABORT according to RFC793 */
				2115
				2116	static inline int tcp_need_reset(int state)
				2117	{
				2118	return (1 << state) &
				2119	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
				2120	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
				2121	}
				2122
				2123	int tcp_disconnect(struct sock *sk, int flags)
				2124	{
				2125	struct inet_sock *inet = inet_sk(sk);
				2126	struct inet_connection_sock *icsk = inet_csk(sk);
				2127	struct tcp_sock *tp = tcp_sk(sk);
				2128	int err = 0;
				2129	int old_state = sk->sk_state;
				2130
				2131	if (old_state != TCP_CLOSE)
				2132	tcp_set_state(sk, TCP_CLOSE);
				2133
				2134	/* ABORT function of RFC793 */
				2135	if (old_state == TCP_LISTEN) {
				2136	inet_csk_listen_stop(sk);
				2137	} else if (tcp_need_reset(old_state) \|\|
				2138	(tp->snd_nxt != tp->write_seq &&
				2139	(1 << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
				2140	/* The last check adjusts for discrepancy of Linux wrt. RFC
				2141	* states
				2142	*/
				2143	tcp_send_active_reset(sk, gfp_any());
				2144	sk->sk_err = ECONNRESET;
				2145	} else if (old_state == TCP_SYN_SENT)
				2146	sk->sk_err = ECONNRESET;
				2147
				2148	tcp_clear_xmit_timers(sk);
				2149	__skb_queue_purge(&sk->sk_receive_queue);
				2150	tcp_write_queue_purge(sk);
				2151	__skb_queue_purge(&tp->out_of_order_queue);
				2152	#ifdef CONFIG_NET_DMA
				2153	__skb_queue_purge(&sk->sk_async_wait_queue);
				2154	#endif
				2155
				2156	inet->inet_dport = 0;
				2157
				2158	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
				2159	inet_reset_saddr(sk);
				2160
				2161	sk->sk_shutdown = 0;
				2162	sock_reset_flag(sk, SOCK_DONE);
				2163	tp->srtt = 0;
				2164	if ((tp->write_seq += tp->max_window + 2) == 0)
				2165	tp->write_seq = 1;
				2166	icsk->icsk_backoff = 0;
				2167	tp->snd_cwnd = 2;
				2168	icsk->icsk_probes_out = 0;
				2169	tp->packets_out = 0;
				2170	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
				2171	tp->snd_cwnd_cnt = 0;
				2172	tp->bytes_acked = 0;
				2173	tp->window_clamp = 0;
				2174	tcp_set_ca_state(sk, TCP_CA_Open);
				2175	tcp_clear_retrans(tp);
				2176	inet_csk_delack_init(sk);
lh	758261d	2023-07-13 05:52:04 -0700	[diff] [blame]	2177	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
				2178	* issue in __tcp_select_window()
				2179	*/
				2180	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; //CVE-2017-14106(BDSA-2017-1152)
lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	2181	tcp_init_send_head(sk);
				2182	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
				2183	__sk_dst_reset(sk);
				2184
				2185	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
				2186
				2187	sk->sk_error_report(sk);
				2188	//return err;
				2189	return ERRNO_TRACK(err);
				2190	}
				2191	EXPORT_SYMBOL(tcp_disconnect);
				2192
				2193	/*
				2194	* Socket option code for TCP.
				2195	*/
				2196	static int do_tcp_setsockopt(struct sock *sk, int level,
				2197	int optname, char __user *optval, unsigned int optlen)
				2198	{
				2199	struct tcp_sock *tp = tcp_sk(sk);
				2200	struct inet_connection_sock *icsk = inet_csk(sk);
				2201	int val;
				2202	int err = 0;
				2203
				2204	/* These are data/string values, all the others are ints */
				2205	switch (optname) {
				2206	case TCP_CONGESTION: {
				2207	char name[TCP_CA_NAME_MAX];
				2208
				2209	if (optlen < 1)
				2210	return ERRNO_TRACK(-EINVAL);
				2211	//return -EINVAL;
				2212
				2213	val = strncpy_from_user(name, optval,
				2214	min_t(long, TCP_CA_NAME_MAX-1, optlen));
				2215	if (val < 0)
				2216	//return -EFAULT;
				2217	return ERRNO_TRACK(-EFAULT);
				2218	name[val] = 0;
				2219
				2220	lock_sock(sk);
				2221	err = tcp_set_congestion_control(sk, name);
				2222	release_sock(sk);
				2223	//return err;
				2224	return ERRNO_TRACK(err);
				2225	}
				2226	case TCP_COOKIE_TRANSACTIONS: {
				2227	struct tcp_cookie_transactions ctd;
				2228	struct tcp_cookie_values *cvp = NULL;
				2229
				2230	if (sizeof(ctd) > optlen)
				2231	//return -EINVAL;
				2232	return ERRNO_TRACK(-EINVAL);
				2233	if (copy_from_user(&ctd, optval, sizeof(ctd)))
				2234	//return -EFAULT;
				2235	return ERRNO_TRACK(-EFAULT);
				2236
				2237	if (ctd.tcpct_used > sizeof(ctd.tcpct_value) \|\|
				2238	ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
				2239	//return -EINVAL;
				2240	return ERRNO_TRACK(-EINVAL);
				2241
				2242	if (ctd.tcpct_cookie_desired == 0) {
				2243	/* default to global value */
				2244	} else if ((0x1 & ctd.tcpct_cookie_desired) \|\|
				2245	ctd.tcpct_cookie_desired > TCP_COOKIE_MAX \|\|
				2246	ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
				2247	//return -EINVAL;
				2248	return ERRNO_TRACK(-EINVAL);
				2249	}
				2250
				2251	if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
				2252	/* Supercedes all other values */
				2253	lock_sock(sk);
				2254	if (tp->cookie_values != NULL) {
				2255	kref_put(&tp->cookie_values->kref,
				2256	tcp_cookie_values_release);
				2257	tp->cookie_values = NULL;
				2258	}
				2259	tp->rx_opt.cookie_in_always = 0; /* false */
				2260	tp->rx_opt.cookie_out_never = 1; /* true */
				2261	release_sock(sk);
				2262	//return err;
				2263	return ERRNO_TRACK(err);
				2264	}
				2265
				2266	/* Allocate ancillary memory before locking.
				2267	*/
				2268	if (ctd.tcpct_used > 0 \|\|
				2269	(tp->cookie_values == NULL &&
				2270	(sysctl_tcp_cookie_size > 0 \|\|
				2271	ctd.tcpct_cookie_desired > 0 \|\|
				2272	ctd.tcpct_s_data_desired > 0))) {
				2273	cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
				2274	GFP_KERNEL);
				2275	if (cvp == NULL)
				2276	//return -ENOMEM;
				2277	return ERRNO_TRACK(-ENOMEM);
				2278
				2279	kref_init(&cvp->kref);
				2280	}
				2281	lock_sock(sk);
				2282	tp->rx_opt.cookie_in_always =
				2283	(TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
				2284	tp->rx_opt.cookie_out_never = 0; /* false */
				2285
				2286	if (tp->cookie_values != NULL) {
				2287	if (cvp != NULL) {
				2288	/* Changed values are recorded by a changed
				2289	* pointer, ensuring the cookie will differ,
				2290	* without separately hashing each value later.
				2291	*/
				2292	kref_put(&tp->cookie_values->kref,
				2293	tcp_cookie_values_release);
				2294	} else {
				2295	cvp = tp->cookie_values;
				2296	}
				2297	}
				2298
				2299	if (cvp != NULL) {
				2300	cvp->cookie_desired = ctd.tcpct_cookie_desired;
				2301
				2302	if (ctd.tcpct_used > 0) {
				2303	memcpy(cvp->s_data_payload, ctd.tcpct_value,
				2304	ctd.tcpct_used);
				2305	cvp->s_data_desired = ctd.tcpct_used;
				2306	cvp->s_data_constant = 1; /* true */
				2307	} else {
				2308	/* No constant payload data. */
				2309	cvp->s_data_desired = ctd.tcpct_s_data_desired;
				2310	cvp->s_data_constant = 0; /* false */
				2311	}
				2312
				2313	tp->cookie_values = cvp;
				2314	}
				2315	release_sock(sk);
				2316	//return err;
				2317	return ERRNO_TRACK(err);
				2318	}
				2319	default:
				2320	/* fallthru */
				2321	break;
				2322	}
				2323
				2324	if (optlen < sizeof(int))
				2325	//return -EINVAL;
				2326	return ERRNO_TRACK(-EINVAL);
				2327	if (get_user(val, (int __user *)optval))
				2328	//return -EFAULT;
				2329	return ERRNO_TRACK(-EFAULT);
				2330
				2331	lock_sock(sk);
				2332
				2333	switch (optname) {
				2334	case TCP_MAXSEG:
				2335	/* Values greater than interface MTU won't take effect. However
				2336	* at the point when this call is done we typically don't yet
				2337	* know which interface is going to be used */
				2338	if (val < TCP_MIN_MSS \|\| val > MAX_TCP_WINDOW) {
				2339	err = -EINVAL;
				2340	break;
				2341	}
				2342	tp->rx_opt.user_mss = val;
				2343	break;
				2344
				2345	case TCP_NODELAY:
				2346	if (val) {
				2347	/* TCP_NODELAY is weaker than TCP_CORK, so that
				2348	* this option on corked socket is remembered, but
				2349	* it is not activated until cork is cleared.
				2350	*
				2351	* However, when TCP_NODELAY is set we make
				2352	* an explicit push, which overrides even TCP_CORK
				2353	* for currently queued segments.
				2354	*/
				2355	tp->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
				2356	tcp_push_pending_frames(sk);
				2357	} else {
				2358	tp->nonagle &= ~TCP_NAGLE_OFF;
				2359	}
				2360	break;
				2361
				2362	case TCP_THIN_LINEAR_TIMEOUTS:
				2363	if (val < 0 \|\| val > 1)
				2364	err = -EINVAL;
				2365	else
				2366	tp->thin_lto = val;
				2367	break;
				2368
				2369	case TCP_THIN_DUPACK:
				2370	if (val < 0 \|\| val > 1)
				2371	err = -EINVAL;
				2372	else
				2373	tp->thin_dupack = val;
				2374	break;
				2375
				2376	case TCP_CORK:
				2377	/* When set indicates to always queue non-full frames.
				2378	* Later the user clears this option and we transmit
				2379	* any pending partial frames in the queue. This is
				2380	* meant to be used alongside sendfile() to get properly
				2381	* filled frames when the user (for example) must write
				2382	* out headers with a write() call first and then use
				2383	* sendfile to send out the data parts.
				2384	*
				2385	* TCP_CORK can be set together with TCP_NODELAY and it is
				2386	* stronger than TCP_NODELAY.
				2387	*/
				2388	if (val) {
				2389	tp->nonagle \|= TCP_NAGLE_CORK;
				2390	} else {
				2391	tp->nonagle &= ~TCP_NAGLE_CORK;
				2392	if (tp->nonagle&TCP_NAGLE_OFF)
				2393	tp->nonagle \|= TCP_NAGLE_PUSH;
				2394	tcp_push_pending_frames(sk);
				2395	}
				2396	break;
				2397
				2398	case TCP_KEEPIDLE:
				2399	if (val < 1 \|\| val > MAX_TCP_KEEPIDLE)
				2400	err = -EINVAL;
				2401	else {
				2402	tp->keepalive_time = val * HZ;
				2403	if (sock_flag(sk, SOCK_KEEPOPEN) &&
				2404	!((1 << sk->sk_state) &
				2405	(TCPF_CLOSE \| TCPF_LISTEN))) {
				2406	u32 elapsed = keepalive_time_elapsed(tp);
				2407	if (tp->keepalive_time > elapsed)
				2408	elapsed = tp->keepalive_time - elapsed;
				2409	else
				2410	elapsed = 0;
				2411	inet_csk_reset_keepalive_timer(sk, elapsed);
				2412	}
				2413	}
				2414	break;
				2415	case TCP_KEEPINTVL:
				2416	if (val < 1 \|\| val > MAX_TCP_KEEPINTVL)
				2417	err = -EINVAL;
				2418	else
				2419	tp->keepalive_intvl = val * HZ;
				2420	break;
				2421	case TCP_KEEPCNT:
				2422	if (val < 1 \|\| val > MAX_TCP_KEEPCNT)
				2423	err = -EINVAL;
				2424	else
				2425	tp->keepalive_probes = val;
				2426	break;
				2427	case TCP_SYNCNT:
				2428	if (val < 1 \|\| val > MAX_TCP_SYNCNT)
				2429	err = -EINVAL;
				2430	else
				2431	icsk->icsk_syn_retries = val;
				2432	break;
				2433
				2434	case TCP_LINGER2:
				2435	if (val < 0)
				2436	tp->linger2 = -1;
				2437	else if (val > sysctl_tcp_fin_timeout / HZ)
				2438	tp->linger2 = 0;
				2439	else
				2440	tp->linger2 = val * HZ;
				2441	break;
				2442
				2443	case TCP_DEFER_ACCEPT:
				2444	/* Translate value in seconds to number of retransmits */
				2445	icsk->icsk_accept_queue.rskq_defer_accept =
				2446	secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
				2447	TCP_RTO_MAX / HZ);
				2448	break;
				2449
				2450	case TCP_WINDOW_CLAMP:
				2451	if (!val) {
				2452	if (sk->sk_state != TCP_CLOSE) {
				2453	err = -EINVAL;
				2454	break;
				2455	}
				2456	tp->window_clamp = 0;
				2457	} else
				2458	tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
				2459	SOCK_MIN_RCVBUF / 2 : val;
				2460	break;
				2461
				2462	case TCP_QUICKACK:
				2463	if (!val) {
				2464	icsk->icsk_ack.pingpong = 1;
				2465	} else {
				2466	icsk->icsk_ack.pingpong = 0;
				2467	if ((1 << sk->sk_state) &
				2468	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
				2469	inet_csk_ack_scheduled(sk)) {
				2470	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				2471	tcp_cleanup_rbuf(sk, 1);
				2472	if (!(val & 1))
				2473	icsk->icsk_ack.pingpong = 1;
				2474	}
				2475	}
				2476	break;
				2477
				2478	#ifdef CONFIG_TCP_MD5SIG
				2479	case TCP_MD5SIG:
				2480	/* Read the IP->Key mappings from userspace */
				2481	err = tp->af_specific->md5_parse(sk, optval, optlen);
				2482	break;
				2483	#endif
				2484	case TCP_USER_TIMEOUT:
				2485	/* Cap the max timeout in ms TCP will retry/retrans
				2486	* before giving up and aborting (ETIMEDOUT) a connection.
				2487	*/
				2488	if (val < 0)
				2489	err = -EINVAL;
				2490	else
				2491	icsk->icsk_user_timeout = msecs_to_jiffies(val);
				2492	break;
				2493	default:
				2494	err = -ENOPROTOOPT;
				2495	break;
				2496	}
				2497
				2498	release_sock(sk);
				2499	//return err;
				2500	return ERRNO_TRACK(err);
				2501	}
				2502
				2503	int tcp_setsockopt(struct sock sk, int level, int optname, char __user optval,
				2504	unsigned int optlen)
				2505	{
				2506	const struct inet_connection_sock *icsk = inet_csk(sk);
				2507	int retval = 0;
				2508	if (level != SOL_TCP)
				2509	{
				2510	retval = icsk->icsk_af_ops->setsockopt(sk, level, optname,
				2511	optval, optlen);
				2512	return ERRNO_TRACK(retval);
				2513	/*return icsk->icsk_af_ops->setsockopt(sk, level, optname,
				2514	optval, optlen);*/
				2515	}
				2516	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				2517	}
				2518	EXPORT_SYMBOL(tcp_setsockopt);
				2519
				2520	#ifdef CONFIG_COMPAT
				2521	int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
				2522	char __user *optval, unsigned int optlen)
				2523	{
				2524	int retval = 0;
				2525	if (level != SOL_TCP){
				2526	retval = inet_csk_compat_setsockopt(sk, level, optname,
				2527	optval, optlen);
				2528	return ERRNO_TRACK(retval);
				2529	/*return inet_csk_compat_setsockopt(sk, level, optname,
				2530	optval, optlen);*/
				2531	}
				2532	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
				2533	}
				2534	EXPORT_SYMBOL(compat_tcp_setsockopt);
				2535	#endif
				2536
				2537	/* Return information about state of tcp endpoint in API format. */
				2538	void tcp_get_info(const struct sock sk, struct tcp_info info)
				2539	{
				2540	const struct tcp_sock *tp = tcp_sk(sk);
				2541	const struct inet_connection_sock *icsk = inet_csk(sk);
				2542	u32 now = tcp_time_stamp;
				2543
				2544	memset(info, 0, sizeof(*info));
				2545
				2546	info->tcpi_state = sk->sk_state;
				2547	info->tcpi_ca_state = icsk->icsk_ca_state;
				2548	info->tcpi_retransmits = icsk->icsk_retransmits;
				2549	info->tcpi_probes = icsk->icsk_probes_out;
				2550	info->tcpi_backoff = icsk->icsk_backoff;
				2551
				2552	if (tp->rx_opt.tstamp_ok)
				2553	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
				2554	if (tcp_is_sack(tp))
				2555	info->tcpi_options \|= TCPI_OPT_SACK;
				2556	if (tp->rx_opt.wscale_ok) {
				2557	info->tcpi_options \|= TCPI_OPT_WSCALE;
				2558	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
				2559	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
				2560	}
				2561
				2562	if (tp->ecn_flags & TCP_ECN_OK)
				2563	info->tcpi_options \|= TCPI_OPT_ECN;
				2564	if (tp->ecn_flags & TCP_ECN_SEEN)
				2565	info->tcpi_options \|= TCPI_OPT_ECN_SEEN;
				2566
				2567	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
				2568	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
				2569	info->tcpi_snd_mss = tp->mss_cache;
				2570	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
				2571
				2572	if (sk->sk_state == TCP_LISTEN) {
				2573	info->tcpi_unacked = sk->sk_ack_backlog;
				2574	info->tcpi_sacked = sk->sk_max_ack_backlog;
				2575	} else {
				2576	info->tcpi_unacked = tp->packets_out;
				2577	info->tcpi_sacked = tp->sacked_out;
				2578	}
				2579	info->tcpi_lost = tp->lost_out;
				2580	info->tcpi_retrans = tp->retrans_out;
				2581	info->tcpi_fackets = tp->fackets_out;
				2582
				2583	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
				2584	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
				2585	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
				2586
				2587	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
				2588	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
				2589	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
				2590	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
				2591	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
				2592	info->tcpi_snd_cwnd = tp->snd_cwnd;
				2593	info->tcpi_advmss = tp->advmss;
				2594	info->tcpi_reordering = tp->reordering;
				2595
				2596	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
				2597	info->tcpi_rcv_space = tp->rcvq_space.space;
				2598
				2599	info->tcpi_total_retrans = tp->total_retrans;
				2600	}
				2601	EXPORT_SYMBOL_GPL(tcp_get_info);
				2602
				2603	static int do_tcp_getsockopt(struct sock *sk, int level,
				2604	int optname, char __user optval, int __user optlen)
				2605	{
				2606	struct inet_connection_sock *icsk = inet_csk(sk);
				2607	struct tcp_sock *tp = tcp_sk(sk);
				2608	int val, len;
				2609
				2610	if (get_user(len, optlen))
				2611	//return -EFAULT;
				2612	return ERRNO_TRACK(-EFAULT);
				2613
				2614	len = min_t(unsigned int, len, sizeof(int));
				2615
				2616	if (len < 0)
				2617	//return -EINVAL;
				2618	return ERRNO_TRACK(-EINVAL);
				2619
				2620	switch (optname) {
				2621	case TCP_MAXSEG:
				2622	val = tp->mss_cache;
				2623	if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
				2624	val = tp->rx_opt.user_mss;
				2625	break;
				2626	case TCP_NODELAY:
				2627	val = !!(tp->nonagle&TCP_NAGLE_OFF);
				2628	break;
				2629	case TCP_CORK:
				2630	val = !!(tp->nonagle&TCP_NAGLE_CORK);
				2631	break;
				2632	case TCP_KEEPIDLE:
				2633	val = keepalive_time_when(tp) / HZ;
				2634	break;
				2635	case TCP_KEEPINTVL:
				2636	val = keepalive_intvl_when(tp) / HZ;
				2637	break;
				2638	case TCP_KEEPCNT:
				2639	val = keepalive_probes(tp);
				2640	break;
				2641	case TCP_SYNCNT:
				2642	val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
				2643	break;
				2644	case TCP_LINGER2:
				2645	val = tp->linger2;
				2646	if (val >= 0)
				2647	val = (val ? : sysctl_tcp_fin_timeout) / HZ;
				2648	break;
				2649	case TCP_DEFER_ACCEPT:
				2650	val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
				2651	TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
				2652	break;
				2653	case TCP_WINDOW_CLAMP:
				2654	val = tp->window_clamp;
				2655	break;
				2656	case TCP_INFO: {
				2657	struct tcp_info info;
				2658
				2659	if (get_user(len, optlen))
				2660	//return -EFAULT;
				2661	return ERRNO_TRACK(-EFAULT);
				2662
				2663	tcp_get_info(sk, &info);
				2664
				2665	len = min_t(unsigned int, len, sizeof(info));
				2666	if (put_user(len, optlen))
				2667	//return -EFAULT;
				2668	return ERRNO_TRACK(-EFAULT);
				2669	if (copy_to_user(optval, &info, len))
				2670	//return -EFAULT;
				2671	return ERRNO_TRACK(-EFAULT);
				2672	return 0;
				2673	}
				2674	case TCP_QUICKACK:
				2675	val = !icsk->icsk_ack.pingpong;
				2676	break;
				2677
				2678	case TCP_CONGESTION:
				2679	if (get_user(len, optlen))
				2680	return ERRNO_TRACK(-EFAULT);
				2681	//return -EFAULT;
				2682	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
				2683	if (put_user(len, optlen))
				2684	//return -EFAULT;
				2685	return ERRNO_TRACK(-EFAULT);
				2686	if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
				2687	//return -EFAULT;
				2688	return ERRNO_TRACK(-EFAULT);
				2689	return 0;
				2690
				2691	case TCP_COOKIE_TRANSACTIONS: {
				2692	struct tcp_cookie_transactions ctd;
				2693	struct tcp_cookie_values *cvp = tp->cookie_values;
				2694
				2695	if (get_user(len, optlen))
				2696	//return -EFAULT;
				2697	return ERRNO_TRACK(-EFAULT);
				2698	if (len < sizeof(ctd))
				2699	//return -EINVAL;
				2700	return ERRNO_TRACK(-EINVAL);
				2701
				2702	memset(&ctd, 0, sizeof(ctd));
				2703	ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
				2704	TCP_COOKIE_IN_ALWAYS : 0)
				2705	\| (tp->rx_opt.cookie_out_never ?
				2706	TCP_COOKIE_OUT_NEVER : 0);
				2707
				2708	if (cvp != NULL) {
				2709	ctd.tcpct_flags \|= (cvp->s_data_in ?
				2710	TCP_S_DATA_IN : 0)
				2711	\| (cvp->s_data_out ?
				2712	TCP_S_DATA_OUT : 0);
				2713
				2714	ctd.tcpct_cookie_desired = cvp->cookie_desired;
				2715	ctd.tcpct_s_data_desired = cvp->s_data_desired;
				2716
				2717	memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
				2718	cvp->cookie_pair_size);
				2719	ctd.tcpct_used = cvp->cookie_pair_size;
				2720	}
				2721
				2722	if (put_user(sizeof(ctd), optlen))
				2723	//return -EFAULT;
				2724	return ERRNO_TRACK(-EFAULT);
				2725	if (copy_to_user(optval, &ctd, sizeof(ctd)))
				2726	//return -EFAULT;
				2727	return ERRNO_TRACK(-EFAULT);
				2728	return 0;
				2729	}
				2730	case TCP_THIN_LINEAR_TIMEOUTS:
				2731	val = tp->thin_lto;
				2732	break;
				2733	case TCP_THIN_DUPACK:
				2734	val = tp->thin_dupack;
				2735	break;
				2736
				2737	case TCP_USER_TIMEOUT:
				2738	val = jiffies_to_msecs(icsk->icsk_user_timeout);
				2739	break;
				2740	default:
				2741	//return -ENOPROTOOPT;
				2742	return ERRNO_TRACK(-ENOPROTOOPT);
				2743	}
				2744
				2745	if (put_user(len, optlen))
				2746	//return -EFAULT;
				2747	return ERRNO_TRACK(-EFAULT);
				2748	if (copy_to_user(optval, &val, len))
				2749	//return -EFAULT;
				2750	return ERRNO_TRACK(-EFAULT);
				2751	return 0;
				2752	}
				2753
				2754	int tcp_getsockopt(struct sock sk, int level, int optname, char __user optval,
				2755	int __user *optlen)
				2756	{
				2757	struct inet_connection_sock *icsk = inet_csk(sk);
				2758	int retval = 0;
				2759	if (level != SOL_TCP){
				2760	retval = icsk->icsk_af_ops->getsockopt(sk, level, optname,
				2761	optval, optlen);
				2762	return ERRNO_TRACK(retval);
				2763	/*
				2764	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
				2765	optval, optlen);*/
				2766	}
				2767	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				2768	}
				2769	EXPORT_SYMBOL(tcp_getsockopt);
				2770
				2771	#ifdef CONFIG_COMPAT
				2772	int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
				2773	char __user optval, int __user optlen)
				2774	{
				2775	int retval = 0;
				2776	if (level != SOL_TCP){
				2777	retval = inet_csk_compat_getsockopt(sk, level, optname,
				2778	optval, optlen);
				2779	return ERRNO_TRACK(retval);
				2780	/*
				2781	return inet_csk_compat_getsockopt(sk, level, optname,
				2782	optval, optlen);*/
				2783	}
				2784	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
				2785	}
				2786	EXPORT_SYMBOL(compat_tcp_getsockopt);
				2787	#endif
				2788
				2789	struct sk_buff tcp_tso_segment(struct sk_buff skb,
				2790	netdev_features_t features)
				2791	{
				2792	struct sk_buff *segs = ERR_PTR(-EINVAL);
				2793	struct tcphdr *th;
				2794	unsigned thlen;
				2795	unsigned int seq;
				2796	__be32 delta;
				2797	unsigned int oldlen;
				2798	unsigned int mss;
				2799
				2800	if (!pskb_may_pull(skb, sizeof(*th)))
				2801	goto out;
				2802
				2803	th = tcp_hdr(skb);
				2804	thlen = th->doff * 4;
				2805	if (thlen < sizeof(*th))
				2806	goto out;
				2807
				2808	if (!pskb_may_pull(skb, thlen))
				2809	goto out;
				2810
				2811	oldlen = (u16)~skb->len;
				2812	__skb_pull(skb, thlen);
				2813
				2814	mss = skb_shinfo(skb)->gso_size;
				2815	if (unlikely(skb->len <= mss))
				2816	goto out;
				2817
				2818	if (skb_gso_ok(skb, features \| NETIF_F_GSO_ROBUST)) {
				2819	/* Packet is from an untrusted source, reset gso_segs. */
				2820	int type = skb_shinfo(skb)->gso_type;
				2821
				2822	if (unlikely(type &
				2823	~(SKB_GSO_TCPV4 \|
				2824	SKB_GSO_DODGY \|
				2825	SKB_GSO_TCP_ECN \|
				2826	SKB_GSO_TCPV6 \|
				2827	0) \|\|
				2828	!(type & (SKB_GSO_TCPV4 \| SKB_GSO_TCPV6))))
				2829	goto out;
				2830
				2831	skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
				2832
				2833	segs = NULL;
				2834	goto out;
				2835	}
				2836
				2837	segs = skb_segment(skb, features);
				2838	if (IS_ERR(segs))
				2839	goto out;
				2840
				2841	delta = htonl(oldlen + (thlen + mss));
				2842
				2843	skb = segs;
				2844	th = tcp_hdr(skb);
				2845	seq = ntohl(th->seq);
				2846
				2847	do {
				2848	th->fin = th->psh = 0;
				2849
				2850	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
				2851	(__force u32)delta));
				2852	if (skb->ip_summed != CHECKSUM_PARTIAL)
				2853	th->check =
				2854	csum_fold(csum_partial(skb_transport_header(skb),
				2855	thlen, skb->csum));
				2856
				2857	seq += mss;
				2858	skb = skb->next;
				2859	th = tcp_hdr(skb);
				2860
				2861	th->seq = htonl(seq);
				2862	th->cwr = 0;
				2863	} while (skb->next);
				2864
				2865	delta = htonl(oldlen + (skb->tail - skb->transport_header) +
				2866	skb->data_len);
				2867	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
				2868	(__force u32)delta));
				2869	if (skb->ip_summed != CHECKSUM_PARTIAL)
				2870	th->check = csum_fold(csum_partial(skb_transport_header(skb),
				2871	thlen, skb->csum));
				2872
				2873	out:
				2874	return segs;
				2875	}
				2876	EXPORT_SYMBOL(tcp_tso_segment);
				2877
				2878	struct sk_buff tcp_gro_receive(struct sk_buff head, struct sk_buff *skb)
				2879	{
				2880	struct sk_buff **pp = NULL;
				2881	struct sk_buff *p;
				2882	struct tcphdr *th;
				2883	struct tcphdr *th2;
				2884	unsigned int len;
				2885	unsigned int thlen;
				2886	__be32 flags;
				2887	unsigned int mss = 1;
				2888	unsigned int hlen;
				2889	unsigned int off;
				2890	int flush = 1;
				2891	int i;
				2892
				2893	off = skb_gro_offset(skb);
				2894	hlen = off + sizeof(*th);
				2895	th = skb_gro_header_fast(skb, off);
				2896	if (skb_gro_header_hard(skb, hlen)) {
				2897	th = skb_gro_header_slow(skb, hlen, off);
				2898	if (unlikely(!th))
				2899	goto out;
				2900	}
				2901
				2902	thlen = th->doff * 4;
				2903	if (thlen < sizeof(*th))
				2904	goto out;
				2905
				2906	hlen = off + thlen;
				2907	if (skb_gro_header_hard(skb, hlen)) {
				2908	th = skb_gro_header_slow(skb, hlen, off);
				2909	if (unlikely(!th))
				2910	goto out;
				2911	}
				2912
				2913	skb_gro_pull(skb, thlen);
				2914
				2915	len = skb_gro_len(skb);
				2916	flags = tcp_flag_word(th);
				2917
				2918	for (; (p = *head); head = &p->next) {
				2919	if (!NAPI_GRO_CB(p)->same_flow)
				2920	continue;
				2921
				2922	th2 = tcp_hdr(p);
				2923
				2924	if ((u32 )&th->source ^ (u32 )&th2->source) {
				2925	NAPI_GRO_CB(p)->same_flow = 0;
				2926	continue;
				2927	}
				2928
				2929	goto found;
				2930	}
				2931
				2932	goto out_check_final;
				2933
				2934	found:
				2935	flush = NAPI_GRO_CB(p)->flush;
				2936	flush \|= (__force int)(flags & TCP_FLAG_CWR);
				2937	flush \|= (__force int)((flags ^ tcp_flag_word(th2)) &
				2938	~(TCP_FLAG_CWR \| TCP_FLAG_FIN \| TCP_FLAG_PSH));
				2939	flush \|= (__force int)(th->ack_seq ^ th2->ack_seq);
				2940	for (i = sizeof(*th); i < thlen; i += 4)
				2941	flush \|= (u32 )((u8 *)th + i) ^
				2942	(u32 )((u8 *)th2 + i);
				2943
				2944	mss = skb_shinfo(p)->gso_size;
				2945
				2946	flush \|= (len - 1) >= mss;
				2947	flush \|= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
				2948
				2949	if (flush \|\| skb_gro_receive(head, skb)) {
				2950	mss = 1;
				2951	goto out_check_final;
				2952	}
				2953
				2954	p = *head;
				2955	th2 = tcp_hdr(p);
				2956	tcp_flag_word(th2) \|= flags & (TCP_FLAG_FIN \| TCP_FLAG_PSH);
				2957
				2958	out_check_final:
				2959	flush = len < mss;
				2960	flush \|= (__force int)(flags & (TCP_FLAG_URG \| TCP_FLAG_PSH \|
				2961	TCP_FLAG_RST \| TCP_FLAG_SYN \|
				2962	TCP_FLAG_FIN));
				2963
				2964	if (p && (!NAPI_GRO_CB(skb)->same_flow \|\| flush))
				2965	pp = head;
				2966
				2967	out:
				2968	NAPI_GRO_CB(skb)->flush \|= flush;
				2969
				2970	return pp;
				2971	}
				2972	EXPORT_SYMBOL(tcp_gro_receive);
				2973
				2974	int tcp_gro_complete(struct sk_buff *skb)
				2975	{
				2976	struct tcphdr *th = tcp_hdr(skb);
				2977
				2978	skb->csum_start = skb_transport_header(skb) - skb->head;
				2979	skb->csum_offset = offsetof(struct tcphdr, check);
				2980	skb->ip_summed = CHECKSUM_PARTIAL;
				2981
				2982	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
				2983
				2984	if (th->cwr)
				2985	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
				2986
				2987	return 0;
				2988	}
				2989	EXPORT_SYMBOL(tcp_gro_complete);
				2990
				2991	#ifdef CONFIG_TCP_MD5SIG
				2992	static unsigned long tcp_md5sig_users;
				2993	static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
				2994	static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
				2995
				2996	static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
				2997	{
				2998	int cpu;
				2999
				3000	for_each_possible_cpu(cpu) {
				3001	struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
				3002
				3003	if (p->md5_desc.tfm)
				3004	crypto_free_hash(p->md5_desc.tfm);
				3005	}
				3006	free_percpu(pool);
				3007	}
				3008
				3009	void tcp_free_md5sig_pool(void)
				3010	{
				3011	struct tcp_md5sig_pool __percpu *pool = NULL;
				3012
				3013	spin_lock_bh(&tcp_md5sig_pool_lock);
				3014	if (--tcp_md5sig_users == 0) {
				3015	pool = tcp_md5sig_pool;
				3016	tcp_md5sig_pool = NULL;
				3017	}
				3018	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3019	if (pool)
				3020	__tcp_free_md5sig_pool(pool);
				3021	}
				3022	EXPORT_SYMBOL(tcp_free_md5sig_pool);
				3023
				3024	static struct tcp_md5sig_pool __percpu *
				3025	__tcp_alloc_md5sig_pool(struct sock *sk)
				3026	{
				3027	int cpu;
				3028	struct tcp_md5sig_pool __percpu *pool;
				3029
				3030	pool = alloc_percpu(struct tcp_md5sig_pool);
				3031	if (!pool)
				3032	return NULL;
				3033
				3034	for_each_possible_cpu(cpu) {
				3035	struct crypto_hash *hash;
				3036
				3037	hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
				3038	if (!hash \|\| IS_ERR(hash))
				3039	goto out_free;
				3040
				3041	per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
				3042	}
				3043	return pool;
				3044	out_free:
				3045	__tcp_free_md5sig_pool(pool);
				3046	return NULL;
				3047	}
				3048
				3049	struct tcp_md5sig_pool __percpu tcp_alloc_md5sig_pool(struct sock sk)
				3050	{
				3051	struct tcp_md5sig_pool __percpu *pool;
				3052	int alloc = 0;
				3053
				3054	retry:
				3055	spin_lock_bh(&tcp_md5sig_pool_lock);
				3056	pool = tcp_md5sig_pool;
				3057	if (tcp_md5sig_users++ == 0) {
				3058	alloc = 1;
				3059	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3060	} else if (!pool) {
				3061	tcp_md5sig_users--;
				3062	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3063	cpu_relax();
				3064	goto retry;
				3065	} else
				3066	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3067
				3068	if (alloc) {
				3069	/* we cannot hold spinlock here because this may sleep. */
				3070	struct tcp_md5sig_pool __percpu *p;
				3071
				3072	p = __tcp_alloc_md5sig_pool(sk);
				3073	spin_lock_bh(&tcp_md5sig_pool_lock);
				3074	if (!p) {
				3075	tcp_md5sig_users--;
				3076	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3077	return NULL;
				3078	}
				3079	pool = tcp_md5sig_pool;
				3080	if (pool) {
				3081	/* oops, it has already been assigned. */
				3082	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3083	__tcp_free_md5sig_pool(p);
				3084	} else {
				3085	tcp_md5sig_pool = pool = p;
				3086	spin_unlock_bh(&tcp_md5sig_pool_lock);
				3087	}
				3088	}
				3089	return pool;
				3090	}
				3091	EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
				3092
				3093
				3094	/**
				3095	* tcp_get_md5sig_pool - get md5sig_pool for this user
				3096	*
				3097	* We use percpu structure, so if we succeed, we exit with preemption
				3098	* and BH disabled, to make sure another thread or softirq handling
				3099	* wont try to get same context.
				3100	*/
				3101	struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
				3102	{
				3103	struct tcp_md5sig_pool __percpu *p;
				3104
				3105	local_bh_disable();
				3106
				3107	spin_lock(&tcp_md5sig_pool_lock);
				3108	p = tcp_md5sig_pool;
				3109	if (p)
				3110	tcp_md5sig_users++;
				3111	spin_unlock(&tcp_md5sig_pool_lock);
				3112
				3113	if (p)
				3114	return this_cpu_ptr(p);
				3115
				3116	local_bh_enable();
				3117	return NULL;
				3118	}
				3119	EXPORT_SYMBOL(tcp_get_md5sig_pool);
				3120
				3121	void tcp_put_md5sig_pool(void)
				3122	{
				3123	local_bh_enable();
				3124	tcp_free_md5sig_pool();
				3125	}
				3126	EXPORT_SYMBOL(tcp_put_md5sig_pool);
				3127
				3128	int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
				3129	const struct tcphdr *th)
				3130	{
				3131	struct scatterlist sg;
				3132	struct tcphdr hdr;
				3133	int err;
				3134
				3135	/* We are not allowed to change tcphdr, make a local copy */
				3136	memcpy(&hdr, th, sizeof(hdr));
				3137	hdr.check = 0;
				3138
				3139	/* options aren't included in the hash */
				3140	sg_init_one(&sg, &hdr, sizeof(hdr));
				3141	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
				3142	return err;
				3143	}
				3144	EXPORT_SYMBOL(tcp_md5_hash_header);
				3145
				3146	int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
				3147	const struct sk_buff *skb, unsigned int header_len)
				3148	{
				3149	struct scatterlist sg;
				3150	const struct tcphdr *tp = tcp_hdr(skb);
				3151	struct hash_desc *desc = &hp->md5_desc;
				3152	unsigned i;
				3153	const unsigned head_data_len = skb_headlen(skb) > header_len ?
				3154	skb_headlen(skb) - header_len : 0;
				3155	const struct skb_shared_info *shi = skb_shinfo(skb);
				3156	struct sk_buff *frag_iter;
				3157
				3158	sg_init_table(&sg, 1);
				3159
				3160	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
				3161	if (crypto_hash_update(desc, &sg, head_data_len))
				3162	return 1;
				3163
				3164	for (i = 0; i < shi->nr_frags; ++i) {
				3165	const struct skb_frag_struct *f = &shi->frags[i];
				3166	unsigned int offset = f->page_offset;
				3167	struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
				3168
				3169	sg_set_page(&sg, page, skb_frag_size(f),
				3170	offset_in_page(offset));
				3171	if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
				3172	return 1;
				3173	}
				3174
				3175	skb_walk_frags(skb, frag_iter)
				3176	if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
				3177	return 1;
				3178
				3179	return 0;
				3180	}
				3181	EXPORT_SYMBOL(tcp_md5_hash_skb_data);
				3182
				3183	int tcp_md5_hash_key(struct tcp_md5sig_pool hp, const struct tcp_md5sig_key key)
				3184	{
				3185	struct scatterlist sg;
				3186
				3187	sg_init_one(&sg, key->key, key->keylen);
				3188	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
				3189	}
				3190	EXPORT_SYMBOL(tcp_md5_hash_key);
				3191
				3192	#endif
				3193
				3194	/**
				3195	* Each Responder maintains up to two secret values concurrently for
				3196	* efficient secret rollover. Each secret value has 4 states:
				3197	*
				3198	* Generating. (tcp_secret_generating != tcp_secret_primary)
				3199	* Generates new Responder-Cookies, but not yet used for primary
				3200	* verification. This is a short-term state, typically lasting only
				3201	* one round trip time (RTT).
				3202	*
				3203	* Primary. (tcp_secret_generating == tcp_secret_primary)
				3204	* Used both for generation and primary verification.
				3205	*
				3206	* Retiring. (tcp_secret_retiring != tcp_secret_secondary)
				3207	* Used for verification, until the first failure that can be
				3208	* verified by the newer Generating secret. At that time, this
				3209	* cookie's state is changed to Secondary, and the Generating
				3210	* cookie's state is changed to Primary. This is a short-term state,
				3211	* typically lasting only one round trip time (RTT).
				3212	*
				3213	* Secondary. (tcp_secret_retiring == tcp_secret_secondary)
				3214	* Used for secondary verification, after primary verification
				3215	* failures. This state lasts no more than twice the Maximum Segment
				3216	* Lifetime (2MSL). Then, the secret is discarded.
				3217	*/
				3218	struct tcp_cookie_secret {
				3219	/* The secret is divided into two parts. The digest part is the
				3220	* equivalent of previously hashing a secret and saving the state,
				3221	* and serves as an initialization vector (IV). The message part
				3222	* serves as the trailing secret.
				3223	*/
				3224	u32 secrets[COOKIE_WORKSPACE_WORDS];
				3225	unsigned long expires;
				3226	};
				3227
				3228	#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
				3229	#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
				3230	#define TCP_SECRET_LIFE (HZ * 600)
				3231
				3232	static struct tcp_cookie_secret tcp_secret_one;
				3233	static struct tcp_cookie_secret tcp_secret_two;
				3234
				3235	/* Essentially a circular list, without dynamic allocation. */
				3236	static struct tcp_cookie_secret *tcp_secret_generating;
				3237	static struct tcp_cookie_secret *tcp_secret_primary;
				3238	static struct tcp_cookie_secret *tcp_secret_retiring;
				3239	static struct tcp_cookie_secret *tcp_secret_secondary;
				3240
				3241	static DEFINE_SPINLOCK(tcp_secret_locker);
				3242
				3243	/* Select a pseudo-random word in the cookie workspace.
				3244	*/
				3245	static inline u32 tcp_cookie_work(const u32 *ws, const int n)
				3246	{
				3247	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
				3248	}
				3249
				3250	/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
				3251	* Called in softirq context.
				3252	* Returns: 0 for success.
				3253	*/
				3254	int tcp_cookie_generator(u32 *bakery)
				3255	{
				3256	unsigned long jiffy = jiffies;
				3257
				3258	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
				3259	spin_lock_bh(&tcp_secret_locker);
				3260	if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
				3261	/* refreshed by another */
				3262	memcpy(bakery,
				3263	&tcp_secret_generating->secrets[0],
				3264	COOKIE_WORKSPACE_WORDS);
				3265	} else {
				3266	/* still needs refreshing */
				3267	get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
				3268
				3269	/* The first time, paranoia assumes that the
				3270	* randomization function isn't as strong. But,
				3271	* this secret initialization is delayed until
				3272	* the last possible moment (packet arrival).
				3273	* Although that time is observable, it is
				3274	* unpredictably variable. Mash in the most
				3275	* volatile clock bits available, and expire the
				3276	* secret extra quickly.
				3277	*/
				3278	if (unlikely(tcp_secret_primary->expires ==
				3279	tcp_secret_secondary->expires)) {
				3280	struct timespec tv;
				3281
				3282	getnstimeofday(&tv);
				3283	bakery[COOKIE_DIGEST_WORDS+0] ^=
				3284	(u32)tv.tv_nsec;
				3285
				3286	tcp_secret_secondary->expires = jiffy
				3287	+ TCP_SECRET_1MSL
				3288	+ (0x0f & tcp_cookie_work(bakery, 0));
				3289	} else {
				3290	tcp_secret_secondary->expires = jiffy
				3291	+ TCP_SECRET_LIFE
				3292	+ (0xff & tcp_cookie_work(bakery, 1));
				3293	tcp_secret_primary->expires = jiffy
				3294	+ TCP_SECRET_2MSL
				3295	+ (0x1f & tcp_cookie_work(bakery, 2));
				3296	}
				3297	memcpy(&tcp_secret_secondary->secrets[0],
				3298	bakery, COOKIE_WORKSPACE_WORDS);
				3299
				3300	rcu_assign_pointer(tcp_secret_generating,
				3301	tcp_secret_secondary);
				3302	rcu_assign_pointer(tcp_secret_retiring,
				3303	tcp_secret_primary);
				3304	/*
				3305	* Neither call_rcu() nor synchronize_rcu() needed.
				3306	* Retiring data is not freed. It is replaced after
				3307	* further (locked) pointer updates, and a quiet time
				3308	* (minimum 1MSL, maximum LIFE - 2MSL).
				3309	*/
				3310	}
				3311	spin_unlock_bh(&tcp_secret_locker);
				3312	} else {
				3313	rcu_read_lock_bh();
				3314	memcpy(bakery,
				3315	&rcu_dereference(tcp_secret_generating)->secrets[0],
				3316	COOKIE_WORKSPACE_WORDS);
				3317	rcu_read_unlock_bh();
				3318	}
				3319	return 0;
				3320	}
				3321	EXPORT_SYMBOL(tcp_cookie_generator);
				3322
				3323	void tcp_done(struct sock *sk)
				3324	{
				3325	if (sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)
				3326	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
				3327
				3328	tcp_set_state(sk, TCP_CLOSE);
				3329	tcp_clear_xmit_timers(sk);
				3330
				3331	sk->sk_shutdown = SHUTDOWN_MASK;
				3332
				3333	if (!sock_flag(sk, SOCK_DEAD))
				3334	sk->sk_state_change(sk);
				3335	else
				3336	inet_csk_destroy_sock(sk);
				3337	}
				3338	EXPORT_SYMBOL_GPL(tcp_done);
				3339
				3340	extern struct tcp_congestion_ops tcp_reno;
				3341
				3342	static __initdata unsigned long thash_entries;
				3343	static int __init set_thash_entries(char *str)
				3344	{
				3345	if (!str)
				3346	return 0;
				3347	thash_entries = simple_strtoul(str, &str, 0);
				3348	return 1;
				3349	}
				3350	__setup("thash_entries=", set_thash_entries);
				3351
				3352	void tcp_init_mem(struct net *net)
				3353	{
				3354	unsigned long limit = nr_free_buffer_pages() / 8;
				3355	limit = max(limit, 128UL);
				3356	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
				3357	net->ipv4.sysctl_tcp_mem[1] = limit;
				3358	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
				3359	}
				3360
				3361	void __init tcp_init(void)
				3362	{
				3363	struct sk_buff *skb = NULL;
				3364	unsigned long limit;
				3365	int max_rshare, max_wshare, cnt;
				3366	unsigned int i;
				3367	unsigned long jiffy = jiffies;
				3368	//hub:CVE-2019-11477
				3369	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
				3370	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
				3371
				3372	percpu_counter_init(&tcp_sockets_allocated, 0);
				3373	percpu_counter_init(&tcp_orphan_count, 0);
				3374	tcp_hashinfo.bind_bucket_cachep =
				3375	kmem_cache_create("tcp_bind_bucket",
				3376	sizeof(struct inet_bind_bucket), 0,
				3377	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
				3378
				3379	/* Size and allocate the main established and bind bucket
				3380	* hash tables.
				3381	*
				3382	* The methodology is similar to that of the buffer cache.
				3383	*/
				3384	tcp_hashinfo.ehash =
				3385	alloc_large_system_hash("TCP established",
				3386	sizeof(struct inet_ehash_bucket),
				3387	1024, //thash_entries,
				3388	(totalram_pages >= 128 * 1024) ?
				3389	13 : 15,
				3390	0,
				3391	NULL,
				3392	&tcp_hashinfo.ehash_mask,
				3393	thash_entries ? 0 : 512 * 1024);
				3394	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
				3395	INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
				3396	INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
				3397	}
				3398	if (inet_ehash_locks_alloc(&tcp_hashinfo))
				3399	panic("TCP: failed to alloc ehash_locks");
				3400	tcp_hashinfo.bhash =
				3401	alloc_large_system_hash("TCP bind",
				3402	sizeof(struct inet_bind_hashbucket),
				3403	256, //tcp_hashinfo.ehash_mask + 1,
				3404	(totalram_pages >= 128 * 1024) ?
				3405	13 : 15,
				3406	0,
				3407	&tcp_hashinfo.bhash_size,
				3408	NULL,
				3409	64 * 1024);
				3410	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
				3411	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
				3412	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
				3413	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
				3414	}
				3415
				3416
				3417	cnt = tcp_hashinfo.ehash_mask + 1;
				3418
				3419	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
				3420	sysctl_tcp_max_orphans = cnt / 2;
				3421	sysctl_max_syn_backlog = max(128, cnt / 256);
				3422
				3423	tcp_init_mem(&init_net);
				3424	/* Set per-socket limits to no more than 1/128 the pressure threshold */
				3425	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
				3426	max_wshare = min(4UL10241024, limit);
				3427	max_rshare = min(6UL10241024, limit);
				3428
				3429	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
				3430	sysctl_tcp_wmem[1] = 16*1024;
				3431	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
				3432
				3433	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
				3434	sysctl_tcp_rmem[1] = 87380;
				3435	sysctl_tcp_rmem[2] = max(87380, max_rshare);
				3436
				3437	pr_info("Hash tables configured (established %u bind %u)\n",
				3438	tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
				3439
				3440	tcp_register_congestion_control(&tcp_reno);
				3441
				3442	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
				3443	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
				3444	tcp_secret_one.expires = jiffy; /* past due */
				3445	tcp_secret_two.expires = jiffy; /* past due */
				3446	tcp_secret_generating = &tcp_secret_one;
				3447	tcp_secret_primary = &tcp_secret_one;
				3448	tcp_secret_retiring = &tcp_secret_two;
				3449	tcp_secret_secondary = &tcp_secret_two;
				3450	}
				3451
				3452	static int tcp_is_local(struct net *net, __be32 addr) {
				3453	struct rtable *rt;
				3454	struct flowi4 fl4 = { .daddr = addr };
				3455	rt = ip_route_output_key(net, &fl4);
				3456	if (IS_ERR_OR_NULL(rt))
				3457	return 0;
				3458	return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
				3459	}
				3460
				3461	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				3462	static int tcp_is_local6(struct net net, struct in6_addr addr) {
				3463	struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
				3464	return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK);
				3465	}
				3466	#endif
				3467
				3468	/*
				3469	* tcp_nuke_addr - destroy all sockets on the given local address
				3470	* if local address is the unspecified address (0.0.0.0 or ::), destroy all
				3471	* sockets with local addresses that are not configured.
				3472	*/
				3473	int tcp_nuke_addr(struct net net, struct sockaddr addr)
				3474	{
				3475	int family = addr->sa_family;
				3476	unsigned int bucket;
				3477
				3478	struct in_addr *in;
				3479	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				3480	struct in6_addr *in6;
				3481	#endif
				3482	if (family == AF_INET) {
				3483	in = &((struct sockaddr_in *)addr)->sin_addr;
				3484	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				3485	} else if (family == AF_INET6) {
				3486	in6 = &((struct sockaddr_in6 *)addr)->sin6_addr;
				3487	#endif
				3488	} else {
				3489	//return -EAFNOSUPPORT;
				3490	return ERRNO_TRACK(-EAFNOSUPPORT);
				3491	}
				3492
				3493	for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
				3494	struct hlist_nulls_node *node;
				3495	struct sock *sk;
				3496	spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
				3497
				3498	restart:
				3499	spin_lock_bh(lock);
				3500	sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
				3501	struct inet_sock *inet = inet_sk(sk);
				3502
				3503	if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
				3504	continue;
				3505	if (sock_flag(sk, SOCK_DEAD))
				3506	continue;
				3507
				3508	if (family == AF_INET) {
				3509	__be32 s4 = inet->inet_rcv_saddr;
				3510	if (s4 == LOOPBACK4_IPV6)
				3511	continue;
				3512
				3513	if (in->s_addr != s4 &&
				3514	!(in->s_addr == INADDR_ANY &&
				3515	!tcp_is_local(net, s4)))
				3516	continue;
				3517	}
				3518
				3519	#if defined(CONFIG_IPV6) \|\| defined(CONFIG_IPV6_MODULE)
				3520	if (family == AF_INET6) {
				3521	struct in6_addr *s6;
				3522	if (!inet->pinet6)
				3523	continue;
				3524
				3525	s6 = &inet->pinet6->rcv_saddr;
				3526	if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)
				3527	continue;
				3528
				3529	if (!ipv6_addr_equal(in6, s6) &&
				3530	!(ipv6_addr_equal(in6, &in6addr_any) &&
				3531	!tcp_is_local6(net, s6)))
				3532	continue;
				3533	}
				3534	#endif
				3535
				3536	sock_hold(sk);
				3537	spin_unlock_bh(lock);
				3538
				3539	local_bh_disable();
				3540	bh_lock_sock(sk);
				3541	sk->sk_err = ETIMEDOUT;
				3542	sk->sk_error_report(sk);
				3543
				3544	tcp_done(sk);
				3545	bh_unlock_sock(sk);
				3546	local_bh_enable();
				3547	sock_put(sk);
				3548
				3549	goto restart;
				3550	}
				3551	spin_unlock_bh(lock);
				3552	}
				3553
				3554	return 0;
				3555	}