Blame - marvell/linux/net/core/sock.c - T108

blob: f70079ce040f8f15971d2e1ac405bda89928550c [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* Generic socket support routines. Memory allocators, socket lock/release
				8	* handler for protocols to use and generic option handler.
				9	*
				10	* Authors: Ross Biro
				11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Alan Cox, <A.Cox@swansea.ac.uk>
				14	*
				15	* Fixes:
				16	* Alan Cox : Numerous verify_area() problems
				17	* Alan Cox : Connecting on a connecting socket
				18	* now returns an error for tcp.
				19	* Alan Cox : sock->protocol is set correctly.
				20	* and is not sometimes left as 0.
				21	* Alan Cox : connect handles icmp errors on a
				22	* connect properly. Unfortunately there
				23	* is a restart syscall nasty there. I
				24	* can't match BSD without hacking the C
				25	* library. Ideas urgently sought!
				26	* Alan Cox : Disallow bind() to addresses that are
				27	* not ours - especially broadcast ones!!
				28	* Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
				29	* Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
				30	* instead they leave that for the DESTROY timer.
				31	* Alan Cox : Clean up error flag in accept
				32	* Alan Cox : TCP ack handling is buggy, the DESTROY timer
				33	* was buggy. Put a remove_sock() in the handler
				34	* for memory when we hit 0. Also altered the timer
				35	* code. The ACK stuff can wait and needs major
				36	* TCP layer surgery.
				37	* Alan Cox : Fixed TCP ack bug, removed remove sock
				38	* and fixed timer/inet_bh race.
				39	* Alan Cox : Added zapped flag for TCP
				40	* Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
				41	* Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
				42	* Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
				43	* Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
				44	* Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
				45	* Rick Sladkey : Relaxed UDP rules for matching packets.
				46	* C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
				47	* Pauline Middelink : identd support
				48	* Alan Cox : Fixed connect() taking signals I think.
				49	* Alan Cox : SO_LINGER supported
				50	* Alan Cox : Error reporting fixes
				51	* Anonymous : inet_create tidied up (sk->reuse setting)
				52	* Alan Cox : inet sockets don't set sk->type!
				53	* Alan Cox : Split socket option code
				54	* Alan Cox : Callbacks
				55	* Alan Cox : Nagle flag for Charles & Johannes stuff
				56	* Alex : Removed restriction on inet fioctl
				57	* Alan Cox : Splitting INET from NET core
				58	* Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
				59	* Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
				60	* Alan Cox : Split IP from generic code
				61	* Alan Cox : New kfree_skbmem()
				62	* Alan Cox : Make SO_DEBUG superuser only.
				63	* Alan Cox : Allow anyone to clear SO_DEBUG
				64	* (compatibility fix)
				65	* Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
				66	* Alan Cox : Allocator for a socket is settable.
				67	* Alan Cox : SO_ERROR includes soft errors.
				68	* Alan Cox : Allow NULL arguments on some SO_ opts
				69	* Alan Cox : Generic socket allocation to make hooks
				70	* easier (suggested by Craig Metz).
				71	* Michael Pall : SO_ERROR returns positive errno again
				72	* Steve Whitehouse: Added default destructor to free
				73	* protocol private data.
				74	* Steve Whitehouse: Added various other default routines
				75	* common to several socket families.
				76	* Chris Evans : Call suser() check last on F_SETOWN
				77	* Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
				78	* Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
				79	* Andi Kleen : Fix write_space callback
				80	* Chris Evans : Security fixes - signedness again
				81	* Arnaldo C. Melo : cleanups, use skb_queue_purge
				82	*
				83	* To Fix:
				84	*/
				85
				86	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				87
				88	#include <asm/unaligned.h>
				89	#include <linux/capability.h>
				90	#include <linux/errno.h>
				91	#include <linux/errqueue.h>
				92	#include <linux/types.h>
				93	#include <linux/socket.h>
				94	#include <linux/in.h>
				95	#include <linux/kernel.h>
				96	#include <linux/module.h>
				97	#include <linux/proc_fs.h>
				98	#include <linux/seq_file.h>
				99	#include <linux/sched.h>
				100	#include <linux/sched/mm.h>
				101	#include <linux/timer.h>
				102	#include <linux/string.h>
				103	#include <linux/sockios.h>
				104	#include <linux/net.h>
				105	#include <linux/mm.h>
				106	#include <linux/slab.h>
				107	#include <linux/interrupt.h>
				108	#include <linux/poll.h>
				109	#include <linux/tcp.h>
				110	#include <linux/init.h>
				111	#include <linux/highmem.h>
				112	#include <linux/user_namespace.h>
				113	#include <linux/static_key.h>
				114	#include <linux/memcontrol.h>
				115	#include <linux/prefetch.h>
				116
				117	#include <linux/uaccess.h>
				118
				119	#include <linux/netdevice.h>
				120	#include <net/protocol.h>
				121	#include <linux/skbuff.h>
				122	#include <net/net_namespace.h>
				123	#include <net/request_sock.h>
				124	#include <net/sock.h>
				125	#include <linux/net_tstamp.h>
				126	#include <net/xfrm.h>
				127	#include <linux/ipsec.h>
				128	#include <net/cls_cgroup.h>
				129	#include <net/netprio_cgroup.h>
				130	#include <linux/sock_diag.h>
				131
				132	#include <linux/filter.h>
				133	#include <net/sock_reuseport.h>
				134	#include <net/bpf_sk_storage.h>
				135
				136	#include <trace/events/sock.h>
				137	#include <trace/hooks/net.h>
				138
				139	#include <net/tcp.h>
				140	#include <net/busy_poll.h>
				141
				142	static DEFINE_MUTEX(proto_list_mutex);
				143	static LIST_HEAD(proto_list);
				144	static atomic64_t cookie_gen;
				145
				146	static void sock_inuse_add(struct net *net, int val);
				147
				148	/**
				149	* sk_ns_capable - General socket capability test
				150	* @sk: Socket to use a capability on or through
				151	* @user_ns: The user namespace of the capability to use
				152	* @cap: The capability to use
				153	*
				154	* Test to see if the opener of the socket had when the socket was
				155	* created and the current process has the capability @cap in the user
				156	* namespace @user_ns.
				157	*/
				158	bool sk_ns_capable(const struct sock *sk,
				159	struct user_namespace *user_ns, int cap)
				160	{
				161	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
				162	ns_capable(user_ns, cap);
				163	}
				164	EXPORT_SYMBOL(sk_ns_capable);
				165
				166	/**
				167	* sk_capable - Socket global capability test
				168	* @sk: Socket to use a capability on or through
				169	* @cap: The global capability to use
				170	*
				171	* Test to see if the opener of the socket had when the socket was
				172	* created and the current process has the capability @cap in all user
				173	* namespaces.
				174	*/
				175	bool sk_capable(const struct sock *sk, int cap)
				176	{
				177	return sk_ns_capable(sk, &init_user_ns, cap);
				178	}
				179	EXPORT_SYMBOL(sk_capable);
				180
				181	/**
				182	* sk_net_capable - Network namespace socket capability test
				183	* @sk: Socket to use a capability on or through
				184	* @cap: The capability to use
				185	*
				186	* Test to see if the opener of the socket had when the socket was created
				187	* and the current process has the capability @cap over the network namespace
				188	* the socket is a member of.
				189	*/
				190	bool sk_net_capable(const struct sock *sk, int cap)
				191	{
				192	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
				193	}
				194	EXPORT_SYMBOL(sk_net_capable);
				195
				196	/*
				197	* Each address family might have different locking rules, so we have
				198	* one slock key per address family and separate keys for internal and
				199	* userspace sockets.
				200	*/
				201	static struct lock_class_key af_family_keys[AF_MAX];
				202	static struct lock_class_key af_family_kern_keys[AF_MAX];
				203	static struct lock_class_key af_family_slock_keys[AF_MAX];
				204	static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
				205
				206	/*
				207	* Make lock validator output more readable. (we pre-construct these
				208	* strings build-time, so that runtime initialization of socket
				209	* locks is fast):
				210	*/
				211
				212	#define _sock_locks(x) \
				213	x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
				214	x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
				215	x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
				216	x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
				217	x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
				218	x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
				219	x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
				220	x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
				221	x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
				222	x "27" , x "28" , x "AF_CAN" , \
				223	x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
				224	x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
				225	x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
				226	x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
				227	x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
				228	x "AF_MAX"
				229
				230	static const char *const af_family_key_strings[AF_MAX+1] = {
				231	_sock_locks("sk_lock-")
				232	};
				233	static const char *const af_family_slock_key_strings[AF_MAX+1] = {
				234	_sock_locks("slock-")
				235	};
				236	static const char *const af_family_clock_key_strings[AF_MAX+1] = {
				237	_sock_locks("clock-")
				238	};
				239
				240	static const char *const af_family_kern_key_strings[AF_MAX+1] = {
				241	_sock_locks("k-sk_lock-")
				242	};
				243	static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
				244	_sock_locks("k-slock-")
				245	};
				246	static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
				247	_sock_locks("k-clock-")
				248	};
				249	static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
				250	_sock_locks("rlock-")
				251	};
				252	static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
				253	_sock_locks("wlock-")
				254	};
				255	static const char *const af_family_elock_key_strings[AF_MAX+1] = {
				256	_sock_locks("elock-")
				257	};
				258
				259	/*
				260	* sk_callback_lock and sk queues locking rules are per-address-family,
				261	* so split the lock classes by using a per-AF key:
				262	*/
				263	static struct lock_class_key af_callback_keys[AF_MAX];
				264	static struct lock_class_key af_rlock_keys[AF_MAX];
				265	static struct lock_class_key af_wlock_keys[AF_MAX];
				266	static struct lock_class_key af_elock_keys[AF_MAX];
				267	static struct lock_class_key af_kern_callback_keys[AF_MAX];
				268
				269	/* Run time adjustable parameters. */
				270	__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
				271	EXPORT_SYMBOL(sysctl_wmem_max);
				272	__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
				273	EXPORT_SYMBOL(sysctl_rmem_max);
				274	__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
				275	__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
				276
				277	/* Maximal space eaten by iovec or ancillary data plus some space */
				278	int sysctl_optmem_max __read_mostly = sizeof(unsigned long)(2UIO_MAXIOV+512);
				279	EXPORT_SYMBOL(sysctl_optmem_max);
				280
				281	int sysctl_tstamp_allow_data __read_mostly = 1;
				282
				283	DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
				284	EXPORT_SYMBOL_GPL(memalloc_socks_key);
				285
				286	/**
				287	* sk_set_memalloc - sets %SOCK_MEMALLOC
				288	* @sk: socket to set it on
				289	*
				290	* Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
				291	* It's the responsibility of the admin to adjust min_free_kbytes
				292	* to meet the requirements
				293	*/
				294	void sk_set_memalloc(struct sock *sk)
				295	{
				296	sock_set_flag(sk, SOCK_MEMALLOC);
				297	sk->sk_allocation \|= __GFP_MEMALLOC;
				298	static_branch_inc(&memalloc_socks_key);
				299	}
				300	EXPORT_SYMBOL_GPL(sk_set_memalloc);
				301
				302	void sk_clear_memalloc(struct sock *sk)
				303	{
				304	sock_reset_flag(sk, SOCK_MEMALLOC);
				305	sk->sk_allocation &= ~__GFP_MEMALLOC;
				306	static_branch_dec(&memalloc_socks_key);
				307
				308	/*
				309	* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
				310	* progress of swapping. SOCK_MEMALLOC may be cleared while
				311	* it has rmem allocations due to the last swapfile being deactivated
				312	* but there is a risk that the socket is unusable due to exceeding
				313	* the rmem limits. Reclaim the reserves and obey rmem limits again.
				314	*/
				315	sk_mem_reclaim(sk);
				316	}
				317	EXPORT_SYMBOL_GPL(sk_clear_memalloc);
				318
				319	int __sk_backlog_rcv(struct sock sk, struct sk_buff skb)
				320	{
				321	int ret;
				322	unsigned int noreclaim_flag;
				323
				324	/* these should have been dropped before queueing */
				325	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
				326
				327	noreclaim_flag = memalloc_noreclaim_save();
				328	ret = sk->sk_backlog_rcv(sk, skb);
				329	memalloc_noreclaim_restore(noreclaim_flag);
				330
				331	return ret;
				332	}
				333	EXPORT_SYMBOL(__sk_backlog_rcv);
				334
				335	static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
				336	{
				337	struct __kernel_sock_timeval tv;
				338	int size;
				339
				340	if (timeo == MAX_SCHEDULE_TIMEOUT) {
				341	tv.tv_sec = 0;
				342	tv.tv_usec = 0;
				343	} else {
				344	tv.tv_sec = timeo / HZ;
				345	tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
				346	}
				347
				348	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
				349	struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
				350	(struct old_timeval32 )optval = tv32;
				351	return sizeof(tv32);
				352	}
				353
				354	if (old_timeval) {
				355	struct __kernel_old_timeval old_tv;
				356	old_tv.tv_sec = tv.tv_sec;
				357	old_tv.tv_usec = tv.tv_usec;
				358	(struct __kernel_old_timeval )optval = old_tv;
				359	size = sizeof(old_tv);
				360	} else {
				361	(struct __kernel_sock_timeval )optval = tv;
				362	size = sizeof(tv);
				363	}
				364
				365	return size;
				366	}
				367
				368	static int sock_set_timeout(long timeo_p, char __user optval, int optlen, bool old_timeval)
				369	{
				370	struct __kernel_sock_timeval tv;
				371
				372	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
				373	struct old_timeval32 tv32;
				374
				375	if (optlen < sizeof(tv32))
				376	return -EINVAL;
				377
				378	if (copy_from_user(&tv32, optval, sizeof(tv32)))
				379	return -EFAULT;
				380	tv.tv_sec = tv32.tv_sec;
				381	tv.tv_usec = tv32.tv_usec;
				382	} else if (old_timeval) {
				383	struct __kernel_old_timeval old_tv;
				384
				385	if (optlen < sizeof(old_tv))
				386	return -EINVAL;
				387	if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
				388	return -EFAULT;
				389	tv.tv_sec = old_tv.tv_sec;
				390	tv.tv_usec = old_tv.tv_usec;
				391	} else {
				392	if (optlen < sizeof(tv))
				393	return -EINVAL;
				394	if (copy_from_user(&tv, optval, sizeof(tv)))
				395	return -EFAULT;
				396	}
				397	if (tv.tv_usec < 0 \|\| tv.tv_usec >= USEC_PER_SEC)
				398	return -EDOM;
				399
				400	if (tv.tv_sec < 0) {
				401	static int warned __read_mostly;
				402
				403	*timeo_p = 0;
				404	if (warned < 10 && net_ratelimit()) {
				405	warned++;
				406	pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
				407	__func__, current->comm, task_pid_nr(current));
				408	}
				409	return 0;
				410	}
				411	*timeo_p = MAX_SCHEDULE_TIMEOUT;
				412	if (tv.tv_sec == 0 && tv.tv_usec == 0)
				413	return 0;
				414	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
				415	timeo_p = tv.tv_sec HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
				416	return 0;
				417	}
				418
				419	static void sock_warn_obsolete_bsdism(const char *name)
				420	{
				421	static int warned;
				422	static char warncomm[TASK_COMM_LEN];
				423	if (strcmp(warncomm, current->comm) && warned < 5) {
				424	strcpy(warncomm, current->comm);
				425	pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
				426	warncomm, name);
				427	warned++;
				428	}
				429	}
				430
				431	static bool sock_needs_netstamp(const struct sock *sk)
				432	{
				433	switch (sk->sk_family) {
				434	case AF_UNSPEC:
				435	case AF_UNIX:
				436	return false;
				437	default:
				438	return true;
				439	}
				440	}
				441
				442	static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
				443	{
				444	if (sk->sk_flags & flags) {
				445	sk->sk_flags &= ~flags;
				446	if (sock_needs_netstamp(sk) &&
				447	!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
				448	net_disable_timestamp();
				449	}
				450	}
				451
				452
				453	int __sock_queue_rcv_skb(struct sock sk, struct sk_buff skb)
				454	{
				455	unsigned long flags;
				456	struct sk_buff_head *list = &sk->sk_receive_queue;
				457
				458	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
				459	atomic_inc(&sk->sk_drops);
				460	trace_sock_rcvqueue_full(sk, skb);
				461	return -ENOMEM;
				462	}
				463
				464	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
				465	atomic_inc(&sk->sk_drops);
				466	return -ENOBUFS;
				467	}
				468
				469	skb->dev = NULL;
				470	skb_set_owner_r(skb, sk);
				471
				472	/* we escape from rcu protected region, make sure we dont leak
				473	* a norefcounted dst
				474	*/
				475	skb_dst_force(skb);
				476
				477	spin_lock_irqsave(&list->lock, flags);
				478	sock_skb_set_dropcount(sk, skb);
				479	__skb_queue_tail(list, skb);
				480	spin_unlock_irqrestore(&list->lock, flags);
				481
				482	if (!sock_flag(sk, SOCK_DEAD))
				483	sk->sk_data_ready(sk);
				484	return 0;
				485	}
				486	EXPORT_SYMBOL(__sock_queue_rcv_skb);
				487
				488	int sock_queue_rcv_skb(struct sock sk, struct sk_buff skb)
				489	{
				490	int err;
				491
				492	err = sk_filter(sk, skb);
				493	if (err)
				494	return err;
				495
				496	return __sock_queue_rcv_skb(sk, skb);
				497	}
				498	EXPORT_SYMBOL(sock_queue_rcv_skb);
				499
				500	int __sk_receive_skb(struct sock sk, struct sk_buff skb,
				501	const int nested, unsigned int trim_cap, bool refcounted)
				502	{
				503	int rc = NET_RX_SUCCESS;
				504
				505	if (sk_filter_trim_cap(sk, skb, trim_cap))
				506	goto discard_and_relse;
				507
				508	skb->dev = NULL;
				509
				510	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
				511	atomic_inc(&sk->sk_drops);
				512	goto discard_and_relse;
				513	}
				514	if (nested)
				515	bh_lock_sock_nested(sk);
				516	else
				517	bh_lock_sock(sk);
				518	if (!sock_owned_by_user(sk)) {
				519	/*
				520	* trylock + unlock semantics:
				521	*/
				522	mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
				523
				524	rc = sk_backlog_rcv(sk, skb);
				525
				526	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
				527	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
				528	bh_unlock_sock(sk);
				529	atomic_inc(&sk->sk_drops);
				530	goto discard_and_relse;
				531	}
				532
				533	bh_unlock_sock(sk);
				534	out:
				535	if (refcounted)
				536	sock_put(sk);
				537	return rc;
				538	discard_and_relse:
				539	kfree_skb(skb);
				540	goto out;
				541	}
				542	EXPORT_SYMBOL(__sk_receive_skb);
				543
				544	u64 sock_gen_cookie(struct sock *sk)
				545	{
				546	while (1) {
				547	u64 res = atomic64_read(&sk->sk_cookie);
				548
				549	if (res)
				550	return res;
				551	res = atomic64_inc_return(&cookie_gen);
				552	atomic64_cmpxchg(&sk->sk_cookie, 0, res);
				553	}
				554	}
				555
				556	struct dst_entry __sk_dst_check(struct sock sk, u32 cookie)
				557	{
				558	struct dst_entry *dst = __sk_dst_get(sk);
				559
				560	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
				561	sk_tx_queue_clear(sk);
				562	WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
				563	RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
				564	dst_release(dst);
				565	return NULL;
				566	}
				567
				568	return dst;
				569	}
				570	EXPORT_SYMBOL(__sk_dst_check);
				571
				572	struct dst_entry sk_dst_check(struct sock sk, u32 cookie)
				573	{
				574	struct dst_entry *dst = sk_dst_get(sk);
				575
				576	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
				577	sk_dst_reset(sk);
				578	dst_release(dst);
				579	return NULL;
				580	}
				581
				582	return dst;
				583	}
				584	EXPORT_SYMBOL(sk_dst_check);
				585
				586	static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
				587	{
				588	int ret = -ENOPROTOOPT;
				589	#ifdef CONFIG_NETDEVICES
				590	struct net *net = sock_net(sk);
				591
				592	/* Sorry... */
				593	ret = -EPERM;
				594	if (!ns_capable(net->user_ns, CAP_NET_RAW))
				595	goto out;
				596
				597	ret = -EINVAL;
				598	if (ifindex < 0)
				599	goto out;
				600
				601	sk->sk_bound_dev_if = ifindex;
				602	if (sk->sk_prot->rehash)
				603	sk->sk_prot->rehash(sk);
				604	sk_dst_reset(sk);
				605
				606	ret = 0;
				607
				608	out:
				609	#endif
				610
				611	return ret;
				612	}
				613
				614	static int sock_setbindtodevice(struct sock sk, char __user optval,
				615	int optlen)
				616	{
				617	int ret = -ENOPROTOOPT;
				618	#ifdef CONFIG_NETDEVICES
				619	struct net *net = sock_net(sk);
				620	char devname[IFNAMSIZ];
				621	int index;
				622
				623	ret = -EINVAL;
				624	if (optlen < 0)
				625	goto out;
				626
				627	/* Bind this socket to a particular device like "eth0",
				628	* as specified in the passed interface name. If the
				629	* name is "" or the option length is zero the socket
				630	* is not bound.
				631	*/
				632	if (optlen > IFNAMSIZ - 1)
				633	optlen = IFNAMSIZ - 1;
				634	memset(devname, 0, sizeof(devname));
				635
				636	ret = -EFAULT;
				637	if (copy_from_user(devname, optval, optlen))
				638	goto out;
				639
				640	index = 0;
				641	if (devname[0] != '\0') {
				642	struct net_device *dev;
				643
				644	rcu_read_lock();
				645	dev = dev_get_by_name_rcu(net, devname);
				646	if (dev)
				647	index = dev->ifindex;
				648	rcu_read_unlock();
				649	ret = -ENODEV;
				650	if (!dev)
				651	goto out;
				652	}
				653
				654	lock_sock(sk);
				655	ret = sock_setbindtodevice_locked(sk, index);
				656	release_sock(sk);
				657
				658	out:
				659	#endif
				660
				661	return ret;
				662	}
				663
				664	static int sock_getbindtodevice(struct sock sk, char __user optval,
				665	int __user *optlen, int len)
				666	{
				667	int ret = -ENOPROTOOPT;
				668	#ifdef CONFIG_NETDEVICES
				669	struct net *net = sock_net(sk);
				670	char devname[IFNAMSIZ];
				671
				672	if (sk->sk_bound_dev_if == 0) {
				673	len = 0;
				674	goto zero;
				675	}
				676
				677	ret = -EINVAL;
				678	if (len < IFNAMSIZ)
				679	goto out;
				680
				681	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
				682	if (ret)
				683	goto out;
				684
				685	len = strlen(devname) + 1;
				686
				687	ret = -EFAULT;
				688	if (copy_to_user(optval, devname, len))
				689	goto out;
				690
				691	zero:
				692	ret = -EFAULT;
				693	if (put_user(len, optlen))
				694	goto out;
				695
				696	ret = 0;
				697
				698	out:
				699	#endif
				700
				701	return ret;
				702	}
				703
				704	static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
				705	{
				706	if (valbool)
				707	sock_set_flag(sk, bit);
				708	else
				709	sock_reset_flag(sk, bit);
				710	}
				711
				712	bool sk_mc_loop(struct sock *sk)
				713	{
				714	if (dev_recursion_level())
				715	return false;
				716	if (!sk)
				717	return true;
				718	/* IPV6_ADDRFORM can change sk->sk_family under us. */
				719	switch (READ_ONCE(sk->sk_family)) {
				720	case AF_INET:
				721	return inet_sk(sk)->mc_loop;
				722	#if IS_ENABLED(CONFIG_IPV6)
				723	case AF_INET6:
				724	return inet6_sk(sk)->mc_loop;
				725	#endif
				726	}
				727	WARN_ON_ONCE(1);
				728	return true;
				729	}
				730	EXPORT_SYMBOL(sk_mc_loop);
				731
				732	/*
				733	* This is meant for all protocols to use and covers goings on
				734	* at the socket level. Everything here is generic.
				735	*/
				736
				737	int sock_setsockopt(struct socket *sock, int level, int optname,
				738	char __user *optval, unsigned int optlen)
				739	{
				740	struct sock_txtime sk_txtime;
				741	struct sock *sk = sock->sk;
				742	int val;
				743	int valbool;
				744	struct linger ling;
				745	int ret = 0;
				746
				747	/*
				748	* Options without arguments
				749	*/
				750
				751	if (optname == SO_BINDTODEVICE)
				752	return sock_setbindtodevice(sk, optval, optlen);
				753
				754	if (optlen < sizeof(int))
				755	return -EINVAL;
				756
				757	if (get_user(val, (int __user *)optval))
				758	return -EFAULT;
				759
				760	valbool = val ? 1 : 0;
				761
				762	lock_sock(sk);
				763
				764	switch (optname) {
				765	case SO_DEBUG:
				766	if (val && !capable(CAP_NET_ADMIN))
				767	ret = -EACCES;
				768	else
				769	sock_valbool_flag(sk, SOCK_DBG, valbool);
				770	break;
				771	case SO_REUSEADDR:
				772	sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
				773	break;
				774	case SO_REUSEPORT:
				775	sk->sk_reuseport = valbool;
				776	break;
				777	case SO_TYPE:
				778	case SO_PROTOCOL:
				779	case SO_DOMAIN:
				780	case SO_ERROR:
				781	ret = -ENOPROTOOPT;
				782	break;
				783	case SO_DONTROUTE:
				784	sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
				785	sk_dst_reset(sk);
				786	break;
				787	case SO_BROADCAST:
				788	sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
				789	break;
				790	case SO_SNDBUF:
				791	/* Don't error on this BSD doesn't and if you think
				792	* about it this is right. Otherwise apps have to
				793	* play 'guess the biggest size' games. RCVBUF/SNDBUF
				794	* are treated in BSD as hints
				795	*/
				796	val = min_t(u32, val, sysctl_wmem_max);
				797	set_sndbuf:
				798	/* Ensure val * 2 fits into an int, to prevent max_t()
				799	* from treating it as a negative value.
				800	*/
				801	val = min_t(int, val, INT_MAX / 2);
				802	sk->sk_userlocks \|= SOCK_SNDBUF_LOCK;
				803	WRITE_ONCE(sk->sk_sndbuf,
				804	max_t(int, val * 2, SOCK_MIN_SNDBUF));
				805	/* Wake up sending tasks if we upped the value. */
				806	sk->sk_write_space(sk);
				807	break;
				808
				809	case SO_SNDBUFFORCE:
				810	if (!capable(CAP_NET_ADMIN)) {
				811	ret = -EPERM;
				812	break;
				813	}
				814
				815	/* No negative values (to prevent underflow, as val will be
				816	* multiplied by 2).
				817	*/
				818	if (val < 0)
				819	val = 0;
				820	goto set_sndbuf;
				821
				822	case SO_RCVBUF:
				823	/* Don't error on this BSD doesn't and if you think
				824	* about it this is right. Otherwise apps have to
				825	* play 'guess the biggest size' games. RCVBUF/SNDBUF
				826	* are treated in BSD as hints
				827	*/
				828	val = min_t(u32, val, sysctl_rmem_max);
				829	set_rcvbuf:
				830	/* Ensure val * 2 fits into an int, to prevent max_t()
				831	* from treating it as a negative value.
				832	*/
				833	val = min_t(int, val, INT_MAX / 2);
				834	sk->sk_userlocks \|= SOCK_RCVBUF_LOCK;
				835	/*
				836	* We double it on the way in to account for
				837	* "struct sk_buff" etc. overhead. Applications
				838	* assume that the SO_RCVBUF setting they make will
				839	* allow that much actual data to be received on that
				840	* socket.
				841	*
				842	* Applications are unaware that "struct sk_buff" and
				843	* other overheads allocate from the receive buffer
				844	* during socket buffer allocation.
				845	*
				846	* And after considering the possible alternatives,
				847	* returning the value we actually used in getsockopt
				848	* is the most desirable behavior.
				849	*/
				850	WRITE_ONCE(sk->sk_rcvbuf,
				851	max_t(int, val * 2, SOCK_MIN_RCVBUF));
				852	break;
				853
				854	case SO_RCVBUFFORCE:
				855	if (!capable(CAP_NET_ADMIN)) {
				856	ret = -EPERM;
				857	break;
				858	}
				859
				860	/* No negative values (to prevent underflow, as val will be
				861	* multiplied by 2).
				862	*/
				863	if (val < 0)
				864	val = 0;
				865	goto set_rcvbuf;
				866
				867	case SO_KEEPALIVE:
				868	if (sk->sk_prot->keepalive)
				869	sk->sk_prot->keepalive(sk, valbool);
				870	sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
				871	break;
				872
				873	case SO_OOBINLINE:
				874	sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
				875	break;
				876
				877	case SO_NO_CHECK:
				878	sk->sk_no_check_tx = valbool;
				879	break;
				880
				881	case SO_PRIORITY:
				882	if ((val >= 0 && val <= 6) \|\|
				883	ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
				884	sk->sk_priority = val;
				885	else
				886	ret = -EPERM;
				887	break;
				888
				889	case SO_LINGER:
				890	if (optlen < sizeof(ling)) {
				891	ret = -EINVAL; /* 1003.1g */
				892	break;
				893	}
				894	if (copy_from_user(&ling, optval, sizeof(ling))) {
				895	ret = -EFAULT;
				896	break;
				897	}
				898	if (!ling.l_onoff)
				899	sock_reset_flag(sk, SOCK_LINGER);
				900	else {
				901	#if (BITS_PER_LONG == 32)
				902	if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
				903	sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
				904	else
				905	#endif
				906	sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
				907	sock_set_flag(sk, SOCK_LINGER);
				908	}
				909	break;
				910
				911	case SO_BSDCOMPAT:
				912	sock_warn_obsolete_bsdism("setsockopt");
				913	break;
				914
				915	case SO_PASSCRED:
				916	if (valbool)
				917	set_bit(SOCK_PASSCRED, &sock->flags);
				918	else
				919	clear_bit(SOCK_PASSCRED, &sock->flags);
				920	break;
				921
				922	case SO_TIMESTAMP_OLD:
				923	case SO_TIMESTAMP_NEW:
				924	case SO_TIMESTAMPNS_OLD:
				925	case SO_TIMESTAMPNS_NEW:
				926	if (valbool) {
				927	if (optname == SO_TIMESTAMP_NEW \|\| optname == SO_TIMESTAMPNS_NEW)
				928	sock_set_flag(sk, SOCK_TSTAMP_NEW);
				929	else
				930	sock_reset_flag(sk, SOCK_TSTAMP_NEW);
				931
				932	if (optname == SO_TIMESTAMP_OLD \|\| optname == SO_TIMESTAMP_NEW)
				933	sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
				934	else
				935	sock_set_flag(sk, SOCK_RCVTSTAMPNS);
				936	sock_set_flag(sk, SOCK_RCVTSTAMP);
				937	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
				938	} else {
				939	sock_reset_flag(sk, SOCK_RCVTSTAMP);
				940	sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
				941	}
				942	break;
				943
				944	case SO_TIMESTAMPING_NEW:
				945	case SO_TIMESTAMPING_OLD:
				946	if (val & ~SOF_TIMESTAMPING_MASK) {
				947	ret = -EINVAL;
				948	break;
				949	}
				950
				951	if (val & SOF_TIMESTAMPING_OPT_ID &&
				952	!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
				953	if (sk->sk_protocol == IPPROTO_TCP &&
				954	sk->sk_type == SOCK_STREAM) {
				955	if ((1 << sk->sk_state) &
				956	(TCPF_CLOSE \| TCPF_LISTEN)) {
				957	ret = -EINVAL;
				958	break;
				959	}
				960	sk->sk_tskey = tcp_sk(sk)->snd_una;
				961	} else {
				962	sk->sk_tskey = 0;
				963	}
				964	}
				965
				966	if (val & SOF_TIMESTAMPING_OPT_STATS &&
				967	!(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
				968	ret = -EINVAL;
				969	break;
				970	}
				971
				972	sk->sk_tsflags = val;
				973	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
				974
				975	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
				976	sock_enable_timestamp(sk,
				977	SOCK_TIMESTAMPING_RX_SOFTWARE);
				978	else
				979	sock_disable_timestamp(sk,
				980	(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
				981	break;
				982
				983	case SO_RCVLOWAT:
				984	if (val < 0)
				985	val = INT_MAX;
				986	if (sock->ops->set_rcvlowat)
				987	ret = sock->ops->set_rcvlowat(sk, val);
				988	else
				989	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
				990	break;
				991
				992	case SO_RCVTIMEO_OLD:
				993	case SO_RCVTIMEO_NEW:
				994	ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
				995	break;
				996
				997	case SO_SNDTIMEO_OLD:
				998	case SO_SNDTIMEO_NEW:
				999	ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
				1000	break;
				1001
				1002	case SO_ATTACH_FILTER:
				1003	ret = -EINVAL;
				1004	if (optlen == sizeof(struct sock_fprog)) {
				1005	struct sock_fprog fprog;
				1006
				1007	ret = -EFAULT;
				1008	if (copy_from_user(&fprog, optval, sizeof(fprog)))
				1009	break;
				1010
				1011	ret = sk_attach_filter(&fprog, sk);
				1012	}
				1013	break;
				1014
				1015	case SO_ATTACH_BPF:
				1016	ret = -EINVAL;
				1017	if (optlen == sizeof(u32)) {
				1018	u32 ufd;
				1019
				1020	ret = -EFAULT;
				1021	if (copy_from_user(&ufd, optval, sizeof(ufd)))
				1022	break;
				1023
				1024	ret = sk_attach_bpf(ufd, sk);
				1025	}
				1026	break;
				1027
				1028	case SO_ATTACH_REUSEPORT_CBPF:
				1029	ret = -EINVAL;
				1030	if (optlen == sizeof(struct sock_fprog)) {
				1031	struct sock_fprog fprog;
				1032
				1033	ret = -EFAULT;
				1034	if (copy_from_user(&fprog, optval, sizeof(fprog)))
				1035	break;
				1036
				1037	ret = sk_reuseport_attach_filter(&fprog, sk);
				1038	}
				1039	break;
				1040
				1041	case SO_ATTACH_REUSEPORT_EBPF:
				1042	ret = -EINVAL;
				1043	if (optlen == sizeof(u32)) {
				1044	u32 ufd;
				1045
				1046	ret = -EFAULT;
				1047	if (copy_from_user(&ufd, optval, sizeof(ufd)))
				1048	break;
				1049
				1050	ret = sk_reuseport_attach_bpf(ufd, sk);
				1051	}
				1052	break;
				1053
				1054	case SO_DETACH_REUSEPORT_BPF:
				1055	ret = reuseport_detach_prog(sk);
				1056	break;
				1057
				1058	case SO_DETACH_FILTER:
				1059	ret = sk_detach_filter(sk);
				1060	break;
				1061
				1062	case SO_LOCK_FILTER:
				1063	if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
				1064	ret = -EPERM;
				1065	else
				1066	sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
				1067	break;
				1068
				1069	case SO_PASSSEC:
				1070	if (valbool)
				1071	set_bit(SOCK_PASSSEC, &sock->flags);
				1072	else
				1073	clear_bit(SOCK_PASSSEC, &sock->flags);
				1074	break;
				1075	case SO_MARK:
				1076	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
				1077	ret = -EPERM;
				1078	} else if (val != sk->sk_mark) {
				1079	sk->sk_mark = val;
				1080	sk_dst_reset(sk);
				1081	}
				1082	break;
				1083
				1084	case SO_RXQ_OVFL:
				1085	sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
				1086	break;
				1087
				1088	case SO_WIFI_STATUS:
				1089	sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
				1090	break;
				1091
				1092	case SO_PEEK_OFF:
				1093	if (sock->ops->set_peek_off)
				1094	ret = sock->ops->set_peek_off(sk, val);
				1095	else
				1096	ret = -EOPNOTSUPP;
				1097	break;
				1098
				1099	case SO_NOFCS:
				1100	sock_valbool_flag(sk, SOCK_NOFCS, valbool);
				1101	break;
				1102
				1103	case SO_SELECT_ERR_QUEUE:
				1104	sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
				1105	break;
				1106
				1107	#ifdef CONFIG_NET_RX_BUSY_POLL
				1108	case SO_BUSY_POLL:
				1109	/* allow unprivileged users to decrease the value */
				1110	if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
				1111	ret = -EPERM;
				1112	else {
				1113	if (val < 0)
				1114	ret = -EINVAL;
				1115	else
				1116	WRITE_ONCE(sk->sk_ll_usec, val);
				1117	}
				1118	break;
				1119	#endif
				1120
				1121	case SO_MAX_PACING_RATE:
				1122	{
				1123	unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
				1124
				1125	if (sizeof(ulval) != sizeof(val) &&
				1126	optlen >= sizeof(ulval) &&
				1127	get_user(ulval, (unsigned long __user *)optval)) {
				1128	ret = -EFAULT;
				1129	break;
				1130	}
				1131	if (ulval != ~0UL)
				1132	cmpxchg(&sk->sk_pacing_status,
				1133	SK_PACING_NONE,
				1134	SK_PACING_NEEDED);
				1135	/* Pairs with READ_ONCE() from sk_getsockopt() */
				1136	WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
				1137	sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
				1138	break;
				1139	}
				1140	case SO_INCOMING_CPU:
				1141	WRITE_ONCE(sk->sk_incoming_cpu, val);
				1142	break;
				1143
				1144	case SO_CNX_ADVICE:
				1145	if (val == 1)
				1146	dst_negative_advice(sk);
				1147	break;
				1148
				1149	case SO_ZEROCOPY:
				1150	if (sk->sk_family == PF_INET \|\| sk->sk_family == PF_INET6) {
				1151	if (!((sk->sk_type == SOCK_STREAM &&
				1152	sk->sk_protocol == IPPROTO_TCP) \|\|
				1153	(sk->sk_type == SOCK_DGRAM &&
				1154	sk->sk_protocol == IPPROTO_UDP)))
				1155	ret = -ENOTSUPP;
				1156	} else if (sk->sk_family != PF_RDS) {
				1157	ret = -ENOTSUPP;
				1158	}
				1159	if (!ret) {
				1160	if (val < 0 \|\| val > 1)
				1161	ret = -EINVAL;
				1162	else
				1163	sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
				1164	}
				1165	break;
				1166
				1167	case SO_TXTIME:
				1168	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
				1169	ret = -EPERM;
				1170	} else if (optlen != sizeof(struct sock_txtime)) {
				1171	ret = -EINVAL;
				1172	} else if (copy_from_user(&sk_txtime, optval,
				1173	sizeof(struct sock_txtime))) {
				1174	ret = -EFAULT;
				1175	} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
				1176	ret = -EINVAL;
				1177	} else {
				1178	sock_valbool_flag(sk, SOCK_TXTIME, true);
				1179	sk->sk_clockid = sk_txtime.clockid;
				1180	sk->sk_txtime_deadline_mode =
				1181	!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
				1182	sk->sk_txtime_report_errors =
				1183	!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
				1184	}
				1185	break;
				1186
				1187	case SO_BINDTOIFINDEX:
				1188	ret = sock_setbindtodevice_locked(sk, val);
				1189	break;
				1190
				1191	default:
				1192	ret = -ENOPROTOOPT;
				1193	break;
				1194	}
				1195	release_sock(sk);
				1196	return ret;
				1197	}
				1198	EXPORT_SYMBOL(sock_setsockopt);
				1199
				1200	static const struct cred sk_get_peer_cred(struct sock sk)
				1201	{
				1202	const struct cred *cred;
				1203
				1204	spin_lock(&sk->sk_peer_lock);
				1205	cred = get_cred(sk->sk_peer_cred);
				1206	spin_unlock(&sk->sk_peer_lock);
				1207
				1208	return cred;
				1209	}
				1210
				1211	static void cred_to_ucred(struct pid pid, const struct cred cred,
				1212	struct ucred *ucred)
				1213	{
				1214	ucred->pid = pid_vnr(pid);
				1215	ucred->uid = ucred->gid = -1;
				1216	if (cred) {
				1217	struct user_namespace *current_ns = current_user_ns();
				1218
				1219	ucred->uid = from_kuid_munged(current_ns, cred->euid);
				1220	ucred->gid = from_kgid_munged(current_ns, cred->egid);
				1221	}
				1222	}
				1223
				1224	static int groups_to_user(gid_t __user dst, const struct group_info src)
				1225	{
				1226	struct user_namespace *user_ns = current_user_ns();
				1227	int i;
				1228
				1229	for (i = 0; i < src->ngroups; i++)
				1230	if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
				1231	return -EFAULT;
				1232
				1233	return 0;
				1234	}
				1235
				1236	int sock_getsockopt(struct socket *sock, int level, int optname,
				1237	char __user optval, int __user optlen)
				1238	{
				1239	struct sock *sk = sock->sk;
				1240
				1241	union {
				1242	int val;
				1243	u64 val64;
				1244	unsigned long ulval;
				1245	struct linger ling;
				1246	struct old_timeval32 tm32;
				1247	struct __kernel_old_timeval tm;
				1248	struct __kernel_sock_timeval stm;
				1249	struct sock_txtime txtime;
				1250	} v;
				1251
				1252	int lv = sizeof(int);
				1253	int len;
				1254
				1255	if (get_user(len, optlen))
				1256	return -EFAULT;
				1257	if (len < 0)
				1258	return -EINVAL;
				1259
				1260	memset(&v, 0, sizeof(v));
				1261
				1262	switch (optname) {
				1263	case SO_DEBUG:
				1264	v.val = sock_flag(sk, SOCK_DBG);
				1265	break;
				1266
				1267	case SO_DONTROUTE:
				1268	v.val = sock_flag(sk, SOCK_LOCALROUTE);
				1269	break;
				1270
				1271	case SO_BROADCAST:
				1272	v.val = sock_flag(sk, SOCK_BROADCAST);
				1273	break;
				1274
				1275	case SO_SNDBUF:
				1276	v.val = READ_ONCE(sk->sk_sndbuf);
				1277	break;
				1278
				1279	case SO_RCVBUF:
				1280	v.val = READ_ONCE(sk->sk_rcvbuf);
				1281	break;
				1282
				1283	case SO_REUSEADDR:
				1284	v.val = sk->sk_reuse;
				1285	break;
				1286
				1287	case SO_REUSEPORT:
				1288	v.val = sk->sk_reuseport;
				1289	break;
				1290
				1291	case SO_KEEPALIVE:
				1292	v.val = sock_flag(sk, SOCK_KEEPOPEN);
				1293	break;
				1294
				1295	case SO_TYPE:
				1296	v.val = sk->sk_type;
				1297	break;
				1298
				1299	case SO_PROTOCOL:
				1300	v.val = sk->sk_protocol;
				1301	break;
				1302
				1303	case SO_DOMAIN:
				1304	v.val = sk->sk_family;
				1305	break;
				1306
				1307	case SO_ERROR:
				1308	v.val = -sock_error(sk);
				1309	if (v.val == 0)
				1310	v.val = xchg(&sk->sk_err_soft, 0);
				1311	break;
				1312
				1313	case SO_OOBINLINE:
				1314	v.val = sock_flag(sk, SOCK_URGINLINE);
				1315	break;
				1316
				1317	case SO_NO_CHECK:
				1318	v.val = sk->sk_no_check_tx;
				1319	break;
				1320
				1321	case SO_PRIORITY:
				1322	v.val = sk->sk_priority;
				1323	break;
				1324
				1325	case SO_LINGER:
				1326	lv = sizeof(v.ling);
				1327	v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
				1328	v.ling.l_linger = sk->sk_lingertime / HZ;
				1329	break;
				1330
				1331	case SO_BSDCOMPAT:
				1332	sock_warn_obsolete_bsdism("getsockopt");
				1333	break;
				1334
				1335	case SO_TIMESTAMP_OLD:
				1336	v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
				1337	!sock_flag(sk, SOCK_TSTAMP_NEW) &&
				1338	!sock_flag(sk, SOCK_RCVTSTAMPNS);
				1339	break;
				1340
				1341	case SO_TIMESTAMPNS_OLD:
				1342	v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
				1343	break;
				1344
				1345	case SO_TIMESTAMP_NEW:
				1346	v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
				1347	break;
				1348
				1349	case SO_TIMESTAMPNS_NEW:
				1350	v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
				1351	break;
				1352
				1353	case SO_TIMESTAMPING_OLD:
				1354	v.val = sk->sk_tsflags;
				1355	break;
				1356
				1357	case SO_RCVTIMEO_OLD:
				1358	case SO_RCVTIMEO_NEW:
				1359	lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
				1360	break;
				1361
				1362	case SO_SNDTIMEO_OLD:
				1363	case SO_SNDTIMEO_NEW:
				1364	lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
				1365	break;
				1366
				1367	case SO_RCVLOWAT:
				1368	v.val = READ_ONCE(sk->sk_rcvlowat);
				1369	break;
				1370
				1371	case SO_SNDLOWAT:
				1372	v.val = 1;
				1373	break;
				1374
				1375	case SO_PASSCRED:
				1376	v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
				1377	break;
				1378
				1379	case SO_PEERCRED:
				1380	{
				1381	struct ucred peercred;
				1382	if (len > sizeof(peercred))
				1383	len = sizeof(peercred);
				1384
				1385	spin_lock(&sk->sk_peer_lock);
				1386	cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
				1387	spin_unlock(&sk->sk_peer_lock);
				1388
				1389	if (copy_to_user(optval, &peercred, len))
				1390	return -EFAULT;
				1391	goto lenout;
				1392	}
				1393
				1394	case SO_PEERGROUPS:
				1395	{
				1396	const struct cred *cred;
				1397	int ret, n;
				1398
				1399	cred = sk_get_peer_cred(sk);
				1400	if (!cred)
				1401	return -ENODATA;
				1402
				1403	n = cred->group_info->ngroups;
				1404	if (len < n * sizeof(gid_t)) {
				1405	len = n * sizeof(gid_t);
				1406	put_cred(cred);
				1407	return put_user(len, optlen) ? -EFAULT : -ERANGE;
				1408	}
				1409	len = n * sizeof(gid_t);
				1410
				1411	ret = groups_to_user((gid_t __user *)optval, cred->group_info);
				1412	put_cred(cred);
				1413	if (ret)
				1414	return ret;
				1415	goto lenout;
				1416	}
				1417
				1418	case SO_PEERNAME:
				1419	{
				1420	char address[128];
				1421
				1422	lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
				1423	if (lv < 0)
				1424	return -ENOTCONN;
				1425	if (lv < len)
				1426	return -EINVAL;
				1427	if (copy_to_user(optval, address, len))
				1428	return -EFAULT;
				1429	goto lenout;
				1430	}
				1431
				1432	/* Dubious BSD thing... Probably nobody even uses it, but
				1433	* the UNIX standard wants it for whatever reason... -DaveM
				1434	*/
				1435	case SO_ACCEPTCONN:
				1436	v.val = sk->sk_state == TCP_LISTEN;
				1437	break;
				1438
				1439	case SO_PASSSEC:
				1440	v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
				1441	break;
				1442
				1443	case SO_PEERSEC:
				1444	return security_socket_getpeersec_stream(sock, optval, optlen, len);
				1445
				1446	case SO_MARK:
				1447	v.val = sk->sk_mark;
				1448	break;
				1449
				1450	case SO_RXQ_OVFL:
				1451	v.val = sock_flag(sk, SOCK_RXQ_OVFL);
				1452	break;
				1453
				1454	case SO_WIFI_STATUS:
				1455	v.val = sock_flag(sk, SOCK_WIFI_STATUS);
				1456	break;
				1457
				1458	case SO_PEEK_OFF:
				1459	if (!sock->ops->set_peek_off)
				1460	return -EOPNOTSUPP;
				1461
				1462	v.val = READ_ONCE(sk->sk_peek_off);
				1463	break;
				1464	case SO_NOFCS:
				1465	v.val = sock_flag(sk, SOCK_NOFCS);
				1466	break;
				1467
				1468	case SO_BINDTODEVICE:
				1469	return sock_getbindtodevice(sk, optval, optlen, len);
				1470
				1471	case SO_GET_FILTER:
				1472	len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
				1473	if (len < 0)
				1474	return len;
				1475
				1476	goto lenout;
				1477
				1478	case SO_LOCK_FILTER:
				1479	v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
				1480	break;
				1481
				1482	case SO_BPF_EXTENSIONS:
				1483	v.val = bpf_tell_extensions();
				1484	break;
				1485
				1486	case SO_SELECT_ERR_QUEUE:
				1487	v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
				1488	break;
				1489
				1490	#ifdef CONFIG_NET_RX_BUSY_POLL
				1491	case SO_BUSY_POLL:
				1492	v.val = READ_ONCE(sk->sk_ll_usec);
				1493	break;
				1494	#endif
				1495
				1496	case SO_MAX_PACING_RATE:
				1497	/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
				1498	if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
				1499	lv = sizeof(v.ulval);
				1500	v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
				1501	} else {
				1502	/* 32bit version */
				1503	v.val = min_t(unsigned long, ~0U,
				1504	READ_ONCE(sk->sk_max_pacing_rate));
				1505	}
				1506	break;
				1507
				1508	case SO_INCOMING_CPU:
				1509	v.val = READ_ONCE(sk->sk_incoming_cpu);
				1510	break;
				1511
				1512	case SO_MEMINFO:
				1513	{
				1514	u32 meminfo[SK_MEMINFO_VARS];
				1515
				1516	sk_get_meminfo(sk, meminfo);
				1517
				1518	len = min_t(unsigned int, len, sizeof(meminfo));
				1519	if (copy_to_user(optval, &meminfo, len))
				1520	return -EFAULT;
				1521
				1522	goto lenout;
				1523	}
				1524
				1525	#ifdef CONFIG_NET_RX_BUSY_POLL
				1526	case SO_INCOMING_NAPI_ID:
				1527	v.val = READ_ONCE(sk->sk_napi_id);
				1528
				1529	/* aggregate non-NAPI IDs down to 0 */
				1530	if (v.val < MIN_NAPI_ID)
				1531	v.val = 0;
				1532
				1533	break;
				1534	#endif
				1535
				1536	case SO_COOKIE:
				1537	lv = sizeof(u64);
				1538	if (len < lv)
				1539	return -EINVAL;
				1540	v.val64 = sock_gen_cookie(sk);
				1541	break;
				1542
				1543	case SO_ZEROCOPY:
				1544	v.val = sock_flag(sk, SOCK_ZEROCOPY);
				1545	break;
				1546
				1547	case SO_TXTIME:
				1548	lv = sizeof(v.txtime);
				1549	v.txtime.clockid = sk->sk_clockid;
				1550	v.txtime.flags \|= sk->sk_txtime_deadline_mode ?
				1551	SOF_TXTIME_DEADLINE_MODE : 0;
				1552	v.txtime.flags \|= sk->sk_txtime_report_errors ?
				1553	SOF_TXTIME_REPORT_ERRORS : 0;
				1554	break;
				1555
				1556	case SO_BINDTOIFINDEX:
				1557	v.val = sk->sk_bound_dev_if;
				1558	break;
				1559
				1560	default:
				1561	/* We implement the SO_SNDLOWAT etc to not be settable
				1562	* (1003.1g 7).
				1563	*/
				1564	return -ENOPROTOOPT;
				1565	}
				1566
				1567	if (len > lv)
				1568	len = lv;
				1569	if (copy_to_user(optval, &v, len))
				1570	return -EFAULT;
				1571	lenout:
				1572	if (put_user(len, optlen))
				1573	return -EFAULT;
				1574	return 0;
				1575	}
				1576
				1577	/*
				1578	* Initialize an sk_lock.
				1579	*
				1580	* (We also register the sk_lock with the lock validator.)
				1581	*/
				1582	static inline void sock_lock_init(struct sock *sk)
				1583	{
				1584	if (sk->sk_kern_sock)
				1585	sock_lock_init_class_and_name(
				1586	sk,
				1587	af_family_kern_slock_key_strings[sk->sk_family],
				1588	af_family_kern_slock_keys + sk->sk_family,
				1589	af_family_kern_key_strings[sk->sk_family],
				1590	af_family_kern_keys + sk->sk_family);
				1591	else
				1592	sock_lock_init_class_and_name(
				1593	sk,
				1594	af_family_slock_key_strings[sk->sk_family],
				1595	af_family_slock_keys + sk->sk_family,
				1596	af_family_key_strings[sk->sk_family],
				1597	af_family_keys + sk->sk_family);
				1598	}
				1599
				1600	/*
				1601	* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
				1602	* even temporarly, because of RCU lookups. sk_node should also be left as is.
				1603	* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
				1604	*/
				1605	static void sock_copy(struct sock nsk, const struct sock osk)
				1606	{
				1607	#ifdef CONFIG_SECURITY_NETWORK
				1608	void *sptr = nsk->sk_security;
				1609	#endif
				1610	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
				1611
				1612	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
				1613	osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
				1614
				1615	#ifdef CONFIG_SECURITY_NETWORK
				1616	nsk->sk_security = sptr;
				1617	security_sk_clone(osk, nsk);
				1618	#endif
				1619	}
				1620
				1621	static struct sock sk_prot_alloc(struct proto prot, gfp_t priority,
				1622	int family)
				1623	{
				1624	struct sock *sk;
				1625	struct kmem_cache *slab;
				1626
				1627	slab = prot->slab;
				1628	if (slab != NULL) {
				1629	sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
				1630	if (!sk)
				1631	return sk;
				1632	if (want_init_on_alloc(priority))
				1633	sk_prot_clear_nulls(sk, prot->obj_size);
				1634	} else
				1635	sk = kmalloc(prot->obj_size, priority);
				1636
				1637	if (sk != NULL) {
				1638	if (security_sk_alloc(sk, family, priority))
				1639	goto out_free;
				1640
				1641	trace_android_rvh_sk_alloc(sk);
				1642
				1643	if (!try_module_get(prot->owner))
				1644	goto out_free_sec;
				1645	sk_tx_queue_clear(sk);
				1646	}
				1647
				1648	return sk;
				1649
				1650	out_free_sec:
				1651	security_sk_free(sk);
				1652	trace_android_rvh_sk_free(sk);
				1653	out_free:
				1654	if (slab != NULL)
				1655	kmem_cache_free(slab, sk);
				1656	else
				1657	kfree(sk);
				1658	return NULL;
				1659	}
				1660
				1661	static void sk_prot_free(struct proto prot, struct sock sk)
				1662	{
				1663	struct kmem_cache *slab;
				1664	struct module *owner;
				1665
				1666	owner = prot->owner;
				1667	slab = prot->slab;
				1668
				1669	cgroup_sk_free(&sk->sk_cgrp_data);
				1670	mem_cgroup_sk_free(sk);
				1671	security_sk_free(sk);
				1672	trace_android_rvh_sk_free(sk);
				1673	if (slab != NULL)
				1674	kmem_cache_free(slab, sk);
				1675	else
				1676	kfree(sk);
				1677	module_put(owner);
				1678	}
				1679
				1680	/**
				1681	* sk_alloc - All socket objects are allocated here
				1682	* @net: the applicable net namespace
				1683	* @family: protocol family
				1684	* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
				1685	* @prot: struct proto associated with this new sock instance
				1686	* @kern: is this to be a kernel socket?
				1687	*/
				1688	struct sock sk_alloc(struct net net, int family, gfp_t priority,
				1689	struct proto *prot, int kern)
				1690	{
				1691	struct sock *sk;
				1692
				1693	sk = sk_prot_alloc(prot, priority \| __GFP_ZERO, family);
				1694	if (sk) {
				1695	sk->sk_family = family;
				1696	/*
				1697	* See comment in struct sock definition to understand
				1698	* why we need sk_prot_creator -acme
				1699	*/
				1700	sk->sk_prot = sk->sk_prot_creator = prot;
				1701	sk->sk_kern_sock = kern;
				1702	sock_lock_init(sk);
				1703	sk->sk_net_refcnt = kern ? 0 : 1;
				1704	if (likely(sk->sk_net_refcnt)) {
				1705	get_net(net);
				1706	sock_inuse_add(net, 1);
				1707	}
				1708
				1709	sock_net_set(sk, net);
				1710	refcount_set(&sk->sk_wmem_alloc, 1);
				1711
				1712	mem_cgroup_sk_alloc(sk);
				1713	cgroup_sk_alloc(&sk->sk_cgrp_data);
				1714	sock_update_classid(&sk->sk_cgrp_data);
				1715	sock_update_netprioidx(&sk->sk_cgrp_data);
				1716	sk_tx_queue_clear(sk);
				1717	}
				1718
				1719	return sk;
				1720	}
				1721	EXPORT_SYMBOL(sk_alloc);
				1722
				1723	/* Sockets having SOCK_RCU_FREE will call this function after one RCU
				1724	* grace period. This is the case for UDP sockets and TCP listeners.
				1725	*/
				1726	static void __sk_destruct(struct rcu_head *head)
				1727	{
				1728	struct sock *sk = container_of(head, struct sock, sk_rcu);
				1729	struct sk_filter *filter;
				1730
				1731	if (sk->sk_destruct)
				1732	sk->sk_destruct(sk);
				1733
				1734	filter = rcu_dereference_check(sk->sk_filter,
				1735	refcount_read(&sk->sk_wmem_alloc) == 0);
				1736	if (filter) {
				1737	sk_filter_uncharge(sk, filter);
				1738	RCU_INIT_POINTER(sk->sk_filter, NULL);
				1739	}
				1740
				1741	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
				1742
				1743	#ifdef CONFIG_BPF_SYSCALL
				1744	bpf_sk_storage_free(sk);
				1745	#endif
				1746
				1747	if (atomic_read(&sk->sk_omem_alloc))
				1748	pr_debug("%s: optmem leakage (%d bytes) detected\n",
				1749	__func__, atomic_read(&sk->sk_omem_alloc));
				1750
				1751	if (sk->sk_frag.page) {
				1752	put_page(sk->sk_frag.page);
				1753	sk->sk_frag.page = NULL;
				1754	}
				1755
				1756	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
				1757	put_cred(sk->sk_peer_cred);
				1758	put_pid(sk->sk_peer_pid);
				1759
				1760	if (likely(sk->sk_net_refcnt))
				1761	put_net(sock_net(sk));
				1762	sk_prot_free(sk->sk_prot_creator, sk);
				1763	}
				1764
				1765	void sk_destruct(struct sock *sk)
				1766	{
				1767	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
				1768
				1769	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
				1770	reuseport_detach_sock(sk);
				1771	use_call_rcu = true;
				1772	}
				1773
				1774	if (use_call_rcu)
				1775	call_rcu(&sk->sk_rcu, __sk_destruct);
				1776	else
				1777	__sk_destruct(&sk->sk_rcu);
				1778	}
				1779
				1780	static void __sk_free(struct sock *sk)
				1781	{
				1782	if (likely(sk->sk_net_refcnt))
				1783	sock_inuse_add(sock_net(sk), -1);
				1784
				1785	#ifdef CONFIG_SOCK_DIAG
				1786	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
				1787	sock_diag_broadcast_destroy(sk);
				1788	else
				1789	#endif
				1790	sk_destruct(sk);
				1791	}
				1792
				1793	void sk_free(struct sock *sk)
				1794	{
				1795	/*
				1796	* We subtract one from sk_wmem_alloc and can know if
				1797	* some packets are still in some tx queue.
				1798	* If not null, sock_wfree() will call __sk_free(sk) later
				1799	*/
				1800	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
				1801	__sk_free(sk);
				1802	}
				1803	EXPORT_SYMBOL(sk_free);
				1804
				1805	static void sk_init_common(struct sock *sk)
				1806	{
				1807	skb_queue_head_init(&sk->sk_receive_queue);
				1808	skb_queue_head_init(&sk->sk_write_queue);
				1809	skb_queue_head_init(&sk->sk_error_queue);
				1810
				1811	rwlock_init(&sk->sk_callback_lock);
				1812	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
				1813	af_rlock_keys + sk->sk_family,
				1814	af_family_rlock_key_strings[sk->sk_family]);
				1815	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
				1816	af_wlock_keys + sk->sk_family,
				1817	af_family_wlock_key_strings[sk->sk_family]);
				1818	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
				1819	af_elock_keys + sk->sk_family,
				1820	af_family_elock_key_strings[sk->sk_family]);
				1821	lockdep_set_class_and_name(&sk->sk_callback_lock,
				1822	af_callback_keys + sk->sk_family,
				1823	af_family_clock_key_strings[sk->sk_family]);
				1824	}
				1825
				1826	/**
				1827	* sk_clone_lock - clone a socket, and lock its clone
				1828	* @sk: the socket to clone
				1829	* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
				1830	*
				1831	* Caller must unlock socket even in error path (bh_unlock_sock(newsk))
				1832	*/
				1833	struct sock sk_clone_lock(const struct sock sk, const gfp_t priority)
				1834	{
				1835	struct sock *newsk;
				1836	bool is_charged = true;
				1837
				1838	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
				1839	if (newsk != NULL) {
				1840	struct sk_filter *filter;
				1841
				1842	sock_copy(newsk, sk);
				1843
				1844	newsk->sk_prot_creator = sk->sk_prot;
				1845
				1846	/* SANITY */
				1847	if (likely(newsk->sk_net_refcnt))
				1848	get_net(sock_net(newsk));
				1849	sk_node_init(&newsk->sk_node);
				1850	sock_lock_init(newsk);
				1851	bh_lock_sock(newsk);
				1852	newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
				1853	newsk->sk_backlog.len = 0;
				1854
				1855	atomic_set(&newsk->sk_rmem_alloc, 0);
				1856	/*
				1857	* sk_wmem_alloc set to one (see sk_free() and sock_wfree())
				1858	*/
				1859	refcount_set(&newsk->sk_wmem_alloc, 1);
				1860	atomic_set(&newsk->sk_omem_alloc, 0);
				1861	sk_init_common(newsk);
				1862
				1863	newsk->sk_dst_cache = NULL;
				1864	newsk->sk_dst_pending_confirm = 0;
				1865	newsk->sk_wmem_queued = 0;
				1866	newsk->sk_forward_alloc = 0;
				1867	atomic_set(&newsk->sk_drops, 0);
				1868	newsk->sk_send_head = NULL;
				1869	newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
				1870	atomic_set(&newsk->sk_zckey, 0);
				1871
				1872	sock_reset_flag(newsk, SOCK_DONE);
				1873
				1874	/* sk->sk_memcg will be populated at accept() time */
				1875	newsk->sk_memcg = NULL;
				1876
				1877	cgroup_sk_clone(&newsk->sk_cgrp_data);
				1878
				1879	rcu_read_lock();
				1880	filter = rcu_dereference(sk->sk_filter);
				1881	if (filter != NULL)
				1882	/* though it's an empty new sock, the charging may fail
				1883	* if sysctl_optmem_max was changed between creation of
				1884	* original socket and cloning
				1885	*/
				1886	is_charged = sk_filter_charge(newsk, filter);
				1887	RCU_INIT_POINTER(newsk->sk_filter, filter);
				1888	rcu_read_unlock();
				1889
				1890	if (unlikely(!is_charged \|\| xfrm_sk_clone_policy(newsk, sk))) {
				1891	/* We need to make sure that we don't uncharge the new
				1892	* socket if we couldn't charge it in the first place
				1893	* as otherwise we uncharge the parent's filter.
				1894	*/
				1895	if (!is_charged)
				1896	RCU_INIT_POINTER(newsk->sk_filter, NULL);
				1897	sk_free_unlock_clone(newsk);
				1898	newsk = NULL;
				1899	goto out;
				1900	}
				1901	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
				1902
				1903	if (bpf_sk_storage_clone(sk, newsk)) {
				1904	sk_free_unlock_clone(newsk);
				1905	newsk = NULL;
				1906	goto out;
				1907	}
				1908
				1909	newsk->sk_err = 0;
				1910	newsk->sk_err_soft = 0;
				1911	newsk->sk_priority = 0;
				1912	newsk->sk_incoming_cpu = raw_smp_processor_id();
				1913	if (likely(newsk->sk_net_refcnt))
				1914	sock_inuse_add(sock_net(newsk), 1);
				1915
				1916	/*
				1917	* Before updating sk_refcnt, we must commit prior changes to memory
				1918	* (Documentation/RCU/rculist_nulls.txt for details)
				1919	*/
				1920	smp_wmb();
				1921	refcount_set(&newsk->sk_refcnt, 2);
				1922
				1923	/*
				1924	* Increment the counter in the same struct proto as the master
				1925	* sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
				1926	* is the same as sk->sk_prot->socks, as this field was copied
				1927	* with memcpy).
				1928	*
				1929	* This _changes_ the previous behaviour, where
				1930	* tcp_create_openreq_child always was incrementing the
				1931	* equivalent to tcp_prot->socks (inet_sock_nr), so this have
				1932	* to be taken into account in all callers. -acme
				1933	*/
				1934	sk_refcnt_debug_inc(newsk);
				1935	sk_set_socket(newsk, NULL);
				1936	sk_tx_queue_clear(newsk);
				1937	RCU_INIT_POINTER(newsk->sk_wq, NULL);
				1938
				1939	if (newsk->sk_prot->sockets_allocated)
				1940	sk_sockets_allocated_inc(newsk);
				1941
				1942	if (sock_needs_netstamp(sk) &&
				1943	newsk->sk_flags & SK_FLAGS_TIMESTAMP)
				1944	net_enable_timestamp();
				1945	}
				1946	out:
				1947	return newsk;
				1948	}
				1949	EXPORT_SYMBOL_GPL(sk_clone_lock);
				1950
				1951	void sk_free_unlock_clone(struct sock *sk)
				1952	{
				1953	/* It is still raw copy of parent, so invalidate
				1954	* destructor and make plain sk_free() */
				1955	sk->sk_destruct = NULL;
				1956	bh_unlock_sock(sk);
				1957	sk_free(sk);
				1958	}
				1959	EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
				1960
				1961	void sk_setup_caps(struct sock sk, struct dst_entry dst)
				1962	{
				1963	u32 max_segs = 1;
				1964
				1965	sk->sk_route_caps = dst->dev->features \| sk->sk_route_forced_caps;
				1966	if (sk->sk_route_caps & NETIF_F_GSO)
				1967	sk->sk_route_caps \|= NETIF_F_GSO_SOFTWARE;
				1968	sk->sk_route_caps &= ~sk->sk_route_nocaps;
				1969	if (sk_can_gso(sk)) {
				1970	if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
				1971	sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
				1972	} else {
				1973	sk->sk_route_caps \|= NETIF_F_SG \| NETIF_F_HW_CSUM;
				1974	sk->sk_gso_max_size = dst->dev->gso_max_size;
				1975	max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
				1976	}
				1977	}
				1978	sk->sk_gso_max_segs = max_segs;
				1979	sk_dst_set(sk, dst);
				1980	}
				1981	EXPORT_SYMBOL_GPL(sk_setup_caps);
				1982
				1983	/*
				1984	* Simple resource managers for sockets.
				1985	*/
				1986
				1987
				1988	/*
				1989	* Write buffer destructor automatically called from kfree_skb.
				1990	*/
				1991	void sock_wfree(struct sk_buff *skb)
				1992	{
				1993	struct sock *sk = skb->sk;
				1994	unsigned int len = skb->truesize;
				1995
				1996	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
				1997	/*
				1998	* Keep a reference on sk_wmem_alloc, this will be released
				1999	* after sk_write_space() call
				2000	*/
				2001	WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
				2002	sk->sk_write_space(sk);
				2003	len = 1;
				2004	}
				2005	/*
				2006	* if sk_wmem_alloc reaches 0, we must finish what sk_free()
				2007	* could not do because of in-flight packets
				2008	*/
				2009	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
				2010	__sk_free(sk);
				2011	}
				2012	EXPORT_SYMBOL(sock_wfree);
				2013
				2014	/* This variant of sock_wfree() is used by TCP,
				2015	* since it sets SOCK_USE_WRITE_QUEUE.
				2016	*/
				2017	void __sock_wfree(struct sk_buff *skb)
				2018	{
				2019	struct sock *sk = skb->sk;
				2020
				2021	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
				2022	__sk_free(sk);
				2023	}
				2024
				2025	void skb_set_owner_w(struct sk_buff skb, struct sock sk)
				2026	{
				2027	skb_orphan(skb);
				2028	skb->sk = sk;
				2029	#ifdef CONFIG_INET
				2030	if (unlikely(!sk_fullsock(sk))) {
				2031	skb->destructor = sock_edemux;
				2032	sock_hold(sk);
				2033	return;
				2034	}
				2035	#endif
				2036	skb->destructor = sock_wfree;
				2037	skb_set_hash_from_sk(skb, sk);
				2038	/*
				2039	* We used to take a refcount on sk, but following operation
				2040	* is enough to guarantee sk_free() wont free this sock until
				2041	* all in-flight packets are completed
				2042	*/
				2043	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				2044	}
				2045	EXPORT_SYMBOL(skb_set_owner_w);
				2046
				2047	static bool can_skb_orphan_partial(const struct sk_buff *skb)
				2048	{
				2049	#ifdef CONFIG_TLS_DEVICE
				2050	/* Drivers depend on in-order delivery for crypto offload,
				2051	* partial orphan breaks out-of-order-OK logic.
				2052	*/
				2053	if (skb->decrypted)
				2054	return false;
				2055	#endif
				2056	return (skb->destructor == sock_wfree \|\|
				2057	(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
				2058	}
				2059
				2060	/* This helper is used by netem, as it can hold packets in its
				2061	* delay queue. We want to allow the owner socket to send more
				2062	* packets, as if they were already TX completed by a typical driver.
				2063	* But we also want to keep skb->sk set because some packet schedulers
				2064	* rely on it (sch_fq for example).
				2065	*/
				2066	void skb_orphan_partial(struct sk_buff *skb)
				2067	{
				2068	if (skb_is_tcp_pure_ack(skb))
				2069	return;
				2070
				2071	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
				2072	return;
				2073
				2074	skb_orphan(skb);
				2075	}
				2076	EXPORT_SYMBOL(skb_orphan_partial);
				2077
				2078	/*
				2079	* Read buffer destructor automatically called from kfree_skb.
				2080	*/
				2081	void sock_rfree(struct sk_buff *skb)
				2082	{
				2083	struct sock *sk = skb->sk;
				2084	unsigned int len = skb->truesize;
				2085
				2086	atomic_sub(len, &sk->sk_rmem_alloc);
				2087	sk_mem_uncharge(sk, len);
				2088	}
				2089	EXPORT_SYMBOL(sock_rfree);
				2090
				2091	/*
				2092	* Buffer destructor for skbs that are not used directly in read or write
				2093	* path, e.g. for error handler skbs. Automatically called from kfree_skb.
				2094	*/
				2095	void sock_efree(struct sk_buff *skb)
				2096	{
				2097	sock_put(skb->sk);
				2098	}
				2099	EXPORT_SYMBOL(sock_efree);
				2100
				2101	kuid_t sock_i_uid(struct sock *sk)
				2102	{
				2103	kuid_t uid;
				2104
				2105	read_lock_bh(&sk->sk_callback_lock);
				2106	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
				2107	read_unlock_bh(&sk->sk_callback_lock);
				2108	return uid;
				2109	}
				2110	EXPORT_SYMBOL(sock_i_uid);
				2111
				2112	unsigned long __sock_i_ino(struct sock *sk)
				2113	{
				2114	unsigned long ino;
				2115
				2116	read_lock(&sk->sk_callback_lock);
				2117	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
				2118	read_unlock(&sk->sk_callback_lock);
				2119	return ino;
				2120	}
				2121	EXPORT_SYMBOL(__sock_i_ino);
				2122
				2123	unsigned long sock_i_ino(struct sock *sk)
				2124	{
				2125	unsigned long ino;
				2126
				2127	local_bh_disable();
				2128	ino = __sock_i_ino(sk);
				2129	local_bh_enable();
				2130	return ino;
				2131	}
				2132	EXPORT_SYMBOL(sock_i_ino);
				2133
				2134	/*
				2135	* Allocate a skb from the socket's send buffer.
				2136	*/
				2137	struct sk_buff sock_wmalloc(struct sock sk, unsigned long size, int force,
				2138	gfp_t priority)
				2139	{
				2140	if (force \|\|
				2141	refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
				2142	struct sk_buff *skb = alloc_skb(size, priority);
				2143
				2144	if (skb) {
				2145	skb_set_owner_w(skb, sk);
				2146	return skb;
				2147	}
				2148	}
				2149	return NULL;
				2150	}
				2151	EXPORT_SYMBOL(sock_wmalloc);
				2152
				2153	static void sock_ofree(struct sk_buff *skb)
				2154	{
				2155	struct sock *sk = skb->sk;
				2156
				2157	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
				2158	}
				2159
				2160	struct sk_buff sock_omalloc(struct sock sk, unsigned long size,
				2161	gfp_t priority)
				2162	{
				2163	struct sk_buff *skb;
				2164
				2165	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
				2166	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
				2167	sysctl_optmem_max)
				2168	return NULL;
				2169
				2170	skb = alloc_skb(size, priority);
				2171	if (!skb)
				2172	return NULL;
				2173
				2174	atomic_add(skb->truesize, &sk->sk_omem_alloc);
				2175	skb->sk = sk;
				2176	skb->destructor = sock_ofree;
				2177	return skb;
				2178	}
				2179
				2180	/*
				2181	* Allocate a memory block from the socket's option memory buffer.
				2182	*/
				2183	void sock_kmalloc(struct sock sk, int size, gfp_t priority)
				2184	{
				2185	if ((unsigned int)size <= sysctl_optmem_max &&
				2186	atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
				2187	void *mem;
				2188	/* First do the add, to avoid the race if kmalloc
				2189	* might sleep.
				2190	*/
				2191	atomic_add(size, &sk->sk_omem_alloc);
				2192	mem = kmalloc(size, priority);
				2193	if (mem)
				2194	return mem;
				2195	atomic_sub(size, &sk->sk_omem_alloc);
				2196	}
				2197	return NULL;
				2198	}
				2199	EXPORT_SYMBOL(sock_kmalloc);
				2200
				2201	/* Free an option memory block. Note, we actually want the inline
				2202	* here as this allows gcc to detect the nullify and fold away the
				2203	* condition entirely.
				2204	*/
				2205	static inline void __sock_kfree_s(struct sock sk, void mem, int size,
				2206	const bool nullify)
				2207	{
				2208	if (WARN_ON_ONCE(!mem))
				2209	return;
				2210	if (nullify)
				2211	kzfree(mem);
				2212	else
				2213	kfree(mem);
				2214	atomic_sub(size, &sk->sk_omem_alloc);
				2215	}
				2216
				2217	void sock_kfree_s(struct sock sk, void mem, int size)
				2218	{
				2219	__sock_kfree_s(sk, mem, size, false);
				2220	}
				2221	EXPORT_SYMBOL(sock_kfree_s);
				2222
				2223	void sock_kzfree_s(struct sock sk, void mem, int size)
				2224	{
				2225	__sock_kfree_s(sk, mem, size, true);
				2226	}
				2227	EXPORT_SYMBOL(sock_kzfree_s);
				2228
				2229	/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
				2230	I think, these locks should be removed for datagram sockets.
				2231	*/
				2232	static long sock_wait_for_wmem(struct sock *sk, long timeo)
				2233	{
				2234	DEFINE_WAIT(wait);
				2235
				2236	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				2237	for (;;) {
				2238	if (!timeo)
				2239	break;
				2240	if (signal_pending(current))
				2241	break;
				2242	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				2243	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
				2244	if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
				2245	break;
				2246	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
				2247	break;
				2248	if (READ_ONCE(sk->sk_err))
				2249	break;
				2250	timeo = schedule_timeout(timeo);
				2251	}
				2252	finish_wait(sk_sleep(sk), &wait);
				2253	return timeo;
				2254	}
				2255
				2256
				2257	/*
				2258	* Generic send/receive buffer handlers
				2259	*/
				2260
				2261	struct sk_buff sock_alloc_send_pskb(struct sock sk, unsigned long header_len,
				2262	unsigned long data_len, int noblock,
				2263	int *errcode, int max_page_order)
				2264	{
				2265	struct sk_buff *skb;
				2266	long timeo;
				2267	int err;
				2268
				2269	timeo = sock_sndtimeo(sk, noblock);
				2270	for (;;) {
				2271	err = sock_error(sk);
				2272	if (err != 0)
				2273	goto failure;
				2274
				2275	err = -EPIPE;
				2276	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
				2277	goto failure;
				2278
				2279	if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
				2280	break;
				2281
				2282	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
				2283	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
				2284	err = -EAGAIN;
				2285	if (!timeo)
				2286	goto failure;
				2287	if (signal_pending(current))
				2288	goto interrupted;
				2289	timeo = sock_wait_for_wmem(sk, timeo);
				2290	}
				2291	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
				2292	errcode, sk->sk_allocation);
				2293	if (skb)
				2294	skb_set_owner_w(skb, sk);
				2295	return skb;
				2296
				2297	interrupted:
				2298	err = sock_intr_errno(timeo);
				2299	failure:
				2300	*errcode = err;
				2301	return NULL;
				2302	}
				2303	EXPORT_SYMBOL(sock_alloc_send_pskb);
				2304
				2305	struct sk_buff sock_alloc_send_skb(struct sock sk, unsigned long size,
				2306	int noblock, int *errcode)
				2307	{
				2308	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
				2309	}
				2310	EXPORT_SYMBOL(sock_alloc_send_skb);
				2311
				2312	int __sock_cmsg_send(struct sock sk, struct msghdr msg, struct cmsghdr *cmsg,
				2313	struct sockcm_cookie *sockc)
				2314	{
				2315	u32 tsflags;
				2316
				2317	switch (cmsg->cmsg_type) {
				2318	case SO_MARK:
				2319	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
				2320	return -EPERM;
				2321	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
				2322	return -EINVAL;
				2323	sockc->mark = (u32 )CMSG_DATA(cmsg);
				2324	break;
				2325	case SO_TIMESTAMPING_OLD:
				2326	case SO_TIMESTAMPING_NEW:
				2327	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
				2328	return -EINVAL;
				2329
				2330	tsflags = (u32 )CMSG_DATA(cmsg);
				2331	if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
				2332	return -EINVAL;
				2333
				2334	sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
				2335	sockc->tsflags \|= tsflags;
				2336	break;
				2337	case SCM_TXTIME:
				2338	if (!sock_flag(sk, SOCK_TXTIME))
				2339	return -EINVAL;
				2340	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
				2341	return -EINVAL;
				2342	sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
				2343	break;
				2344	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
				2345	case SCM_RIGHTS:
				2346	case SCM_CREDENTIALS:
				2347	break;
				2348	default:
				2349	return -EINVAL;
				2350	}
				2351	return 0;
				2352	}
				2353	EXPORT_SYMBOL(__sock_cmsg_send);
				2354
				2355	int sock_cmsg_send(struct sock sk, struct msghdr msg,
				2356	struct sockcm_cookie *sockc)
				2357	{
				2358	struct cmsghdr *cmsg;
				2359	int ret;
				2360
				2361	for_each_cmsghdr(cmsg, msg) {
				2362	if (!CMSG_OK(msg, cmsg))
				2363	return -EINVAL;
				2364	if (cmsg->cmsg_level != SOL_SOCKET)
				2365	continue;
				2366	ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
				2367	if (ret)
				2368	return ret;
				2369	}
				2370	return 0;
				2371	}
				2372	EXPORT_SYMBOL(sock_cmsg_send);
				2373
				2374	static void sk_enter_memory_pressure(struct sock *sk)
				2375	{
				2376	if (!sk->sk_prot->enter_memory_pressure)
				2377	return;
				2378
				2379	sk->sk_prot->enter_memory_pressure(sk);
				2380	}
				2381
				2382	static void sk_leave_memory_pressure(struct sock *sk)
				2383	{
				2384	if (sk->sk_prot->leave_memory_pressure) {
				2385	sk->sk_prot->leave_memory_pressure(sk);
				2386	} else {
				2387	unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
				2388
				2389	if (memory_pressure && READ_ONCE(*memory_pressure))
				2390	WRITE_ONCE(*memory_pressure, 0);
				2391	}
				2392	}
				2393
				2394	DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
				2395
				2396	/**
				2397	* skb_page_frag_refill - check that a page_frag contains enough room
				2398	* @sz: minimum size of the fragment we want to get
				2399	* @pfrag: pointer to page_frag
				2400	* @gfp: priority for memory allocation
				2401	*
				2402	* Note: While this allocator tries to use high order pages, there is
				2403	* no guarantee that allocations succeed. Therefore, @sz MUST be
				2404	* less or equal than PAGE_SIZE.
				2405	*/
				2406	bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
				2407	{
				2408	if (pfrag->page) {
				2409	if (page_ref_count(pfrag->page) == 1) {
				2410	pfrag->offset = 0;
				2411	return true;
				2412	}
				2413	if (pfrag->offset + sz <= pfrag->size)
				2414	return true;
				2415	put_page(pfrag->page);
				2416	}
				2417
				2418	pfrag->offset = 0;
				2419	if (SKB_FRAG_PAGE_ORDER &&
				2420	!static_branch_unlikely(&net_high_order_alloc_disable_key)) {
				2421	/* Avoid direct reclaim but allow kswapd to wake */
				2422	pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) \|
				2423	__GFP_COMP \| __GFP_NOWARN \|
				2424	__GFP_NORETRY,
				2425	SKB_FRAG_PAGE_ORDER);
				2426	if (likely(pfrag->page)) {
				2427	pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
				2428	return true;
				2429	}
				2430	}
				2431	pfrag->page = alloc_page(gfp);
				2432	if (likely(pfrag->page)) {
				2433	pfrag->size = PAGE_SIZE;
				2434	return true;
				2435	}
				2436	return false;
				2437	}
				2438	EXPORT_SYMBOL(skb_page_frag_refill);
				2439
				2440	bool sk_page_frag_refill(struct sock sk, struct page_frag pfrag)
				2441	{
				2442	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
				2443	return true;
				2444
				2445	sk_enter_memory_pressure(sk);
				2446	sk_stream_moderate_sndbuf(sk);
				2447	return false;
				2448	}
				2449	EXPORT_SYMBOL(sk_page_frag_refill);
				2450
				2451	static void __lock_sock(struct sock *sk)
				2452	__releases(&sk->sk_lock.slock)
				2453	__acquires(&sk->sk_lock.slock)
				2454	{
				2455	DEFINE_WAIT(wait);
				2456
				2457	for (;;) {
				2458	prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
				2459	TASK_UNINTERRUPTIBLE);
				2460	spin_unlock_bh(&sk->sk_lock.slock);
				2461	schedule();
				2462	spin_lock_bh(&sk->sk_lock.slock);
				2463	if (!sock_owned_by_user(sk))
				2464	break;
				2465	}
				2466	finish_wait(&sk->sk_lock.wq, &wait);
				2467	}
				2468
				2469	void __release_sock(struct sock *sk)
				2470	__releases(&sk->sk_lock.slock)
				2471	__acquires(&sk->sk_lock.slock)
				2472	{
				2473	struct sk_buff skb, next;
				2474
				2475	while ((skb = sk->sk_backlog.head) != NULL) {
				2476	sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
				2477
				2478	spin_unlock_bh(&sk->sk_lock.slock);
				2479
				2480	do {
				2481	next = skb->next;
				2482	prefetch(next);
				2483	WARN_ON_ONCE(skb_dst_is_noref(skb));
				2484	skb_mark_not_on_list(skb);
				2485	sk_backlog_rcv(sk, skb);
				2486
				2487	cond_resched();
				2488
				2489	skb = next;
				2490	} while (skb != NULL);
				2491
				2492	spin_lock_bh(&sk->sk_lock.slock);
				2493	}
				2494
				2495	/*
				2496	* Doing the zeroing here guarantee we can not loop forever
				2497	* while a wild producer attempts to flood us.
				2498	*/
				2499	sk->sk_backlog.len = 0;
				2500	}
				2501
				2502	void __sk_flush_backlog(struct sock *sk)
				2503	{
				2504	spin_lock_bh(&sk->sk_lock.slock);
				2505	__release_sock(sk);
				2506	spin_unlock_bh(&sk->sk_lock.slock);
				2507	}
				2508
				2509	/**
				2510	* sk_wait_data - wait for data to arrive at sk_receive_queue
				2511	* @sk: sock to wait on
				2512	* @timeo: for how long
				2513	* @skb: last skb seen on sk_receive_queue
				2514	*
				2515	* Now socket state including sk->sk_err is changed only under lock,
				2516	* hence we may omit checks after joining wait queue.
				2517	* We check receive queue before schedule() only as optimization;
				2518	* it is very likely that release_sock() added new data.
				2519	*/
				2520	int sk_wait_data(struct sock sk, long timeo, const struct sk_buff *skb)
				2521	{
				2522	DEFINE_WAIT_FUNC(wait, woken_wake_function);
				2523	int rc;
				2524
				2525	add_wait_queue(sk_sleep(sk), &wait);
				2526	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
				2527	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
				2528	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
				2529	remove_wait_queue(sk_sleep(sk), &wait);
				2530	return rc;
				2531	}
				2532	EXPORT_SYMBOL(sk_wait_data);
				2533
				2534	/**
				2535	* __sk_mem_raise_allocated - increase memory_allocated
				2536	* @sk: socket
				2537	* @size: memory size to allocate
				2538	* @amt: pages to allocate
				2539	* @kind: allocation type
				2540	*
				2541	* Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
				2542	*/
				2543	int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
				2544	{
				2545	struct proto *prot = sk->sk_prot;
				2546	long allocated = sk_memory_allocated_add(sk, amt);
				2547	bool charged = true;
				2548
				2549	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
				2550	!(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
				2551	goto suppress_allocation;
				2552
				2553	/* Under limit. */
				2554	if (allocated <= sk_prot_mem_limits(sk, 0)) {
				2555	sk_leave_memory_pressure(sk);
				2556	return 1;
				2557	}
				2558
				2559	/* Under pressure. */
				2560	if (allocated > sk_prot_mem_limits(sk, 1))
				2561	sk_enter_memory_pressure(sk);
				2562
				2563	/* Over hard limit. */
				2564	if (allocated > sk_prot_mem_limits(sk, 2))
				2565	goto suppress_allocation;
				2566
				2567	/* guarantee minimum buffer size under pressure */
				2568	if (kind == SK_MEM_RECV) {
				2569	if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
				2570	return 1;
				2571
				2572	} else { /* SK_MEM_SEND */
				2573	int wmem0 = sk_get_wmem0(sk, prot);
				2574
				2575	if (sk->sk_type == SOCK_STREAM) {
				2576	if (sk->sk_wmem_queued < wmem0)
				2577	return 1;
				2578	} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
				2579	return 1;
				2580	}
				2581	}
				2582
				2583	if (sk_has_memory_pressure(sk)) {
				2584	u64 alloc;
				2585
				2586	if (!sk_under_memory_pressure(sk))
				2587	return 1;
				2588	alloc = sk_sockets_allocated_read_positive(sk);
				2589	if (sk_prot_mem_limits(sk, 2) > alloc *
				2590	sk_mem_pages(sk->sk_wmem_queued +
				2591	atomic_read(&sk->sk_rmem_alloc) +
				2592	sk->sk_forward_alloc))
				2593	return 1;
				2594	}
				2595
				2596	suppress_allocation:
				2597
				2598	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
				2599	sk_stream_moderate_sndbuf(sk);
				2600
				2601	/* Fail only if socket is _under_ its sndbuf.
				2602	* In this case we cannot block, so that we have to fail.
				2603	*/
				2604	if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
				2605	return 1;
				2606	}
				2607
				2608	if (kind == SK_MEM_SEND \|\| (kind == SK_MEM_RECV && charged))
				2609	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
				2610
				2611	sk_memory_allocated_sub(sk, amt);
				2612
				2613	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
				2614	mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
				2615
				2616	return 0;
				2617	}
				2618	EXPORT_SYMBOL(__sk_mem_raise_allocated);
				2619
				2620	/**
				2621	* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
				2622	* @sk: socket
				2623	* @size: memory size to allocate
				2624	* @kind: allocation type
				2625	*
				2626	* If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
				2627	* rmem allocation. This function assumes that protocols which have
				2628	* memory_pressure use sk_wmem_queued as write buffer accounting.
				2629	*/
				2630	int __sk_mem_schedule(struct sock *sk, int size, int kind)
				2631	{
				2632	int ret, amt = sk_mem_pages(size);
				2633
				2634	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
				2635	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
				2636	if (!ret)
				2637	sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
				2638	return ret;
				2639	}
				2640	EXPORT_SYMBOL(__sk_mem_schedule);
				2641
				2642	/**
				2643	* __sk_mem_reduce_allocated - reclaim memory_allocated
				2644	* @sk: socket
				2645	* @amount: number of quanta
				2646	*
				2647	* Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
				2648	*/
				2649	void __sk_mem_reduce_allocated(struct sock *sk, int amount)
				2650	{
				2651	sk_memory_allocated_sub(sk, amount);
				2652
				2653	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
				2654	mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
				2655
				2656	if (sk_under_global_memory_pressure(sk) &&
				2657	(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
				2658	sk_leave_memory_pressure(sk);
				2659	}
				2660	EXPORT_SYMBOL(__sk_mem_reduce_allocated);
				2661
				2662	/**
				2663	* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
				2664	* @sk: socket
				2665	* @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
				2666	*/
				2667	void __sk_mem_reclaim(struct sock *sk, int amount)
				2668	{
				2669	amount >>= SK_MEM_QUANTUM_SHIFT;
				2670	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
				2671	__sk_mem_reduce_allocated(sk, amount);
				2672	}
				2673	EXPORT_SYMBOL(__sk_mem_reclaim);
				2674
				2675	int sk_set_peek_off(struct sock *sk, int val)
				2676	{
				2677	WRITE_ONCE(sk->sk_peek_off, val);
				2678	return 0;
				2679	}
				2680	EXPORT_SYMBOL_GPL(sk_set_peek_off);
				2681
				2682	/*
				2683	* Set of default routines for initialising struct proto_ops when
				2684	* the protocol does not support a particular function. In certain
				2685	* cases where it makes no sense for a protocol to have a "do nothing"
				2686	* function, some default processing is provided.
				2687	*/
				2688
				2689	int sock_no_bind(struct socket sock, struct sockaddr saddr, int len)
				2690	{
				2691	return -EOPNOTSUPP;
				2692	}
				2693	EXPORT_SYMBOL(sock_no_bind);
				2694
				2695	int sock_no_connect(struct socket sock, struct sockaddr saddr,
				2696	int len, int flags)
				2697	{
				2698	return -EOPNOTSUPP;
				2699	}
				2700	EXPORT_SYMBOL(sock_no_connect);
				2701
				2702	int sock_no_socketpair(struct socket sock1, struct socket sock2)
				2703	{
				2704	return -EOPNOTSUPP;
				2705	}
				2706	EXPORT_SYMBOL(sock_no_socketpair);
				2707
				2708	int sock_no_accept(struct socket sock, struct socket newsock, int flags,
				2709	bool kern)
				2710	{
				2711	return -EOPNOTSUPP;
				2712	}
				2713	EXPORT_SYMBOL(sock_no_accept);
				2714
				2715	int sock_no_getname(struct socket sock, struct sockaddr saddr,
				2716	int peer)
				2717	{
				2718	return -EOPNOTSUPP;
				2719	}
				2720	EXPORT_SYMBOL(sock_no_getname);
				2721
				2722	int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
				2723	{
				2724	return -EOPNOTSUPP;
				2725	}
				2726	EXPORT_SYMBOL(sock_no_ioctl);
				2727
				2728	int sock_no_listen(struct socket *sock, int backlog)
				2729	{
				2730	return -EOPNOTSUPP;
				2731	}
				2732	EXPORT_SYMBOL(sock_no_listen);
				2733
				2734	int sock_no_shutdown(struct socket *sock, int how)
				2735	{
				2736	return -EOPNOTSUPP;
				2737	}
				2738	EXPORT_SYMBOL(sock_no_shutdown);
				2739
				2740	int sock_no_setsockopt(struct socket *sock, int level, int optname,
				2741	char __user *optval, unsigned int optlen)
				2742	{
				2743	return -EOPNOTSUPP;
				2744	}
				2745	EXPORT_SYMBOL(sock_no_setsockopt);
				2746
				2747	int sock_no_getsockopt(struct socket *sock, int level, int optname,
				2748	char __user optval, int __user optlen)
				2749	{
				2750	return -EOPNOTSUPP;
				2751	}
				2752	EXPORT_SYMBOL(sock_no_getsockopt);
				2753
				2754	int sock_no_sendmsg(struct socket sock, struct msghdr m, size_t len)
				2755	{
				2756	return -EOPNOTSUPP;
				2757	}
				2758	EXPORT_SYMBOL(sock_no_sendmsg);
				2759
				2760	int sock_no_sendmsg_locked(struct sock sk, struct msghdr m, size_t len)
				2761	{
				2762	return -EOPNOTSUPP;
				2763	}
				2764	EXPORT_SYMBOL(sock_no_sendmsg_locked);
				2765
				2766	int sock_no_recvmsg(struct socket sock, struct msghdr m, size_t len,
				2767	int flags)
				2768	{
				2769	return -EOPNOTSUPP;
				2770	}
				2771	EXPORT_SYMBOL(sock_no_recvmsg);
				2772
				2773	int sock_no_mmap(struct file file, struct socket sock, struct vm_area_struct *vma)
				2774	{
				2775	/* Mirror missing mmap method error code */
				2776	return -ENODEV;
				2777	}
				2778	EXPORT_SYMBOL(sock_no_mmap);
				2779
				2780	/*
				2781	* When a file is received (via SCM_RIGHTS, etc), we must bump the
				2782	* various sock-based usage counts.
				2783	*/
				2784	void __receive_sock(struct file *file)
				2785	{
				2786	struct socket *sock;
				2787	int error;
				2788
				2789	/*
				2790	* The resulting value of "error" is ignored here since we only
				2791	* need to take action when the file is a socket and testing
				2792	* "sock" for NULL is sufficient.
				2793	*/
				2794	sock = sock_from_file(file, &error);
				2795	if (sock) {
				2796	sock_update_netprioidx(&sock->sk->sk_cgrp_data);
				2797	sock_update_classid(&sock->sk->sk_cgrp_data);
				2798	}
				2799	}
				2800
				2801	ssize_t sock_no_sendpage(struct socket sock, struct page page, int offset, size_t size, int flags)
				2802	{
				2803	ssize_t res;
				2804	struct msghdr msg = {.msg_flags = flags};
				2805	struct kvec iov;
				2806	char *kaddr = kmap(page);
				2807	iov.iov_base = kaddr + offset;
				2808	iov.iov_len = size;
				2809	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
				2810	kunmap(page);
				2811	return res;
				2812	}
				2813	EXPORT_SYMBOL(sock_no_sendpage);
				2814
				2815	ssize_t sock_no_sendpage_locked(struct sock sk, struct page page,
				2816	int offset, size_t size, int flags)
				2817	{
				2818	ssize_t res;
				2819	struct msghdr msg = {.msg_flags = flags};
				2820	struct kvec iov;
				2821	char *kaddr = kmap(page);
				2822
				2823	iov.iov_base = kaddr + offset;
				2824	iov.iov_len = size;
				2825	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
				2826	kunmap(page);
				2827	return res;
				2828	}
				2829	EXPORT_SYMBOL(sock_no_sendpage_locked);
				2830
				2831	/*
				2832	* Default Socket Callbacks
				2833	*/
				2834
				2835	static void sock_def_wakeup(struct sock *sk)
				2836	{
				2837	struct socket_wq *wq;
				2838
				2839	rcu_read_lock();
				2840	wq = rcu_dereference(sk->sk_wq);
				2841	if (skwq_has_sleeper(wq))
				2842	wake_up_interruptible_all(&wq->wait);
				2843	rcu_read_unlock();
				2844	}
				2845
				2846	static void sock_def_error_report(struct sock *sk)
				2847	{
				2848	struct socket_wq *wq;
				2849
				2850	rcu_read_lock();
				2851	wq = rcu_dereference(sk->sk_wq);
				2852	if (skwq_has_sleeper(wq))
				2853	wake_up_interruptible_poll(&wq->wait, EPOLLERR);
				2854	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
				2855	rcu_read_unlock();
				2856	}
				2857
				2858	static void sock_def_readable(struct sock *sk)
				2859	{
				2860	struct socket_wq *wq;
				2861
				2862	rcu_read_lock();
				2863	wq = rcu_dereference(sk->sk_wq);
				2864	if (skwq_has_sleeper(wq))
				2865	wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN \| EPOLLPRI \|
				2866	EPOLLRDNORM \| EPOLLRDBAND);
				2867	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				2868	rcu_read_unlock();
				2869	}
				2870
				2871	static void sock_def_write_space(struct sock *sk)
				2872	{
				2873	struct socket_wq *wq;
				2874
				2875	rcu_read_lock();
				2876
				2877	/* Do not wake up a writer until he can make "significant"
				2878	* progress. --DaveM
				2879	*/
				2880	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
				2881	wq = rcu_dereference(sk->sk_wq);
				2882	if (skwq_has_sleeper(wq))
				2883	wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT \|
				2884	EPOLLWRNORM \| EPOLLWRBAND);
				2885
				2886	/* Should agree with poll, otherwise some programs break */
				2887	if (sock_writeable(sk))
				2888	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
				2889	}
				2890
				2891	rcu_read_unlock();
				2892	}
				2893
				2894	static void sock_def_destruct(struct sock *sk)
				2895	{
				2896	}
				2897
				2898	void sk_send_sigurg(struct sock *sk)
				2899	{
				2900	if (sk->sk_socket && sk->sk_socket->file)
				2901	if (send_sigurg(&sk->sk_socket->file->f_owner))
				2902	sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
				2903	}
				2904	EXPORT_SYMBOL(sk_send_sigurg);
				2905
				2906	void sk_reset_timer(struct sock sk, struct timer_list timer,
				2907	unsigned long expires)
				2908	{
				2909	if (!mod_timer(timer, expires))
				2910	sock_hold(sk);
				2911	}
				2912	EXPORT_SYMBOL(sk_reset_timer);
				2913
				2914	void sk_stop_timer(struct sock sk, struct timer_list timer)
				2915	{
				2916	if (del_timer(timer))
				2917	__sock_put(sk);
				2918	}
				2919	EXPORT_SYMBOL(sk_stop_timer);
				2920
				2921	void sk_stop_timer_sync(struct sock sk, struct timer_list timer)
				2922	{
				2923	if (del_timer_sync(timer))
				2924	__sock_put(sk);
				2925	}
				2926	EXPORT_SYMBOL(sk_stop_timer_sync);
				2927
				2928	void sock_init_data_uid(struct socket sock, struct sock sk, kuid_t uid)
				2929	{
				2930	sk_init_common(sk);
				2931	sk->sk_send_head = NULL;
				2932
				2933	timer_setup(&sk->sk_timer, NULL, 0);
				2934
				2935	sk->sk_allocation = GFP_KERNEL;
				2936	sk->sk_rcvbuf = sysctl_rmem_default;
				2937	sk->sk_sndbuf = sysctl_wmem_default;
				2938	sk->sk_state = TCP_CLOSE;
				2939	sk_set_socket(sk, sock);
				2940
				2941	sock_set_flag(sk, SOCK_ZAPPED);
				2942
				2943	if (sock) {
				2944	sk->sk_type = sock->type;
				2945	RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
				2946	sock->sk = sk;
				2947	} else {
				2948	RCU_INIT_POINTER(sk->sk_wq, NULL);
				2949	}
				2950	sk->sk_uid = uid;
				2951
				2952	rwlock_init(&sk->sk_callback_lock);
				2953	if (sk->sk_kern_sock)
				2954	lockdep_set_class_and_name(
				2955	&sk->sk_callback_lock,
				2956	af_kern_callback_keys + sk->sk_family,
				2957	af_family_kern_clock_key_strings[sk->sk_family]);
				2958	else
				2959	lockdep_set_class_and_name(
				2960	&sk->sk_callback_lock,
				2961	af_callback_keys + sk->sk_family,
				2962	af_family_clock_key_strings[sk->sk_family]);
				2963
				2964	sk->sk_state_change = sock_def_wakeup;
				2965	sk->sk_data_ready = sock_def_readable;
				2966	sk->sk_write_space = sock_def_write_space;
				2967	sk->sk_error_report = sock_def_error_report;
				2968	sk->sk_destruct = sock_def_destruct;
				2969
				2970	sk->sk_frag.page = NULL;
				2971	sk->sk_frag.offset = 0;
				2972	sk->sk_peek_off = -1;
				2973
				2974	sk->sk_peer_pid = NULL;
				2975	sk->sk_peer_cred = NULL;
				2976	spin_lock_init(&sk->sk_peer_lock);
				2977
				2978	sk->sk_write_pending = 0;
				2979	sk->sk_rcvlowat = 1;
				2980	sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
				2981	sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
				2982
				2983	sk->sk_stamp = SK_DEFAULT_STAMP;
				2984	#if BITS_PER_LONG==32
				2985	seqlock_init(&sk->sk_stamp_seq);
				2986	#endif
				2987	atomic_set(&sk->sk_zckey, 0);
				2988
				2989	#ifdef CONFIG_NET_RX_BUSY_POLL
				2990	sk->sk_napi_id = 0;
				2991	sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
				2992	#endif
				2993
				2994	sk->sk_max_pacing_rate = ~0UL;
				2995	sk->sk_pacing_rate = ~0UL;
				2996	WRITE_ONCE(sk->sk_pacing_shift, 10);
				2997	sk->sk_incoming_cpu = -1;
				2998
				2999	sk_rx_queue_clear(sk);
				3000	/*
				3001	* Before updating sk_refcnt, we must commit prior changes to memory
				3002	* (Documentation/RCU/rculist_nulls.txt for details)
				3003	*/
				3004	smp_wmb();
				3005	refcount_set(&sk->sk_refcnt, 1);
				3006	atomic_set(&sk->sk_drops, 0);
				3007	}
				3008	EXPORT_SYMBOL(sock_init_data_uid);
				3009
				3010	void sock_init_data(struct socket sock, struct sock sk)
				3011	{
				3012	kuid_t uid = sock ?
				3013	SOCK_INODE(sock)->i_uid :
				3014	make_kuid(sock_net(sk)->user_ns, 0);
				3015
				3016	sock_init_data_uid(sock, sk, uid);
				3017	}
				3018	EXPORT_SYMBOL(sock_init_data);
				3019
				3020	void lock_sock_nested(struct sock *sk, int subclass)
				3021	{
				3022	might_sleep();
				3023	spin_lock_bh(&sk->sk_lock.slock);
				3024	if (sk->sk_lock.owned)
				3025	__lock_sock(sk);
				3026	sk->sk_lock.owned = 1;
				3027	spin_unlock(&sk->sk_lock.slock);
				3028	/*
				3029	* The sk_lock has mutex_lock() semantics here:
				3030	*/
				3031	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
				3032	local_bh_enable();
				3033	}
				3034	EXPORT_SYMBOL(lock_sock_nested);
				3035
				3036	void release_sock(struct sock *sk)
				3037	{
				3038	spin_lock_bh(&sk->sk_lock.slock);
				3039	if (sk->sk_backlog.tail)
				3040	__release_sock(sk);
				3041
				3042	/* Warning : release_cb() might need to release sk ownership,
				3043	* ie call sock_release_ownership(sk) before us.
				3044	*/
				3045	if (sk->sk_prot->release_cb)
				3046	sk->sk_prot->release_cb(sk);
				3047
				3048	sock_release_ownership(sk);
				3049	if (waitqueue_active(&sk->sk_lock.wq))
				3050	wake_up(&sk->sk_lock.wq);
				3051	spin_unlock_bh(&sk->sk_lock.slock);
				3052	}
				3053	EXPORT_SYMBOL(release_sock);
				3054
				3055	/**
				3056	* lock_sock_fast - fast version of lock_sock
				3057	* @sk: socket
				3058	*
				3059	* This version should be used for very small section, where process wont block
				3060	* return false if fast path is taken:
				3061	*
				3062	* sk_lock.slock locked, owned = 0, BH disabled
				3063	*
				3064	* return true if slow path is taken:
				3065	*
				3066	* sk_lock.slock unlocked, owned = 1, BH enabled
				3067	*/
				3068	bool lock_sock_fast(struct sock *sk)
				3069	{
				3070	might_sleep();
				3071	spin_lock_bh(&sk->sk_lock.slock);
				3072
				3073	if (!sk->sk_lock.owned)
				3074	/*
				3075	* Note : We must disable BH
				3076	*/
				3077	return false;
				3078
				3079	__lock_sock(sk);
				3080	sk->sk_lock.owned = 1;
				3081	spin_unlock(&sk->sk_lock.slock);
				3082	/*
				3083	* The sk_lock has mutex_lock() semantics here:
				3084	*/
				3085	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
				3086	local_bh_enable();
				3087	return true;
				3088	}
				3089	EXPORT_SYMBOL(lock_sock_fast);
				3090
				3091	int sock_gettstamp(struct socket sock, void __user userstamp,
				3092	bool timeval, bool time32)
				3093	{
				3094	struct sock *sk = sock->sk;
				3095	struct timespec64 ts;
				3096
				3097	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
				3098	ts = ktime_to_timespec64(sock_read_timestamp(sk));
				3099	if (ts.tv_sec == -1)
				3100	return -ENOENT;
				3101	if (ts.tv_sec == 0) {
				3102	ktime_t kt = ktime_get_real();
				3103	sock_write_timestamp(sk, kt);;
				3104	ts = ktime_to_timespec64(kt);
				3105	}
				3106
				3107	if (timeval)
				3108	ts.tv_nsec /= 1000;
				3109
				3110	#ifdef CONFIG_COMPAT_32BIT_TIME
				3111	if (time32)
				3112	return put_old_timespec32(&ts, userstamp);
				3113	#endif
				3114	#ifdef CONFIG_SPARC64
				3115	/* beware of padding in sparc64 timeval */
				3116	if (timeval && !in_compat_syscall()) {
				3117	struct __kernel_old_timeval __user tv = {
				3118	.tv_sec = ts.tv_sec,
				3119	.tv_usec = ts.tv_nsec,
				3120	};
				3121	if (copy_to_user(userstamp, &tv, sizeof(tv)))
				3122	return -EFAULT;
				3123	return 0;
				3124	}
				3125	#endif
				3126	return put_timespec64(&ts, userstamp);
				3127	}
				3128	EXPORT_SYMBOL(sock_gettstamp);
				3129
				3130	void sock_enable_timestamp(struct sock *sk, int flag)
				3131	{
				3132	if (!sock_flag(sk, flag)) {
				3133	unsigned long previous_flags = sk->sk_flags;
				3134
				3135	sock_set_flag(sk, flag);
				3136	/*
				3137	* we just set one of the two flags which require net
				3138	* time stamping, but time stamping might have been on
				3139	* already because of the other one
				3140	*/
				3141	if (sock_needs_netstamp(sk) &&
				3142	!(previous_flags & SK_FLAGS_TIMESTAMP))
				3143	net_enable_timestamp();
				3144	}
				3145	}
				3146
				3147	int sock_recv_errqueue(struct sock sk, struct msghdr msg, int len,
				3148	int level, int type)
				3149	{
				3150	struct sock_exterr_skb *serr;
				3151	struct sk_buff *skb;
				3152	int copied, err;
				3153
				3154	err = -EAGAIN;
				3155	skb = sock_dequeue_err_skb(sk);
				3156	if (skb == NULL)
				3157	goto out;
				3158
				3159	copied = skb->len;
				3160	if (copied > len) {
				3161	msg->msg_flags \|= MSG_TRUNC;
				3162	copied = len;
				3163	}
				3164	err = skb_copy_datagram_msg(skb, 0, msg, copied);
				3165	if (err)
				3166	goto out_free_skb;
				3167
				3168	sock_recv_timestamp(msg, sk, skb);
				3169
				3170	serr = SKB_EXT_ERR(skb);
				3171	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
				3172
				3173	msg->msg_flags \|= MSG_ERRQUEUE;
				3174	err = copied;
				3175
				3176	out_free_skb:
				3177	kfree_skb(skb);
				3178	out:
				3179	return err;
				3180	}
				3181	EXPORT_SYMBOL(sock_recv_errqueue);
				3182
				3183	/*
				3184	* Get a socket option on an socket.
				3185	*
				3186	* FIX: POSIX 1003.1g is very ambiguous here. It states that
				3187	* asynchronous errors should be reported by getsockopt. We assume
				3188	* this means if you specify SO_ERROR (otherwise whats the point of it).
				3189	*/
				3190	int sock_common_getsockopt(struct socket *sock, int level, int optname,
				3191	char __user optval, int __user optlen)
				3192	{
				3193	struct sock *sk = sock->sk;
				3194
				3195	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
				3196	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
				3197	}
				3198	EXPORT_SYMBOL(sock_common_getsockopt);
				3199
				3200	#ifdef CONFIG_COMPAT
				3201	int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
				3202	char __user optval, int __user optlen)
				3203	{
				3204	struct sock *sk = sock->sk;
				3205
				3206	if (sk->sk_prot->compat_getsockopt != NULL)
				3207	return sk->sk_prot->compat_getsockopt(sk, level, optname,
				3208	optval, optlen);
				3209	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
				3210	}
				3211	EXPORT_SYMBOL(compat_sock_common_getsockopt);
				3212	#endif
				3213
				3214	int sock_common_recvmsg(struct socket sock, struct msghdr msg, size_t size,
				3215	int flags)
				3216	{
				3217	struct sock *sk = sock->sk;
				3218	int addr_len = 0;
				3219	int err;
				3220
				3221	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
				3222	flags & ~MSG_DONTWAIT, &addr_len);
				3223	if (err >= 0)
				3224	msg->msg_namelen = addr_len;
				3225	return err;
				3226	}
				3227	EXPORT_SYMBOL(sock_common_recvmsg);
				3228
				3229	/*
				3230	* Set socket options on an inet socket.
				3231	*/
				3232	int sock_common_setsockopt(struct socket *sock, int level, int optname,
				3233	char __user *optval, unsigned int optlen)
				3234	{
				3235	struct sock *sk = sock->sk;
				3236
				3237	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
				3238	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
				3239	}
				3240	EXPORT_SYMBOL(sock_common_setsockopt);
				3241
				3242	#ifdef CONFIG_COMPAT
				3243	int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
				3244	char __user *optval, unsigned int optlen)
				3245	{
				3246	struct sock *sk = sock->sk;
				3247
				3248	if (sk->sk_prot->compat_setsockopt != NULL)
				3249	return sk->sk_prot->compat_setsockopt(sk, level, optname,
				3250	optval, optlen);
				3251	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
				3252	}
				3253	EXPORT_SYMBOL(compat_sock_common_setsockopt);
				3254	#endif
				3255
				3256	void sk_common_release(struct sock *sk)
				3257	{
				3258	if (sk->sk_prot->destroy)
				3259	sk->sk_prot->destroy(sk);
				3260
				3261	/*
				3262	* Observation: when sock_common_release is called, processes have
				3263	* no access to socket. But net still has.
				3264	* Step one, detach it from networking:
				3265	*
				3266	* A. Remove from hash tables.
				3267	*/
				3268
				3269	sk->sk_prot->unhash(sk);
				3270
				3271	/*
				3272	* In this point socket cannot receive new packets, but it is possible
				3273	* that some packets are in flight because some CPU runs receiver and
				3274	* did hash table lookup before we unhashed socket. They will achieve
				3275	* receive queue and will be purged by socket destructor.
				3276	*
				3277	* Also we still have packets pending on receive queue and probably,
				3278	* our own packets waiting in device queues. sock_destroy will drain
				3279	* receive queue, but transmitted packets will delay socket destruction
				3280	* until the last reference will be released.
				3281	*/
				3282
				3283	sock_orphan(sk);
				3284
				3285	xfrm_sk_free_policy(sk);
				3286
				3287	sk_refcnt_debug_release(sk);
				3288
				3289	sock_put(sk);
				3290	}
				3291	EXPORT_SYMBOL(sk_common_release);
				3292
				3293	void sk_get_meminfo(const struct sock sk, u32 mem)
				3294	{
				3295	memset(mem, 0, sizeof(mem) SK_MEMINFO_VARS);
				3296
				3297	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
				3298	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
				3299	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
				3300	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
				3301	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
				3302	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
				3303	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
				3304	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
				3305	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
				3306	}
				3307
				3308	#ifdef CONFIG_PROC_FS
				3309	#define PROTO_INUSE_NR 64 /* should be enough for the first time */
				3310	struct prot_inuse {
				3311	int val[PROTO_INUSE_NR];
				3312	};
				3313
				3314	static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
				3315
				3316	void sock_prot_inuse_add(struct net net, struct proto prot, int val)
				3317	{
				3318	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
				3319	}
				3320	EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
				3321
				3322	int sock_prot_inuse_get(struct net net, struct proto prot)
				3323	{
				3324	int cpu, idx = prot->inuse_idx;
				3325	int res = 0;
				3326
				3327	for_each_possible_cpu(cpu)
				3328	res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
				3329
				3330	return res >= 0 ? res : 0;
				3331	}
				3332	EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
				3333
				3334	static void sock_inuse_add(struct net *net, int val)
				3335	{
				3336	this_cpu_add(*net->core.sock_inuse, val);
				3337	}
				3338
				3339	int sock_inuse_get(struct net *net)
				3340	{
				3341	int cpu, res = 0;
				3342
				3343	for_each_possible_cpu(cpu)
				3344	res += *per_cpu_ptr(net->core.sock_inuse, cpu);
				3345
				3346	return res;
				3347	}
				3348
				3349	EXPORT_SYMBOL_GPL(sock_inuse_get);
				3350
				3351	static int __net_init sock_inuse_init_net(struct net *net)
				3352	{
				3353	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
				3354	if (net->core.prot_inuse == NULL)
				3355	return -ENOMEM;
				3356
				3357	net->core.sock_inuse = alloc_percpu(int);
				3358	if (net->core.sock_inuse == NULL)
				3359	goto out;
				3360
				3361	return 0;
				3362
				3363	out:
				3364	free_percpu(net->core.prot_inuse);
				3365	return -ENOMEM;
				3366	}
				3367
				3368	static void __net_exit sock_inuse_exit_net(struct net *net)
				3369	{
				3370	free_percpu(net->core.prot_inuse);
				3371	free_percpu(net->core.sock_inuse);
				3372	}
				3373
				3374	static struct pernet_operations net_inuse_ops = {
				3375	.init = sock_inuse_init_net,
				3376	.exit = sock_inuse_exit_net,
				3377	};
				3378
				3379	static __init int net_inuse_init(void)
				3380	{
				3381	if (register_pernet_subsys(&net_inuse_ops))
				3382	panic("Cannot initialize net inuse counters");
				3383
				3384	return 0;
				3385	}
				3386
				3387	core_initcall(net_inuse_init);
				3388
				3389	static int assign_proto_idx(struct proto *prot)
				3390	{
				3391	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
				3392
				3393	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
				3394	pr_err("PROTO_INUSE_NR exhausted\n");
				3395	return -ENOSPC;
				3396	}
				3397
				3398	set_bit(prot->inuse_idx, proto_inuse_idx);
				3399	return 0;
				3400	}
				3401
				3402	static void release_proto_idx(struct proto *prot)
				3403	{
				3404	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
				3405	clear_bit(prot->inuse_idx, proto_inuse_idx);
				3406	}
				3407	#else
				3408	static inline int assign_proto_idx(struct proto *prot)
				3409	{
				3410	return 0;
				3411	}
				3412
				3413	static inline void release_proto_idx(struct proto *prot)
				3414	{
				3415	}
				3416
				3417	static void sock_inuse_add(struct net *net, int val)
				3418	{
				3419	}
				3420	#endif
				3421
				3422	static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
				3423	{
				3424	if (!twsk_prot)
				3425	return;
				3426	kfree(twsk_prot->twsk_slab_name);
				3427	twsk_prot->twsk_slab_name = NULL;
				3428	kmem_cache_destroy(twsk_prot->twsk_slab);
				3429	twsk_prot->twsk_slab = NULL;
				3430	}
				3431
				3432	static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
				3433	{
				3434	if (!rsk_prot)
				3435	return;
				3436	kfree(rsk_prot->slab_name);
				3437	rsk_prot->slab_name = NULL;
				3438	kmem_cache_destroy(rsk_prot->slab);
				3439	rsk_prot->slab = NULL;
				3440	}
				3441
				3442	static int req_prot_init(const struct proto *prot)
				3443	{
				3444	struct request_sock_ops *rsk_prot = prot->rsk_prot;
				3445
				3446	if (!rsk_prot)
				3447	return 0;
				3448
				3449	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
				3450	prot->name);
				3451	if (!rsk_prot->slab_name)
				3452	return -ENOMEM;
				3453
				3454	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
				3455	rsk_prot->obj_size, 0,
				3456	SLAB_ACCOUNT \| prot->slab_flags,
				3457	NULL);
				3458
				3459	if (!rsk_prot->slab) {
				3460	pr_crit("%s: Can't create request sock SLAB cache!\n",
				3461	prot->name);
				3462	return -ENOMEM;
				3463	}
				3464	return 0;
				3465	}
				3466
				3467	int proto_register(struct proto *prot, int alloc_slab)
				3468	{
				3469	int ret = -ENOBUFS;
				3470
				3471	if (alloc_slab) {
				3472	prot->slab = kmem_cache_create_usercopy(prot->name,
				3473	prot->obj_size, 0,
				3474	SLAB_HWCACHE_ALIGN \| SLAB_ACCOUNT \|
				3475	prot->slab_flags,
				3476	prot->useroffset, prot->usersize,
				3477	NULL);
				3478
				3479	if (prot->slab == NULL) {
				3480	pr_crit("%s: Can't create sock SLAB cache!\n",
				3481	prot->name);
				3482	goto out;
				3483	}
				3484
				3485	if (req_prot_init(prot))
				3486	goto out_free_request_sock_slab;
				3487
				3488	if (prot->twsk_prot != NULL) {
				3489	prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
				3490
				3491	if (prot->twsk_prot->twsk_slab_name == NULL)
				3492	goto out_free_request_sock_slab;
				3493
				3494	prot->twsk_prot->twsk_slab =
				3495	kmem_cache_create(prot->twsk_prot->twsk_slab_name,
				3496	prot->twsk_prot->twsk_obj_size,
				3497	0,
				3498	SLAB_ACCOUNT \|
				3499	prot->slab_flags,
				3500	NULL);
				3501	if (prot->twsk_prot->twsk_slab == NULL)
				3502	goto out_free_timewait_sock_slab;
				3503	}
				3504	}
				3505
				3506	mutex_lock(&proto_list_mutex);
				3507	ret = assign_proto_idx(prot);
				3508	if (ret) {
				3509	mutex_unlock(&proto_list_mutex);
				3510	goto out_free_timewait_sock_slab;
				3511	}
				3512	list_add(&prot->node, &proto_list);
				3513	mutex_unlock(&proto_list_mutex);
				3514	return ret;
				3515
				3516	out_free_timewait_sock_slab:
				3517	if (alloc_slab && prot->twsk_prot)
				3518	tw_prot_cleanup(prot->twsk_prot);
				3519	out_free_request_sock_slab:
				3520	if (alloc_slab) {
				3521	req_prot_cleanup(prot->rsk_prot);
				3522
				3523	kmem_cache_destroy(prot->slab);
				3524	prot->slab = NULL;
				3525	}
				3526	out:
				3527	return ret;
				3528	}
				3529	EXPORT_SYMBOL(proto_register);
				3530
				3531	void proto_unregister(struct proto *prot)
				3532	{
				3533	mutex_lock(&proto_list_mutex);
				3534	release_proto_idx(prot);
				3535	list_del(&prot->node);
				3536	mutex_unlock(&proto_list_mutex);
				3537
				3538	kmem_cache_destroy(prot->slab);
				3539	prot->slab = NULL;
				3540
				3541	req_prot_cleanup(prot->rsk_prot);
				3542	tw_prot_cleanup(prot->twsk_prot);
				3543	}
				3544	EXPORT_SYMBOL(proto_unregister);
				3545
				3546	int sock_load_diag_module(int family, int protocol)
				3547	{
				3548	if (!protocol) {
				3549	if (!sock_is_registered(family))
				3550	return -ENOENT;
				3551
				3552	return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
				3553	NETLINK_SOCK_DIAG, family);
				3554	}
				3555
				3556	#ifdef CONFIG_INET
				3557	if (family == AF_INET &&
				3558	protocol != IPPROTO_RAW &&
				3559	!rcu_access_pointer(inet_protos[protocol]))
				3560	return -ENOENT;
				3561	#endif
				3562
				3563	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
				3564	NETLINK_SOCK_DIAG, family, protocol);
				3565	}
				3566	EXPORT_SYMBOL(sock_load_diag_module);
				3567
				3568	#ifdef CONFIG_PROC_FS
				3569	static void proto_seq_start(struct seq_file seq, loff_t *pos)
				3570	__acquires(proto_list_mutex)
				3571	{
				3572	mutex_lock(&proto_list_mutex);
				3573	return seq_list_start_head(&proto_list, *pos);
				3574	}
				3575
				3576	static void proto_seq_next(struct seq_file seq, void v, loff_t pos)
				3577	{
				3578	return seq_list_next(v, &proto_list, pos);
				3579	}
				3580
				3581	static void proto_seq_stop(struct seq_file seq, void v)
				3582	__releases(proto_list_mutex)
				3583	{
				3584	mutex_unlock(&proto_list_mutex);
				3585	}
				3586
				3587	static char proto_method_implemented(const void *method)
				3588	{
				3589	return method == NULL ? 'n' : 'y';
				3590	}
				3591	static long sock_prot_memory_allocated(struct proto *proto)
				3592	{
				3593	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
				3594	}
				3595
				3596	static const char sock_prot_memory_pressure(struct proto proto)
				3597	{
				3598	return proto->memory_pressure != NULL ?
				3599	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
				3600	}
				3601
				3602	static void proto_seq_printf(struct seq_file seq, struct proto proto)
				3603	{
				3604
				3605	seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
				3606	"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
				3607	proto->name,
				3608	proto->obj_size,
				3609	sock_prot_inuse_get(seq_file_net(seq), proto),
				3610	sock_prot_memory_allocated(proto),
				3611	sock_prot_memory_pressure(proto),
				3612	proto->max_header,
				3613	proto->slab == NULL ? "no" : "yes",
				3614	module_name(proto->owner),
				3615	proto_method_implemented(proto->close),
				3616	proto_method_implemented(proto->connect),
				3617	proto_method_implemented(proto->disconnect),
				3618	proto_method_implemented(proto->accept),
				3619	proto_method_implemented(proto->ioctl),
				3620	proto_method_implemented(proto->init),
				3621	proto_method_implemented(proto->destroy),
				3622	proto_method_implemented(proto->shutdown),
				3623	proto_method_implemented(proto->setsockopt),
				3624	proto_method_implemented(proto->getsockopt),
				3625	proto_method_implemented(proto->sendmsg),
				3626	proto_method_implemented(proto->recvmsg),
				3627	proto_method_implemented(proto->sendpage),
				3628	proto_method_implemented(proto->bind),
				3629	proto_method_implemented(proto->backlog_rcv),
				3630	proto_method_implemented(proto->hash),
				3631	proto_method_implemented(proto->unhash),
				3632	proto_method_implemented(proto->get_port),
				3633	proto_method_implemented(proto->enter_memory_pressure));
				3634	}
				3635
				3636	static int proto_seq_show(struct seq_file seq, void v)
				3637	{
				3638	if (v == &proto_list)
				3639	seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
				3640	"protocol",
				3641	"size",
				3642	"sockets",
				3643	"memory",
				3644	"press",
				3645	"maxhdr",
				3646	"slab",
				3647	"module",
				3648	"cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
				3649	else
				3650	proto_seq_printf(seq, list_entry(v, struct proto, node));
				3651	return 0;
				3652	}
				3653
				3654	static const struct seq_operations proto_seq_ops = {
				3655	.start = proto_seq_start,
				3656	.next = proto_seq_next,
				3657	.stop = proto_seq_stop,
				3658	.show = proto_seq_show,
				3659	};
				3660
				3661	static __net_init int proto_init_net(struct net *net)
				3662	{
				3663	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
				3664	sizeof(struct seq_net_private)))
				3665	return -ENOMEM;
				3666
				3667	return 0;
				3668	}
				3669
				3670	static __net_exit void proto_exit_net(struct net *net)
				3671	{
				3672	remove_proc_entry("protocols", net->proc_net);
				3673	}
				3674
				3675
				3676	static __net_initdata struct pernet_operations proto_net_ops = {
				3677	.init = proto_init_net,
				3678	.exit = proto_exit_net,
				3679	};
				3680
				3681	static int __init proto_init(void)
				3682	{
				3683	if (IS_ENABLED(CONFIG_PROC_STRIPPED))
				3684	return 0;
				3685	return register_pernet_subsys(&proto_net_ops);
				3686	}
				3687
				3688	subsys_initcall(proto_init);
				3689
				3690	#endif /* PROC_FS */
				3691
				3692	#ifdef CONFIG_NET_RX_BUSY_POLL
				3693	bool sk_busy_loop_end(void *p, unsigned long start_time)
				3694	{
				3695	struct sock *sk = p;
				3696
				3697	return !skb_queue_empty_lockless(&sk->sk_receive_queue) \|\|
				3698	sk_busy_loop_timeout(sk, start_time);
				3699	}
				3700	EXPORT_SYMBOL(sk_busy_loop_end);
				3701	#endif /* CONFIG_NET_RX_BUSY_POLL */