blob: f70079ce040f8f15971d2e1ac405bda89928550c [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <asm/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
111#include <linux/highmem.h>
112#include <linux/user_namespace.h>
113#include <linux/static_key.h>
114#include <linux/memcontrol.h>
115#include <linux/prefetch.h>
116
117#include <linux/uaccess.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/net_namespace.h>
123#include <net/request_sock.h>
124#include <net/sock.h>
125#include <linux/net_tstamp.h>
126#include <net/xfrm.h>
127#include <linux/ipsec.h>
128#include <net/cls_cgroup.h>
129#include <net/netprio_cgroup.h>
130#include <linux/sock_diag.h>
131
132#include <linux/filter.h>
133#include <net/sock_reuseport.h>
134#include <net/bpf_sk_storage.h>
135
136#include <trace/events/sock.h>
137#include <trace/hooks/net.h>
138
139#include <net/tcp.h>
140#include <net/busy_poll.h>
141
142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list);
144static atomic64_t cookie_gen;
145
146static void sock_inuse_add(struct net *net, int val);
147
148/**
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
153 *
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
157 */
158bool sk_ns_capable(const struct sock *sk,
159 struct user_namespace *user_ns, int cap)
160{
161 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 ns_capable(user_ns, cap);
163}
164EXPORT_SYMBOL(sk_ns_capable);
165
166/**
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
169 * @cap: The global capability to use
170 *
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
174 */
175bool sk_capable(const struct sock *sk, int cap)
176{
177 return sk_ns_capable(sk, &init_user_ns, cap);
178}
179EXPORT_SYMBOL(sk_capable);
180
181/**
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
185 *
186 * Test to see if the opener of the socket had when the socket was created
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
189 */
190bool sk_net_capable(const struct sock *sk, int cap)
191{
192 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193}
194EXPORT_SYMBOL(sk_net_capable);
195
196/*
197 * Each address family might have different locking rules, so we have
198 * one slock key per address family and separate keys for internal and
199 * userspace sockets.
200 */
201static struct lock_class_key af_family_keys[AF_MAX];
202static struct lock_class_key af_family_kern_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX];
204static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
205
206/*
207 * Make lock validator output more readable. (we pre-construct these
208 * strings build-time, so that runtime initialization of socket
209 * locks is fast):
210 */
211
212#define _sock_locks(x) \
213 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
214 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
215 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
216 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
217 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
218 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
219 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
220 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
221 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
222 x "27" , x "28" , x "AF_CAN" , \
223 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
224 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
225 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
226 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
227 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
228 x "AF_MAX"
229
230static const char *const af_family_key_strings[AF_MAX+1] = {
231 _sock_locks("sk_lock-")
232};
233static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 _sock_locks("slock-")
235};
236static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 _sock_locks("clock-")
238};
239
240static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 _sock_locks("k-sk_lock-")
242};
243static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 _sock_locks("k-slock-")
245};
246static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-clock-")
248};
249static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 _sock_locks("rlock-")
251};
252static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 _sock_locks("wlock-")
254};
255static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 _sock_locks("elock-")
257};
258
259/*
260 * sk_callback_lock and sk queues locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264static struct lock_class_key af_rlock_keys[AF_MAX];
265static struct lock_class_key af_wlock_keys[AF_MAX];
266static struct lock_class_key af_elock_keys[AF_MAX];
267static struct lock_class_key af_kern_callback_keys[AF_MAX];
268
269/* Run time adjustable parameters. */
270__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271EXPORT_SYMBOL(sysctl_wmem_max);
272__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273EXPORT_SYMBOL(sysctl_rmem_max);
274__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276
277/* Maximal space eaten by iovec or ancillary data plus some space */
278int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279EXPORT_SYMBOL(sysctl_optmem_max);
280
281int sysctl_tstamp_allow_data __read_mostly = 1;
282
283DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284EXPORT_SYMBOL_GPL(memalloc_socks_key);
285
286/**
287 * sk_set_memalloc - sets %SOCK_MEMALLOC
288 * @sk: socket to set it on
289 *
290 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291 * It's the responsibility of the admin to adjust min_free_kbytes
292 * to meet the requirements
293 */
294void sk_set_memalloc(struct sock *sk)
295{
296 sock_set_flag(sk, SOCK_MEMALLOC);
297 sk->sk_allocation |= __GFP_MEMALLOC;
298 static_branch_inc(&memalloc_socks_key);
299}
300EXPORT_SYMBOL_GPL(sk_set_memalloc);
301
302void sk_clear_memalloc(struct sock *sk)
303{
304 sock_reset_flag(sk, SOCK_MEMALLOC);
305 sk->sk_allocation &= ~__GFP_MEMALLOC;
306 static_branch_dec(&memalloc_socks_key);
307
308 /*
309 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 * it has rmem allocations due to the last swapfile being deactivated
312 * but there is a risk that the socket is unusable due to exceeding
313 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 */
315 sk_mem_reclaim(sk);
316}
317EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318
319int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320{
321 int ret;
322 unsigned int noreclaim_flag;
323
324 /* these should have been dropped before queueing */
325 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326
327 noreclaim_flag = memalloc_noreclaim_save();
328 ret = sk->sk_backlog_rcv(sk, skb);
329 memalloc_noreclaim_restore(noreclaim_flag);
330
331 return ret;
332}
333EXPORT_SYMBOL(__sk_backlog_rcv);
334
335static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
336{
337 struct __kernel_sock_timeval tv;
338 int size;
339
340 if (timeo == MAX_SCHEDULE_TIMEOUT) {
341 tv.tv_sec = 0;
342 tv.tv_usec = 0;
343 } else {
344 tv.tv_sec = timeo / HZ;
345 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
346 }
347
348 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
349 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
350 *(struct old_timeval32 *)optval = tv32;
351 return sizeof(tv32);
352 }
353
354 if (old_timeval) {
355 struct __kernel_old_timeval old_tv;
356 old_tv.tv_sec = tv.tv_sec;
357 old_tv.tv_usec = tv.tv_usec;
358 *(struct __kernel_old_timeval *)optval = old_tv;
359 size = sizeof(old_tv);
360 } else {
361 *(struct __kernel_sock_timeval *)optval = tv;
362 size = sizeof(tv);
363 }
364
365 return size;
366}
367
368static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
369{
370 struct __kernel_sock_timeval tv;
371
372 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 struct old_timeval32 tv32;
374
375 if (optlen < sizeof(tv32))
376 return -EINVAL;
377
378 if (copy_from_user(&tv32, optval, sizeof(tv32)))
379 return -EFAULT;
380 tv.tv_sec = tv32.tv_sec;
381 tv.tv_usec = tv32.tv_usec;
382 } else if (old_timeval) {
383 struct __kernel_old_timeval old_tv;
384
385 if (optlen < sizeof(old_tv))
386 return -EINVAL;
387 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
388 return -EFAULT;
389 tv.tv_sec = old_tv.tv_sec;
390 tv.tv_usec = old_tv.tv_usec;
391 } else {
392 if (optlen < sizeof(tv))
393 return -EINVAL;
394 if (copy_from_user(&tv, optval, sizeof(tv)))
395 return -EFAULT;
396 }
397 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
398 return -EDOM;
399
400 if (tv.tv_sec < 0) {
401 static int warned __read_mostly;
402
403 *timeo_p = 0;
404 if (warned < 10 && net_ratelimit()) {
405 warned++;
406 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
407 __func__, current->comm, task_pid_nr(current));
408 }
409 return 0;
410 }
411 *timeo_p = MAX_SCHEDULE_TIMEOUT;
412 if (tv.tv_sec == 0 && tv.tv_usec == 0)
413 return 0;
414 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
415 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
416 return 0;
417}
418
419static void sock_warn_obsolete_bsdism(const char *name)
420{
421 static int warned;
422 static char warncomm[TASK_COMM_LEN];
423 if (strcmp(warncomm, current->comm) && warned < 5) {
424 strcpy(warncomm, current->comm);
425 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
426 warncomm, name);
427 warned++;
428 }
429}
430
431static bool sock_needs_netstamp(const struct sock *sk)
432{
433 switch (sk->sk_family) {
434 case AF_UNSPEC:
435 case AF_UNIX:
436 return false;
437 default:
438 return true;
439 }
440}
441
442static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
443{
444 if (sk->sk_flags & flags) {
445 sk->sk_flags &= ~flags;
446 if (sock_needs_netstamp(sk) &&
447 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
448 net_disable_timestamp();
449 }
450}
451
452
453int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
454{
455 unsigned long flags;
456 struct sk_buff_head *list = &sk->sk_receive_queue;
457
458 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
459 atomic_inc(&sk->sk_drops);
460 trace_sock_rcvqueue_full(sk, skb);
461 return -ENOMEM;
462 }
463
464 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
465 atomic_inc(&sk->sk_drops);
466 return -ENOBUFS;
467 }
468
469 skb->dev = NULL;
470 skb_set_owner_r(skb, sk);
471
472 /* we escape from rcu protected region, make sure we dont leak
473 * a norefcounted dst
474 */
475 skb_dst_force(skb);
476
477 spin_lock_irqsave(&list->lock, flags);
478 sock_skb_set_dropcount(sk, skb);
479 __skb_queue_tail(list, skb);
480 spin_unlock_irqrestore(&list->lock, flags);
481
482 if (!sock_flag(sk, SOCK_DEAD))
483 sk->sk_data_ready(sk);
484 return 0;
485}
486EXPORT_SYMBOL(__sock_queue_rcv_skb);
487
488int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489{
490 int err;
491
492 err = sk_filter(sk, skb);
493 if (err)
494 return err;
495
496 return __sock_queue_rcv_skb(sk, skb);
497}
498EXPORT_SYMBOL(sock_queue_rcv_skb);
499
500int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
501 const int nested, unsigned int trim_cap, bool refcounted)
502{
503 int rc = NET_RX_SUCCESS;
504
505 if (sk_filter_trim_cap(sk, skb, trim_cap))
506 goto discard_and_relse;
507
508 skb->dev = NULL;
509
510 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
511 atomic_inc(&sk->sk_drops);
512 goto discard_and_relse;
513 }
514 if (nested)
515 bh_lock_sock_nested(sk);
516 else
517 bh_lock_sock(sk);
518 if (!sock_owned_by_user(sk)) {
519 /*
520 * trylock + unlock semantics:
521 */
522 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
523
524 rc = sk_backlog_rcv(sk, skb);
525
526 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
527 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
528 bh_unlock_sock(sk);
529 atomic_inc(&sk->sk_drops);
530 goto discard_and_relse;
531 }
532
533 bh_unlock_sock(sk);
534out:
535 if (refcounted)
536 sock_put(sk);
537 return rc;
538discard_and_relse:
539 kfree_skb(skb);
540 goto out;
541}
542EXPORT_SYMBOL(__sk_receive_skb);
543
544u64 sock_gen_cookie(struct sock *sk)
545{
546 while (1) {
547 u64 res = atomic64_read(&sk->sk_cookie);
548
549 if (res)
550 return res;
551 res = atomic64_inc_return(&cookie_gen);
552 atomic64_cmpxchg(&sk->sk_cookie, 0, res);
553 }
554}
555
556struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
557{
558 struct dst_entry *dst = __sk_dst_get(sk);
559
560 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
561 sk_tx_queue_clear(sk);
562 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
563 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
564 dst_release(dst);
565 return NULL;
566 }
567
568 return dst;
569}
570EXPORT_SYMBOL(__sk_dst_check);
571
572struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
573{
574 struct dst_entry *dst = sk_dst_get(sk);
575
576 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
577 sk_dst_reset(sk);
578 dst_release(dst);
579 return NULL;
580 }
581
582 return dst;
583}
584EXPORT_SYMBOL(sk_dst_check);
585
586static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
587{
588 int ret = -ENOPROTOOPT;
589#ifdef CONFIG_NETDEVICES
590 struct net *net = sock_net(sk);
591
592 /* Sorry... */
593 ret = -EPERM;
594 if (!ns_capable(net->user_ns, CAP_NET_RAW))
595 goto out;
596
597 ret = -EINVAL;
598 if (ifindex < 0)
599 goto out;
600
601 sk->sk_bound_dev_if = ifindex;
602 if (sk->sk_prot->rehash)
603 sk->sk_prot->rehash(sk);
604 sk_dst_reset(sk);
605
606 ret = 0;
607
608out:
609#endif
610
611 return ret;
612}
613
614static int sock_setbindtodevice(struct sock *sk, char __user *optval,
615 int optlen)
616{
617 int ret = -ENOPROTOOPT;
618#ifdef CONFIG_NETDEVICES
619 struct net *net = sock_net(sk);
620 char devname[IFNAMSIZ];
621 int index;
622
623 ret = -EINVAL;
624 if (optlen < 0)
625 goto out;
626
627 /* Bind this socket to a particular device like "eth0",
628 * as specified in the passed interface name. If the
629 * name is "" or the option length is zero the socket
630 * is not bound.
631 */
632 if (optlen > IFNAMSIZ - 1)
633 optlen = IFNAMSIZ - 1;
634 memset(devname, 0, sizeof(devname));
635
636 ret = -EFAULT;
637 if (copy_from_user(devname, optval, optlen))
638 goto out;
639
640 index = 0;
641 if (devname[0] != '\0') {
642 struct net_device *dev;
643
644 rcu_read_lock();
645 dev = dev_get_by_name_rcu(net, devname);
646 if (dev)
647 index = dev->ifindex;
648 rcu_read_unlock();
649 ret = -ENODEV;
650 if (!dev)
651 goto out;
652 }
653
654 lock_sock(sk);
655 ret = sock_setbindtodevice_locked(sk, index);
656 release_sock(sk);
657
658out:
659#endif
660
661 return ret;
662}
663
664static int sock_getbindtodevice(struct sock *sk, char __user *optval,
665 int __user *optlen, int len)
666{
667 int ret = -ENOPROTOOPT;
668#ifdef CONFIG_NETDEVICES
669 struct net *net = sock_net(sk);
670 char devname[IFNAMSIZ];
671
672 if (sk->sk_bound_dev_if == 0) {
673 len = 0;
674 goto zero;
675 }
676
677 ret = -EINVAL;
678 if (len < IFNAMSIZ)
679 goto out;
680
681 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
682 if (ret)
683 goto out;
684
685 len = strlen(devname) + 1;
686
687 ret = -EFAULT;
688 if (copy_to_user(optval, devname, len))
689 goto out;
690
691zero:
692 ret = -EFAULT;
693 if (put_user(len, optlen))
694 goto out;
695
696 ret = 0;
697
698out:
699#endif
700
701 return ret;
702}
703
704static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
705{
706 if (valbool)
707 sock_set_flag(sk, bit);
708 else
709 sock_reset_flag(sk, bit);
710}
711
712bool sk_mc_loop(struct sock *sk)
713{
714 if (dev_recursion_level())
715 return false;
716 if (!sk)
717 return true;
718 /* IPV6_ADDRFORM can change sk->sk_family under us. */
719 switch (READ_ONCE(sk->sk_family)) {
720 case AF_INET:
721 return inet_sk(sk)->mc_loop;
722#if IS_ENABLED(CONFIG_IPV6)
723 case AF_INET6:
724 return inet6_sk(sk)->mc_loop;
725#endif
726 }
727 WARN_ON_ONCE(1);
728 return true;
729}
730EXPORT_SYMBOL(sk_mc_loop);
731
732/*
733 * This is meant for all protocols to use and covers goings on
734 * at the socket level. Everything here is generic.
735 */
736
737int sock_setsockopt(struct socket *sock, int level, int optname,
738 char __user *optval, unsigned int optlen)
739{
740 struct sock_txtime sk_txtime;
741 struct sock *sk = sock->sk;
742 int val;
743 int valbool;
744 struct linger ling;
745 int ret = 0;
746
747 /*
748 * Options without arguments
749 */
750
751 if (optname == SO_BINDTODEVICE)
752 return sock_setbindtodevice(sk, optval, optlen);
753
754 if (optlen < sizeof(int))
755 return -EINVAL;
756
757 if (get_user(val, (int __user *)optval))
758 return -EFAULT;
759
760 valbool = val ? 1 : 0;
761
762 lock_sock(sk);
763
764 switch (optname) {
765 case SO_DEBUG:
766 if (val && !capable(CAP_NET_ADMIN))
767 ret = -EACCES;
768 else
769 sock_valbool_flag(sk, SOCK_DBG, valbool);
770 break;
771 case SO_REUSEADDR:
772 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
773 break;
774 case SO_REUSEPORT:
775 sk->sk_reuseport = valbool;
776 break;
777 case SO_TYPE:
778 case SO_PROTOCOL:
779 case SO_DOMAIN:
780 case SO_ERROR:
781 ret = -ENOPROTOOPT;
782 break;
783 case SO_DONTROUTE:
784 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
785 sk_dst_reset(sk);
786 break;
787 case SO_BROADCAST:
788 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
789 break;
790 case SO_SNDBUF:
791 /* Don't error on this BSD doesn't and if you think
792 * about it this is right. Otherwise apps have to
793 * play 'guess the biggest size' games. RCVBUF/SNDBUF
794 * are treated in BSD as hints
795 */
796 val = min_t(u32, val, sysctl_wmem_max);
797set_sndbuf:
798 /* Ensure val * 2 fits into an int, to prevent max_t()
799 * from treating it as a negative value.
800 */
801 val = min_t(int, val, INT_MAX / 2);
802 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
803 WRITE_ONCE(sk->sk_sndbuf,
804 max_t(int, val * 2, SOCK_MIN_SNDBUF));
805 /* Wake up sending tasks if we upped the value. */
806 sk->sk_write_space(sk);
807 break;
808
809 case SO_SNDBUFFORCE:
810 if (!capable(CAP_NET_ADMIN)) {
811 ret = -EPERM;
812 break;
813 }
814
815 /* No negative values (to prevent underflow, as val will be
816 * multiplied by 2).
817 */
818 if (val < 0)
819 val = 0;
820 goto set_sndbuf;
821
822 case SO_RCVBUF:
823 /* Don't error on this BSD doesn't and if you think
824 * about it this is right. Otherwise apps have to
825 * play 'guess the biggest size' games. RCVBUF/SNDBUF
826 * are treated in BSD as hints
827 */
828 val = min_t(u32, val, sysctl_rmem_max);
829set_rcvbuf:
830 /* Ensure val * 2 fits into an int, to prevent max_t()
831 * from treating it as a negative value.
832 */
833 val = min_t(int, val, INT_MAX / 2);
834 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
835 /*
836 * We double it on the way in to account for
837 * "struct sk_buff" etc. overhead. Applications
838 * assume that the SO_RCVBUF setting they make will
839 * allow that much actual data to be received on that
840 * socket.
841 *
842 * Applications are unaware that "struct sk_buff" and
843 * other overheads allocate from the receive buffer
844 * during socket buffer allocation.
845 *
846 * And after considering the possible alternatives,
847 * returning the value we actually used in getsockopt
848 * is the most desirable behavior.
849 */
850 WRITE_ONCE(sk->sk_rcvbuf,
851 max_t(int, val * 2, SOCK_MIN_RCVBUF));
852 break;
853
854 case SO_RCVBUFFORCE:
855 if (!capable(CAP_NET_ADMIN)) {
856 ret = -EPERM;
857 break;
858 }
859
860 /* No negative values (to prevent underflow, as val will be
861 * multiplied by 2).
862 */
863 if (val < 0)
864 val = 0;
865 goto set_rcvbuf;
866
867 case SO_KEEPALIVE:
868 if (sk->sk_prot->keepalive)
869 sk->sk_prot->keepalive(sk, valbool);
870 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
871 break;
872
873 case SO_OOBINLINE:
874 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
875 break;
876
877 case SO_NO_CHECK:
878 sk->sk_no_check_tx = valbool;
879 break;
880
881 case SO_PRIORITY:
882 if ((val >= 0 && val <= 6) ||
883 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
884 sk->sk_priority = val;
885 else
886 ret = -EPERM;
887 break;
888
889 case SO_LINGER:
890 if (optlen < sizeof(ling)) {
891 ret = -EINVAL; /* 1003.1g */
892 break;
893 }
894 if (copy_from_user(&ling, optval, sizeof(ling))) {
895 ret = -EFAULT;
896 break;
897 }
898 if (!ling.l_onoff)
899 sock_reset_flag(sk, SOCK_LINGER);
900 else {
901#if (BITS_PER_LONG == 32)
902 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
903 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
904 else
905#endif
906 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
907 sock_set_flag(sk, SOCK_LINGER);
908 }
909 break;
910
911 case SO_BSDCOMPAT:
912 sock_warn_obsolete_bsdism("setsockopt");
913 break;
914
915 case SO_PASSCRED:
916 if (valbool)
917 set_bit(SOCK_PASSCRED, &sock->flags);
918 else
919 clear_bit(SOCK_PASSCRED, &sock->flags);
920 break;
921
922 case SO_TIMESTAMP_OLD:
923 case SO_TIMESTAMP_NEW:
924 case SO_TIMESTAMPNS_OLD:
925 case SO_TIMESTAMPNS_NEW:
926 if (valbool) {
927 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
928 sock_set_flag(sk, SOCK_TSTAMP_NEW);
929 else
930 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
931
932 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
933 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
934 else
935 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
936 sock_set_flag(sk, SOCK_RCVTSTAMP);
937 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
938 } else {
939 sock_reset_flag(sk, SOCK_RCVTSTAMP);
940 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
941 }
942 break;
943
944 case SO_TIMESTAMPING_NEW:
945 case SO_TIMESTAMPING_OLD:
946 if (val & ~SOF_TIMESTAMPING_MASK) {
947 ret = -EINVAL;
948 break;
949 }
950
951 if (val & SOF_TIMESTAMPING_OPT_ID &&
952 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
953 if (sk->sk_protocol == IPPROTO_TCP &&
954 sk->sk_type == SOCK_STREAM) {
955 if ((1 << sk->sk_state) &
956 (TCPF_CLOSE | TCPF_LISTEN)) {
957 ret = -EINVAL;
958 break;
959 }
960 sk->sk_tskey = tcp_sk(sk)->snd_una;
961 } else {
962 sk->sk_tskey = 0;
963 }
964 }
965
966 if (val & SOF_TIMESTAMPING_OPT_STATS &&
967 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
968 ret = -EINVAL;
969 break;
970 }
971
972 sk->sk_tsflags = val;
973 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
974
975 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
976 sock_enable_timestamp(sk,
977 SOCK_TIMESTAMPING_RX_SOFTWARE);
978 else
979 sock_disable_timestamp(sk,
980 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
981 break;
982
983 case SO_RCVLOWAT:
984 if (val < 0)
985 val = INT_MAX;
986 if (sock->ops->set_rcvlowat)
987 ret = sock->ops->set_rcvlowat(sk, val);
988 else
989 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
990 break;
991
992 case SO_RCVTIMEO_OLD:
993 case SO_RCVTIMEO_NEW:
994 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
995 break;
996
997 case SO_SNDTIMEO_OLD:
998 case SO_SNDTIMEO_NEW:
999 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
1000 break;
1001
1002 case SO_ATTACH_FILTER:
1003 ret = -EINVAL;
1004 if (optlen == sizeof(struct sock_fprog)) {
1005 struct sock_fprog fprog;
1006
1007 ret = -EFAULT;
1008 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1009 break;
1010
1011 ret = sk_attach_filter(&fprog, sk);
1012 }
1013 break;
1014
1015 case SO_ATTACH_BPF:
1016 ret = -EINVAL;
1017 if (optlen == sizeof(u32)) {
1018 u32 ufd;
1019
1020 ret = -EFAULT;
1021 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1022 break;
1023
1024 ret = sk_attach_bpf(ufd, sk);
1025 }
1026 break;
1027
1028 case SO_ATTACH_REUSEPORT_CBPF:
1029 ret = -EINVAL;
1030 if (optlen == sizeof(struct sock_fprog)) {
1031 struct sock_fprog fprog;
1032
1033 ret = -EFAULT;
1034 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1035 break;
1036
1037 ret = sk_reuseport_attach_filter(&fprog, sk);
1038 }
1039 break;
1040
1041 case SO_ATTACH_REUSEPORT_EBPF:
1042 ret = -EINVAL;
1043 if (optlen == sizeof(u32)) {
1044 u32 ufd;
1045
1046 ret = -EFAULT;
1047 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1048 break;
1049
1050 ret = sk_reuseport_attach_bpf(ufd, sk);
1051 }
1052 break;
1053
1054 case SO_DETACH_REUSEPORT_BPF:
1055 ret = reuseport_detach_prog(sk);
1056 break;
1057
1058 case SO_DETACH_FILTER:
1059 ret = sk_detach_filter(sk);
1060 break;
1061
1062 case SO_LOCK_FILTER:
1063 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1064 ret = -EPERM;
1065 else
1066 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1067 break;
1068
1069 case SO_PASSSEC:
1070 if (valbool)
1071 set_bit(SOCK_PASSSEC, &sock->flags);
1072 else
1073 clear_bit(SOCK_PASSSEC, &sock->flags);
1074 break;
1075 case SO_MARK:
1076 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1077 ret = -EPERM;
1078 } else if (val != sk->sk_mark) {
1079 sk->sk_mark = val;
1080 sk_dst_reset(sk);
1081 }
1082 break;
1083
1084 case SO_RXQ_OVFL:
1085 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1086 break;
1087
1088 case SO_WIFI_STATUS:
1089 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1090 break;
1091
1092 case SO_PEEK_OFF:
1093 if (sock->ops->set_peek_off)
1094 ret = sock->ops->set_peek_off(sk, val);
1095 else
1096 ret = -EOPNOTSUPP;
1097 break;
1098
1099 case SO_NOFCS:
1100 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1101 break;
1102
1103 case SO_SELECT_ERR_QUEUE:
1104 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1105 break;
1106
1107#ifdef CONFIG_NET_RX_BUSY_POLL
1108 case SO_BUSY_POLL:
1109 /* allow unprivileged users to decrease the value */
1110 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1111 ret = -EPERM;
1112 else {
1113 if (val < 0)
1114 ret = -EINVAL;
1115 else
1116 WRITE_ONCE(sk->sk_ll_usec, val);
1117 }
1118 break;
1119#endif
1120
1121 case SO_MAX_PACING_RATE:
1122 {
1123 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1124
1125 if (sizeof(ulval) != sizeof(val) &&
1126 optlen >= sizeof(ulval) &&
1127 get_user(ulval, (unsigned long __user *)optval)) {
1128 ret = -EFAULT;
1129 break;
1130 }
1131 if (ulval != ~0UL)
1132 cmpxchg(&sk->sk_pacing_status,
1133 SK_PACING_NONE,
1134 SK_PACING_NEEDED);
1135 /* Pairs with READ_ONCE() from sk_getsockopt() */
1136 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1137 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1138 break;
1139 }
1140 case SO_INCOMING_CPU:
1141 WRITE_ONCE(sk->sk_incoming_cpu, val);
1142 break;
1143
1144 case SO_CNX_ADVICE:
1145 if (val == 1)
1146 dst_negative_advice(sk);
1147 break;
1148
1149 case SO_ZEROCOPY:
1150 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1151 if (!((sk->sk_type == SOCK_STREAM &&
1152 sk->sk_protocol == IPPROTO_TCP) ||
1153 (sk->sk_type == SOCK_DGRAM &&
1154 sk->sk_protocol == IPPROTO_UDP)))
1155 ret = -ENOTSUPP;
1156 } else if (sk->sk_family != PF_RDS) {
1157 ret = -ENOTSUPP;
1158 }
1159 if (!ret) {
1160 if (val < 0 || val > 1)
1161 ret = -EINVAL;
1162 else
1163 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1164 }
1165 break;
1166
1167 case SO_TXTIME:
1168 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1169 ret = -EPERM;
1170 } else if (optlen != sizeof(struct sock_txtime)) {
1171 ret = -EINVAL;
1172 } else if (copy_from_user(&sk_txtime, optval,
1173 sizeof(struct sock_txtime))) {
1174 ret = -EFAULT;
1175 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1176 ret = -EINVAL;
1177 } else {
1178 sock_valbool_flag(sk, SOCK_TXTIME, true);
1179 sk->sk_clockid = sk_txtime.clockid;
1180 sk->sk_txtime_deadline_mode =
1181 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1182 sk->sk_txtime_report_errors =
1183 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1184 }
1185 break;
1186
1187 case SO_BINDTOIFINDEX:
1188 ret = sock_setbindtodevice_locked(sk, val);
1189 break;
1190
1191 default:
1192 ret = -ENOPROTOOPT;
1193 break;
1194 }
1195 release_sock(sk);
1196 return ret;
1197}
1198EXPORT_SYMBOL(sock_setsockopt);
1199
1200static const struct cred *sk_get_peer_cred(struct sock *sk)
1201{
1202 const struct cred *cred;
1203
1204 spin_lock(&sk->sk_peer_lock);
1205 cred = get_cred(sk->sk_peer_cred);
1206 spin_unlock(&sk->sk_peer_lock);
1207
1208 return cred;
1209}
1210
1211static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1212 struct ucred *ucred)
1213{
1214 ucred->pid = pid_vnr(pid);
1215 ucred->uid = ucred->gid = -1;
1216 if (cred) {
1217 struct user_namespace *current_ns = current_user_ns();
1218
1219 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1220 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1221 }
1222}
1223
1224static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1225{
1226 struct user_namespace *user_ns = current_user_ns();
1227 int i;
1228
1229 for (i = 0; i < src->ngroups; i++)
1230 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1231 return -EFAULT;
1232
1233 return 0;
1234}
1235
1236int sock_getsockopt(struct socket *sock, int level, int optname,
1237 char __user *optval, int __user *optlen)
1238{
1239 struct sock *sk = sock->sk;
1240
1241 union {
1242 int val;
1243 u64 val64;
1244 unsigned long ulval;
1245 struct linger ling;
1246 struct old_timeval32 tm32;
1247 struct __kernel_old_timeval tm;
1248 struct __kernel_sock_timeval stm;
1249 struct sock_txtime txtime;
1250 } v;
1251
1252 int lv = sizeof(int);
1253 int len;
1254
1255 if (get_user(len, optlen))
1256 return -EFAULT;
1257 if (len < 0)
1258 return -EINVAL;
1259
1260 memset(&v, 0, sizeof(v));
1261
1262 switch (optname) {
1263 case SO_DEBUG:
1264 v.val = sock_flag(sk, SOCK_DBG);
1265 break;
1266
1267 case SO_DONTROUTE:
1268 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1269 break;
1270
1271 case SO_BROADCAST:
1272 v.val = sock_flag(sk, SOCK_BROADCAST);
1273 break;
1274
1275 case SO_SNDBUF:
1276 v.val = READ_ONCE(sk->sk_sndbuf);
1277 break;
1278
1279 case SO_RCVBUF:
1280 v.val = READ_ONCE(sk->sk_rcvbuf);
1281 break;
1282
1283 case SO_REUSEADDR:
1284 v.val = sk->sk_reuse;
1285 break;
1286
1287 case SO_REUSEPORT:
1288 v.val = sk->sk_reuseport;
1289 break;
1290
1291 case SO_KEEPALIVE:
1292 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1293 break;
1294
1295 case SO_TYPE:
1296 v.val = sk->sk_type;
1297 break;
1298
1299 case SO_PROTOCOL:
1300 v.val = sk->sk_protocol;
1301 break;
1302
1303 case SO_DOMAIN:
1304 v.val = sk->sk_family;
1305 break;
1306
1307 case SO_ERROR:
1308 v.val = -sock_error(sk);
1309 if (v.val == 0)
1310 v.val = xchg(&sk->sk_err_soft, 0);
1311 break;
1312
1313 case SO_OOBINLINE:
1314 v.val = sock_flag(sk, SOCK_URGINLINE);
1315 break;
1316
1317 case SO_NO_CHECK:
1318 v.val = sk->sk_no_check_tx;
1319 break;
1320
1321 case SO_PRIORITY:
1322 v.val = sk->sk_priority;
1323 break;
1324
1325 case SO_LINGER:
1326 lv = sizeof(v.ling);
1327 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1328 v.ling.l_linger = sk->sk_lingertime / HZ;
1329 break;
1330
1331 case SO_BSDCOMPAT:
1332 sock_warn_obsolete_bsdism("getsockopt");
1333 break;
1334
1335 case SO_TIMESTAMP_OLD:
1336 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1337 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1338 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1339 break;
1340
1341 case SO_TIMESTAMPNS_OLD:
1342 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1343 break;
1344
1345 case SO_TIMESTAMP_NEW:
1346 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1347 break;
1348
1349 case SO_TIMESTAMPNS_NEW:
1350 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1351 break;
1352
1353 case SO_TIMESTAMPING_OLD:
1354 v.val = sk->sk_tsflags;
1355 break;
1356
1357 case SO_RCVTIMEO_OLD:
1358 case SO_RCVTIMEO_NEW:
1359 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1360 break;
1361
1362 case SO_SNDTIMEO_OLD:
1363 case SO_SNDTIMEO_NEW:
1364 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1365 break;
1366
1367 case SO_RCVLOWAT:
1368 v.val = READ_ONCE(sk->sk_rcvlowat);
1369 break;
1370
1371 case SO_SNDLOWAT:
1372 v.val = 1;
1373 break;
1374
1375 case SO_PASSCRED:
1376 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1377 break;
1378
1379 case SO_PEERCRED:
1380 {
1381 struct ucred peercred;
1382 if (len > sizeof(peercred))
1383 len = sizeof(peercred);
1384
1385 spin_lock(&sk->sk_peer_lock);
1386 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1387 spin_unlock(&sk->sk_peer_lock);
1388
1389 if (copy_to_user(optval, &peercred, len))
1390 return -EFAULT;
1391 goto lenout;
1392 }
1393
1394 case SO_PEERGROUPS:
1395 {
1396 const struct cred *cred;
1397 int ret, n;
1398
1399 cred = sk_get_peer_cred(sk);
1400 if (!cred)
1401 return -ENODATA;
1402
1403 n = cred->group_info->ngroups;
1404 if (len < n * sizeof(gid_t)) {
1405 len = n * sizeof(gid_t);
1406 put_cred(cred);
1407 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1408 }
1409 len = n * sizeof(gid_t);
1410
1411 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1412 put_cred(cred);
1413 if (ret)
1414 return ret;
1415 goto lenout;
1416 }
1417
1418 case SO_PEERNAME:
1419 {
1420 char address[128];
1421
1422 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1423 if (lv < 0)
1424 return -ENOTCONN;
1425 if (lv < len)
1426 return -EINVAL;
1427 if (copy_to_user(optval, address, len))
1428 return -EFAULT;
1429 goto lenout;
1430 }
1431
1432 /* Dubious BSD thing... Probably nobody even uses it, but
1433 * the UNIX standard wants it for whatever reason... -DaveM
1434 */
1435 case SO_ACCEPTCONN:
1436 v.val = sk->sk_state == TCP_LISTEN;
1437 break;
1438
1439 case SO_PASSSEC:
1440 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1441 break;
1442
1443 case SO_PEERSEC:
1444 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1445
1446 case SO_MARK:
1447 v.val = sk->sk_mark;
1448 break;
1449
1450 case SO_RXQ_OVFL:
1451 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1452 break;
1453
1454 case SO_WIFI_STATUS:
1455 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1456 break;
1457
1458 case SO_PEEK_OFF:
1459 if (!sock->ops->set_peek_off)
1460 return -EOPNOTSUPP;
1461
1462 v.val = READ_ONCE(sk->sk_peek_off);
1463 break;
1464 case SO_NOFCS:
1465 v.val = sock_flag(sk, SOCK_NOFCS);
1466 break;
1467
1468 case SO_BINDTODEVICE:
1469 return sock_getbindtodevice(sk, optval, optlen, len);
1470
1471 case SO_GET_FILTER:
1472 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1473 if (len < 0)
1474 return len;
1475
1476 goto lenout;
1477
1478 case SO_LOCK_FILTER:
1479 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1480 break;
1481
1482 case SO_BPF_EXTENSIONS:
1483 v.val = bpf_tell_extensions();
1484 break;
1485
1486 case SO_SELECT_ERR_QUEUE:
1487 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1488 break;
1489
1490#ifdef CONFIG_NET_RX_BUSY_POLL
1491 case SO_BUSY_POLL:
1492 v.val = READ_ONCE(sk->sk_ll_usec);
1493 break;
1494#endif
1495
1496 case SO_MAX_PACING_RATE:
1497 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1498 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1499 lv = sizeof(v.ulval);
1500 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1501 } else {
1502 /* 32bit version */
1503 v.val = min_t(unsigned long, ~0U,
1504 READ_ONCE(sk->sk_max_pacing_rate));
1505 }
1506 break;
1507
1508 case SO_INCOMING_CPU:
1509 v.val = READ_ONCE(sk->sk_incoming_cpu);
1510 break;
1511
1512 case SO_MEMINFO:
1513 {
1514 u32 meminfo[SK_MEMINFO_VARS];
1515
1516 sk_get_meminfo(sk, meminfo);
1517
1518 len = min_t(unsigned int, len, sizeof(meminfo));
1519 if (copy_to_user(optval, &meminfo, len))
1520 return -EFAULT;
1521
1522 goto lenout;
1523 }
1524
1525#ifdef CONFIG_NET_RX_BUSY_POLL
1526 case SO_INCOMING_NAPI_ID:
1527 v.val = READ_ONCE(sk->sk_napi_id);
1528
1529 /* aggregate non-NAPI IDs down to 0 */
1530 if (v.val < MIN_NAPI_ID)
1531 v.val = 0;
1532
1533 break;
1534#endif
1535
1536 case SO_COOKIE:
1537 lv = sizeof(u64);
1538 if (len < lv)
1539 return -EINVAL;
1540 v.val64 = sock_gen_cookie(sk);
1541 break;
1542
1543 case SO_ZEROCOPY:
1544 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1545 break;
1546
1547 case SO_TXTIME:
1548 lv = sizeof(v.txtime);
1549 v.txtime.clockid = sk->sk_clockid;
1550 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1551 SOF_TXTIME_DEADLINE_MODE : 0;
1552 v.txtime.flags |= sk->sk_txtime_report_errors ?
1553 SOF_TXTIME_REPORT_ERRORS : 0;
1554 break;
1555
1556 case SO_BINDTOIFINDEX:
1557 v.val = sk->sk_bound_dev_if;
1558 break;
1559
1560 default:
1561 /* We implement the SO_SNDLOWAT etc to not be settable
1562 * (1003.1g 7).
1563 */
1564 return -ENOPROTOOPT;
1565 }
1566
1567 if (len > lv)
1568 len = lv;
1569 if (copy_to_user(optval, &v, len))
1570 return -EFAULT;
1571lenout:
1572 if (put_user(len, optlen))
1573 return -EFAULT;
1574 return 0;
1575}
1576
1577/*
1578 * Initialize an sk_lock.
1579 *
1580 * (We also register the sk_lock with the lock validator.)
1581 */
1582static inline void sock_lock_init(struct sock *sk)
1583{
1584 if (sk->sk_kern_sock)
1585 sock_lock_init_class_and_name(
1586 sk,
1587 af_family_kern_slock_key_strings[sk->sk_family],
1588 af_family_kern_slock_keys + sk->sk_family,
1589 af_family_kern_key_strings[sk->sk_family],
1590 af_family_kern_keys + sk->sk_family);
1591 else
1592 sock_lock_init_class_and_name(
1593 sk,
1594 af_family_slock_key_strings[sk->sk_family],
1595 af_family_slock_keys + sk->sk_family,
1596 af_family_key_strings[sk->sk_family],
1597 af_family_keys + sk->sk_family);
1598}
1599
1600/*
1601 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1602 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1603 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1604 */
1605static void sock_copy(struct sock *nsk, const struct sock *osk)
1606{
1607#ifdef CONFIG_SECURITY_NETWORK
1608 void *sptr = nsk->sk_security;
1609#endif
1610 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1611
1612 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1613 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1614
1615#ifdef CONFIG_SECURITY_NETWORK
1616 nsk->sk_security = sptr;
1617 security_sk_clone(osk, nsk);
1618#endif
1619}
1620
1621static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1622 int family)
1623{
1624 struct sock *sk;
1625 struct kmem_cache *slab;
1626
1627 slab = prot->slab;
1628 if (slab != NULL) {
1629 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1630 if (!sk)
1631 return sk;
1632 if (want_init_on_alloc(priority))
1633 sk_prot_clear_nulls(sk, prot->obj_size);
1634 } else
1635 sk = kmalloc(prot->obj_size, priority);
1636
1637 if (sk != NULL) {
1638 if (security_sk_alloc(sk, family, priority))
1639 goto out_free;
1640
1641 trace_android_rvh_sk_alloc(sk);
1642
1643 if (!try_module_get(prot->owner))
1644 goto out_free_sec;
1645 sk_tx_queue_clear(sk);
1646 }
1647
1648 return sk;
1649
1650out_free_sec:
1651 security_sk_free(sk);
1652 trace_android_rvh_sk_free(sk);
1653out_free:
1654 if (slab != NULL)
1655 kmem_cache_free(slab, sk);
1656 else
1657 kfree(sk);
1658 return NULL;
1659}
1660
1661static void sk_prot_free(struct proto *prot, struct sock *sk)
1662{
1663 struct kmem_cache *slab;
1664 struct module *owner;
1665
1666 owner = prot->owner;
1667 slab = prot->slab;
1668
1669 cgroup_sk_free(&sk->sk_cgrp_data);
1670 mem_cgroup_sk_free(sk);
1671 security_sk_free(sk);
1672 trace_android_rvh_sk_free(sk);
1673 if (slab != NULL)
1674 kmem_cache_free(slab, sk);
1675 else
1676 kfree(sk);
1677 module_put(owner);
1678}
1679
1680/**
1681 * sk_alloc - All socket objects are allocated here
1682 * @net: the applicable net namespace
1683 * @family: protocol family
1684 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1685 * @prot: struct proto associated with this new sock instance
1686 * @kern: is this to be a kernel socket?
1687 */
1688struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1689 struct proto *prot, int kern)
1690{
1691 struct sock *sk;
1692
1693 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1694 if (sk) {
1695 sk->sk_family = family;
1696 /*
1697 * See comment in struct sock definition to understand
1698 * why we need sk_prot_creator -acme
1699 */
1700 sk->sk_prot = sk->sk_prot_creator = prot;
1701 sk->sk_kern_sock = kern;
1702 sock_lock_init(sk);
1703 sk->sk_net_refcnt = kern ? 0 : 1;
1704 if (likely(sk->sk_net_refcnt)) {
1705 get_net(net);
1706 sock_inuse_add(net, 1);
1707 }
1708
1709 sock_net_set(sk, net);
1710 refcount_set(&sk->sk_wmem_alloc, 1);
1711
1712 mem_cgroup_sk_alloc(sk);
1713 cgroup_sk_alloc(&sk->sk_cgrp_data);
1714 sock_update_classid(&sk->sk_cgrp_data);
1715 sock_update_netprioidx(&sk->sk_cgrp_data);
1716 sk_tx_queue_clear(sk);
1717 }
1718
1719 return sk;
1720}
1721EXPORT_SYMBOL(sk_alloc);
1722
1723/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1724 * grace period. This is the case for UDP sockets and TCP listeners.
1725 */
1726static void __sk_destruct(struct rcu_head *head)
1727{
1728 struct sock *sk = container_of(head, struct sock, sk_rcu);
1729 struct sk_filter *filter;
1730
1731 if (sk->sk_destruct)
1732 sk->sk_destruct(sk);
1733
1734 filter = rcu_dereference_check(sk->sk_filter,
1735 refcount_read(&sk->sk_wmem_alloc) == 0);
1736 if (filter) {
1737 sk_filter_uncharge(sk, filter);
1738 RCU_INIT_POINTER(sk->sk_filter, NULL);
1739 }
1740
1741 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1742
1743#ifdef CONFIG_BPF_SYSCALL
1744 bpf_sk_storage_free(sk);
1745#endif
1746
1747 if (atomic_read(&sk->sk_omem_alloc))
1748 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1749 __func__, atomic_read(&sk->sk_omem_alloc));
1750
1751 if (sk->sk_frag.page) {
1752 put_page(sk->sk_frag.page);
1753 sk->sk_frag.page = NULL;
1754 }
1755
1756 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
1757 put_cred(sk->sk_peer_cred);
1758 put_pid(sk->sk_peer_pid);
1759
1760 if (likely(sk->sk_net_refcnt))
1761 put_net(sock_net(sk));
1762 sk_prot_free(sk->sk_prot_creator, sk);
1763}
1764
1765void sk_destruct(struct sock *sk)
1766{
1767 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1768
1769 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1770 reuseport_detach_sock(sk);
1771 use_call_rcu = true;
1772 }
1773
1774 if (use_call_rcu)
1775 call_rcu(&sk->sk_rcu, __sk_destruct);
1776 else
1777 __sk_destruct(&sk->sk_rcu);
1778}
1779
1780static void __sk_free(struct sock *sk)
1781{
1782 if (likely(sk->sk_net_refcnt))
1783 sock_inuse_add(sock_net(sk), -1);
1784
1785#ifdef CONFIG_SOCK_DIAG
1786 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1787 sock_diag_broadcast_destroy(sk);
1788 else
1789#endif
1790 sk_destruct(sk);
1791}
1792
1793void sk_free(struct sock *sk)
1794{
1795 /*
1796 * We subtract one from sk_wmem_alloc and can know if
1797 * some packets are still in some tx queue.
1798 * If not null, sock_wfree() will call __sk_free(sk) later
1799 */
1800 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1801 __sk_free(sk);
1802}
1803EXPORT_SYMBOL(sk_free);
1804
1805static void sk_init_common(struct sock *sk)
1806{
1807 skb_queue_head_init(&sk->sk_receive_queue);
1808 skb_queue_head_init(&sk->sk_write_queue);
1809 skb_queue_head_init(&sk->sk_error_queue);
1810
1811 rwlock_init(&sk->sk_callback_lock);
1812 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1813 af_rlock_keys + sk->sk_family,
1814 af_family_rlock_key_strings[sk->sk_family]);
1815 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1816 af_wlock_keys + sk->sk_family,
1817 af_family_wlock_key_strings[sk->sk_family]);
1818 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1819 af_elock_keys + sk->sk_family,
1820 af_family_elock_key_strings[sk->sk_family]);
1821 lockdep_set_class_and_name(&sk->sk_callback_lock,
1822 af_callback_keys + sk->sk_family,
1823 af_family_clock_key_strings[sk->sk_family]);
1824}
1825
1826/**
1827 * sk_clone_lock - clone a socket, and lock its clone
1828 * @sk: the socket to clone
1829 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1830 *
1831 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1832 */
1833struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1834{
1835 struct sock *newsk;
1836 bool is_charged = true;
1837
1838 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1839 if (newsk != NULL) {
1840 struct sk_filter *filter;
1841
1842 sock_copy(newsk, sk);
1843
1844 newsk->sk_prot_creator = sk->sk_prot;
1845
1846 /* SANITY */
1847 if (likely(newsk->sk_net_refcnt))
1848 get_net(sock_net(newsk));
1849 sk_node_init(&newsk->sk_node);
1850 sock_lock_init(newsk);
1851 bh_lock_sock(newsk);
1852 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1853 newsk->sk_backlog.len = 0;
1854
1855 atomic_set(&newsk->sk_rmem_alloc, 0);
1856 /*
1857 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1858 */
1859 refcount_set(&newsk->sk_wmem_alloc, 1);
1860 atomic_set(&newsk->sk_omem_alloc, 0);
1861 sk_init_common(newsk);
1862
1863 newsk->sk_dst_cache = NULL;
1864 newsk->sk_dst_pending_confirm = 0;
1865 newsk->sk_wmem_queued = 0;
1866 newsk->sk_forward_alloc = 0;
1867 atomic_set(&newsk->sk_drops, 0);
1868 newsk->sk_send_head = NULL;
1869 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1870 atomic_set(&newsk->sk_zckey, 0);
1871
1872 sock_reset_flag(newsk, SOCK_DONE);
1873
1874 /* sk->sk_memcg will be populated at accept() time */
1875 newsk->sk_memcg = NULL;
1876
1877 cgroup_sk_clone(&newsk->sk_cgrp_data);
1878
1879 rcu_read_lock();
1880 filter = rcu_dereference(sk->sk_filter);
1881 if (filter != NULL)
1882 /* though it's an empty new sock, the charging may fail
1883 * if sysctl_optmem_max was changed between creation of
1884 * original socket and cloning
1885 */
1886 is_charged = sk_filter_charge(newsk, filter);
1887 RCU_INIT_POINTER(newsk->sk_filter, filter);
1888 rcu_read_unlock();
1889
1890 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1891 /* We need to make sure that we don't uncharge the new
1892 * socket if we couldn't charge it in the first place
1893 * as otherwise we uncharge the parent's filter.
1894 */
1895 if (!is_charged)
1896 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1897 sk_free_unlock_clone(newsk);
1898 newsk = NULL;
1899 goto out;
1900 }
1901 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1902
1903 if (bpf_sk_storage_clone(sk, newsk)) {
1904 sk_free_unlock_clone(newsk);
1905 newsk = NULL;
1906 goto out;
1907 }
1908
1909 newsk->sk_err = 0;
1910 newsk->sk_err_soft = 0;
1911 newsk->sk_priority = 0;
1912 newsk->sk_incoming_cpu = raw_smp_processor_id();
1913 if (likely(newsk->sk_net_refcnt))
1914 sock_inuse_add(sock_net(newsk), 1);
1915
1916 /*
1917 * Before updating sk_refcnt, we must commit prior changes to memory
1918 * (Documentation/RCU/rculist_nulls.txt for details)
1919 */
1920 smp_wmb();
1921 refcount_set(&newsk->sk_refcnt, 2);
1922
1923 /*
1924 * Increment the counter in the same struct proto as the master
1925 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1926 * is the same as sk->sk_prot->socks, as this field was copied
1927 * with memcpy).
1928 *
1929 * This _changes_ the previous behaviour, where
1930 * tcp_create_openreq_child always was incrementing the
1931 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1932 * to be taken into account in all callers. -acme
1933 */
1934 sk_refcnt_debug_inc(newsk);
1935 sk_set_socket(newsk, NULL);
1936 sk_tx_queue_clear(newsk);
1937 RCU_INIT_POINTER(newsk->sk_wq, NULL);
1938
1939 if (newsk->sk_prot->sockets_allocated)
1940 sk_sockets_allocated_inc(newsk);
1941
1942 if (sock_needs_netstamp(sk) &&
1943 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1944 net_enable_timestamp();
1945 }
1946out:
1947 return newsk;
1948}
1949EXPORT_SYMBOL_GPL(sk_clone_lock);
1950
1951void sk_free_unlock_clone(struct sock *sk)
1952{
1953 /* It is still raw copy of parent, so invalidate
1954 * destructor and make plain sk_free() */
1955 sk->sk_destruct = NULL;
1956 bh_unlock_sock(sk);
1957 sk_free(sk);
1958}
1959EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1960
1961void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1962{
1963 u32 max_segs = 1;
1964
1965 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1966 if (sk->sk_route_caps & NETIF_F_GSO)
1967 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1968 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1969 if (sk_can_gso(sk)) {
1970 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1971 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1972 } else {
1973 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1974 sk->sk_gso_max_size = dst->dev->gso_max_size;
1975 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1976 }
1977 }
1978 sk->sk_gso_max_segs = max_segs;
1979 sk_dst_set(sk, dst);
1980}
1981EXPORT_SYMBOL_GPL(sk_setup_caps);
1982
1983/*
1984 * Simple resource managers for sockets.
1985 */
1986
1987
1988/*
1989 * Write buffer destructor automatically called from kfree_skb.
1990 */
1991void sock_wfree(struct sk_buff *skb)
1992{
1993 struct sock *sk = skb->sk;
1994 unsigned int len = skb->truesize;
1995
1996 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1997 /*
1998 * Keep a reference on sk_wmem_alloc, this will be released
1999 * after sk_write_space() call
2000 */
2001 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2002 sk->sk_write_space(sk);
2003 len = 1;
2004 }
2005 /*
2006 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2007 * could not do because of in-flight packets
2008 */
2009 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2010 __sk_free(sk);
2011}
2012EXPORT_SYMBOL(sock_wfree);
2013
2014/* This variant of sock_wfree() is used by TCP,
2015 * since it sets SOCK_USE_WRITE_QUEUE.
2016 */
2017void __sock_wfree(struct sk_buff *skb)
2018{
2019 struct sock *sk = skb->sk;
2020
2021 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2022 __sk_free(sk);
2023}
2024
2025void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2026{
2027 skb_orphan(skb);
2028 skb->sk = sk;
2029#ifdef CONFIG_INET
2030 if (unlikely(!sk_fullsock(sk))) {
2031 skb->destructor = sock_edemux;
2032 sock_hold(sk);
2033 return;
2034 }
2035#endif
2036 skb->destructor = sock_wfree;
2037 skb_set_hash_from_sk(skb, sk);
2038 /*
2039 * We used to take a refcount on sk, but following operation
2040 * is enough to guarantee sk_free() wont free this sock until
2041 * all in-flight packets are completed
2042 */
2043 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2044}
2045EXPORT_SYMBOL(skb_set_owner_w);
2046
2047static bool can_skb_orphan_partial(const struct sk_buff *skb)
2048{
2049#ifdef CONFIG_TLS_DEVICE
2050 /* Drivers depend on in-order delivery for crypto offload,
2051 * partial orphan breaks out-of-order-OK logic.
2052 */
2053 if (skb->decrypted)
2054 return false;
2055#endif
2056 return (skb->destructor == sock_wfree ||
2057 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2058}
2059
2060/* This helper is used by netem, as it can hold packets in its
2061 * delay queue. We want to allow the owner socket to send more
2062 * packets, as if they were already TX completed by a typical driver.
2063 * But we also want to keep skb->sk set because some packet schedulers
2064 * rely on it (sch_fq for example).
2065 */
2066void skb_orphan_partial(struct sk_buff *skb)
2067{
2068 if (skb_is_tcp_pure_ack(skb))
2069 return;
2070
2071 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2072 return;
2073
2074 skb_orphan(skb);
2075}
2076EXPORT_SYMBOL(skb_orphan_partial);
2077
2078/*
2079 * Read buffer destructor automatically called from kfree_skb.
2080 */
2081void sock_rfree(struct sk_buff *skb)
2082{
2083 struct sock *sk = skb->sk;
2084 unsigned int len = skb->truesize;
2085
2086 atomic_sub(len, &sk->sk_rmem_alloc);
2087 sk_mem_uncharge(sk, len);
2088}
2089EXPORT_SYMBOL(sock_rfree);
2090
2091/*
2092 * Buffer destructor for skbs that are not used directly in read or write
2093 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2094 */
2095void sock_efree(struct sk_buff *skb)
2096{
2097 sock_put(skb->sk);
2098}
2099EXPORT_SYMBOL(sock_efree);
2100
2101kuid_t sock_i_uid(struct sock *sk)
2102{
2103 kuid_t uid;
2104
2105 read_lock_bh(&sk->sk_callback_lock);
2106 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2107 read_unlock_bh(&sk->sk_callback_lock);
2108 return uid;
2109}
2110EXPORT_SYMBOL(sock_i_uid);
2111
2112unsigned long __sock_i_ino(struct sock *sk)
2113{
2114 unsigned long ino;
2115
2116 read_lock(&sk->sk_callback_lock);
2117 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2118 read_unlock(&sk->sk_callback_lock);
2119 return ino;
2120}
2121EXPORT_SYMBOL(__sock_i_ino);
2122
2123unsigned long sock_i_ino(struct sock *sk)
2124{
2125 unsigned long ino;
2126
2127 local_bh_disable();
2128 ino = __sock_i_ino(sk);
2129 local_bh_enable();
2130 return ino;
2131}
2132EXPORT_SYMBOL(sock_i_ino);
2133
2134/*
2135 * Allocate a skb from the socket's send buffer.
2136 */
2137struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2138 gfp_t priority)
2139{
2140 if (force ||
2141 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2142 struct sk_buff *skb = alloc_skb(size, priority);
2143
2144 if (skb) {
2145 skb_set_owner_w(skb, sk);
2146 return skb;
2147 }
2148 }
2149 return NULL;
2150}
2151EXPORT_SYMBOL(sock_wmalloc);
2152
2153static void sock_ofree(struct sk_buff *skb)
2154{
2155 struct sock *sk = skb->sk;
2156
2157 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2158}
2159
2160struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2161 gfp_t priority)
2162{
2163 struct sk_buff *skb;
2164
2165 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2166 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2167 sysctl_optmem_max)
2168 return NULL;
2169
2170 skb = alloc_skb(size, priority);
2171 if (!skb)
2172 return NULL;
2173
2174 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2175 skb->sk = sk;
2176 skb->destructor = sock_ofree;
2177 return skb;
2178}
2179
2180/*
2181 * Allocate a memory block from the socket's option memory buffer.
2182 */
2183void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2184{
2185 if ((unsigned int)size <= sysctl_optmem_max &&
2186 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2187 void *mem;
2188 /* First do the add, to avoid the race if kmalloc
2189 * might sleep.
2190 */
2191 atomic_add(size, &sk->sk_omem_alloc);
2192 mem = kmalloc(size, priority);
2193 if (mem)
2194 return mem;
2195 atomic_sub(size, &sk->sk_omem_alloc);
2196 }
2197 return NULL;
2198}
2199EXPORT_SYMBOL(sock_kmalloc);
2200
2201/* Free an option memory block. Note, we actually want the inline
2202 * here as this allows gcc to detect the nullify and fold away the
2203 * condition entirely.
2204 */
2205static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2206 const bool nullify)
2207{
2208 if (WARN_ON_ONCE(!mem))
2209 return;
2210 if (nullify)
2211 kzfree(mem);
2212 else
2213 kfree(mem);
2214 atomic_sub(size, &sk->sk_omem_alloc);
2215}
2216
2217void sock_kfree_s(struct sock *sk, void *mem, int size)
2218{
2219 __sock_kfree_s(sk, mem, size, false);
2220}
2221EXPORT_SYMBOL(sock_kfree_s);
2222
2223void sock_kzfree_s(struct sock *sk, void *mem, int size)
2224{
2225 __sock_kfree_s(sk, mem, size, true);
2226}
2227EXPORT_SYMBOL(sock_kzfree_s);
2228
2229/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2230 I think, these locks should be removed for datagram sockets.
2231 */
2232static long sock_wait_for_wmem(struct sock *sk, long timeo)
2233{
2234 DEFINE_WAIT(wait);
2235
2236 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2237 for (;;) {
2238 if (!timeo)
2239 break;
2240 if (signal_pending(current))
2241 break;
2242 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2243 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2244 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2245 break;
2246 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2247 break;
2248 if (READ_ONCE(sk->sk_err))
2249 break;
2250 timeo = schedule_timeout(timeo);
2251 }
2252 finish_wait(sk_sleep(sk), &wait);
2253 return timeo;
2254}
2255
2256
2257/*
2258 * Generic send/receive buffer handlers
2259 */
2260
2261struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2262 unsigned long data_len, int noblock,
2263 int *errcode, int max_page_order)
2264{
2265 struct sk_buff *skb;
2266 long timeo;
2267 int err;
2268
2269 timeo = sock_sndtimeo(sk, noblock);
2270 for (;;) {
2271 err = sock_error(sk);
2272 if (err != 0)
2273 goto failure;
2274
2275 err = -EPIPE;
2276 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2277 goto failure;
2278
2279 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2280 break;
2281
2282 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2283 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2284 err = -EAGAIN;
2285 if (!timeo)
2286 goto failure;
2287 if (signal_pending(current))
2288 goto interrupted;
2289 timeo = sock_wait_for_wmem(sk, timeo);
2290 }
2291 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2292 errcode, sk->sk_allocation);
2293 if (skb)
2294 skb_set_owner_w(skb, sk);
2295 return skb;
2296
2297interrupted:
2298 err = sock_intr_errno(timeo);
2299failure:
2300 *errcode = err;
2301 return NULL;
2302}
2303EXPORT_SYMBOL(sock_alloc_send_pskb);
2304
2305struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2306 int noblock, int *errcode)
2307{
2308 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2309}
2310EXPORT_SYMBOL(sock_alloc_send_skb);
2311
2312int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2313 struct sockcm_cookie *sockc)
2314{
2315 u32 tsflags;
2316
2317 switch (cmsg->cmsg_type) {
2318 case SO_MARK:
2319 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2320 return -EPERM;
2321 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2322 return -EINVAL;
2323 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2324 break;
2325 case SO_TIMESTAMPING_OLD:
2326 case SO_TIMESTAMPING_NEW:
2327 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2328 return -EINVAL;
2329
2330 tsflags = *(u32 *)CMSG_DATA(cmsg);
2331 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2332 return -EINVAL;
2333
2334 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2335 sockc->tsflags |= tsflags;
2336 break;
2337 case SCM_TXTIME:
2338 if (!sock_flag(sk, SOCK_TXTIME))
2339 return -EINVAL;
2340 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2341 return -EINVAL;
2342 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2343 break;
2344 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2345 case SCM_RIGHTS:
2346 case SCM_CREDENTIALS:
2347 break;
2348 default:
2349 return -EINVAL;
2350 }
2351 return 0;
2352}
2353EXPORT_SYMBOL(__sock_cmsg_send);
2354
2355int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2356 struct sockcm_cookie *sockc)
2357{
2358 struct cmsghdr *cmsg;
2359 int ret;
2360
2361 for_each_cmsghdr(cmsg, msg) {
2362 if (!CMSG_OK(msg, cmsg))
2363 return -EINVAL;
2364 if (cmsg->cmsg_level != SOL_SOCKET)
2365 continue;
2366 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2367 if (ret)
2368 return ret;
2369 }
2370 return 0;
2371}
2372EXPORT_SYMBOL(sock_cmsg_send);
2373
2374static void sk_enter_memory_pressure(struct sock *sk)
2375{
2376 if (!sk->sk_prot->enter_memory_pressure)
2377 return;
2378
2379 sk->sk_prot->enter_memory_pressure(sk);
2380}
2381
2382static void sk_leave_memory_pressure(struct sock *sk)
2383{
2384 if (sk->sk_prot->leave_memory_pressure) {
2385 sk->sk_prot->leave_memory_pressure(sk);
2386 } else {
2387 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2388
2389 if (memory_pressure && READ_ONCE(*memory_pressure))
2390 WRITE_ONCE(*memory_pressure, 0);
2391 }
2392}
2393
2394DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2395
2396/**
2397 * skb_page_frag_refill - check that a page_frag contains enough room
2398 * @sz: minimum size of the fragment we want to get
2399 * @pfrag: pointer to page_frag
2400 * @gfp: priority for memory allocation
2401 *
2402 * Note: While this allocator tries to use high order pages, there is
2403 * no guarantee that allocations succeed. Therefore, @sz MUST be
2404 * less or equal than PAGE_SIZE.
2405 */
2406bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2407{
2408 if (pfrag->page) {
2409 if (page_ref_count(pfrag->page) == 1) {
2410 pfrag->offset = 0;
2411 return true;
2412 }
2413 if (pfrag->offset + sz <= pfrag->size)
2414 return true;
2415 put_page(pfrag->page);
2416 }
2417
2418 pfrag->offset = 0;
2419 if (SKB_FRAG_PAGE_ORDER &&
2420 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2421 /* Avoid direct reclaim but allow kswapd to wake */
2422 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2423 __GFP_COMP | __GFP_NOWARN |
2424 __GFP_NORETRY,
2425 SKB_FRAG_PAGE_ORDER);
2426 if (likely(pfrag->page)) {
2427 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2428 return true;
2429 }
2430 }
2431 pfrag->page = alloc_page(gfp);
2432 if (likely(pfrag->page)) {
2433 pfrag->size = PAGE_SIZE;
2434 return true;
2435 }
2436 return false;
2437}
2438EXPORT_SYMBOL(skb_page_frag_refill);
2439
2440bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2441{
2442 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2443 return true;
2444
2445 sk_enter_memory_pressure(sk);
2446 sk_stream_moderate_sndbuf(sk);
2447 return false;
2448}
2449EXPORT_SYMBOL(sk_page_frag_refill);
2450
2451static void __lock_sock(struct sock *sk)
2452 __releases(&sk->sk_lock.slock)
2453 __acquires(&sk->sk_lock.slock)
2454{
2455 DEFINE_WAIT(wait);
2456
2457 for (;;) {
2458 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2459 TASK_UNINTERRUPTIBLE);
2460 spin_unlock_bh(&sk->sk_lock.slock);
2461 schedule();
2462 spin_lock_bh(&sk->sk_lock.slock);
2463 if (!sock_owned_by_user(sk))
2464 break;
2465 }
2466 finish_wait(&sk->sk_lock.wq, &wait);
2467}
2468
2469void __release_sock(struct sock *sk)
2470 __releases(&sk->sk_lock.slock)
2471 __acquires(&sk->sk_lock.slock)
2472{
2473 struct sk_buff *skb, *next;
2474
2475 while ((skb = sk->sk_backlog.head) != NULL) {
2476 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2477
2478 spin_unlock_bh(&sk->sk_lock.slock);
2479
2480 do {
2481 next = skb->next;
2482 prefetch(next);
2483 WARN_ON_ONCE(skb_dst_is_noref(skb));
2484 skb_mark_not_on_list(skb);
2485 sk_backlog_rcv(sk, skb);
2486
2487 cond_resched();
2488
2489 skb = next;
2490 } while (skb != NULL);
2491
2492 spin_lock_bh(&sk->sk_lock.slock);
2493 }
2494
2495 /*
2496 * Doing the zeroing here guarantee we can not loop forever
2497 * while a wild producer attempts to flood us.
2498 */
2499 sk->sk_backlog.len = 0;
2500}
2501
2502void __sk_flush_backlog(struct sock *sk)
2503{
2504 spin_lock_bh(&sk->sk_lock.slock);
2505 __release_sock(sk);
2506 spin_unlock_bh(&sk->sk_lock.slock);
2507}
2508
2509/**
2510 * sk_wait_data - wait for data to arrive at sk_receive_queue
2511 * @sk: sock to wait on
2512 * @timeo: for how long
2513 * @skb: last skb seen on sk_receive_queue
2514 *
2515 * Now socket state including sk->sk_err is changed only under lock,
2516 * hence we may omit checks after joining wait queue.
2517 * We check receive queue before schedule() only as optimization;
2518 * it is very likely that release_sock() added new data.
2519 */
2520int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2521{
2522 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2523 int rc;
2524
2525 add_wait_queue(sk_sleep(sk), &wait);
2526 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2527 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2528 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2529 remove_wait_queue(sk_sleep(sk), &wait);
2530 return rc;
2531}
2532EXPORT_SYMBOL(sk_wait_data);
2533
2534/**
2535 * __sk_mem_raise_allocated - increase memory_allocated
2536 * @sk: socket
2537 * @size: memory size to allocate
2538 * @amt: pages to allocate
2539 * @kind: allocation type
2540 *
2541 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2542 */
2543int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2544{
2545 struct proto *prot = sk->sk_prot;
2546 long allocated = sk_memory_allocated_add(sk, amt);
2547 bool charged = true;
2548
2549 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2550 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2551 goto suppress_allocation;
2552
2553 /* Under limit. */
2554 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2555 sk_leave_memory_pressure(sk);
2556 return 1;
2557 }
2558
2559 /* Under pressure. */
2560 if (allocated > sk_prot_mem_limits(sk, 1))
2561 sk_enter_memory_pressure(sk);
2562
2563 /* Over hard limit. */
2564 if (allocated > sk_prot_mem_limits(sk, 2))
2565 goto suppress_allocation;
2566
2567 /* guarantee minimum buffer size under pressure */
2568 if (kind == SK_MEM_RECV) {
2569 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2570 return 1;
2571
2572 } else { /* SK_MEM_SEND */
2573 int wmem0 = sk_get_wmem0(sk, prot);
2574
2575 if (sk->sk_type == SOCK_STREAM) {
2576 if (sk->sk_wmem_queued < wmem0)
2577 return 1;
2578 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2579 return 1;
2580 }
2581 }
2582
2583 if (sk_has_memory_pressure(sk)) {
2584 u64 alloc;
2585
2586 if (!sk_under_memory_pressure(sk))
2587 return 1;
2588 alloc = sk_sockets_allocated_read_positive(sk);
2589 if (sk_prot_mem_limits(sk, 2) > alloc *
2590 sk_mem_pages(sk->sk_wmem_queued +
2591 atomic_read(&sk->sk_rmem_alloc) +
2592 sk->sk_forward_alloc))
2593 return 1;
2594 }
2595
2596suppress_allocation:
2597
2598 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2599 sk_stream_moderate_sndbuf(sk);
2600
2601 /* Fail only if socket is _under_ its sndbuf.
2602 * In this case we cannot block, so that we have to fail.
2603 */
2604 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2605 return 1;
2606 }
2607
2608 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2609 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2610
2611 sk_memory_allocated_sub(sk, amt);
2612
2613 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2614 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2615
2616 return 0;
2617}
2618EXPORT_SYMBOL(__sk_mem_raise_allocated);
2619
2620/**
2621 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2622 * @sk: socket
2623 * @size: memory size to allocate
2624 * @kind: allocation type
2625 *
2626 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2627 * rmem allocation. This function assumes that protocols which have
2628 * memory_pressure use sk_wmem_queued as write buffer accounting.
2629 */
2630int __sk_mem_schedule(struct sock *sk, int size, int kind)
2631{
2632 int ret, amt = sk_mem_pages(size);
2633
2634 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2635 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2636 if (!ret)
2637 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2638 return ret;
2639}
2640EXPORT_SYMBOL(__sk_mem_schedule);
2641
2642/**
2643 * __sk_mem_reduce_allocated - reclaim memory_allocated
2644 * @sk: socket
2645 * @amount: number of quanta
2646 *
2647 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2648 */
2649void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2650{
2651 sk_memory_allocated_sub(sk, amount);
2652
2653 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2654 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2655
2656 if (sk_under_global_memory_pressure(sk) &&
2657 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2658 sk_leave_memory_pressure(sk);
2659}
2660EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2661
2662/**
2663 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2664 * @sk: socket
2665 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2666 */
2667void __sk_mem_reclaim(struct sock *sk, int amount)
2668{
2669 amount >>= SK_MEM_QUANTUM_SHIFT;
2670 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2671 __sk_mem_reduce_allocated(sk, amount);
2672}
2673EXPORT_SYMBOL(__sk_mem_reclaim);
2674
2675int sk_set_peek_off(struct sock *sk, int val)
2676{
2677 WRITE_ONCE(sk->sk_peek_off, val);
2678 return 0;
2679}
2680EXPORT_SYMBOL_GPL(sk_set_peek_off);
2681
2682/*
2683 * Set of default routines for initialising struct proto_ops when
2684 * the protocol does not support a particular function. In certain
2685 * cases where it makes no sense for a protocol to have a "do nothing"
2686 * function, some default processing is provided.
2687 */
2688
2689int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2690{
2691 return -EOPNOTSUPP;
2692}
2693EXPORT_SYMBOL(sock_no_bind);
2694
2695int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2696 int len, int flags)
2697{
2698 return -EOPNOTSUPP;
2699}
2700EXPORT_SYMBOL(sock_no_connect);
2701
2702int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2703{
2704 return -EOPNOTSUPP;
2705}
2706EXPORT_SYMBOL(sock_no_socketpair);
2707
2708int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2709 bool kern)
2710{
2711 return -EOPNOTSUPP;
2712}
2713EXPORT_SYMBOL(sock_no_accept);
2714
2715int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2716 int peer)
2717{
2718 return -EOPNOTSUPP;
2719}
2720EXPORT_SYMBOL(sock_no_getname);
2721
2722int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2723{
2724 return -EOPNOTSUPP;
2725}
2726EXPORT_SYMBOL(sock_no_ioctl);
2727
2728int sock_no_listen(struct socket *sock, int backlog)
2729{
2730 return -EOPNOTSUPP;
2731}
2732EXPORT_SYMBOL(sock_no_listen);
2733
2734int sock_no_shutdown(struct socket *sock, int how)
2735{
2736 return -EOPNOTSUPP;
2737}
2738EXPORT_SYMBOL(sock_no_shutdown);
2739
2740int sock_no_setsockopt(struct socket *sock, int level, int optname,
2741 char __user *optval, unsigned int optlen)
2742{
2743 return -EOPNOTSUPP;
2744}
2745EXPORT_SYMBOL(sock_no_setsockopt);
2746
2747int sock_no_getsockopt(struct socket *sock, int level, int optname,
2748 char __user *optval, int __user *optlen)
2749{
2750 return -EOPNOTSUPP;
2751}
2752EXPORT_SYMBOL(sock_no_getsockopt);
2753
2754int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2755{
2756 return -EOPNOTSUPP;
2757}
2758EXPORT_SYMBOL(sock_no_sendmsg);
2759
2760int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2761{
2762 return -EOPNOTSUPP;
2763}
2764EXPORT_SYMBOL(sock_no_sendmsg_locked);
2765
2766int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2767 int flags)
2768{
2769 return -EOPNOTSUPP;
2770}
2771EXPORT_SYMBOL(sock_no_recvmsg);
2772
2773int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2774{
2775 /* Mirror missing mmap method error code */
2776 return -ENODEV;
2777}
2778EXPORT_SYMBOL(sock_no_mmap);
2779
2780/*
2781 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2782 * various sock-based usage counts.
2783 */
2784void __receive_sock(struct file *file)
2785{
2786 struct socket *sock;
2787 int error;
2788
2789 /*
2790 * The resulting value of "error" is ignored here since we only
2791 * need to take action when the file is a socket and testing
2792 * "sock" for NULL is sufficient.
2793 */
2794 sock = sock_from_file(file, &error);
2795 if (sock) {
2796 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2797 sock_update_classid(&sock->sk->sk_cgrp_data);
2798 }
2799}
2800
2801ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2802{
2803 ssize_t res;
2804 struct msghdr msg = {.msg_flags = flags};
2805 struct kvec iov;
2806 char *kaddr = kmap(page);
2807 iov.iov_base = kaddr + offset;
2808 iov.iov_len = size;
2809 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2810 kunmap(page);
2811 return res;
2812}
2813EXPORT_SYMBOL(sock_no_sendpage);
2814
2815ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2816 int offset, size_t size, int flags)
2817{
2818 ssize_t res;
2819 struct msghdr msg = {.msg_flags = flags};
2820 struct kvec iov;
2821 char *kaddr = kmap(page);
2822
2823 iov.iov_base = kaddr + offset;
2824 iov.iov_len = size;
2825 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2826 kunmap(page);
2827 return res;
2828}
2829EXPORT_SYMBOL(sock_no_sendpage_locked);
2830
2831/*
2832 * Default Socket Callbacks
2833 */
2834
2835static void sock_def_wakeup(struct sock *sk)
2836{
2837 struct socket_wq *wq;
2838
2839 rcu_read_lock();
2840 wq = rcu_dereference(sk->sk_wq);
2841 if (skwq_has_sleeper(wq))
2842 wake_up_interruptible_all(&wq->wait);
2843 rcu_read_unlock();
2844}
2845
2846static void sock_def_error_report(struct sock *sk)
2847{
2848 struct socket_wq *wq;
2849
2850 rcu_read_lock();
2851 wq = rcu_dereference(sk->sk_wq);
2852 if (skwq_has_sleeper(wq))
2853 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2854 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2855 rcu_read_unlock();
2856}
2857
2858static void sock_def_readable(struct sock *sk)
2859{
2860 struct socket_wq *wq;
2861
2862 rcu_read_lock();
2863 wq = rcu_dereference(sk->sk_wq);
2864 if (skwq_has_sleeper(wq))
2865 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2866 EPOLLRDNORM | EPOLLRDBAND);
2867 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2868 rcu_read_unlock();
2869}
2870
2871static void sock_def_write_space(struct sock *sk)
2872{
2873 struct socket_wq *wq;
2874
2875 rcu_read_lock();
2876
2877 /* Do not wake up a writer until he can make "significant"
2878 * progress. --DaveM
2879 */
2880 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2881 wq = rcu_dereference(sk->sk_wq);
2882 if (skwq_has_sleeper(wq))
2883 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2884 EPOLLWRNORM | EPOLLWRBAND);
2885
2886 /* Should agree with poll, otherwise some programs break */
2887 if (sock_writeable(sk))
2888 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2889 }
2890
2891 rcu_read_unlock();
2892}
2893
2894static void sock_def_destruct(struct sock *sk)
2895{
2896}
2897
2898void sk_send_sigurg(struct sock *sk)
2899{
2900 if (sk->sk_socket && sk->sk_socket->file)
2901 if (send_sigurg(&sk->sk_socket->file->f_owner))
2902 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2903}
2904EXPORT_SYMBOL(sk_send_sigurg);
2905
2906void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2907 unsigned long expires)
2908{
2909 if (!mod_timer(timer, expires))
2910 sock_hold(sk);
2911}
2912EXPORT_SYMBOL(sk_reset_timer);
2913
2914void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2915{
2916 if (del_timer(timer))
2917 __sock_put(sk);
2918}
2919EXPORT_SYMBOL(sk_stop_timer);
2920
2921void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2922{
2923 if (del_timer_sync(timer))
2924 __sock_put(sk);
2925}
2926EXPORT_SYMBOL(sk_stop_timer_sync);
2927
2928void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
2929{
2930 sk_init_common(sk);
2931 sk->sk_send_head = NULL;
2932
2933 timer_setup(&sk->sk_timer, NULL, 0);
2934
2935 sk->sk_allocation = GFP_KERNEL;
2936 sk->sk_rcvbuf = sysctl_rmem_default;
2937 sk->sk_sndbuf = sysctl_wmem_default;
2938 sk->sk_state = TCP_CLOSE;
2939 sk_set_socket(sk, sock);
2940
2941 sock_set_flag(sk, SOCK_ZAPPED);
2942
2943 if (sock) {
2944 sk->sk_type = sock->type;
2945 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2946 sock->sk = sk;
2947 } else {
2948 RCU_INIT_POINTER(sk->sk_wq, NULL);
2949 }
2950 sk->sk_uid = uid;
2951
2952 rwlock_init(&sk->sk_callback_lock);
2953 if (sk->sk_kern_sock)
2954 lockdep_set_class_and_name(
2955 &sk->sk_callback_lock,
2956 af_kern_callback_keys + sk->sk_family,
2957 af_family_kern_clock_key_strings[sk->sk_family]);
2958 else
2959 lockdep_set_class_and_name(
2960 &sk->sk_callback_lock,
2961 af_callback_keys + sk->sk_family,
2962 af_family_clock_key_strings[sk->sk_family]);
2963
2964 sk->sk_state_change = sock_def_wakeup;
2965 sk->sk_data_ready = sock_def_readable;
2966 sk->sk_write_space = sock_def_write_space;
2967 sk->sk_error_report = sock_def_error_report;
2968 sk->sk_destruct = sock_def_destruct;
2969
2970 sk->sk_frag.page = NULL;
2971 sk->sk_frag.offset = 0;
2972 sk->sk_peek_off = -1;
2973
2974 sk->sk_peer_pid = NULL;
2975 sk->sk_peer_cred = NULL;
2976 spin_lock_init(&sk->sk_peer_lock);
2977
2978 sk->sk_write_pending = 0;
2979 sk->sk_rcvlowat = 1;
2980 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2981 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2982
2983 sk->sk_stamp = SK_DEFAULT_STAMP;
2984#if BITS_PER_LONG==32
2985 seqlock_init(&sk->sk_stamp_seq);
2986#endif
2987 atomic_set(&sk->sk_zckey, 0);
2988
2989#ifdef CONFIG_NET_RX_BUSY_POLL
2990 sk->sk_napi_id = 0;
2991 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
2992#endif
2993
2994 sk->sk_max_pacing_rate = ~0UL;
2995 sk->sk_pacing_rate = ~0UL;
2996 WRITE_ONCE(sk->sk_pacing_shift, 10);
2997 sk->sk_incoming_cpu = -1;
2998
2999 sk_rx_queue_clear(sk);
3000 /*
3001 * Before updating sk_refcnt, we must commit prior changes to memory
3002 * (Documentation/RCU/rculist_nulls.txt for details)
3003 */
3004 smp_wmb();
3005 refcount_set(&sk->sk_refcnt, 1);
3006 atomic_set(&sk->sk_drops, 0);
3007}
3008EXPORT_SYMBOL(sock_init_data_uid);
3009
3010void sock_init_data(struct socket *sock, struct sock *sk)
3011{
3012 kuid_t uid = sock ?
3013 SOCK_INODE(sock)->i_uid :
3014 make_kuid(sock_net(sk)->user_ns, 0);
3015
3016 sock_init_data_uid(sock, sk, uid);
3017}
3018EXPORT_SYMBOL(sock_init_data);
3019
3020void lock_sock_nested(struct sock *sk, int subclass)
3021{
3022 might_sleep();
3023 spin_lock_bh(&sk->sk_lock.slock);
3024 if (sk->sk_lock.owned)
3025 __lock_sock(sk);
3026 sk->sk_lock.owned = 1;
3027 spin_unlock(&sk->sk_lock.slock);
3028 /*
3029 * The sk_lock has mutex_lock() semantics here:
3030 */
3031 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3032 local_bh_enable();
3033}
3034EXPORT_SYMBOL(lock_sock_nested);
3035
3036void release_sock(struct sock *sk)
3037{
3038 spin_lock_bh(&sk->sk_lock.slock);
3039 if (sk->sk_backlog.tail)
3040 __release_sock(sk);
3041
3042 /* Warning : release_cb() might need to release sk ownership,
3043 * ie call sock_release_ownership(sk) before us.
3044 */
3045 if (sk->sk_prot->release_cb)
3046 sk->sk_prot->release_cb(sk);
3047
3048 sock_release_ownership(sk);
3049 if (waitqueue_active(&sk->sk_lock.wq))
3050 wake_up(&sk->sk_lock.wq);
3051 spin_unlock_bh(&sk->sk_lock.slock);
3052}
3053EXPORT_SYMBOL(release_sock);
3054
3055/**
3056 * lock_sock_fast - fast version of lock_sock
3057 * @sk: socket
3058 *
3059 * This version should be used for very small section, where process wont block
3060 * return false if fast path is taken:
3061 *
3062 * sk_lock.slock locked, owned = 0, BH disabled
3063 *
3064 * return true if slow path is taken:
3065 *
3066 * sk_lock.slock unlocked, owned = 1, BH enabled
3067 */
3068bool lock_sock_fast(struct sock *sk)
3069{
3070 might_sleep();
3071 spin_lock_bh(&sk->sk_lock.slock);
3072
3073 if (!sk->sk_lock.owned)
3074 /*
3075 * Note : We must disable BH
3076 */
3077 return false;
3078
3079 __lock_sock(sk);
3080 sk->sk_lock.owned = 1;
3081 spin_unlock(&sk->sk_lock.slock);
3082 /*
3083 * The sk_lock has mutex_lock() semantics here:
3084 */
3085 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3086 local_bh_enable();
3087 return true;
3088}
3089EXPORT_SYMBOL(lock_sock_fast);
3090
3091int sock_gettstamp(struct socket *sock, void __user *userstamp,
3092 bool timeval, bool time32)
3093{
3094 struct sock *sk = sock->sk;
3095 struct timespec64 ts;
3096
3097 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3098 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3099 if (ts.tv_sec == -1)
3100 return -ENOENT;
3101 if (ts.tv_sec == 0) {
3102 ktime_t kt = ktime_get_real();
3103 sock_write_timestamp(sk, kt);;
3104 ts = ktime_to_timespec64(kt);
3105 }
3106
3107 if (timeval)
3108 ts.tv_nsec /= 1000;
3109
3110#ifdef CONFIG_COMPAT_32BIT_TIME
3111 if (time32)
3112 return put_old_timespec32(&ts, userstamp);
3113#endif
3114#ifdef CONFIG_SPARC64
3115 /* beware of padding in sparc64 timeval */
3116 if (timeval && !in_compat_syscall()) {
3117 struct __kernel_old_timeval __user tv = {
3118 .tv_sec = ts.tv_sec,
3119 .tv_usec = ts.tv_nsec,
3120 };
3121 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3122 return -EFAULT;
3123 return 0;
3124 }
3125#endif
3126 return put_timespec64(&ts, userstamp);
3127}
3128EXPORT_SYMBOL(sock_gettstamp);
3129
3130void sock_enable_timestamp(struct sock *sk, int flag)
3131{
3132 if (!sock_flag(sk, flag)) {
3133 unsigned long previous_flags = sk->sk_flags;
3134
3135 sock_set_flag(sk, flag);
3136 /*
3137 * we just set one of the two flags which require net
3138 * time stamping, but time stamping might have been on
3139 * already because of the other one
3140 */
3141 if (sock_needs_netstamp(sk) &&
3142 !(previous_flags & SK_FLAGS_TIMESTAMP))
3143 net_enable_timestamp();
3144 }
3145}
3146
3147int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3148 int level, int type)
3149{
3150 struct sock_exterr_skb *serr;
3151 struct sk_buff *skb;
3152 int copied, err;
3153
3154 err = -EAGAIN;
3155 skb = sock_dequeue_err_skb(sk);
3156 if (skb == NULL)
3157 goto out;
3158
3159 copied = skb->len;
3160 if (copied > len) {
3161 msg->msg_flags |= MSG_TRUNC;
3162 copied = len;
3163 }
3164 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3165 if (err)
3166 goto out_free_skb;
3167
3168 sock_recv_timestamp(msg, sk, skb);
3169
3170 serr = SKB_EXT_ERR(skb);
3171 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3172
3173 msg->msg_flags |= MSG_ERRQUEUE;
3174 err = copied;
3175
3176out_free_skb:
3177 kfree_skb(skb);
3178out:
3179 return err;
3180}
3181EXPORT_SYMBOL(sock_recv_errqueue);
3182
3183/*
3184 * Get a socket option on an socket.
3185 *
3186 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3187 * asynchronous errors should be reported by getsockopt. We assume
3188 * this means if you specify SO_ERROR (otherwise whats the point of it).
3189 */
3190int sock_common_getsockopt(struct socket *sock, int level, int optname,
3191 char __user *optval, int __user *optlen)
3192{
3193 struct sock *sk = sock->sk;
3194
3195 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3196 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3197}
3198EXPORT_SYMBOL(sock_common_getsockopt);
3199
3200#ifdef CONFIG_COMPAT
3201int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3202 char __user *optval, int __user *optlen)
3203{
3204 struct sock *sk = sock->sk;
3205
3206 if (sk->sk_prot->compat_getsockopt != NULL)
3207 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3208 optval, optlen);
3209 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3210}
3211EXPORT_SYMBOL(compat_sock_common_getsockopt);
3212#endif
3213
3214int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3215 int flags)
3216{
3217 struct sock *sk = sock->sk;
3218 int addr_len = 0;
3219 int err;
3220
3221 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3222 flags & ~MSG_DONTWAIT, &addr_len);
3223 if (err >= 0)
3224 msg->msg_namelen = addr_len;
3225 return err;
3226}
3227EXPORT_SYMBOL(sock_common_recvmsg);
3228
3229/*
3230 * Set socket options on an inet socket.
3231 */
3232int sock_common_setsockopt(struct socket *sock, int level, int optname,
3233 char __user *optval, unsigned int optlen)
3234{
3235 struct sock *sk = sock->sk;
3236
3237 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3238 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3239}
3240EXPORT_SYMBOL(sock_common_setsockopt);
3241
3242#ifdef CONFIG_COMPAT
3243int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3244 char __user *optval, unsigned int optlen)
3245{
3246 struct sock *sk = sock->sk;
3247
3248 if (sk->sk_prot->compat_setsockopt != NULL)
3249 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3250 optval, optlen);
3251 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3252}
3253EXPORT_SYMBOL(compat_sock_common_setsockopt);
3254#endif
3255
3256void sk_common_release(struct sock *sk)
3257{
3258 if (sk->sk_prot->destroy)
3259 sk->sk_prot->destroy(sk);
3260
3261 /*
3262 * Observation: when sock_common_release is called, processes have
3263 * no access to socket. But net still has.
3264 * Step one, detach it from networking:
3265 *
3266 * A. Remove from hash tables.
3267 */
3268
3269 sk->sk_prot->unhash(sk);
3270
3271 /*
3272 * In this point socket cannot receive new packets, but it is possible
3273 * that some packets are in flight because some CPU runs receiver and
3274 * did hash table lookup before we unhashed socket. They will achieve
3275 * receive queue and will be purged by socket destructor.
3276 *
3277 * Also we still have packets pending on receive queue and probably,
3278 * our own packets waiting in device queues. sock_destroy will drain
3279 * receive queue, but transmitted packets will delay socket destruction
3280 * until the last reference will be released.
3281 */
3282
3283 sock_orphan(sk);
3284
3285 xfrm_sk_free_policy(sk);
3286
3287 sk_refcnt_debug_release(sk);
3288
3289 sock_put(sk);
3290}
3291EXPORT_SYMBOL(sk_common_release);
3292
3293void sk_get_meminfo(const struct sock *sk, u32 *mem)
3294{
3295 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3296
3297 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3298 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3299 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3300 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3301 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3302 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3303 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3304 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3305 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3306}
3307
3308#ifdef CONFIG_PROC_FS
3309#define PROTO_INUSE_NR 64 /* should be enough for the first time */
3310struct prot_inuse {
3311 int val[PROTO_INUSE_NR];
3312};
3313
3314static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3315
3316void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3317{
3318 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3319}
3320EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3321
3322int sock_prot_inuse_get(struct net *net, struct proto *prot)
3323{
3324 int cpu, idx = prot->inuse_idx;
3325 int res = 0;
3326
3327 for_each_possible_cpu(cpu)
3328 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3329
3330 return res >= 0 ? res : 0;
3331}
3332EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3333
3334static void sock_inuse_add(struct net *net, int val)
3335{
3336 this_cpu_add(*net->core.sock_inuse, val);
3337}
3338
3339int sock_inuse_get(struct net *net)
3340{
3341 int cpu, res = 0;
3342
3343 for_each_possible_cpu(cpu)
3344 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3345
3346 return res;
3347}
3348
3349EXPORT_SYMBOL_GPL(sock_inuse_get);
3350
3351static int __net_init sock_inuse_init_net(struct net *net)
3352{
3353 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3354 if (net->core.prot_inuse == NULL)
3355 return -ENOMEM;
3356
3357 net->core.sock_inuse = alloc_percpu(int);
3358 if (net->core.sock_inuse == NULL)
3359 goto out;
3360
3361 return 0;
3362
3363out:
3364 free_percpu(net->core.prot_inuse);
3365 return -ENOMEM;
3366}
3367
3368static void __net_exit sock_inuse_exit_net(struct net *net)
3369{
3370 free_percpu(net->core.prot_inuse);
3371 free_percpu(net->core.sock_inuse);
3372}
3373
3374static struct pernet_operations net_inuse_ops = {
3375 .init = sock_inuse_init_net,
3376 .exit = sock_inuse_exit_net,
3377};
3378
3379static __init int net_inuse_init(void)
3380{
3381 if (register_pernet_subsys(&net_inuse_ops))
3382 panic("Cannot initialize net inuse counters");
3383
3384 return 0;
3385}
3386
3387core_initcall(net_inuse_init);
3388
3389static int assign_proto_idx(struct proto *prot)
3390{
3391 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3392
3393 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3394 pr_err("PROTO_INUSE_NR exhausted\n");
3395 return -ENOSPC;
3396 }
3397
3398 set_bit(prot->inuse_idx, proto_inuse_idx);
3399 return 0;
3400}
3401
3402static void release_proto_idx(struct proto *prot)
3403{
3404 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3405 clear_bit(prot->inuse_idx, proto_inuse_idx);
3406}
3407#else
3408static inline int assign_proto_idx(struct proto *prot)
3409{
3410 return 0;
3411}
3412
3413static inline void release_proto_idx(struct proto *prot)
3414{
3415}
3416
3417static void sock_inuse_add(struct net *net, int val)
3418{
3419}
3420#endif
3421
3422static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3423{
3424 if (!twsk_prot)
3425 return;
3426 kfree(twsk_prot->twsk_slab_name);
3427 twsk_prot->twsk_slab_name = NULL;
3428 kmem_cache_destroy(twsk_prot->twsk_slab);
3429 twsk_prot->twsk_slab = NULL;
3430}
3431
3432static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3433{
3434 if (!rsk_prot)
3435 return;
3436 kfree(rsk_prot->slab_name);
3437 rsk_prot->slab_name = NULL;
3438 kmem_cache_destroy(rsk_prot->slab);
3439 rsk_prot->slab = NULL;
3440}
3441
3442static int req_prot_init(const struct proto *prot)
3443{
3444 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3445
3446 if (!rsk_prot)
3447 return 0;
3448
3449 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3450 prot->name);
3451 if (!rsk_prot->slab_name)
3452 return -ENOMEM;
3453
3454 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3455 rsk_prot->obj_size, 0,
3456 SLAB_ACCOUNT | prot->slab_flags,
3457 NULL);
3458
3459 if (!rsk_prot->slab) {
3460 pr_crit("%s: Can't create request sock SLAB cache!\n",
3461 prot->name);
3462 return -ENOMEM;
3463 }
3464 return 0;
3465}
3466
3467int proto_register(struct proto *prot, int alloc_slab)
3468{
3469 int ret = -ENOBUFS;
3470
3471 if (alloc_slab) {
3472 prot->slab = kmem_cache_create_usercopy(prot->name,
3473 prot->obj_size, 0,
3474 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3475 prot->slab_flags,
3476 prot->useroffset, prot->usersize,
3477 NULL);
3478
3479 if (prot->slab == NULL) {
3480 pr_crit("%s: Can't create sock SLAB cache!\n",
3481 prot->name);
3482 goto out;
3483 }
3484
3485 if (req_prot_init(prot))
3486 goto out_free_request_sock_slab;
3487
3488 if (prot->twsk_prot != NULL) {
3489 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3490
3491 if (prot->twsk_prot->twsk_slab_name == NULL)
3492 goto out_free_request_sock_slab;
3493
3494 prot->twsk_prot->twsk_slab =
3495 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3496 prot->twsk_prot->twsk_obj_size,
3497 0,
3498 SLAB_ACCOUNT |
3499 prot->slab_flags,
3500 NULL);
3501 if (prot->twsk_prot->twsk_slab == NULL)
3502 goto out_free_timewait_sock_slab;
3503 }
3504 }
3505
3506 mutex_lock(&proto_list_mutex);
3507 ret = assign_proto_idx(prot);
3508 if (ret) {
3509 mutex_unlock(&proto_list_mutex);
3510 goto out_free_timewait_sock_slab;
3511 }
3512 list_add(&prot->node, &proto_list);
3513 mutex_unlock(&proto_list_mutex);
3514 return ret;
3515
3516out_free_timewait_sock_slab:
3517 if (alloc_slab && prot->twsk_prot)
3518 tw_prot_cleanup(prot->twsk_prot);
3519out_free_request_sock_slab:
3520 if (alloc_slab) {
3521 req_prot_cleanup(prot->rsk_prot);
3522
3523 kmem_cache_destroy(prot->slab);
3524 prot->slab = NULL;
3525 }
3526out:
3527 return ret;
3528}
3529EXPORT_SYMBOL(proto_register);
3530
3531void proto_unregister(struct proto *prot)
3532{
3533 mutex_lock(&proto_list_mutex);
3534 release_proto_idx(prot);
3535 list_del(&prot->node);
3536 mutex_unlock(&proto_list_mutex);
3537
3538 kmem_cache_destroy(prot->slab);
3539 prot->slab = NULL;
3540
3541 req_prot_cleanup(prot->rsk_prot);
3542 tw_prot_cleanup(prot->twsk_prot);
3543}
3544EXPORT_SYMBOL(proto_unregister);
3545
3546int sock_load_diag_module(int family, int protocol)
3547{
3548 if (!protocol) {
3549 if (!sock_is_registered(family))
3550 return -ENOENT;
3551
3552 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3553 NETLINK_SOCK_DIAG, family);
3554 }
3555
3556#ifdef CONFIG_INET
3557 if (family == AF_INET &&
3558 protocol != IPPROTO_RAW &&
3559 !rcu_access_pointer(inet_protos[protocol]))
3560 return -ENOENT;
3561#endif
3562
3563 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3564 NETLINK_SOCK_DIAG, family, protocol);
3565}
3566EXPORT_SYMBOL(sock_load_diag_module);
3567
3568#ifdef CONFIG_PROC_FS
3569static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3570 __acquires(proto_list_mutex)
3571{
3572 mutex_lock(&proto_list_mutex);
3573 return seq_list_start_head(&proto_list, *pos);
3574}
3575
3576static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3577{
3578 return seq_list_next(v, &proto_list, pos);
3579}
3580
3581static void proto_seq_stop(struct seq_file *seq, void *v)
3582 __releases(proto_list_mutex)
3583{
3584 mutex_unlock(&proto_list_mutex);
3585}
3586
3587static char proto_method_implemented(const void *method)
3588{
3589 return method == NULL ? 'n' : 'y';
3590}
3591static long sock_prot_memory_allocated(struct proto *proto)
3592{
3593 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3594}
3595
3596static const char *sock_prot_memory_pressure(struct proto *proto)
3597{
3598 return proto->memory_pressure != NULL ?
3599 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3600}
3601
3602static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3603{
3604
3605 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3606 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3607 proto->name,
3608 proto->obj_size,
3609 sock_prot_inuse_get(seq_file_net(seq), proto),
3610 sock_prot_memory_allocated(proto),
3611 sock_prot_memory_pressure(proto),
3612 proto->max_header,
3613 proto->slab == NULL ? "no" : "yes",
3614 module_name(proto->owner),
3615 proto_method_implemented(proto->close),
3616 proto_method_implemented(proto->connect),
3617 proto_method_implemented(proto->disconnect),
3618 proto_method_implemented(proto->accept),
3619 proto_method_implemented(proto->ioctl),
3620 proto_method_implemented(proto->init),
3621 proto_method_implemented(proto->destroy),
3622 proto_method_implemented(proto->shutdown),
3623 proto_method_implemented(proto->setsockopt),
3624 proto_method_implemented(proto->getsockopt),
3625 proto_method_implemented(proto->sendmsg),
3626 proto_method_implemented(proto->recvmsg),
3627 proto_method_implemented(proto->sendpage),
3628 proto_method_implemented(proto->bind),
3629 proto_method_implemented(proto->backlog_rcv),
3630 proto_method_implemented(proto->hash),
3631 proto_method_implemented(proto->unhash),
3632 proto_method_implemented(proto->get_port),
3633 proto_method_implemented(proto->enter_memory_pressure));
3634}
3635
3636static int proto_seq_show(struct seq_file *seq, void *v)
3637{
3638 if (v == &proto_list)
3639 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3640 "protocol",
3641 "size",
3642 "sockets",
3643 "memory",
3644 "press",
3645 "maxhdr",
3646 "slab",
3647 "module",
3648 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3649 else
3650 proto_seq_printf(seq, list_entry(v, struct proto, node));
3651 return 0;
3652}
3653
3654static const struct seq_operations proto_seq_ops = {
3655 .start = proto_seq_start,
3656 .next = proto_seq_next,
3657 .stop = proto_seq_stop,
3658 .show = proto_seq_show,
3659};
3660
3661static __net_init int proto_init_net(struct net *net)
3662{
3663 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3664 sizeof(struct seq_net_private)))
3665 return -ENOMEM;
3666
3667 return 0;
3668}
3669
3670static __net_exit void proto_exit_net(struct net *net)
3671{
3672 remove_proc_entry("protocols", net->proc_net);
3673}
3674
3675
3676static __net_initdata struct pernet_operations proto_net_ops = {
3677 .init = proto_init_net,
3678 .exit = proto_exit_net,
3679};
3680
3681static int __init proto_init(void)
3682{
3683 if (IS_ENABLED(CONFIG_PROC_STRIPPED))
3684 return 0;
3685 return register_pernet_subsys(&proto_net_ops);
3686}
3687
3688subsys_initcall(proto_init);
3689
3690#endif /* PROC_FS */
3691
3692#ifdef CONFIG_NET_RX_BUSY_POLL
3693bool sk_busy_loop_end(void *p, unsigned long start_time)
3694{
3695 struct sock *sk = p;
3696
3697 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3698 sk_busy_loop_timeout(sk, start_time);
3699}
3700EXPORT_SYMBOL(sk_busy_loop_end);
3701#endif /* CONFIG_NET_RX_BUSY_POLL */