Blame - marvell/linux/ipc/sem.c - T108

blob: d58c00dd1ed99a4963c786d5d2f166b1fdad3071 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/ipc/sem.c
				4	* Copyright (C) 1992 Krishna Balasubramanian
				5	* Copyright (C) 1995 Eric Schenk, Bruno Haible
				6	*
				7	* /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
				8	*
				9	* SMP-threaded, sysctl's added
				10	* (c) 1999 Manfred Spraul <manfred@colorfullife.com>
				11	* Enforced range limit on SEM_UNDO
				12	* (c) 2001 Red Hat Inc
				13	* Lockless wakeup
				14	* (c) 2003 Manfred Spraul <manfred@colorfullife.com>
				15	* (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
				16	* Further wakeup optimizations, documentation
				17	* (c) 2010 Manfred Spraul <manfred@colorfullife.com>
				18	*
				19	* support for audit of ipc object properties and permission changes
				20	* Dustin Kirkland <dustin.kirkland@us.ibm.com>
				21	*
				22	* namespaces support
				23	* OpenVZ, SWsoft Inc.
				24	* Pavel Emelianov <xemul@openvz.org>
				25	*
				26	* Implementation notes: (May 2010)
				27	* This file implements System V semaphores.
				28	*
				29	* User space visible behavior:
				30	* - FIFO ordering for semop() operations (just FIFO, not starvation
				31	* protection)
				32	* - multiple semaphore operations that alter the same semaphore in
				33	* one semop() are handled.
				34	* - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
				35	* SETALL calls.
				36	* - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
				37	* - undo adjustments at process exit are limited to 0..SEMVMX.
				38	* - namespace are supported.
				39	* - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
				40	* to /proc/sys/kernel/sem.
				41	* - statistics about the usage are reported in /proc/sysvipc/sem.
				42	*
				43	* Internals:
				44	* - scalability:
				45	* - all global variables are read-mostly.
				46	* - semop() calls and semctl(RMID) are synchronized by RCU.
				47	* - most operations do write operations (actually: spin_lock calls) to
				48	* the per-semaphore array structure.
				49	* Thus: Perfect SMP scaling between independent semaphore arrays.
				50	* If multiple semaphores in one array are used, then cache line
				51	* trashing on the semaphore array spinlock will limit the scaling.
				52	* - semncnt and semzcnt are calculated on demand in count_semcnt()
				53	* - the task that performs a successful semop() scans the list of all
				54	* sleeping tasks and completes any pending operations that can be fulfilled.
				55	* Semaphores are actively given to waiting tasks (necessary for FIFO).
				56	* (see update_queue())
				57	* - To improve the scalability, the actual wake-up calls are performed after
				58	* dropping all locks. (see wake_up_sem_queue_prepare())
				59	* - All work is done by the waker, the woken up task does not have to do
				60	* anything - not even acquiring a lock or dropping a refcount.
				61	* - A woken up task may not even touch the semaphore array anymore, it may
				62	* have been destroyed already by a semctl(RMID).
				63	* - UNDO values are stored in an array (one per process and per
				64	* semaphore array, lazily allocated). For backwards compatibility, multiple
				65	* modes for the UNDO variables are supported (per process, per thread)
				66	* (see copy_semundo, CLONE_SYSVSEM)
				67	* - There are two lists of the pending operations: a per-array list
				68	* and per-semaphore list (stored in the array). This allows to achieve FIFO
				69	* ordering without always scanning all pending operations.
				70	* The worst-case behavior is nevertheless O(N^2) for N wakeups.
				71	*/
				72
				73	#include <linux/compat.h>
				74	#include <linux/slab.h>
				75	#include <linux/spinlock.h>
				76	#include <linux/init.h>
				77	#include <linux/proc_fs.h>
				78	#include <linux/time.h>
				79	#include <linux/security.h>
				80	#include <linux/syscalls.h>
				81	#include <linux/audit.h>
				82	#include <linux/capability.h>
				83	#include <linux/seq_file.h>
				84	#include <linux/rwsem.h>
				85	#include <linux/nsproxy.h>
				86	#include <linux/ipc_namespace.h>
				87	#include <linux/sched/wake_q.h>
				88	#include <linux/nospec.h>
				89	#include <linux/rhashtable.h>
				90
				91	#include <linux/uaccess.h>
				92	#include "util.h"
				93
				94	/* One semaphore structure for each semaphore in the system. */
				95	struct sem {
				96	int semval; /* current value */
				97	/*
				98	* PID of the process that last modified the semaphore. For
				99	* Linux, specifically these are:
				100	* - semop
				101	* - semctl, via SETVAL and SETALL.
				102	* - at task exit when performing undo adjustments (see exit_sem).
				103	*/
				104	struct pid *sempid;
				105	spinlock_t lock; /* spinlock for fine-grained semtimedop */
				106	struct list_head pending_alter; /* pending single-sop operations */
				107	/* that alter the semaphore */
				108	struct list_head pending_const; /* pending single-sop operations */
				109	/* that do not alter the semaphore*/
				110	time64_t sem_otime; /* candidate for sem_otime */
				111	} ____cacheline_aligned_in_smp;
				112
				113	/* One sem_array data structure for each set of semaphores in the system. */
				114	struct sem_array {
				115	struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */
				116	time64_t sem_ctime; /* create/last semctl() time */
				117	struct list_head pending_alter; /* pending operations */
				118	/* that alter the array */
				119	struct list_head pending_const; /* pending complex operations */
				120	/* that do not alter semvals */
				121	struct list_head list_id; /* undo requests on this array */
				122	int sem_nsems; /* no. of semaphores in array */
				123	int complex_count; /* pending complex operations */
				124	unsigned int use_global_lock;/* >0: global lock required */
				125
				126	struct sem sems[];
				127	} __randomize_layout;
				128
				129	/* One queue for each sleeping process in the system. */
				130	struct sem_queue {
				131	struct list_head list; /* queue of pending operations */
				132	struct task_struct sleeper; / this process */
				133	struct sem_undo undo; / undo structure */
				134	struct pid pid; / process id of requesting process */
				135	int status; /* completion status of operation */
				136	struct sembuf sops; / array of pending operations */
				137	struct sembuf blocking; / the operation that blocked */
				138	int nsops; /* number of operations */
				139	bool alter; /* does sops alter the array? /
				140	bool dupsop; /* sops on more than one sem_num */
				141	};
				142
				143	/* Each task has a list of undo requests. They are executed automatically
				144	* when the process exits.
				145	*/
				146	struct sem_undo {
				147	struct list_head list_proc; /* per-process list: *
				148	* all undos from one process
				149	* rcu protected */
				150	struct rcu_head rcu; /* rcu struct for sem_undo */
				151	struct sem_undo_list ulp; / back ptr to sem_undo_list */
				152	struct list_head list_id; /* per semaphore array list:
				153	* all undos for one array */
				154	int semid; /* semaphore set identifier */
				155	short semadj; / array of adjustments */
				156	/* one per semaphore */
				157	};
				158
				159	/* sem_undo_list controls shared access to the list of sem_undo structures
				160	* that may be shared among all a CLONE_SYSVSEM task group.
				161	*/
				162	struct sem_undo_list {
				163	refcount_t refcnt;
				164	spinlock_t lock;
				165	struct list_head list_proc;
				166	};
				167
				168
				169	#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
				170
				171	static int newary(struct ipc_namespace , struct ipc_params );
				172	static void freeary(struct ipc_namespace , struct kern_ipc_perm );
				173	#ifdef CONFIG_PROC_FS
				174	static int sysvipc_sem_proc_show(struct seq_file s, void it);
				175	#endif
				176
				177	#define SEMMSL_FAST 256 /* 512 bytes on stack */
				178	#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
				179
				180	/*
				181	* Switching from the mode suitable for simple ops
				182	* to the mode for complex ops is costly. Therefore:
				183	* use some hysteresis
				184	*/
				185	#define USE_GLOBAL_LOCK_HYSTERESIS 10
				186
				187	/*
				188	* Locking:
				189	* a) global sem_lock() for read/write
				190	* sem_undo.id_next,
				191	* sem_array.complex_count,
				192	* sem_array.pending{_alter,_const},
				193	* sem_array.sem_undo
				194	*
				195	* b) global or semaphore sem_lock() for read/write:
				196	* sem_array.sems[i].pending_{const,alter}:
				197	*
				198	* c) special:
				199	* sem_undo_list.list_proc:
				200	* * undo_list->lock for write
				201	* * rcu for read
				202	* use_global_lock:
				203	* * global sem_lock() for write
				204	* * either local or global sem_lock() for read.
				205	*
				206	* Memory ordering:
				207	* Most ordering is enforced by using spin_lock() and spin_unlock().
				208	* The special case is use_global_lock:
				209	* Setting it from non-zero to 0 is a RELEASE, this is ensured by
				210	* using smp_store_release().
				211	* Testing if it is non-zero is an ACQUIRE, this is ensured by using
				212	* smp_load_acquire().
				213	* Setting it from 0 to non-zero must be ordered with regards to
				214	* this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
				215	* is inside a spin_lock() and after a write from 0 to non-zero a
				216	* spin_lock()+spin_unlock() is done.
				217	*/
				218
				219	#define sc_semmsl sem_ctls[0]
				220	#define sc_semmns sem_ctls[1]
				221	#define sc_semopm sem_ctls[2]
				222	#define sc_semmni sem_ctls[3]
				223
				224	void sem_init_ns(struct ipc_namespace *ns)
				225	{
				226	ns->sc_semmsl = SEMMSL;
				227	ns->sc_semmns = SEMMNS;
				228	ns->sc_semopm = SEMOPM;
				229	ns->sc_semmni = SEMMNI;
				230	ns->used_sems = 0;
				231	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
				232	}
				233
				234	#ifdef CONFIG_IPC_NS
				235	void sem_exit_ns(struct ipc_namespace *ns)
				236	{
				237	free_ipcs(ns, &sem_ids(ns), freeary);
				238	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
				239	rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
				240	}
				241	#endif
				242
				243	void __init sem_init(void)
				244	{
				245	sem_init_ns(&init_ipc_ns);
				246	if (IS_ENABLED(CONFIG_PROC_STRIPPED))
				247	return;
				248	ipc_init_proc_interface("sysvipc/sem",
				249	" key semid perms nsems uid gid cuid cgid otime ctime\n",
				250	IPC_SEM_IDS, sysvipc_sem_proc_show);
				251	}
				252
				253	/**
				254	* unmerge_queues - unmerge queues, if possible.
				255	* @sma: semaphore array
				256	*
				257	* The function unmerges the wait queues if complex_count is 0.
				258	* It must be called prior to dropping the global semaphore array lock.
				259	*/
				260	static void unmerge_queues(struct sem_array *sma)
				261	{
				262	struct sem_queue q, tq;
				263
				264	/* complex operations still around? */
				265	if (sma->complex_count)
				266	return;
				267	/*
				268	* We will switch back to simple mode.
				269	* Move all pending operation back into the per-semaphore
				270	* queues.
				271	*/
				272	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
				273	struct sem *curr;
				274	curr = &sma->sems[q->sops[0].sem_num];
				275
				276	list_add_tail(&q->list, &curr->pending_alter);
				277	}
				278	INIT_LIST_HEAD(&sma->pending_alter);
				279	}
				280
				281	/**
				282	* merge_queues - merge single semop queues into global queue
				283	* @sma: semaphore array
				284	*
				285	* This function merges all per-semaphore queues into the global queue.
				286	* It is necessary to achieve FIFO ordering for the pending single-sop
				287	* operations when a multi-semop operation must sleep.
				288	* Only the alter operations must be moved, the const operations can stay.
				289	*/
				290	static void merge_queues(struct sem_array *sma)
				291	{
				292	int i;
				293	for (i = 0; i < sma->sem_nsems; i++) {
				294	struct sem *sem = &sma->sems[i];
				295
				296	list_splice_init(&sem->pending_alter, &sma->pending_alter);
				297	}
				298	}
				299
				300	static void sem_rcu_free(struct rcu_head *head)
				301	{
				302	struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
				303	struct sem_array *sma = container_of(p, struct sem_array, sem_perm);
				304
				305	security_sem_free(&sma->sem_perm);
				306	kvfree(sma);
				307	}
				308
				309	/*
				310	* Enter the mode suitable for non-simple operations:
				311	* Caller must own sem_perm.lock.
				312	*/
				313	static void complexmode_enter(struct sem_array *sma)
				314	{
				315	int i;
				316	struct sem *sem;
				317
				318	if (sma->use_global_lock > 0) {
				319	/*
				320	* We are already in global lock mode.
				321	* Nothing to do, just reset the
				322	* counter until we return to simple mode.
				323	*/
				324	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
				325	return;
				326	}
				327	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
				328
				329	for (i = 0; i < sma->sem_nsems; i++) {
				330	sem = &sma->sems[i];
				331	spin_lock(&sem->lock);
				332	spin_unlock(&sem->lock);
				333	}
				334	}
				335
				336	/*
				337	* Try to leave the mode that disallows simple operations:
				338	* Caller must own sem_perm.lock.
				339	*/
				340	static void complexmode_tryleave(struct sem_array *sma)
				341	{
				342	if (sma->complex_count) {
				343	/* Complex ops are sleeping.
				344	* We must stay in complex mode
				345	*/
				346	return;
				347	}
				348	if (sma->use_global_lock == 1) {
				349	/*
				350	* Immediately after setting use_global_lock to 0,
				351	* a simple op can start. Thus: all memory writes
				352	* performed by the current operation must be visible
				353	* before we set use_global_lock to 0.
				354	*/
				355	smp_store_release(&sma->use_global_lock, 0);
				356	} else {
				357	sma->use_global_lock--;
				358	}
				359	}
				360
				361	#define SEM_GLOBAL_LOCK (-1)
				362	/*
				363	* If the request contains only one semaphore operation, and there are
				364	* no complex transactions pending, lock only the semaphore involved.
				365	* Otherwise, lock the entire semaphore array, since we either have
				366	* multiple semaphores in our own semops, or we need to look at
				367	* semaphores from other pending complex operations.
				368	*/
				369	static inline int sem_lock(struct sem_array sma, struct sembuf sops,
				370	int nsops)
				371	{
				372	struct sem *sem;
				373	int idx;
				374
				375	if (nsops != 1) {
				376	/* Complex operation - acquire a full lock */
				377	ipc_lock_object(&sma->sem_perm);
				378
				379	/* Prevent parallel simple ops */
				380	complexmode_enter(sma);
				381	return SEM_GLOBAL_LOCK;
				382	}
				383
				384	/*
				385	* Only one semaphore affected - try to optimize locking.
				386	* Optimized locking is possible if no complex operation
				387	* is either enqueued or processed right now.
				388	*
				389	* Both facts are tracked by use_global_mode.
				390	*/
				391	idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
				392	sem = &sma->sems[idx];
				393
				394	/*
				395	* Initial check for use_global_lock. Just an optimization,
				396	* no locking, no memory barrier.
				397	*/
				398	if (!sma->use_global_lock) {
				399	/*
				400	* It appears that no complex operation is around.
				401	* Acquire the per-semaphore lock.
				402	*/
				403	spin_lock(&sem->lock);
				404
				405	/* pairs with smp_store_release() */
				406	if (!smp_load_acquire(&sma->use_global_lock)) {
				407	/* fast path successful! */
				408	return sops->sem_num;
				409	}
				410	spin_unlock(&sem->lock);
				411	}
				412
				413	/* slow path: acquire the full lock */
				414	ipc_lock_object(&sma->sem_perm);
				415
				416	if (sma->use_global_lock == 0) {
				417	/*
				418	* The use_global_lock mode ended while we waited for
				419	* sma->sem_perm.lock. Thus we must switch to locking
				420	* with sem->lock.
				421	* Unlike in the fast path, there is no need to recheck
				422	* sma->use_global_lock after we have acquired sem->lock:
				423	* We own sma->sem_perm.lock, thus use_global_lock cannot
				424	* change.
				425	*/
				426	spin_lock(&sem->lock);
				427
				428	ipc_unlock_object(&sma->sem_perm);
				429	return sops->sem_num;
				430	} else {
				431	/*
				432	* Not a false alarm, thus continue to use the global lock
				433	* mode. No need for complexmode_enter(), this was done by
				434	* the caller that has set use_global_mode to non-zero.
				435	*/
				436	return SEM_GLOBAL_LOCK;
				437	}
				438	}
				439
				440	static inline void sem_unlock(struct sem_array *sma, int locknum)
				441	{
				442	if (locknum == SEM_GLOBAL_LOCK) {
				443	unmerge_queues(sma);
				444	complexmode_tryleave(sma);
				445	ipc_unlock_object(&sma->sem_perm);
				446	} else {
				447	struct sem *sem = &sma->sems[locknum];
				448	spin_unlock(&sem->lock);
				449	}
				450	}
				451
				452	/*
				453	* sem_lock_(check_) routines are called in the paths where the rwsem
				454	* is not held.
				455	*
				456	* The caller holds the RCU read lock.
				457	*/
				458	static inline struct sem_array sem_obtain_object(struct ipc_namespace ns, int id)
				459	{
				460	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
				461
				462	if (IS_ERR(ipcp))
				463	return ERR_CAST(ipcp);
				464
				465	return container_of(ipcp, struct sem_array, sem_perm);
				466	}
				467
				468	static inline struct sem_array sem_obtain_object_check(struct ipc_namespace ns,
				469	int id)
				470	{
				471	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
				472
				473	if (IS_ERR(ipcp))
				474	return ERR_CAST(ipcp);
				475
				476	return container_of(ipcp, struct sem_array, sem_perm);
				477	}
				478
				479	static inline void sem_lock_and_putref(struct sem_array *sma)
				480	{
				481	sem_lock(sma, NULL, -1);
				482	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				483	}
				484
				485	static inline void sem_rmid(struct ipc_namespace ns, struct sem_array s)
				486	{
				487	ipc_rmid(&sem_ids(ns), &s->sem_perm);
				488	}
				489
				490	static struct sem_array *sem_alloc(size_t nsems)
				491	{
				492	struct sem_array *sma;
				493
				494	if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
				495	return NULL;
				496
				497	sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT);
				498	if (unlikely(!sma))
				499	return NULL;
				500
				501	return sma;
				502	}
				503
				504	/**
				505	* newary - Create a new semaphore set
				506	* @ns: namespace
				507	* @params: ptr to the structure that contains key, semflg and nsems
				508	*
				509	* Called with sem_ids.rwsem held (as a writer)
				510	*/
				511	static int newary(struct ipc_namespace ns, struct ipc_params params)
				512	{
				513	int retval;
				514	struct sem_array *sma;
				515	key_t key = params->key;
				516	int nsems = params->u.nsems;
				517	int semflg = params->flg;
				518	int i;
				519
				520	if (!nsems)
				521	return -EINVAL;
				522	if (ns->used_sems + nsems > ns->sc_semmns)
				523	return -ENOSPC;
				524
				525	sma = sem_alloc(nsems);
				526	if (!sma)
				527	return -ENOMEM;
				528
				529	sma->sem_perm.mode = (semflg & S_IRWXUGO);
				530	sma->sem_perm.key = key;
				531
				532	sma->sem_perm.security = NULL;
				533	retval = security_sem_alloc(&sma->sem_perm);
				534	if (retval) {
				535	kvfree(sma);
				536	return retval;
				537	}
				538
				539	for (i = 0; i < nsems; i++) {
				540	INIT_LIST_HEAD(&sma->sems[i].pending_alter);
				541	INIT_LIST_HEAD(&sma->sems[i].pending_const);
				542	spin_lock_init(&sma->sems[i].lock);
				543	}
				544
				545	sma->complex_count = 0;
				546	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
				547	INIT_LIST_HEAD(&sma->pending_alter);
				548	INIT_LIST_HEAD(&sma->pending_const);
				549	INIT_LIST_HEAD(&sma->list_id);
				550	sma->sem_nsems = nsems;
				551	sma->sem_ctime = ktime_get_real_seconds();
				552
				553	/* ipc_addid() locks sma upon success. */
				554	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
				555	if (retval < 0) {
				556	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				557	return retval;
				558	}
				559	ns->used_sems += nsems;
				560
				561	sem_unlock(sma, -1);
				562	rcu_read_unlock();
				563
				564	return sma->sem_perm.id;
				565	}
				566
				567
				568	/*
				569	* Called with sem_ids.rwsem and ipcp locked.
				570	*/
				571	static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
				572	struct ipc_params *params)
				573	{
				574	struct sem_array *sma;
				575
				576	sma = container_of(ipcp, struct sem_array, sem_perm);
				577	if (params->u.nsems > sma->sem_nsems)
				578	return -EINVAL;
				579
				580	return 0;
				581	}
				582
				583	long ksys_semget(key_t key, int nsems, int semflg)
				584	{
				585	struct ipc_namespace *ns;
				586	static const struct ipc_ops sem_ops = {
				587	.getnew = newary,
				588	.associate = security_sem_associate,
				589	.more_checks = sem_more_checks,
				590	};
				591	struct ipc_params sem_params;
				592
				593	ns = current->nsproxy->ipc_ns;
				594
				595	if (nsems < 0 \|\| nsems > ns->sc_semmsl)
				596	return -EINVAL;
				597
				598	sem_params.key = key;
				599	sem_params.flg = semflg;
				600	sem_params.u.nsems = nsems;
				601
				602	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
				603	}
				604
				605	SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
				606	{
				607	return ksys_semget(key, nsems, semflg);
				608	}
				609
				610	/**
				611	* perform_atomic_semop[_slow] - Attempt to perform semaphore
				612	* operations on a given array.
				613	* @sma: semaphore array
				614	* @q: struct sem_queue that describes the operation
				615	*
				616	* Caller blocking are as follows, based the value
				617	* indicated by the semaphore operation (sem_op):
				618	*
				619	* (1) >0 never blocks.
				620	* (2) 0 (wait-for-zero operation): semval is non-zero.
				621	* (3) <0 attempting to decrement semval to a value smaller than zero.
				622	*
				623	* Returns 0 if the operation was possible.
				624	* Returns 1 if the operation is impossible, the caller must sleep.
				625	* Returns <0 for error codes.
				626	*/
				627	static int perform_atomic_semop_slow(struct sem_array sma, struct sem_queue q)
				628	{
				629	int result, sem_op, nsops;
				630	struct pid *pid;
				631	struct sembuf *sop;
				632	struct sem *curr;
				633	struct sembuf *sops;
				634	struct sem_undo *un;
				635
				636	sops = q->sops;
				637	nsops = q->nsops;
				638	un = q->undo;
				639
				640	for (sop = sops; sop < sops + nsops; sop++) {
				641	int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
				642	curr = &sma->sems[idx];
				643	sem_op = sop->sem_op;
				644	result = curr->semval;
				645
				646	if (!sem_op && result)
				647	goto would_block;
				648
				649	result += sem_op;
				650	if (result < 0)
				651	goto would_block;
				652	if (result > SEMVMX)
				653	goto out_of_range;
				654
				655	if (sop->sem_flg & SEM_UNDO) {
				656	int undo = un->semadj[sop->sem_num] - sem_op;
				657	/* Exceeding the undo range is an error. */
				658	if (undo < (-SEMAEM - 1) \|\| undo > SEMAEM)
				659	goto out_of_range;
				660	un->semadj[sop->sem_num] = undo;
				661	}
				662
				663	curr->semval = result;
				664	}
				665
				666	sop--;
				667	pid = q->pid;
				668	while (sop >= sops) {
				669	ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid);
				670	sop--;
				671	}
				672
				673	return 0;
				674
				675	out_of_range:
				676	result = -ERANGE;
				677	goto undo;
				678
				679	would_block:
				680	q->blocking = sop;
				681
				682	if (sop->sem_flg & IPC_NOWAIT)
				683	result = -EAGAIN;
				684	else
				685	result = 1;
				686
				687	undo:
				688	sop--;
				689	while (sop >= sops) {
				690	sem_op = sop->sem_op;
				691	sma->sems[sop->sem_num].semval -= sem_op;
				692	if (sop->sem_flg & SEM_UNDO)
				693	un->semadj[sop->sem_num] += sem_op;
				694	sop--;
				695	}
				696
				697	return result;
				698	}
				699
				700	static int perform_atomic_semop(struct sem_array sma, struct sem_queue q)
				701	{
				702	int result, sem_op, nsops;
				703	struct sembuf *sop;
				704	struct sem *curr;
				705	struct sembuf *sops;
				706	struct sem_undo *un;
				707
				708	sops = q->sops;
				709	nsops = q->nsops;
				710	un = q->undo;
				711
				712	if (unlikely(q->dupsop))
				713	return perform_atomic_semop_slow(sma, q);
				714
				715	/*
				716	* We scan the semaphore set twice, first to ensure that the entire
				717	* operation can succeed, therefore avoiding any pointless writes
				718	* to shared memory and having to undo such changes in order to block
				719	* until the operations can go through.
				720	*/
				721	for (sop = sops; sop < sops + nsops; sop++) {
				722	int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
				723
				724	curr = &sma->sems[idx];
				725	sem_op = sop->sem_op;
				726	result = curr->semval;
				727
				728	if (!sem_op && result)
				729	goto would_block; /* wait-for-zero */
				730
				731	result += sem_op;
				732	if (result < 0)
				733	goto would_block;
				734
				735	if (result > SEMVMX)
				736	return -ERANGE;
				737
				738	if (sop->sem_flg & SEM_UNDO) {
				739	int undo = un->semadj[sop->sem_num] - sem_op;
				740
				741	/* Exceeding the undo range is an error. */
				742	if (undo < (-SEMAEM - 1) \|\| undo > SEMAEM)
				743	return -ERANGE;
				744	}
				745	}
				746
				747	for (sop = sops; sop < sops + nsops; sop++) {
				748	curr = &sma->sems[sop->sem_num];
				749	sem_op = sop->sem_op;
				750	result = curr->semval;
				751
				752	if (sop->sem_flg & SEM_UNDO) {
				753	int undo = un->semadj[sop->sem_num] - sem_op;
				754
				755	un->semadj[sop->sem_num] = undo;
				756	}
				757	curr->semval += sem_op;
				758	ipc_update_pid(&curr->sempid, q->pid);
				759	}
				760
				761	return 0;
				762
				763	would_block:
				764	q->blocking = sop;
				765	return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
				766	}
				767
				768	static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
				769	struct wake_q_head *wake_q)
				770	{
				771	wake_q_add(wake_q, q->sleeper);
				772	/*
				773	* Rely on the above implicit barrier, such that we can
				774	* ensure that we hold reference to the task before setting
				775	* q->status. Otherwise we could race with do_exit if the
				776	* task is awoken by an external event before calling
				777	* wake_up_process().
				778	*/
				779	WRITE_ONCE(q->status, error);
				780	}
				781
				782	static void unlink_queue(struct sem_array sma, struct sem_queue q)
				783	{
				784	list_del(&q->list);
				785	if (q->nsops > 1)
				786	sma->complex_count--;
				787	}
				788
				789	/** check_restart(sma, q)
				790	* @sma: semaphore array
				791	* @q: the operation that just completed
				792	*
				793	* update_queue is O(N^2) when it restarts scanning the whole queue of
				794	* waiting operations. Therefore this function checks if the restart is
				795	* really necessary. It is called after a previously waiting operation
				796	* modified the array.
				797	* Note that wait-for-zero operations are handled without restart.
				798	*/
				799	static inline int check_restart(struct sem_array sma, struct sem_queue q)
				800	{
				801	/* pending complex alter operations are too difficult to analyse */
				802	if (!list_empty(&sma->pending_alter))
				803	return 1;
				804
				805	/* we were a sleeping complex operation. Too difficult */
				806	if (q->nsops > 1)
				807	return 1;
				808
				809	/* It is impossible that someone waits for the new value:
				810	* - complex operations always restart.
				811	* - wait-for-zero are handled seperately.
				812	* - q is a previously sleeping simple operation that
				813	* altered the array. It must be a decrement, because
				814	* simple increments never sleep.
				815	* - If there are older (higher priority) decrements
				816	* in the queue, then they have observed the original
				817	* semval value and couldn't proceed. The operation
				818	* decremented to value - thus they won't proceed either.
				819	*/
				820	return 0;
				821	}
				822
				823	/**
				824	* wake_const_ops - wake up non-alter tasks
				825	* @sma: semaphore array.
				826	* @semnum: semaphore that was modified.
				827	* @wake_q: lockless wake-queue head.
				828	*
				829	* wake_const_ops must be called after a semaphore in a semaphore array
				830	* was set to 0. If complex const operations are pending, wake_const_ops must
				831	* be called with semnum = -1, as well as with the number of each modified
				832	* semaphore.
				833	* The tasks that must be woken up are added to @wake_q. The return code
				834	* is stored in q->pid.
				835	* The function returns 1 if at least one operation was completed successfully.
				836	*/
				837	static int wake_const_ops(struct sem_array *sma, int semnum,
				838	struct wake_q_head *wake_q)
				839	{
				840	struct sem_queue q, tmp;
				841	struct list_head *pending_list;
				842	int semop_completed = 0;
				843
				844	if (semnum == -1)
				845	pending_list = &sma->pending_const;
				846	else
				847	pending_list = &sma->sems[semnum].pending_const;
				848
				849	list_for_each_entry_safe(q, tmp, pending_list, list) {
				850	int error = perform_atomic_semop(sma, q);
				851
				852	if (error > 0)
				853	continue;
				854	/* operation completed, remove from queue & wakeup */
				855	unlink_queue(sma, q);
				856
				857	wake_up_sem_queue_prepare(q, error, wake_q);
				858	if (error == 0)
				859	semop_completed = 1;
				860	}
				861
				862	return semop_completed;
				863	}
				864
				865	/**
				866	* do_smart_wakeup_zero - wakeup all wait for zero tasks
				867	* @sma: semaphore array
				868	* @sops: operations that were performed
				869	* @nsops: number of operations
				870	* @wake_q: lockless wake-queue head
				871	*
				872	* Checks all required queue for wait-for-zero operations, based
				873	* on the actual changes that were performed on the semaphore array.
				874	* The function returns 1 if at least one operation was completed successfully.
				875	*/
				876	static int do_smart_wakeup_zero(struct sem_array sma, struct sembuf sops,
				877	int nsops, struct wake_q_head *wake_q)
				878	{
				879	int i;
				880	int semop_completed = 0;
				881	int got_zero = 0;
				882
				883	/* first: the per-semaphore queues, if known */
				884	if (sops) {
				885	for (i = 0; i < nsops; i++) {
				886	int num = sops[i].sem_num;
				887
				888	if (sma->sems[num].semval == 0) {
				889	got_zero = 1;
				890	semop_completed \|= wake_const_ops(sma, num, wake_q);
				891	}
				892	}
				893	} else {
				894	/*
				895	* No sops means modified semaphores not known.
				896	* Assume all were changed.
				897	*/
				898	for (i = 0; i < sma->sem_nsems; i++) {
				899	if (sma->sems[i].semval == 0) {
				900	got_zero = 1;
				901	semop_completed \|= wake_const_ops(sma, i, wake_q);
				902	}
				903	}
				904	}
				905	/*
				906	* If one of the modified semaphores got 0,
				907	* then check the global queue, too.
				908	*/
				909	if (got_zero)
				910	semop_completed \|= wake_const_ops(sma, -1, wake_q);
				911
				912	return semop_completed;
				913	}
				914
				915
				916	/**
				917	* update_queue - look for tasks that can be completed.
				918	* @sma: semaphore array.
				919	* @semnum: semaphore that was modified.
				920	* @wake_q: lockless wake-queue head.
				921	*
				922	* update_queue must be called after a semaphore in a semaphore array
				923	* was modified. If multiple semaphores were modified, update_queue must
				924	* be called with semnum = -1, as well as with the number of each modified
				925	* semaphore.
				926	* The tasks that must be woken up are added to @wake_q. The return code
				927	* is stored in q->pid.
				928	* The function internally checks if const operations can now succeed.
				929	*
				930	* The function return 1 if at least one semop was completed successfully.
				931	*/
				932	static int update_queue(struct sem_array sma, int semnum, struct wake_q_head wake_q)
				933	{
				934	struct sem_queue q, tmp;
				935	struct list_head *pending_list;
				936	int semop_completed = 0;
				937
				938	if (semnum == -1)
				939	pending_list = &sma->pending_alter;
				940	else
				941	pending_list = &sma->sems[semnum].pending_alter;
				942
				943	again:
				944	list_for_each_entry_safe(q, tmp, pending_list, list) {
				945	int error, restart;
				946
				947	/* If we are scanning the single sop, per-semaphore list of
				948	* one semaphore and that semaphore is 0, then it is not
				949	* necessary to scan further: simple increments
				950	* that affect only one entry succeed immediately and cannot
				951	* be in the per semaphore pending queue, and decrements
				952	* cannot be successful if the value is already 0.
				953	*/
				954	if (semnum != -1 && sma->sems[semnum].semval == 0)
				955	break;
				956
				957	error = perform_atomic_semop(sma, q);
				958
				959	/* Does q->sleeper still need to sleep? */
				960	if (error > 0)
				961	continue;
				962
				963	unlink_queue(sma, q);
				964
				965	if (error) {
				966	restart = 0;
				967	} else {
				968	semop_completed = 1;
				969	do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
				970	restart = check_restart(sma, q);
				971	}
				972
				973	wake_up_sem_queue_prepare(q, error, wake_q);
				974	if (restart)
				975	goto again;
				976	}
				977	return semop_completed;
				978	}
				979
				980	/**
				981	* set_semotime - set sem_otime
				982	* @sma: semaphore array
				983	* @sops: operations that modified the array, may be NULL
				984	*
				985	* sem_otime is replicated to avoid cache line trashing.
				986	* This function sets one instance to the current time.
				987	*/
				988	static void set_semotime(struct sem_array sma, struct sembuf sops)
				989	{
				990	if (sops == NULL) {
				991	sma->sems[0].sem_otime = ktime_get_real_seconds();
				992	} else {
				993	sma->sems[sops[0].sem_num].sem_otime =
				994	ktime_get_real_seconds();
				995	}
				996	}
				997
				998	/**
				999	* do_smart_update - optimized update_queue
				1000	* @sma: semaphore array
				1001	* @sops: operations that were performed
				1002	* @nsops: number of operations
				1003	* @otime: force setting otime
				1004	* @wake_q: lockless wake-queue head
				1005	*
				1006	* do_smart_update() does the required calls to update_queue and wakeup_zero,
				1007	* based on the actual changes that were performed on the semaphore array.
				1008	* Note that the function does not do the actual wake-up: the caller is
				1009	* responsible for calling wake_up_q().
				1010	* It is safe to perform this call after dropping all locks.
				1011	*/
				1012	static void do_smart_update(struct sem_array sma, struct sembuf sops, int nsops,
				1013	int otime, struct wake_q_head *wake_q)
				1014	{
				1015	int i;
				1016
				1017	otime \|= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
				1018
				1019	if (!list_empty(&sma->pending_alter)) {
				1020	/* semaphore array uses the global queue - just process it. */
				1021	otime \|= update_queue(sma, -1, wake_q);
				1022	} else {
				1023	if (!sops) {
				1024	/*
				1025	* No sops, thus the modified semaphores are not
				1026	* known. Check all.
				1027	*/
				1028	for (i = 0; i < sma->sem_nsems; i++)
				1029	otime \|= update_queue(sma, i, wake_q);
				1030	} else {
				1031	/*
				1032	* Check the semaphores that were increased:
				1033	* - No complex ops, thus all sleeping ops are
				1034	* decrease.
				1035	* - if we decreased the value, then any sleeping
				1036	* semaphore ops wont be able to run: If the
				1037	* previous value was too small, then the new
				1038	* value will be too small, too.
				1039	*/
				1040	for (i = 0; i < nsops; i++) {
				1041	if (sops[i].sem_op > 0) {
				1042	otime \|= update_queue(sma,
				1043	sops[i].sem_num, wake_q);
				1044	}
				1045	}
				1046	}
				1047	}
				1048	if (otime)
				1049	set_semotime(sma, sops);
				1050	}
				1051
				1052	/*
				1053	* check_qop: Test if a queued operation sleeps on the semaphore semnum
				1054	*/
				1055	static int check_qop(struct sem_array sma, int semnum, struct sem_queue q,
				1056	bool count_zero)
				1057	{
				1058	struct sembuf *sop = q->blocking;
				1059
				1060	/*
				1061	* Linux always (since 0.99.10) reported a task as sleeping on all
				1062	* semaphores. This violates SUS, therefore it was changed to the
				1063	* standard compliant behavior.
				1064	* Give the administrators a chance to notice that an application
				1065	* might misbehave because it relies on the Linux behavior.
				1066	*/
				1067	pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
				1068	"The task %s (%d) triggered the difference, watch for misbehavior.\n",
				1069	current->comm, task_pid_nr(current));
				1070
				1071	if (sop->sem_num != semnum)
				1072	return 0;
				1073
				1074	if (count_zero && sop->sem_op == 0)
				1075	return 1;
				1076	if (!count_zero && sop->sem_op < 0)
				1077	return 1;
				1078
				1079	return 0;
				1080	}
				1081
				1082	/* The following counts are associated to each semaphore:
				1083	* semncnt number of tasks waiting on semval being nonzero
				1084	* semzcnt number of tasks waiting on semval being zero
				1085	*
				1086	* Per definition, a task waits only on the semaphore of the first semop
				1087	* that cannot proceed, even if additional operation would block, too.
				1088	*/
				1089	static int count_semcnt(struct sem_array *sma, ushort semnum,
				1090	bool count_zero)
				1091	{
				1092	struct list_head *l;
				1093	struct sem_queue *q;
				1094	int semcnt;
				1095
				1096	semcnt = 0;
				1097	/* First: check the simple operations. They are easy to evaluate */
				1098	if (count_zero)
				1099	l = &sma->sems[semnum].pending_const;
				1100	else
				1101	l = &sma->sems[semnum].pending_alter;
				1102
				1103	list_for_each_entry(q, l, list) {
				1104	/* all task on a per-semaphore list sleep on exactly
				1105	* that semaphore
				1106	*/
				1107	semcnt++;
				1108	}
				1109
				1110	/* Then: check the complex operations. */
				1111	list_for_each_entry(q, &sma->pending_alter, list) {
				1112	semcnt += check_qop(sma, semnum, q, count_zero);
				1113	}
				1114	if (count_zero) {
				1115	list_for_each_entry(q, &sma->pending_const, list) {
				1116	semcnt += check_qop(sma, semnum, q, count_zero);
				1117	}
				1118	}
				1119	return semcnt;
				1120	}
				1121
				1122	/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
				1123	* as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
				1124	* remains locked on exit.
				1125	*/
				1126	static void freeary(struct ipc_namespace ns, struct kern_ipc_perm ipcp)
				1127	{
				1128	struct sem_undo un, tu;
				1129	struct sem_queue q, tq;
				1130	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
				1131	int i;
				1132	DEFINE_WAKE_Q(wake_q);
				1133
				1134	/* Free the existing undo structures for this semaphore set. */
				1135	ipc_assert_locked_object(&sma->sem_perm);
				1136	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
				1137	list_del(&un->list_id);
				1138	spin_lock(&un->ulp->lock);
				1139	un->semid = -1;
				1140	list_del_rcu(&un->list_proc);
				1141	spin_unlock(&un->ulp->lock);
				1142	kfree_rcu(un, rcu);
				1143	}
				1144
				1145	/* Wake up all pending processes and let them fail with EIDRM. */
				1146	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
				1147	unlink_queue(sma, q);
				1148	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1149	}
				1150
				1151	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
				1152	unlink_queue(sma, q);
				1153	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1154	}
				1155	for (i = 0; i < sma->sem_nsems; i++) {
				1156	struct sem *sem = &sma->sems[i];
				1157	list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
				1158	unlink_queue(sma, q);
				1159	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1160	}
				1161	list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
				1162	unlink_queue(sma, q);
				1163	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1164	}
				1165	ipc_update_pid(&sem->sempid, NULL);
				1166	}
				1167
				1168	/* Remove the semaphore set from the IDR */
				1169	sem_rmid(ns, sma);
				1170	sem_unlock(sma, -1);
				1171	rcu_read_unlock();
				1172
				1173	wake_up_q(&wake_q);
				1174	ns->used_sems -= sma->sem_nsems;
				1175	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1176	}
				1177
				1178	static unsigned long copy_semid_to_user(void __user buf, struct semid64_ds in, int version)
				1179	{
				1180	switch (version) {
				1181	case IPC_64:
				1182	return copy_to_user(buf, in, sizeof(*in));
				1183	case IPC_OLD:
				1184	{
				1185	struct semid_ds out;
				1186
				1187	memset(&out, 0, sizeof(out));
				1188
				1189	ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
				1190
				1191	out.sem_otime = in->sem_otime;
				1192	out.sem_ctime = in->sem_ctime;
				1193	out.sem_nsems = in->sem_nsems;
				1194
				1195	return copy_to_user(buf, &out, sizeof(out));
				1196	}
				1197	default:
				1198	return -EINVAL;
				1199	}
				1200	}
				1201
				1202	static time64_t get_semotime(struct sem_array *sma)
				1203	{
				1204	int i;
				1205	time64_t res;
				1206
				1207	res = sma->sems[0].sem_otime;
				1208	for (i = 1; i < sma->sem_nsems; i++) {
				1209	time64_t to = sma->sems[i].sem_otime;
				1210
				1211	if (to > res)
				1212	res = to;
				1213	}
				1214	return res;
				1215	}
				1216
				1217	static int semctl_stat(struct ipc_namespace *ns, int semid,
				1218	int cmd, struct semid64_ds *semid64)
				1219	{
				1220	struct sem_array *sma;
				1221	time64_t semotime;
				1222	int err;
				1223
				1224	memset(semid64, 0, sizeof(*semid64));
				1225
				1226	rcu_read_lock();
				1227	if (cmd == SEM_STAT \|\| cmd == SEM_STAT_ANY) {
				1228	sma = sem_obtain_object(ns, semid);
				1229	if (IS_ERR(sma)) {
				1230	err = PTR_ERR(sma);
				1231	goto out_unlock;
				1232	}
				1233	} else { /* IPC_STAT */
				1234	sma = sem_obtain_object_check(ns, semid);
				1235	if (IS_ERR(sma)) {
				1236	err = PTR_ERR(sma);
				1237	goto out_unlock;
				1238	}
				1239	}
				1240
				1241	/* see comment for SHM_STAT_ANY */
				1242	if (cmd == SEM_STAT_ANY)
				1243	audit_ipc_obj(&sma->sem_perm);
				1244	else {
				1245	err = -EACCES;
				1246	if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
				1247	goto out_unlock;
				1248	}
				1249
				1250	err = security_sem_semctl(&sma->sem_perm, cmd);
				1251	if (err)
				1252	goto out_unlock;
				1253
				1254	ipc_lock_object(&sma->sem_perm);
				1255
				1256	if (!ipc_valid_object(&sma->sem_perm)) {
				1257	ipc_unlock_object(&sma->sem_perm);
				1258	err = -EIDRM;
				1259	goto out_unlock;
				1260	}
				1261
				1262	kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
				1263	semotime = get_semotime(sma);
				1264	semid64->sem_otime = semotime;
				1265	semid64->sem_ctime = sma->sem_ctime;
				1266	#ifndef CONFIG_64BIT
				1267	semid64->sem_otime_high = semotime >> 32;
				1268	semid64->sem_ctime_high = sma->sem_ctime >> 32;
				1269	#endif
				1270	semid64->sem_nsems = sma->sem_nsems;
				1271
				1272	if (cmd == IPC_STAT) {
				1273	/*
				1274	* As defined in SUS:
				1275	* Return 0 on success
				1276	*/
				1277	err = 0;
				1278	} else {
				1279	/*
				1280	* SEM_STAT and SEM_STAT_ANY (both Linux specific)
				1281	* Return the full id, including the sequence number
				1282	*/
				1283	err = sma->sem_perm.id;
				1284	}
				1285	ipc_unlock_object(&sma->sem_perm);
				1286	out_unlock:
				1287	rcu_read_unlock();
				1288	return err;
				1289	}
				1290
				1291	static int semctl_info(struct ipc_namespace *ns, int semid,
				1292	int cmd, void __user *p)
				1293	{
				1294	struct seminfo seminfo;
				1295	int max_idx;
				1296	int err;
				1297
				1298	err = security_sem_semctl(NULL, cmd);
				1299	if (err)
				1300	return err;
				1301
				1302	memset(&seminfo, 0, sizeof(seminfo));
				1303	seminfo.semmni = ns->sc_semmni;
				1304	seminfo.semmns = ns->sc_semmns;
				1305	seminfo.semmsl = ns->sc_semmsl;
				1306	seminfo.semopm = ns->sc_semopm;
				1307	seminfo.semvmx = SEMVMX;
				1308	seminfo.semmnu = SEMMNU;
				1309	seminfo.semmap = SEMMAP;
				1310	seminfo.semume = SEMUME;
				1311	down_read(&sem_ids(ns).rwsem);
				1312	if (cmd == SEM_INFO) {
				1313	seminfo.semusz = sem_ids(ns).in_use;
				1314	seminfo.semaem = ns->used_sems;
				1315	} else {
				1316	seminfo.semusz = SEMUSZ;
				1317	seminfo.semaem = SEMAEM;
				1318	}
				1319	max_idx = ipc_get_maxidx(&sem_ids(ns));
				1320	up_read(&sem_ids(ns).rwsem);
				1321	if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
				1322	return -EFAULT;
				1323	return (max_idx < 0) ? 0 : max_idx;
				1324	}
				1325
				1326	static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
				1327	int val)
				1328	{
				1329	struct sem_undo *un;
				1330	struct sem_array *sma;
				1331	struct sem *curr;
				1332	int err;
				1333	DEFINE_WAKE_Q(wake_q);
				1334
				1335	if (val > SEMVMX \|\| val < 0)
				1336	return -ERANGE;
				1337
				1338	rcu_read_lock();
				1339	sma = sem_obtain_object_check(ns, semid);
				1340	if (IS_ERR(sma)) {
				1341	rcu_read_unlock();
				1342	return PTR_ERR(sma);
				1343	}
				1344
				1345	if (semnum < 0 \|\| semnum >= sma->sem_nsems) {
				1346	rcu_read_unlock();
				1347	return -EINVAL;
				1348	}
				1349
				1350
				1351	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
				1352	rcu_read_unlock();
				1353	return -EACCES;
				1354	}
				1355
				1356	err = security_sem_semctl(&sma->sem_perm, SETVAL);
				1357	if (err) {
				1358	rcu_read_unlock();
				1359	return -EACCES;
				1360	}
				1361
				1362	sem_lock(sma, NULL, -1);
				1363
				1364	if (!ipc_valid_object(&sma->sem_perm)) {
				1365	sem_unlock(sma, -1);
				1366	rcu_read_unlock();
				1367	return -EIDRM;
				1368	}
				1369
				1370	semnum = array_index_nospec(semnum, sma->sem_nsems);
				1371	curr = &sma->sems[semnum];
				1372
				1373	ipc_assert_locked_object(&sma->sem_perm);
				1374	list_for_each_entry(un, &sma->list_id, list_id)
				1375	un->semadj[semnum] = 0;
				1376
				1377	curr->semval = val;
				1378	ipc_update_pid(&curr->sempid, task_tgid(current));
				1379	sma->sem_ctime = ktime_get_real_seconds();
				1380	/* maybe some queued-up processes were waiting for this */
				1381	do_smart_update(sma, NULL, 0, 0, &wake_q);
				1382	sem_unlock(sma, -1);
				1383	rcu_read_unlock();
				1384	wake_up_q(&wake_q);
				1385	return 0;
				1386	}
				1387
				1388	static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
				1389	int cmd, void __user *p)
				1390	{
				1391	struct sem_array *sma;
				1392	struct sem *curr;
				1393	int err, nsems;
				1394	ushort fast_sem_io[SEMMSL_FAST];
				1395	ushort *sem_io = fast_sem_io;
				1396	DEFINE_WAKE_Q(wake_q);
				1397
				1398	rcu_read_lock();
				1399	sma = sem_obtain_object_check(ns, semid);
				1400	if (IS_ERR(sma)) {
				1401	rcu_read_unlock();
				1402	return PTR_ERR(sma);
				1403	}
				1404
				1405	nsems = sma->sem_nsems;
				1406
				1407	err = -EACCES;
				1408	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
				1409	goto out_rcu_wakeup;
				1410
				1411	err = security_sem_semctl(&sma->sem_perm, cmd);
				1412	if (err)
				1413	goto out_rcu_wakeup;
				1414
				1415	err = -EACCES;
				1416	switch (cmd) {
				1417	case GETALL:
				1418	{
				1419	ushort __user *array = p;
				1420	int i;
				1421
				1422	sem_lock(sma, NULL, -1);
				1423	if (!ipc_valid_object(&sma->sem_perm)) {
				1424	err = -EIDRM;
				1425	goto out_unlock;
				1426	}
				1427	if (nsems > SEMMSL_FAST) {
				1428	if (!ipc_rcu_getref(&sma->sem_perm)) {
				1429	err = -EIDRM;
				1430	goto out_unlock;
				1431	}
				1432	sem_unlock(sma, -1);
				1433	rcu_read_unlock();
				1434	sem_io = kvmalloc_array(nsems, sizeof(ushort),
				1435	GFP_KERNEL);
				1436	if (sem_io == NULL) {
				1437	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1438	return -ENOMEM;
				1439	}
				1440
				1441	rcu_read_lock();
				1442	sem_lock_and_putref(sma);
				1443	if (!ipc_valid_object(&sma->sem_perm)) {
				1444	err = -EIDRM;
				1445	goto out_unlock;
				1446	}
				1447	}
				1448	for (i = 0; i < sma->sem_nsems; i++)
				1449	sem_io[i] = sma->sems[i].semval;
				1450	sem_unlock(sma, -1);
				1451	rcu_read_unlock();
				1452	err = 0;
				1453	if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
				1454	err = -EFAULT;
				1455	goto out_free;
				1456	}
				1457	case SETALL:
				1458	{
				1459	int i;
				1460	struct sem_undo *un;
				1461
				1462	if (!ipc_rcu_getref(&sma->sem_perm)) {
				1463	err = -EIDRM;
				1464	goto out_rcu_wakeup;
				1465	}
				1466	rcu_read_unlock();
				1467
				1468	if (nsems > SEMMSL_FAST) {
				1469	sem_io = kvmalloc_array(nsems, sizeof(ushort),
				1470	GFP_KERNEL);
				1471	if (sem_io == NULL) {
				1472	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1473	return -ENOMEM;
				1474	}
				1475	}
				1476
				1477	if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
				1478	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1479	err = -EFAULT;
				1480	goto out_free;
				1481	}
				1482
				1483	for (i = 0; i < nsems; i++) {
				1484	if (sem_io[i] > SEMVMX) {
				1485	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1486	err = -ERANGE;
				1487	goto out_free;
				1488	}
				1489	}
				1490	rcu_read_lock();
				1491	sem_lock_and_putref(sma);
				1492	if (!ipc_valid_object(&sma->sem_perm)) {
				1493	err = -EIDRM;
				1494	goto out_unlock;
				1495	}
				1496
				1497	for (i = 0; i < nsems; i++) {
				1498	sma->sems[i].semval = sem_io[i];
				1499	ipc_update_pid(&sma->sems[i].sempid, task_tgid(current));
				1500	}
				1501
				1502	ipc_assert_locked_object(&sma->sem_perm);
				1503	list_for_each_entry(un, &sma->list_id, list_id) {
				1504	for (i = 0; i < nsems; i++)
				1505	un->semadj[i] = 0;
				1506	}
				1507	sma->sem_ctime = ktime_get_real_seconds();
				1508	/* maybe some queued-up processes were waiting for this */
				1509	do_smart_update(sma, NULL, 0, 0, &wake_q);
				1510	err = 0;
				1511	goto out_unlock;
				1512	}
				1513	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
				1514	}
				1515	err = -EINVAL;
				1516	if (semnum < 0 \|\| semnum >= nsems)
				1517	goto out_rcu_wakeup;
				1518
				1519	sem_lock(sma, NULL, -1);
				1520	if (!ipc_valid_object(&sma->sem_perm)) {
				1521	err = -EIDRM;
				1522	goto out_unlock;
				1523	}
				1524
				1525	semnum = array_index_nospec(semnum, nsems);
				1526	curr = &sma->sems[semnum];
				1527
				1528	switch (cmd) {
				1529	case GETVAL:
				1530	err = curr->semval;
				1531	goto out_unlock;
				1532	case GETPID:
				1533	err = pid_vnr(curr->sempid);
				1534	goto out_unlock;
				1535	case GETNCNT:
				1536	err = count_semcnt(sma, semnum, 0);
				1537	goto out_unlock;
				1538	case GETZCNT:
				1539	err = count_semcnt(sma, semnum, 1);
				1540	goto out_unlock;
				1541	}
				1542
				1543	out_unlock:
				1544	sem_unlock(sma, -1);
				1545	out_rcu_wakeup:
				1546	rcu_read_unlock();
				1547	wake_up_q(&wake_q);
				1548	out_free:
				1549	if (sem_io != fast_sem_io)
				1550	kvfree(sem_io);
				1551	return err;
				1552	}
				1553
				1554	static inline unsigned long
				1555	copy_semid_from_user(struct semid64_ds out, void __user buf, int version)
				1556	{
				1557	switch (version) {
				1558	case IPC_64:
				1559	if (copy_from_user(out, buf, sizeof(*out)))
				1560	return -EFAULT;
				1561	return 0;
				1562	case IPC_OLD:
				1563	{
				1564	struct semid_ds tbuf_old;
				1565
				1566	if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
				1567	return -EFAULT;
				1568
				1569	out->sem_perm.uid = tbuf_old.sem_perm.uid;
				1570	out->sem_perm.gid = tbuf_old.sem_perm.gid;
				1571	out->sem_perm.mode = tbuf_old.sem_perm.mode;
				1572
				1573	return 0;
				1574	}
				1575	default:
				1576	return -EINVAL;
				1577	}
				1578	}
				1579
				1580	/*
				1581	* This function handles some semctl commands which require the rwsem
				1582	* to be held in write mode.
				1583	* NOTE: no locks must be held, the rwsem is taken inside this function.
				1584	*/
				1585	static int semctl_down(struct ipc_namespace *ns, int semid,
				1586	int cmd, struct semid64_ds *semid64)
				1587	{
				1588	struct sem_array *sma;
				1589	int err;
				1590	struct kern_ipc_perm *ipcp;
				1591
				1592	down_write(&sem_ids(ns).rwsem);
				1593	rcu_read_lock();
				1594
				1595	ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
				1596	&semid64->sem_perm, 0);
				1597	if (IS_ERR(ipcp)) {
				1598	err = PTR_ERR(ipcp);
				1599	goto out_unlock1;
				1600	}
				1601
				1602	sma = container_of(ipcp, struct sem_array, sem_perm);
				1603
				1604	err = security_sem_semctl(&sma->sem_perm, cmd);
				1605	if (err)
				1606	goto out_unlock1;
				1607
				1608	switch (cmd) {
				1609	case IPC_RMID:
				1610	sem_lock(sma, NULL, -1);
				1611	/* freeary unlocks the ipc object and rcu */
				1612	freeary(ns, ipcp);
				1613	goto out_up;
				1614	case IPC_SET:
				1615	sem_lock(sma, NULL, -1);
				1616	err = ipc_update_perm(&semid64->sem_perm, ipcp);
				1617	if (err)
				1618	goto out_unlock0;
				1619	sma->sem_ctime = ktime_get_real_seconds();
				1620	break;
				1621	default:
				1622	err = -EINVAL;
				1623	goto out_unlock1;
				1624	}
				1625
				1626	out_unlock0:
				1627	sem_unlock(sma, -1);
				1628	out_unlock1:
				1629	rcu_read_unlock();
				1630	out_up:
				1631	up_write(&sem_ids(ns).rwsem);
				1632	return err;
				1633	}
				1634
				1635	static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version)
				1636	{
				1637	struct ipc_namespace *ns;
				1638	void __user p = (void __user )arg;
				1639	struct semid64_ds semid64;
				1640	int err;
				1641
				1642	if (semid < 0)
				1643	return -EINVAL;
				1644
				1645	ns = current->nsproxy->ipc_ns;
				1646
				1647	switch (cmd) {
				1648	case IPC_INFO:
				1649	case SEM_INFO:
				1650	return semctl_info(ns, semid, cmd, p);
				1651	case IPC_STAT:
				1652	case SEM_STAT:
				1653	case SEM_STAT_ANY:
				1654	err = semctl_stat(ns, semid, cmd, &semid64);
				1655	if (err < 0)
				1656	return err;
				1657	if (copy_semid_to_user(p, &semid64, version))
				1658	err = -EFAULT;
				1659	return err;
				1660	case GETALL:
				1661	case GETVAL:
				1662	case GETPID:
				1663	case GETNCNT:
				1664	case GETZCNT:
				1665	case SETALL:
				1666	return semctl_main(ns, semid, semnum, cmd, p);
				1667	case SETVAL: {
				1668	int val;
				1669	#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
				1670	/* big-endian 64bit */
				1671	val = arg >> 32;
				1672	#else
				1673	/* 32bit or little-endian 64bit */
				1674	val = arg;
				1675	#endif
				1676	return semctl_setval(ns, semid, semnum, val);
				1677	}
				1678	case IPC_SET:
				1679	if (copy_semid_from_user(&semid64, p, version))
				1680	return -EFAULT;
				1681	/* fall through */
				1682	case IPC_RMID:
				1683	return semctl_down(ns, semid, cmd, &semid64);
				1684	default:
				1685	return -EINVAL;
				1686	}
				1687	}
				1688
				1689	SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
				1690	{
				1691	return ksys_semctl(semid, semnum, cmd, arg, IPC_64);
				1692	}
				1693
				1694	#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
				1695	long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg)
				1696	{
				1697	int version = ipc_parse_version(&cmd);
				1698
				1699	return ksys_semctl(semid, semnum, cmd, arg, version);
				1700	}
				1701
				1702	SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
				1703	{
				1704	return ksys_old_semctl(semid, semnum, cmd, arg);
				1705	}
				1706	#endif
				1707
				1708	#ifdef CONFIG_COMPAT
				1709
				1710	struct compat_semid_ds {
				1711	struct compat_ipc_perm sem_perm;
				1712	old_time32_t sem_otime;
				1713	old_time32_t sem_ctime;
				1714	compat_uptr_t sem_base;
				1715	compat_uptr_t sem_pending;
				1716	compat_uptr_t sem_pending_last;
				1717	compat_uptr_t undo;
				1718	unsigned short sem_nsems;
				1719	};
				1720
				1721	static int copy_compat_semid_from_user(struct semid64_ds out, void __user buf,
				1722	int version)
				1723	{
				1724	memset(out, 0, sizeof(*out));
				1725	if (version == IPC_64) {
				1726	struct compat_semid64_ds __user *p = buf;
				1727	return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm);
				1728	} else {
				1729	struct compat_semid_ds __user *p = buf;
				1730	return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm);
				1731	}
				1732	}
				1733
				1734	static int copy_compat_semid_to_user(void __user buf, struct semid64_ds in,
				1735	int version)
				1736	{
				1737	if (version == IPC_64) {
				1738	struct compat_semid64_ds v;
				1739	memset(&v, 0, sizeof(v));
				1740	to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
				1741	v.sem_otime = lower_32_bits(in->sem_otime);
				1742	v.sem_otime_high = upper_32_bits(in->sem_otime);
				1743	v.sem_ctime = lower_32_bits(in->sem_ctime);
				1744	v.sem_ctime_high = upper_32_bits(in->sem_ctime);
				1745	v.sem_nsems = in->sem_nsems;
				1746	return copy_to_user(buf, &v, sizeof(v));
				1747	} else {
				1748	struct compat_semid_ds v;
				1749	memset(&v, 0, sizeof(v));
				1750	to_compat_ipc_perm(&v.sem_perm, &in->sem_perm);
				1751	v.sem_otime = in->sem_otime;
				1752	v.sem_ctime = in->sem_ctime;
				1753	v.sem_nsems = in->sem_nsems;
				1754	return copy_to_user(buf, &v, sizeof(v));
				1755	}
				1756	}
				1757
				1758	static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version)
				1759	{
				1760	void __user *p = compat_ptr(arg);
				1761	struct ipc_namespace *ns;
				1762	struct semid64_ds semid64;
				1763	int err;
				1764
				1765	ns = current->nsproxy->ipc_ns;
				1766
				1767	if (semid < 0)
				1768	return -EINVAL;
				1769
				1770	switch (cmd & (~IPC_64)) {
				1771	case IPC_INFO:
				1772	case SEM_INFO:
				1773	return semctl_info(ns, semid, cmd, p);
				1774	case IPC_STAT:
				1775	case SEM_STAT:
				1776	case SEM_STAT_ANY:
				1777	err = semctl_stat(ns, semid, cmd, &semid64);
				1778	if (err < 0)
				1779	return err;
				1780	if (copy_compat_semid_to_user(p, &semid64, version))
				1781	err = -EFAULT;
				1782	return err;
				1783	case GETVAL:
				1784	case GETPID:
				1785	case GETNCNT:
				1786	case GETZCNT:
				1787	case GETALL:
				1788	case SETALL:
				1789	return semctl_main(ns, semid, semnum, cmd, p);
				1790	case SETVAL:
				1791	return semctl_setval(ns, semid, semnum, arg);
				1792	case IPC_SET:
				1793	if (copy_compat_semid_from_user(&semid64, p, version))
				1794	return -EFAULT;
				1795	/* fallthru */
				1796	case IPC_RMID:
				1797	return semctl_down(ns, semid, cmd, &semid64);
				1798	default:
				1799	return -EINVAL;
				1800	}
				1801	}
				1802
				1803	COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
				1804	{
				1805	return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64);
				1806	}
				1807
				1808	#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
				1809	long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg)
				1810	{
				1811	int version = compat_ipc_parse_version(&cmd);
				1812
				1813	return compat_ksys_semctl(semid, semnum, cmd, arg, version);
				1814	}
				1815
				1816	COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg)
				1817	{
				1818	return compat_ksys_old_semctl(semid, semnum, cmd, arg);
				1819	}
				1820	#endif
				1821	#endif
				1822
				1823	/* If the task doesn't already have a undo_list, then allocate one
				1824	* here. We guarantee there is only one thread using this undo list,
				1825	* and current is THE ONE
				1826	*
				1827	* If this allocation and assignment succeeds, but later
				1828	* portions of this code fail, there is no need to free the sem_undo_list.
				1829	* Just let it stay associated with the task, and it'll be freed later
				1830	* at exit time.
				1831	*
				1832	* This can block, so callers must hold no locks.
				1833	*/
				1834	static inline int get_undo_list(struct sem_undo_list **undo_listp)
				1835	{
				1836	struct sem_undo_list *undo_list;
				1837
				1838	undo_list = current->sysvsem.undo_list;
				1839	if (!undo_list) {
				1840	undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
				1841	if (undo_list == NULL)
				1842	return -ENOMEM;
				1843	spin_lock_init(&undo_list->lock);
				1844	refcount_set(&undo_list->refcnt, 1);
				1845	INIT_LIST_HEAD(&undo_list->list_proc);
				1846
				1847	current->sysvsem.undo_list = undo_list;
				1848	}
				1849	*undo_listp = undo_list;
				1850	return 0;
				1851	}
				1852
				1853	static struct sem_undo __lookup_undo(struct sem_undo_list ulp, int semid)
				1854	{
				1855	struct sem_undo *un;
				1856
				1857	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc,
				1858	spin_is_locked(&ulp->lock)) {
				1859	if (un->semid == semid)
				1860	return un;
				1861	}
				1862	return NULL;
				1863	}
				1864
				1865	static struct sem_undo lookup_undo(struct sem_undo_list ulp, int semid)
				1866	{
				1867	struct sem_undo *un;
				1868
				1869	assert_spin_locked(&ulp->lock);
				1870
				1871	un = __lookup_undo(ulp, semid);
				1872	if (un) {
				1873	list_del_rcu(&un->list_proc);
				1874	list_add_rcu(&un->list_proc, &ulp->list_proc);
				1875	}
				1876	return un;
				1877	}
				1878
				1879	/**
				1880	* find_alloc_undo - lookup (and if not present create) undo array
				1881	* @ns: namespace
				1882	* @semid: semaphore array id
				1883	*
				1884	* The function looks up (and if not present creates) the undo structure.
				1885	* The size of the undo structure depends on the size of the semaphore
				1886	* array, thus the alloc path is not that straightforward.
				1887	* Lifetime-rules: sem_undo is rcu-protected, on success, the function
				1888	* performs a rcu_read_lock().
				1889	*/
				1890	static struct sem_undo find_alloc_undo(struct ipc_namespace ns, int semid)
				1891	{
				1892	struct sem_array *sma;
				1893	struct sem_undo_list *ulp;
				1894	struct sem_undo un, new;
				1895	int nsems, error;
				1896
				1897	error = get_undo_list(&ulp);
				1898	if (error)
				1899	return ERR_PTR(error);
				1900
				1901	rcu_read_lock();
				1902	spin_lock(&ulp->lock);
				1903	un = lookup_undo(ulp, semid);
				1904	spin_unlock(&ulp->lock);
				1905	if (likely(un != NULL))
				1906	goto out;
				1907
				1908	/* no undo structure around - allocate one. */
				1909	/* step 1: figure out the size of the semaphore array */
				1910	sma = sem_obtain_object_check(ns, semid);
				1911	if (IS_ERR(sma)) {
				1912	rcu_read_unlock();
				1913	return ERR_CAST(sma);
				1914	}
				1915
				1916	nsems = sma->sem_nsems;
				1917	if (!ipc_rcu_getref(&sma->sem_perm)) {
				1918	rcu_read_unlock();
				1919	un = ERR_PTR(-EIDRM);
				1920	goto out;
				1921	}
				1922	rcu_read_unlock();
				1923
				1924	/* step 2: allocate new undo structure */
				1925	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL_ACCOUNT);
				1926	if (!new) {
				1927	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1928	return ERR_PTR(-ENOMEM);
				1929	}
				1930
				1931	/* step 3: Acquire the lock on semaphore array */
				1932	rcu_read_lock();
				1933	sem_lock_and_putref(sma);
				1934	if (!ipc_valid_object(&sma->sem_perm)) {
				1935	sem_unlock(sma, -1);
				1936	rcu_read_unlock();
				1937	kfree(new);
				1938	un = ERR_PTR(-EIDRM);
				1939	goto out;
				1940	}
				1941	spin_lock(&ulp->lock);
				1942
				1943	/*
				1944	* step 4: check for races: did someone else allocate the undo struct?
				1945	*/
				1946	un = lookup_undo(ulp, semid);
				1947	if (un) {
				1948	kfree(new);
				1949	goto success;
				1950	}
				1951	/* step 5: initialize & link new undo structure */
				1952	new->semadj = (short *) &new[1];
				1953	new->ulp = ulp;
				1954	new->semid = semid;
				1955	assert_spin_locked(&ulp->lock);
				1956	list_add_rcu(&new->list_proc, &ulp->list_proc);
				1957	ipc_assert_locked_object(&sma->sem_perm);
				1958	list_add(&new->list_id, &sma->list_id);
				1959	un = new;
				1960
				1961	success:
				1962	spin_unlock(&ulp->lock);
				1963	sem_unlock(sma, -1);
				1964	out:
				1965	return un;
				1966	}
				1967
				1968	static long do_semtimedop(int semid, struct sembuf __user *tsops,
				1969	unsigned nsops, const struct timespec64 *timeout)
				1970	{
				1971	int error = -EINVAL;
				1972	struct sem_array *sma;
				1973	struct sembuf fast_sops[SEMOPM_FAST];
				1974	struct sembuf sops = fast_sops, sop;
				1975	struct sem_undo *un;
				1976	int max, locknum;
				1977	bool undos = false, alter = false, dupsop = false;
				1978	struct sem_queue queue;
				1979	unsigned long dup = 0, jiffies_left = 0;
				1980	struct ipc_namespace *ns;
				1981
				1982	ns = current->nsproxy->ipc_ns;
				1983
				1984	if (nsops < 1 \|\| semid < 0)
				1985	return -EINVAL;
				1986	if (nsops > ns->sc_semopm)
				1987	return -E2BIG;
				1988	if (nsops > SEMOPM_FAST) {
				1989	sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
				1990	if (sops == NULL)
				1991	return -ENOMEM;
				1992	}
				1993
				1994	if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
				1995	error = -EFAULT;
				1996	goto out_free;
				1997	}
				1998
				1999	if (timeout) {
				2000	if (timeout->tv_sec < 0 \|\| timeout->tv_nsec < 0 \|\|
				2001	timeout->tv_nsec >= 1000000000L) {
				2002	error = -EINVAL;
				2003	goto out_free;
				2004	}
				2005	jiffies_left = timespec64_to_jiffies(timeout);
				2006	}
				2007
				2008	max = 0;
				2009	for (sop = sops; sop < sops + nsops; sop++) {
				2010	unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
				2011
				2012	if (sop->sem_num >= max)
				2013	max = sop->sem_num;
				2014	if (sop->sem_flg & SEM_UNDO)
				2015	undos = true;
				2016	if (dup & mask) {
				2017	/*
				2018	* There was a previous alter access that appears
				2019	* to have accessed the same semaphore, thus use
				2020	* the dupsop logic. "appears", because the detection
				2021	* can only check % BITS_PER_LONG.
				2022	*/
				2023	dupsop = true;
				2024	}
				2025	if (sop->sem_op != 0) {
				2026	alter = true;
				2027	dup \|= mask;
				2028	}
				2029	}
				2030
				2031	if (undos) {
				2032	/* On success, find_alloc_undo takes the rcu_read_lock */
				2033	un = find_alloc_undo(ns, semid);
				2034	if (IS_ERR(un)) {
				2035	error = PTR_ERR(un);
				2036	goto out_free;
				2037	}
				2038	} else {
				2039	un = NULL;
				2040	rcu_read_lock();
				2041	}
				2042
				2043	sma = sem_obtain_object_check(ns, semid);
				2044	if (IS_ERR(sma)) {
				2045	rcu_read_unlock();
				2046	error = PTR_ERR(sma);
				2047	goto out_free;
				2048	}
				2049
				2050	error = -EFBIG;
				2051	if (max >= sma->sem_nsems) {
				2052	rcu_read_unlock();
				2053	goto out_free;
				2054	}
				2055
				2056	error = -EACCES;
				2057	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
				2058	rcu_read_unlock();
				2059	goto out_free;
				2060	}
				2061
				2062	error = security_sem_semop(&sma->sem_perm, sops, nsops, alter);
				2063	if (error) {
				2064	rcu_read_unlock();
				2065	goto out_free;
				2066	}
				2067
				2068	error = -EIDRM;
				2069	locknum = sem_lock(sma, sops, nsops);
				2070	/*
				2071	* We eventually might perform the following check in a lockless
				2072	* fashion, considering ipc_valid_object() locking constraints.
				2073	* If nsops == 1 and there is no contention for sem_perm.lock, then
				2074	* only a per-semaphore lock is held and it's OK to proceed with the
				2075	* check below. More details on the fine grained locking scheme
				2076	* entangled here and why it's RMID race safe on comments at sem_lock()
				2077	*/
				2078	if (!ipc_valid_object(&sma->sem_perm))
				2079	goto out_unlock_free;
				2080	/*
				2081	* semid identifiers are not unique - find_alloc_undo may have
				2082	* allocated an undo structure, it was invalidated by an RMID
				2083	* and now a new array with received the same id. Check and fail.
				2084	* This case can be detected checking un->semid. The existence of
				2085	* "un" itself is guaranteed by rcu.
				2086	*/
				2087	if (un && un->semid == -1)
				2088	goto out_unlock_free;
				2089
				2090	queue.sops = sops;
				2091	queue.nsops = nsops;
				2092	queue.undo = un;
				2093	queue.pid = task_tgid(current);
				2094	queue.alter = alter;
				2095	queue.dupsop = dupsop;
				2096
				2097	error = perform_atomic_semop(sma, &queue);
				2098	if (error == 0) { /* non-blocking succesfull path */
				2099	DEFINE_WAKE_Q(wake_q);
				2100
				2101	/*
				2102	* If the operation was successful, then do
				2103	* the required updates.
				2104	*/
				2105	if (alter)
				2106	do_smart_update(sma, sops, nsops, 1, &wake_q);
				2107	else
				2108	set_semotime(sma, sops);
				2109
				2110	sem_unlock(sma, locknum);
				2111	rcu_read_unlock();
				2112	wake_up_q(&wake_q);
				2113
				2114	goto out_free;
				2115	}
				2116	if (error < 0) /* non-blocking error path */
				2117	goto out_unlock_free;
				2118
				2119	/*
				2120	* We need to sleep on this operation, so we put the current
				2121	* task into the pending queue and go to sleep.
				2122	*/
				2123	if (nsops == 1) {
				2124	struct sem *curr;
				2125	int idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
				2126	curr = &sma->sems[idx];
				2127
				2128	if (alter) {
				2129	if (sma->complex_count) {
				2130	list_add_tail(&queue.list,
				2131	&sma->pending_alter);
				2132	} else {
				2133
				2134	list_add_tail(&queue.list,
				2135	&curr->pending_alter);
				2136	}
				2137	} else {
				2138	list_add_tail(&queue.list, &curr->pending_const);
				2139	}
				2140	} else {
				2141	if (!sma->complex_count)
				2142	merge_queues(sma);
				2143
				2144	if (alter)
				2145	list_add_tail(&queue.list, &sma->pending_alter);
				2146	else
				2147	list_add_tail(&queue.list, &sma->pending_const);
				2148
				2149	sma->complex_count++;
				2150	}
				2151
				2152	do {
				2153	WRITE_ONCE(queue.status, -EINTR);
				2154	queue.sleeper = current;
				2155
				2156	__set_current_state(TASK_INTERRUPTIBLE);
				2157	sem_unlock(sma, locknum);
				2158	rcu_read_unlock();
				2159
				2160	if (timeout)
				2161	jiffies_left = schedule_timeout(jiffies_left);
				2162	else
				2163	schedule();
				2164
				2165	/*
				2166	* fastpath: the semop has completed, either successfully or
				2167	* not, from the syscall pov, is quite irrelevant to us at this
				2168	* point; we're done.
				2169	*
				2170	* We _do_ care, nonetheless, about being awoken by a signal or
				2171	* spuriously. The queue.status is checked again in the
				2172	* slowpath (aka after taking sem_lock), such that we can detect
				2173	* scenarios where we were awakened externally, during the
				2174	* window between wake_q_add() and wake_up_q().
				2175	*/
				2176	rcu_read_lock();
				2177	error = READ_ONCE(queue.status);
				2178	if (error != -EINTR) {
				2179	/*
				2180	* User space could assume that semop() is a memory
				2181	* barrier: Without the mb(), the cpu could
				2182	* speculatively read in userspace stale data that was
				2183	* overwritten by the previous owner of the semaphore.
				2184	*/
				2185	smp_mb();
				2186	rcu_read_unlock();
				2187	goto out_free;
				2188	}
				2189
				2190	locknum = sem_lock(sma, sops, nsops);
				2191
				2192	if (!ipc_valid_object(&sma->sem_perm))
				2193	goto out_unlock_free;
				2194
				2195	error = READ_ONCE(queue.status);
				2196
				2197	/*
				2198	* If queue.status != -EINTR we are woken up by another process.
				2199	* Leave without unlink_queue(), but with sem_unlock().
				2200	*/
				2201	if (error != -EINTR)
				2202	goto out_unlock_free;
				2203
				2204	/*
				2205	* If an interrupt occurred we have to clean up the queue.
				2206	*/
				2207	if (timeout && jiffies_left == 0)
				2208	error = -EAGAIN;
				2209	} while (error == -EINTR && !signal_pending(current)); /* spurious */
				2210
				2211	unlink_queue(sma, &queue);
				2212
				2213	out_unlock_free:
				2214	sem_unlock(sma, locknum);
				2215	rcu_read_unlock();
				2216	out_free:
				2217	if (sops != fast_sops)
				2218	kvfree(sops);
				2219	return error;
				2220	}
				2221
				2222	long ksys_semtimedop(int semid, struct sembuf __user *tsops,
				2223	unsigned int nsops, const struct __kernel_timespec __user *timeout)
				2224	{
				2225	if (timeout) {
				2226	struct timespec64 ts;
				2227	if (get_timespec64(&ts, timeout))
				2228	return -EFAULT;
				2229	return do_semtimedop(semid, tsops, nsops, &ts);
				2230	}
				2231	return do_semtimedop(semid, tsops, nsops, NULL);
				2232	}
				2233
				2234	SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
				2235	unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
				2236	{
				2237	return ksys_semtimedop(semid, tsops, nsops, timeout);
				2238	}
				2239
				2240	#ifdef CONFIG_COMPAT_32BIT_TIME
				2241	long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
				2242	unsigned int nsops,
				2243	const struct old_timespec32 __user *timeout)
				2244	{
				2245	if (timeout) {
				2246	struct timespec64 ts;
				2247	if (get_old_timespec32(&ts, timeout))
				2248	return -EFAULT;
				2249	return do_semtimedop(semid, tsems, nsops, &ts);
				2250	}
				2251	return do_semtimedop(semid, tsems, nsops, NULL);
				2252	}
				2253
				2254	SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
				2255	unsigned int, nsops,
				2256	const struct old_timespec32 __user *, timeout)
				2257	{
				2258	return compat_ksys_semtimedop(semid, tsems, nsops, timeout);
				2259	}
				2260	#endif
				2261
				2262	SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
				2263	unsigned, nsops)
				2264	{
				2265	return do_semtimedop(semid, tsops, nsops, NULL);
				2266	}
				2267
				2268	/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
				2269	* parent and child tasks.
				2270	*/
				2271
				2272	int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
				2273	{
				2274	struct sem_undo_list *undo_list;
				2275	int error;
				2276
				2277	if (clone_flags & CLONE_SYSVSEM) {
				2278	error = get_undo_list(&undo_list);
				2279	if (error)
				2280	return error;
				2281	refcount_inc(&undo_list->refcnt);
				2282	tsk->sysvsem.undo_list = undo_list;
				2283	} else
				2284	tsk->sysvsem.undo_list = NULL;
				2285
				2286	return 0;
				2287	}
				2288
				2289	/*
				2290	* add semadj values to semaphores, free undo structures.
				2291	* undo structures are not freed when semaphore arrays are destroyed
				2292	* so some of them may be out of date.
				2293	* IMPLEMENTATION NOTE: There is some confusion over whether the
				2294	* set of adjustments that needs to be done should be done in an atomic
				2295	* manner or not. That is, if we are attempting to decrement the semval
				2296	* should we queue up and wait until we can do so legally?
				2297	* The original implementation attempted to do this (queue and wait).
				2298	* The current implementation does not do so. The POSIX standard
				2299	* and SVID should be consulted to determine what behavior is mandated.
				2300	*/
				2301	void exit_sem(struct task_struct *tsk)
				2302	{
				2303	struct sem_undo_list *ulp;
				2304
				2305	ulp = tsk->sysvsem.undo_list;
				2306	if (!ulp)
				2307	return;
				2308	tsk->sysvsem.undo_list = NULL;
				2309
				2310	if (!refcount_dec_and_test(&ulp->refcnt))
				2311	return;
				2312
				2313	for (;;) {
				2314	struct sem_array *sma;
				2315	struct sem_undo *un;
				2316	int semid, i;
				2317	DEFINE_WAKE_Q(wake_q);
				2318
				2319	cond_resched();
				2320
				2321	rcu_read_lock();
				2322	un = list_entry_rcu(ulp->list_proc.next,
				2323	struct sem_undo, list_proc);
				2324	if (&un->list_proc == &ulp->list_proc) {
				2325	/*
				2326	* We must wait for freeary() before freeing this ulp,
				2327	* in case we raced with last sem_undo. There is a small
				2328	* possibility where we exit while freeary() didn't
				2329	* finish unlocking sem_undo_list.
				2330	*/
				2331	spin_lock(&ulp->lock);
				2332	spin_unlock(&ulp->lock);
				2333	rcu_read_unlock();
				2334	break;
				2335	}
				2336	spin_lock(&ulp->lock);
				2337	semid = un->semid;
				2338	spin_unlock(&ulp->lock);
				2339
				2340	/* exit_sem raced with IPC_RMID, nothing to do */
				2341	if (semid == -1) {
				2342	rcu_read_unlock();
				2343	continue;
				2344	}
				2345
				2346	sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
				2347	/* exit_sem raced with IPC_RMID, nothing to do */
				2348	if (IS_ERR(sma)) {
				2349	rcu_read_unlock();
				2350	continue;
				2351	}
				2352
				2353	sem_lock(sma, NULL, -1);
				2354	/* exit_sem raced with IPC_RMID, nothing to do */
				2355	if (!ipc_valid_object(&sma->sem_perm)) {
				2356	sem_unlock(sma, -1);
				2357	rcu_read_unlock();
				2358	continue;
				2359	}
				2360	un = __lookup_undo(ulp, semid);
				2361	if (un == NULL) {
				2362	/* exit_sem raced with IPC_RMID+semget() that created
				2363	* exactly the same semid. Nothing to do.
				2364	*/
				2365	sem_unlock(sma, -1);
				2366	rcu_read_unlock();
				2367	continue;
				2368	}
				2369
				2370	/* remove un from the linked lists */
				2371	ipc_assert_locked_object(&sma->sem_perm);
				2372	list_del(&un->list_id);
				2373
				2374	spin_lock(&ulp->lock);
				2375	list_del_rcu(&un->list_proc);
				2376	spin_unlock(&ulp->lock);
				2377
				2378	/* perform adjustments registered in un */
				2379	for (i = 0; i < sma->sem_nsems; i++) {
				2380	struct sem *semaphore = &sma->sems[i];
				2381	if (un->semadj[i]) {
				2382	semaphore->semval += un->semadj[i];
				2383	/*
				2384	* Range checks of the new semaphore value,
				2385	* not defined by sus:
				2386	* - Some unices ignore the undo entirely
				2387	* (e.g. HP UX 11i 11.22, Tru64 V5.1)
				2388	* - some cap the value (e.g. FreeBSD caps
				2389	* at 0, but doesn't enforce SEMVMX)
				2390	*
				2391	* Linux caps the semaphore value, both at 0
				2392	* and at SEMVMX.
				2393	*
				2394	* Manfred <manfred@colorfullife.com>
				2395	*/
				2396	if (semaphore->semval < 0)
				2397	semaphore->semval = 0;
				2398	if (semaphore->semval > SEMVMX)
				2399	semaphore->semval = SEMVMX;
				2400	ipc_update_pid(&semaphore->sempid, task_tgid(current));
				2401	}
				2402	}
				2403	/* maybe some queued-up processes were waiting for this */
				2404	do_smart_update(sma, NULL, 0, 1, &wake_q);
				2405	sem_unlock(sma, -1);
				2406	rcu_read_unlock();
				2407	wake_up_q(&wake_q);
				2408
				2409	kfree_rcu(un, rcu);
				2410	}
				2411	kfree(ulp);
				2412	}
				2413
				2414	#ifdef CONFIG_PROC_FS
				2415	static int sysvipc_sem_proc_show(struct seq_file s, void it)
				2416	{
				2417	struct user_namespace *user_ns = seq_user_ns(s);
				2418	struct kern_ipc_perm *ipcp = it;
				2419	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
				2420	time64_t sem_otime;
				2421
				2422	/*
				2423	* The proc interface isn't aware of sem_lock(), it calls
				2424	* ipc_lock_object() directly (in sysvipc_find_ipc).
				2425	* In order to stay compatible with sem_lock(), we must
				2426	* enter / leave complex_mode.
				2427	*/
				2428	complexmode_enter(sma);
				2429
				2430	sem_otime = get_semotime(sma);
				2431
				2432	seq_printf(s,
				2433	"%10d %10d %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
				2434	sma->sem_perm.key,
				2435	sma->sem_perm.id,
				2436	sma->sem_perm.mode,
				2437	sma->sem_nsems,
				2438	from_kuid_munged(user_ns, sma->sem_perm.uid),
				2439	from_kgid_munged(user_ns, sma->sem_perm.gid),
				2440	from_kuid_munged(user_ns, sma->sem_perm.cuid),
				2441	from_kgid_munged(user_ns, sma->sem_perm.cgid),
				2442	sem_otime,
				2443	sma->sem_ctime);
				2444
				2445	complexmode_tryleave(sma);
				2446
				2447	return 0;
				2448	}
				2449	#endif