Blame - src/kernel/linux/v4.14/ipc/sem.c - T103

blob: 6adc245f3e02cd17fb9c0502e15e0a2a33e2557e [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/ipc/sem.c
				4	* Copyright (C) 1992 Krishna Balasubramanian
				5	* Copyright (C) 1995 Eric Schenk, Bruno Haible
				6	*
				7	* /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
				8	*
				9	* SMP-threaded, sysctl's added
				10	* (c) 1999 Manfred Spraul <manfred@colorfullife.com>
				11	* Enforced range limit on SEM_UNDO
				12	* (c) 2001 Red Hat Inc
				13	* Lockless wakeup
				14	* (c) 2003 Manfred Spraul <manfred@colorfullife.com>
				15	* (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
				16	* Further wakeup optimizations, documentation
				17	* (c) 2010 Manfred Spraul <manfred@colorfullife.com>
				18	*
				19	* support for audit of ipc object properties and permission changes
				20	* Dustin Kirkland <dustin.kirkland@us.ibm.com>
				21	*
				22	* namespaces support
				23	* OpenVZ, SWsoft Inc.
				24	* Pavel Emelianov <xemul@openvz.org>
				25	*
				26	* Implementation notes: (May 2010)
				27	* This file implements System V semaphores.
				28	*
				29	* User space visible behavior:
				30	* - FIFO ordering for semop() operations (just FIFO, not starvation
				31	* protection)
				32	* - multiple semaphore operations that alter the same semaphore in
				33	* one semop() are handled.
				34	* - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
				35	* SETALL calls.
				36	* - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
				37	* - undo adjustments at process exit are limited to 0..SEMVMX.
				38	* - namespace are supported.
				39	* - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
				40	* to /proc/sys/kernel/sem.
				41	* - statistics about the usage are reported in /proc/sysvipc/sem.
				42	*
				43	* Internals:
				44	* - scalability:
				45	* - all global variables are read-mostly.
				46	* - semop() calls and semctl(RMID) are synchronized by RCU.
				47	* - most operations do write operations (actually: spin_lock calls) to
				48	* the per-semaphore array structure.
				49	* Thus: Perfect SMP scaling between independent semaphore arrays.
				50	* If multiple semaphores in one array are used, then cache line
				51	* trashing on the semaphore array spinlock will limit the scaling.
				52	* - semncnt and semzcnt are calculated on demand in count_semcnt()
				53	* - the task that performs a successful semop() scans the list of all
				54	* sleeping tasks and completes any pending operations that can be fulfilled.
				55	* Semaphores are actively given to waiting tasks (necessary for FIFO).
				56	* (see update_queue())
				57	* - To improve the scalability, the actual wake-up calls are performed after
				58	* dropping all locks. (see wake_up_sem_queue_prepare())
				59	* - All work is done by the waker, the woken up task does not have to do
				60	* anything - not even acquiring a lock or dropping a refcount.
				61	* - A woken up task may not even touch the semaphore array anymore, it may
				62	* have been destroyed already by a semctl(RMID).
				63	* - UNDO values are stored in an array (one per process and per
				64	* semaphore array, lazily allocated). For backwards compatibility, multiple
				65	* modes for the UNDO variables are supported (per process, per thread)
				66	* (see copy_semundo, CLONE_SYSVSEM)
				67	* - There are two lists of the pending operations: a per-array list
				68	* and per-semaphore list (stored in the array). This allows to achieve FIFO
				69	* ordering without always scanning all pending operations.
				70	* The worst-case behavior is nevertheless O(N^2) for N wakeups.
				71	*/
				72
				73	#include <linux/slab.h>
				74	#include <linux/spinlock.h>
				75	#include <linux/init.h>
				76	#include <linux/proc_fs.h>
				77	#include <linux/time.h>
				78	#include <linux/security.h>
				79	#include <linux/syscalls.h>
				80	#include <linux/audit.h>
				81	#include <linux/capability.h>
				82	#include <linux/seq_file.h>
				83	#include <linux/rwsem.h>
				84	#include <linux/nsproxy.h>
				85	#include <linux/ipc_namespace.h>
				86	#include <linux/sched/wake_q.h>
				87
				88	#include <linux/uaccess.h>
				89	#include "util.h"
				90
				91
				92	/* One queue for each sleeping process in the system. */
				93	struct sem_queue {
				94	struct list_head list; /* queue of pending operations */
				95	struct task_struct sleeper; / this process */
				96	struct sem_undo undo; / undo structure */
				97	int pid; /* process id of requesting process */
				98	int status; /* completion status of operation */
				99	struct sembuf sops; / array of pending operations */
				100	struct sembuf blocking; / the operation that blocked */
				101	int nsops; /* number of operations */
				102	bool alter; /* does sops alter the array? /
				103	bool dupsop; /* sops on more than one sem_num */
				104	};
				105
				106	/* Each task has a list of undo requests. They are executed automatically
				107	* when the process exits.
				108	*/
				109	struct sem_undo {
				110	struct list_head list_proc; /* per-process list: *
				111	* all undos from one process
				112	* rcu protected */
				113	struct rcu_head rcu; /* rcu struct for sem_undo */
				114	struct sem_undo_list ulp; / back ptr to sem_undo_list */
				115	struct list_head list_id; /* per semaphore array list:
				116	* all undos for one array */
				117	int semid; /* semaphore set identifier */
				118	short semadj; / array of adjustments */
				119	/* one per semaphore */
				120	};
				121
				122	/* sem_undo_list controls shared access to the list of sem_undo structures
				123	* that may be shared among all a CLONE_SYSVSEM task group.
				124	*/
				125	struct sem_undo_list {
				126	refcount_t refcnt;
				127	spinlock_t lock;
				128	struct list_head list_proc;
				129	};
				130
				131
				132	#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
				133
				134	static int newary(struct ipc_namespace , struct ipc_params );
				135	static void freeary(struct ipc_namespace , struct kern_ipc_perm );
				136	#ifdef CONFIG_PROC_FS
				137	static int sysvipc_sem_proc_show(struct seq_file s, void it);
				138	#endif
				139
				140	#define SEMMSL_FAST 256 /* 512 bytes on stack */
				141	#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
				142
				143	/*
				144	* Switching from the mode suitable for simple ops
				145	* to the mode for complex ops is costly. Therefore:
				146	* use some hysteresis
				147	*/
				148	#define USE_GLOBAL_LOCK_HYSTERESIS 10
				149
				150	/*
				151	* Locking:
				152	* a) global sem_lock() for read/write
				153	* sem_undo.id_next,
				154	* sem_array.complex_count,
				155	* sem_array.pending{_alter,_const},
				156	* sem_array.sem_undo
				157	*
				158	* b) global or semaphore sem_lock() for read/write:
				159	* sem_array.sems[i].pending_{const,alter}:
				160	*
				161	* c) special:
				162	* sem_undo_list.list_proc:
				163	* * undo_list->lock for write
				164	* * rcu for read
				165	* use_global_lock:
				166	* * global sem_lock() for write
				167	* * either local or global sem_lock() for read.
				168	*
				169	* Memory ordering:
				170	* Most ordering is enforced by using spin_lock() and spin_unlock().
				171	* The special case is use_global_lock:
				172	* Setting it from non-zero to 0 is a RELEASE, this is ensured by
				173	* using smp_store_release().
				174	* Testing if it is non-zero is an ACQUIRE, this is ensured by using
				175	* smp_load_acquire().
				176	* Setting it from 0 to non-zero must be ordered with regards to
				177	* this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
				178	* is inside a spin_lock() and after a write from 0 to non-zero a
				179	* spin_lock()+spin_unlock() is done.
				180	*/
				181
				182	#define sc_semmsl sem_ctls[0]
				183	#define sc_semmns sem_ctls[1]
				184	#define sc_semopm sem_ctls[2]
				185	#define sc_semmni sem_ctls[3]
				186
				187	int sem_init_ns(struct ipc_namespace *ns)
				188	{
				189	ns->sc_semmsl = SEMMSL;
				190	ns->sc_semmns = SEMMNS;
				191	ns->sc_semopm = SEMOPM;
				192	ns->sc_semmni = SEMMNI;
				193	ns->used_sems = 0;
				194	return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
				195	}
				196
				197	#ifdef CONFIG_IPC_NS
				198	void sem_exit_ns(struct ipc_namespace *ns)
				199	{
				200	free_ipcs(ns, &sem_ids(ns), freeary);
				201	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
				202	rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
				203	}
				204	#endif
				205
				206	int __init sem_init(void)
				207	{
				208	const int err = sem_init_ns(&init_ipc_ns);
				209
				210	ipc_init_proc_interface("sysvipc/sem",
				211	" key semid perms nsems uid gid cuid cgid otime ctime\n",
				212	IPC_SEM_IDS, sysvipc_sem_proc_show);
				213	return err;
				214	}
				215
				216	/**
				217	* unmerge_queues - unmerge queues, if possible.
				218	* @sma: semaphore array
				219	*
				220	* The function unmerges the wait queues if complex_count is 0.
				221	* It must be called prior to dropping the global semaphore array lock.
				222	*/
				223	static void unmerge_queues(struct sem_array *sma)
				224	{
				225	struct sem_queue q, tq;
				226
				227	/* complex operations still around? */
				228	if (sma->complex_count)
				229	return;
				230	/*
				231	* We will switch back to simple mode.
				232	* Move all pending operation back into the per-semaphore
				233	* queues.
				234	*/
				235	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
				236	struct sem *curr;
				237	curr = &sma->sems[q->sops[0].sem_num];
				238
				239	list_add_tail(&q->list, &curr->pending_alter);
				240	}
				241	INIT_LIST_HEAD(&sma->pending_alter);
				242	}
				243
				244	/**
				245	* merge_queues - merge single semop queues into global queue
				246	* @sma: semaphore array
				247	*
				248	* This function merges all per-semaphore queues into the global queue.
				249	* It is necessary to achieve FIFO ordering for the pending single-sop
				250	* operations when a multi-semop operation must sleep.
				251	* Only the alter operations must be moved, the const operations can stay.
				252	*/
				253	static void merge_queues(struct sem_array *sma)
				254	{
				255	int i;
				256	for (i = 0; i < sma->sem_nsems; i++) {
				257	struct sem *sem = &sma->sems[i];
				258
				259	list_splice_init(&sem->pending_alter, &sma->pending_alter);
				260	}
				261	}
				262
				263	static void sem_rcu_free(struct rcu_head *head)
				264	{
				265	struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
				266	struct sem_array *sma = container_of(p, struct sem_array, sem_perm);
				267
				268	security_sem_free(sma);
				269	kvfree(sma);
				270	}
				271
				272	/*
				273	* Enter the mode suitable for non-simple operations:
				274	* Caller must own sem_perm.lock.
				275	*/
				276	static void complexmode_enter(struct sem_array *sma)
				277	{
				278	int i;
				279	struct sem *sem;
				280
				281	if (sma->use_global_lock > 0) {
				282	/*
				283	* We are already in global lock mode.
				284	* Nothing to do, just reset the
				285	* counter until we return to simple mode.
				286	*/
				287	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
				288	return;
				289	}
				290	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
				291
				292	for (i = 0; i < sma->sem_nsems; i++) {
				293	sem = &sma->sems[i];
				294	spin_lock(&sem->lock);
				295	spin_unlock(&sem->lock);
				296	}
				297	}
				298
				299	/*
				300	* Try to leave the mode that disallows simple operations:
				301	* Caller must own sem_perm.lock.
				302	*/
				303	static void complexmode_tryleave(struct sem_array *sma)
				304	{
				305	if (sma->complex_count) {
				306	/* Complex ops are sleeping.
				307	* We must stay in complex mode
				308	*/
				309	return;
				310	}
				311	if (sma->use_global_lock == 1) {
				312	/*
				313	* Immediately after setting use_global_lock to 0,
				314	* a simple op can start. Thus: all memory writes
				315	* performed by the current operation must be visible
				316	* before we set use_global_lock to 0.
				317	*/
				318	smp_store_release(&sma->use_global_lock, 0);
				319	} else {
				320	sma->use_global_lock--;
				321	}
				322	}
				323
				324	#define SEM_GLOBAL_LOCK (-1)
				325	/*
				326	* If the request contains only one semaphore operation, and there are
				327	* no complex transactions pending, lock only the semaphore involved.
				328	* Otherwise, lock the entire semaphore array, since we either have
				329	* multiple semaphores in our own semops, or we need to look at
				330	* semaphores from other pending complex operations.
				331	*/
				332	static inline int sem_lock(struct sem_array sma, struct sembuf sops,
				333	int nsops)
				334	{
				335	struct sem *sem;
				336
				337	if (nsops != 1) {
				338	/* Complex operation - acquire a full lock */
				339	ipc_lock_object(&sma->sem_perm);
				340
				341	/* Prevent parallel simple ops */
				342	complexmode_enter(sma);
				343	return SEM_GLOBAL_LOCK;
				344	}
				345
				346	/*
				347	* Only one semaphore affected - try to optimize locking.
				348	* Optimized locking is possible if no complex operation
				349	* is either enqueued or processed right now.
				350	*
				351	* Both facts are tracked by use_global_mode.
				352	*/
				353	sem = &sma->sems[sops->sem_num];
				354
				355	/*
				356	* Initial check for use_global_lock. Just an optimization,
				357	* no locking, no memory barrier.
				358	*/
				359	if (!sma->use_global_lock) {
				360	/*
				361	* It appears that no complex operation is around.
				362	* Acquire the per-semaphore lock.
				363	*/
				364	spin_lock(&sem->lock);
				365
				366	/* pairs with smp_store_release() */
				367	if (!smp_load_acquire(&sma->use_global_lock)) {
				368	/* fast path successful! */
				369	return sops->sem_num;
				370	}
				371	spin_unlock(&sem->lock);
				372	}
				373
				374	/* slow path: acquire the full lock */
				375	ipc_lock_object(&sma->sem_perm);
				376
				377	if (sma->use_global_lock == 0) {
				378	/*
				379	* The use_global_lock mode ended while we waited for
				380	* sma->sem_perm.lock. Thus we must switch to locking
				381	* with sem->lock.
				382	* Unlike in the fast path, there is no need to recheck
				383	* sma->use_global_lock after we have acquired sem->lock:
				384	* We own sma->sem_perm.lock, thus use_global_lock cannot
				385	* change.
				386	*/
				387	spin_lock(&sem->lock);
				388
				389	ipc_unlock_object(&sma->sem_perm);
				390	return sops->sem_num;
				391	} else {
				392	/*
				393	* Not a false alarm, thus continue to use the global lock
				394	* mode. No need for complexmode_enter(), this was done by
				395	* the caller that has set use_global_mode to non-zero.
				396	*/
				397	return SEM_GLOBAL_LOCK;
				398	}
				399	}
				400
				401	static inline void sem_unlock(struct sem_array *sma, int locknum)
				402	{
				403	if (locknum == SEM_GLOBAL_LOCK) {
				404	unmerge_queues(sma);
				405	complexmode_tryleave(sma);
				406	ipc_unlock_object(&sma->sem_perm);
				407	} else {
				408	struct sem *sem = &sma->sems[locknum];
				409	spin_unlock(&sem->lock);
				410	}
				411	}
				412
				413	/*
				414	* sem_lock_(check_) routines are called in the paths where the rwsem
				415	* is not held.
				416	*
				417	* The caller holds the RCU read lock.
				418	*/
				419	static inline struct sem_array sem_obtain_object(struct ipc_namespace ns, int id)
				420	{
				421	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
				422
				423	if (IS_ERR(ipcp))
				424	return ERR_CAST(ipcp);
				425
				426	return container_of(ipcp, struct sem_array, sem_perm);
				427	}
				428
				429	static inline struct sem_array sem_obtain_object_check(struct ipc_namespace ns,
				430	int id)
				431	{
				432	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
				433
				434	if (IS_ERR(ipcp))
				435	return ERR_CAST(ipcp);
				436
				437	return container_of(ipcp, struct sem_array, sem_perm);
				438	}
				439
				440	static inline void sem_lock_and_putref(struct sem_array *sma)
				441	{
				442	sem_lock(sma, NULL, -1);
				443	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				444	}
				445
				446	static inline void sem_rmid(struct ipc_namespace ns, struct sem_array s)
				447	{
				448	ipc_rmid(&sem_ids(ns), &s->sem_perm);
				449	}
				450
				451	static struct sem_array *sem_alloc(size_t nsems)
				452	{
				453	struct sem_array *sma;
				454	size_t size;
				455
				456	if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
				457	return NULL;
				458
				459	size = sizeof(sma) + nsems sizeof(sma->sems[0]);
				460	sma = kvmalloc(size, GFP_KERNEL);
				461	if (unlikely(!sma))
				462	return NULL;
				463
				464	memset(sma, 0, size);
				465
				466	return sma;
				467	}
				468
				469	/**
				470	* newary - Create a new semaphore set
				471	* @ns: namespace
				472	* @params: ptr to the structure that contains key, semflg and nsems
				473	*
				474	* Called with sem_ids.rwsem held (as a writer)
				475	*/
				476	static int newary(struct ipc_namespace ns, struct ipc_params params)
				477	{
				478	int retval;
				479	struct sem_array *sma;
				480	key_t key = params->key;
				481	int nsems = params->u.nsems;
				482	int semflg = params->flg;
				483	int i;
				484
				485	if (!nsems)
				486	return -EINVAL;
				487	if (ns->used_sems + nsems > ns->sc_semmns)
				488	return -ENOSPC;
				489
				490	sma = sem_alloc(nsems);
				491	if (!sma)
				492	return -ENOMEM;
				493
				494	sma->sem_perm.mode = (semflg & S_IRWXUGO);
				495	sma->sem_perm.key = key;
				496
				497	sma->sem_perm.security = NULL;
				498	retval = security_sem_alloc(sma);
				499	if (retval) {
				500	kvfree(sma);
				501	return retval;
				502	}
				503
				504	for (i = 0; i < nsems; i++) {
				505	INIT_LIST_HEAD(&sma->sems[i].pending_alter);
				506	INIT_LIST_HEAD(&sma->sems[i].pending_const);
				507	spin_lock_init(&sma->sems[i].lock);
				508	}
				509
				510	sma->complex_count = 0;
				511	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
				512	INIT_LIST_HEAD(&sma->pending_alter);
				513	INIT_LIST_HEAD(&sma->pending_const);
				514	INIT_LIST_HEAD(&sma->list_id);
				515	sma->sem_nsems = nsems;
				516	sma->sem_ctime = ktime_get_real_seconds();
				517
				518	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
				519	if (retval < 0) {
				520	call_rcu(&sma->sem_perm.rcu, sem_rcu_free);
				521	return retval;
				522	}
				523	ns->used_sems += nsems;
				524
				525	sem_unlock(sma, -1);
				526	rcu_read_unlock();
				527
				528	return sma->sem_perm.id;
				529	}
				530
				531
				532	/*
				533	* Called with sem_ids.rwsem and ipcp locked.
				534	*/
				535	static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
				536	{
				537	struct sem_array *sma;
				538
				539	sma = container_of(ipcp, struct sem_array, sem_perm);
				540	return security_sem_associate(sma, semflg);
				541	}
				542
				543	/*
				544	* Called with sem_ids.rwsem and ipcp locked.
				545	*/
				546	static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
				547	struct ipc_params *params)
				548	{
				549	struct sem_array *sma;
				550
				551	sma = container_of(ipcp, struct sem_array, sem_perm);
				552	if (params->u.nsems > sma->sem_nsems)
				553	return -EINVAL;
				554
				555	return 0;
				556	}
				557
				558	SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
				559	{
				560	struct ipc_namespace *ns;
				561	static const struct ipc_ops sem_ops = {
				562	.getnew = newary,
				563	.associate = sem_security,
				564	.more_checks = sem_more_checks,
				565	};
				566	struct ipc_params sem_params;
				567
				568	ns = current->nsproxy->ipc_ns;
				569
				570	if (nsems < 0 \|\| nsems > ns->sc_semmsl)
				571	return -EINVAL;
				572
				573	sem_params.key = key;
				574	sem_params.flg = semflg;
				575	sem_params.u.nsems = nsems;
				576
				577	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
				578	}
				579
				580	/**
				581	* perform_atomic_semop[_slow] - Attempt to perform semaphore
				582	* operations on a given array.
				583	* @sma: semaphore array
				584	* @q: struct sem_queue that describes the operation
				585	*
				586	* Caller blocking are as follows, based the value
				587	* indicated by the semaphore operation (sem_op):
				588	*
				589	* (1) >0 never blocks.
				590	* (2) 0 (wait-for-zero operation): semval is non-zero.
				591	* (3) <0 attempting to decrement semval to a value smaller than zero.
				592	*
				593	* Returns 0 if the operation was possible.
				594	* Returns 1 if the operation is impossible, the caller must sleep.
				595	* Returns <0 for error codes.
				596	*/
				597	static int perform_atomic_semop_slow(struct sem_array sma, struct sem_queue q)
				598	{
				599	int result, sem_op, nsops, pid;
				600	struct sembuf *sop;
				601	struct sem *curr;
				602	struct sembuf *sops;
				603	struct sem_undo *un;
				604
				605	sops = q->sops;
				606	nsops = q->nsops;
				607	un = q->undo;
				608
				609	for (sop = sops; sop < sops + nsops; sop++) {
				610	curr = &sma->sems[sop->sem_num];
				611	sem_op = sop->sem_op;
				612	result = curr->semval;
				613
				614	if (!sem_op && result)
				615	goto would_block;
				616
				617	result += sem_op;
				618	if (result < 0)
				619	goto would_block;
				620	if (result > SEMVMX)
				621	goto out_of_range;
				622
				623	if (sop->sem_flg & SEM_UNDO) {
				624	int undo = un->semadj[sop->sem_num] - sem_op;
				625	/* Exceeding the undo range is an error. */
				626	if (undo < (-SEMAEM - 1) \|\| undo > SEMAEM)
				627	goto out_of_range;
				628	un->semadj[sop->sem_num] = undo;
				629	}
				630
				631	curr->semval = result;
				632	}
				633
				634	sop--;
				635	pid = q->pid;
				636	while (sop >= sops) {
				637	sma->sems[sop->sem_num].sempid = pid;
				638	sop--;
				639	}
				640
				641	return 0;
				642
				643	out_of_range:
				644	result = -ERANGE;
				645	goto undo;
				646
				647	would_block:
				648	q->blocking = sop;
				649
				650	if (sop->sem_flg & IPC_NOWAIT)
				651	result = -EAGAIN;
				652	else
				653	result = 1;
				654
				655	undo:
				656	sop--;
				657	while (sop >= sops) {
				658	sem_op = sop->sem_op;
				659	sma->sems[sop->sem_num].semval -= sem_op;
				660	if (sop->sem_flg & SEM_UNDO)
				661	un->semadj[sop->sem_num] += sem_op;
				662	sop--;
				663	}
				664
				665	return result;
				666	}
				667
				668	static int perform_atomic_semop(struct sem_array sma, struct sem_queue q)
				669	{
				670	int result, sem_op, nsops;
				671	struct sembuf *sop;
				672	struct sem *curr;
				673	struct sembuf *sops;
				674	struct sem_undo *un;
				675
				676	sops = q->sops;
				677	nsops = q->nsops;
				678	un = q->undo;
				679
				680	if (unlikely(q->dupsop))
				681	return perform_atomic_semop_slow(sma, q);
				682
				683	/*
				684	* We scan the semaphore set twice, first to ensure that the entire
				685	* operation can succeed, therefore avoiding any pointless writes
				686	* to shared memory and having to undo such changes in order to block
				687	* until the operations can go through.
				688	*/
				689	for (sop = sops; sop < sops + nsops; sop++) {
				690	curr = &sma->sems[sop->sem_num];
				691	sem_op = sop->sem_op;
				692	result = curr->semval;
				693
				694	if (!sem_op && result)
				695	goto would_block; /* wait-for-zero */
				696
				697	result += sem_op;
				698	if (result < 0)
				699	goto would_block;
				700
				701	if (result > SEMVMX)
				702	return -ERANGE;
				703
				704	if (sop->sem_flg & SEM_UNDO) {
				705	int undo = un->semadj[sop->sem_num] - sem_op;
				706
				707	/* Exceeding the undo range is an error. */
				708	if (undo < (-SEMAEM - 1) \|\| undo > SEMAEM)
				709	return -ERANGE;
				710	}
				711	}
				712
				713	for (sop = sops; sop < sops + nsops; sop++) {
				714	curr = &sma->sems[sop->sem_num];
				715	sem_op = sop->sem_op;
				716	result = curr->semval;
				717
				718	if (sop->sem_flg & SEM_UNDO) {
				719	int undo = un->semadj[sop->sem_num] - sem_op;
				720
				721	un->semadj[sop->sem_num] = undo;
				722	}
				723	curr->semval += sem_op;
				724	curr->sempid = q->pid;
				725	}
				726
				727	return 0;
				728
				729	would_block:
				730	q->blocking = sop;
				731	return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
				732	}
				733
				734	static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
				735	struct wake_q_head *wake_q)
				736	{
				737	wake_q_add(wake_q, q->sleeper);
				738	/*
				739	* Rely on the above implicit barrier, such that we can
				740	* ensure that we hold reference to the task before setting
				741	* q->status. Otherwise we could race with do_exit if the
				742	* task is awoken by an external event before calling
				743	* wake_up_process().
				744	*/
				745	WRITE_ONCE(q->status, error);
				746	}
				747
				748	static void unlink_queue(struct sem_array sma, struct sem_queue q)
				749	{
				750	list_del(&q->list);
				751	if (q->nsops > 1)
				752	sma->complex_count--;
				753	}
				754
				755	/** check_restart(sma, q)
				756	* @sma: semaphore array
				757	* @q: the operation that just completed
				758	*
				759	* update_queue is O(N^2) when it restarts scanning the whole queue of
				760	* waiting operations. Therefore this function checks if the restart is
				761	* really necessary. It is called after a previously waiting operation
				762	* modified the array.
				763	* Note that wait-for-zero operations are handled without restart.
				764	*/
				765	static inline int check_restart(struct sem_array sma, struct sem_queue q)
				766	{
				767	/* pending complex alter operations are too difficult to analyse */
				768	if (!list_empty(&sma->pending_alter))
				769	return 1;
				770
				771	/* we were a sleeping complex operation. Too difficult */
				772	if (q->nsops > 1)
				773	return 1;
				774
				775	/* It is impossible that someone waits for the new value:
				776	* - complex operations always restart.
				777	* - wait-for-zero are handled seperately.
				778	* - q is a previously sleeping simple operation that
				779	* altered the array. It must be a decrement, because
				780	* simple increments never sleep.
				781	* - If there are older (higher priority) decrements
				782	* in the queue, then they have observed the original
				783	* semval value and couldn't proceed. The operation
				784	* decremented to value - thus they won't proceed either.
				785	*/
				786	return 0;
				787	}
				788
				789	/**
				790	* wake_const_ops - wake up non-alter tasks
				791	* @sma: semaphore array.
				792	* @semnum: semaphore that was modified.
				793	* @wake_q: lockless wake-queue head.
				794	*
				795	* wake_const_ops must be called after a semaphore in a semaphore array
				796	* was set to 0. If complex const operations are pending, wake_const_ops must
				797	* be called with semnum = -1, as well as with the number of each modified
				798	* semaphore.
				799	* The tasks that must be woken up are added to @wake_q. The return code
				800	* is stored in q->pid.
				801	* The function returns 1 if at least one operation was completed successfully.
				802	*/
				803	static int wake_const_ops(struct sem_array *sma, int semnum,
				804	struct wake_q_head *wake_q)
				805	{
				806	struct sem_queue q, tmp;
				807	struct list_head *pending_list;
				808	int semop_completed = 0;
				809
				810	if (semnum == -1)
				811	pending_list = &sma->pending_const;
				812	else
				813	pending_list = &sma->sems[semnum].pending_const;
				814
				815	list_for_each_entry_safe(q, tmp, pending_list, list) {
				816	int error = perform_atomic_semop(sma, q);
				817
				818	if (error > 0)
				819	continue;
				820	/* operation completed, remove from queue & wakeup */
				821	unlink_queue(sma, q);
				822
				823	wake_up_sem_queue_prepare(q, error, wake_q);
				824	if (error == 0)
				825	semop_completed = 1;
				826	}
				827
				828	return semop_completed;
				829	}
				830
				831	/**
				832	* do_smart_wakeup_zero - wakeup all wait for zero tasks
				833	* @sma: semaphore array
				834	* @sops: operations that were performed
				835	* @nsops: number of operations
				836	* @wake_q: lockless wake-queue head
				837	*
				838	* Checks all required queue for wait-for-zero operations, based
				839	* on the actual changes that were performed on the semaphore array.
				840	* The function returns 1 if at least one operation was completed successfully.
				841	*/
				842	static int do_smart_wakeup_zero(struct sem_array sma, struct sembuf sops,
				843	int nsops, struct wake_q_head *wake_q)
				844	{
				845	int i;
				846	int semop_completed = 0;
				847	int got_zero = 0;
				848
				849	/* first: the per-semaphore queues, if known */
				850	if (sops) {
				851	for (i = 0; i < nsops; i++) {
				852	int num = sops[i].sem_num;
				853
				854	if (sma->sems[num].semval == 0) {
				855	got_zero = 1;
				856	semop_completed \|= wake_const_ops(sma, num, wake_q);
				857	}
				858	}
				859	} else {
				860	/*
				861	* No sops means modified semaphores not known.
				862	* Assume all were changed.
				863	*/
				864	for (i = 0; i < sma->sem_nsems; i++) {
				865	if (sma->sems[i].semval == 0) {
				866	got_zero = 1;
				867	semop_completed \|= wake_const_ops(sma, i, wake_q);
				868	}
				869	}
				870	}
				871	/*
				872	* If one of the modified semaphores got 0,
				873	* then check the global queue, too.
				874	*/
				875	if (got_zero)
				876	semop_completed \|= wake_const_ops(sma, -1, wake_q);
				877
				878	return semop_completed;
				879	}
				880
				881
				882	/**
				883	* update_queue - look for tasks that can be completed.
				884	* @sma: semaphore array.
				885	* @semnum: semaphore that was modified.
				886	* @wake_q: lockless wake-queue head.
				887	*
				888	* update_queue must be called after a semaphore in a semaphore array
				889	* was modified. If multiple semaphores were modified, update_queue must
				890	* be called with semnum = -1, as well as with the number of each modified
				891	* semaphore.
				892	* The tasks that must be woken up are added to @wake_q. The return code
				893	* is stored in q->pid.
				894	* The function internally checks if const operations can now succeed.
				895	*
				896	* The function return 1 if at least one semop was completed successfully.
				897	*/
				898	static int update_queue(struct sem_array sma, int semnum, struct wake_q_head wake_q)
				899	{
				900	struct sem_queue q, tmp;
				901	struct list_head *pending_list;
				902	int semop_completed = 0;
				903
				904	if (semnum == -1)
				905	pending_list = &sma->pending_alter;
				906	else
				907	pending_list = &sma->sems[semnum].pending_alter;
				908
				909	again:
				910	list_for_each_entry_safe(q, tmp, pending_list, list) {
				911	int error, restart;
				912
				913	/* If we are scanning the single sop, per-semaphore list of
				914	* one semaphore and that semaphore is 0, then it is not
				915	* necessary to scan further: simple increments
				916	* that affect only one entry succeed immediately and cannot
				917	* be in the per semaphore pending queue, and decrements
				918	* cannot be successful if the value is already 0.
				919	*/
				920	if (semnum != -1 && sma->sems[semnum].semval == 0)
				921	break;
				922
				923	error = perform_atomic_semop(sma, q);
				924
				925	/* Does q->sleeper still need to sleep? */
				926	if (error > 0)
				927	continue;
				928
				929	unlink_queue(sma, q);
				930
				931	if (error) {
				932	restart = 0;
				933	} else {
				934	semop_completed = 1;
				935	do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
				936	restart = check_restart(sma, q);
				937	}
				938
				939	wake_up_sem_queue_prepare(q, error, wake_q);
				940	if (restart)
				941	goto again;
				942	}
				943	return semop_completed;
				944	}
				945
				946	/**
				947	* set_semotime - set sem_otime
				948	* @sma: semaphore array
				949	* @sops: operations that modified the array, may be NULL
				950	*
				951	* sem_otime is replicated to avoid cache line trashing.
				952	* This function sets one instance to the current time.
				953	*/
				954	static void set_semotime(struct sem_array sma, struct sembuf sops)
				955	{
				956	if (sops == NULL) {
				957	sma->sems[0].sem_otime = get_seconds();
				958	} else {
				959	sma->sems[sops[0].sem_num].sem_otime =
				960	get_seconds();
				961	}
				962	}
				963
				964	/**
				965	* do_smart_update - optimized update_queue
				966	* @sma: semaphore array
				967	* @sops: operations that were performed
				968	* @nsops: number of operations
				969	* @otime: force setting otime
				970	* @wake_q: lockless wake-queue head
				971	*
				972	* do_smart_update() does the required calls to update_queue and wakeup_zero,
				973	* based on the actual changes that were performed on the semaphore array.
				974	* Note that the function does not do the actual wake-up: the caller is
				975	* responsible for calling wake_up_q().
				976	* It is safe to perform this call after dropping all locks.
				977	*/
				978	static void do_smart_update(struct sem_array sma, struct sembuf sops, int nsops,
				979	int otime, struct wake_q_head *wake_q)
				980	{
				981	int i;
				982
				983	otime \|= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
				984
				985	if (!list_empty(&sma->pending_alter)) {
				986	/* semaphore array uses the global queue - just process it. */
				987	otime \|= update_queue(sma, -1, wake_q);
				988	} else {
				989	if (!sops) {
				990	/*
				991	* No sops, thus the modified semaphores are not
				992	* known. Check all.
				993	*/
				994	for (i = 0; i < sma->sem_nsems; i++)
				995	otime \|= update_queue(sma, i, wake_q);
				996	} else {
				997	/*
				998	* Check the semaphores that were increased:
				999	* - No complex ops, thus all sleeping ops are
				1000	* decrease.
				1001	* - if we decreased the value, then any sleeping
				1002	* semaphore ops wont be able to run: If the
				1003	* previous value was too small, then the new
				1004	* value will be too small, too.
				1005	*/
				1006	for (i = 0; i < nsops; i++) {
				1007	if (sops[i].sem_op > 0) {
				1008	otime \|= update_queue(sma,
				1009	sops[i].sem_num, wake_q);
				1010	}
				1011	}
				1012	}
				1013	}
				1014	if (otime)
				1015	set_semotime(sma, sops);
				1016	}
				1017
				1018	/*
				1019	* check_qop: Test if a queued operation sleeps on the semaphore semnum
				1020	*/
				1021	static int check_qop(struct sem_array sma, int semnum, struct sem_queue q,
				1022	bool count_zero)
				1023	{
				1024	struct sembuf *sop = q->blocking;
				1025
				1026	/*
				1027	* Linux always (since 0.99.10) reported a task as sleeping on all
				1028	* semaphores. This violates SUS, therefore it was changed to the
				1029	* standard compliant behavior.
				1030	* Give the administrators a chance to notice that an application
				1031	* might misbehave because it relies on the Linux behavior.
				1032	*/
				1033	pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
				1034	"The task %s (%d) triggered the difference, watch for misbehavior.\n",
				1035	current->comm, task_pid_nr(current));
				1036
				1037	if (sop->sem_num != semnum)
				1038	return 0;
				1039
				1040	if (count_zero && sop->sem_op == 0)
				1041	return 1;
				1042	if (!count_zero && sop->sem_op < 0)
				1043	return 1;
				1044
				1045	return 0;
				1046	}
				1047
				1048	/* The following counts are associated to each semaphore:
				1049	* semncnt number of tasks waiting on semval being nonzero
				1050	* semzcnt number of tasks waiting on semval being zero
				1051	*
				1052	* Per definition, a task waits only on the semaphore of the first semop
				1053	* that cannot proceed, even if additional operation would block, too.
				1054	*/
				1055	static int count_semcnt(struct sem_array *sma, ushort semnum,
				1056	bool count_zero)
				1057	{
				1058	struct list_head *l;
				1059	struct sem_queue *q;
				1060	int semcnt;
				1061
				1062	semcnt = 0;
				1063	/* First: check the simple operations. They are easy to evaluate */
				1064	if (count_zero)
				1065	l = &sma->sems[semnum].pending_const;
				1066	else
				1067	l = &sma->sems[semnum].pending_alter;
				1068
				1069	list_for_each_entry(q, l, list) {
				1070	/* all task on a per-semaphore list sleep on exactly
				1071	* that semaphore
				1072	*/
				1073	semcnt++;
				1074	}
				1075
				1076	/* Then: check the complex operations. */
				1077	list_for_each_entry(q, &sma->pending_alter, list) {
				1078	semcnt += check_qop(sma, semnum, q, count_zero);
				1079	}
				1080	if (count_zero) {
				1081	list_for_each_entry(q, &sma->pending_const, list) {
				1082	semcnt += check_qop(sma, semnum, q, count_zero);
				1083	}
				1084	}
				1085	return semcnt;
				1086	}
				1087
				1088	/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
				1089	* as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
				1090	* remains locked on exit.
				1091	*/
				1092	static void freeary(struct ipc_namespace ns, struct kern_ipc_perm ipcp)
				1093	{
				1094	struct sem_undo un, tu;
				1095	struct sem_queue q, tq;
				1096	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
				1097	int i;
				1098	DEFINE_WAKE_Q(wake_q);
				1099
				1100	/* Free the existing undo structures for this semaphore set. */
				1101	ipc_assert_locked_object(&sma->sem_perm);
				1102	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
				1103	list_del(&un->list_id);
				1104	spin_lock(&un->ulp->lock);
				1105	un->semid = -1;
				1106	list_del_rcu(&un->list_proc);
				1107	spin_unlock(&un->ulp->lock);
				1108	kfree_rcu(un, rcu);
				1109	}
				1110
				1111	/* Wake up all pending processes and let them fail with EIDRM. */
				1112	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
				1113	unlink_queue(sma, q);
				1114	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1115	}
				1116
				1117	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
				1118	unlink_queue(sma, q);
				1119	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1120	}
				1121	for (i = 0; i < sma->sem_nsems; i++) {
				1122	struct sem *sem = &sma->sems[i];
				1123	list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
				1124	unlink_queue(sma, q);
				1125	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1126	}
				1127	list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
				1128	unlink_queue(sma, q);
				1129	wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
				1130	}
				1131	}
				1132
				1133	/* Remove the semaphore set from the IDR */
				1134	sem_rmid(ns, sma);
				1135	sem_unlock(sma, -1);
				1136	rcu_read_unlock();
				1137
				1138	wake_up_q(&wake_q);
				1139	ns->used_sems -= sma->sem_nsems;
				1140	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1141	}
				1142
				1143	static unsigned long copy_semid_to_user(void __user buf, struct semid64_ds in, int version)
				1144	{
				1145	switch (version) {
				1146	case IPC_64:
				1147	return copy_to_user(buf, in, sizeof(*in));
				1148	case IPC_OLD:
				1149	{
				1150	struct semid_ds out;
				1151
				1152	memset(&out, 0, sizeof(out));
				1153
				1154	ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
				1155
				1156	out.sem_otime = in->sem_otime;
				1157	out.sem_ctime = in->sem_ctime;
				1158	out.sem_nsems = in->sem_nsems;
				1159
				1160	return copy_to_user(buf, &out, sizeof(out));
				1161	}
				1162	default:
				1163	return -EINVAL;
				1164	}
				1165	}
				1166
				1167	static time64_t get_semotime(struct sem_array *sma)
				1168	{
				1169	int i;
				1170	time64_t res;
				1171
				1172	res = sma->sems[0].sem_otime;
				1173	for (i = 1; i < sma->sem_nsems; i++) {
				1174	time64_t to = sma->sems[i].sem_otime;
				1175
				1176	if (to > res)
				1177	res = to;
				1178	}
				1179	return res;
				1180	}
				1181
				1182	static int semctl_stat(struct ipc_namespace *ns, int semid,
				1183	int cmd, struct semid64_ds *semid64)
				1184	{
				1185	struct sem_array *sma;
				1186	int id = 0;
				1187	int err;
				1188
				1189	memset(semid64, 0, sizeof(*semid64));
				1190
				1191	rcu_read_lock();
				1192	if (cmd == SEM_STAT) {
				1193	sma = sem_obtain_object(ns, semid);
				1194	if (IS_ERR(sma)) {
				1195	err = PTR_ERR(sma);
				1196	goto out_unlock;
				1197	}
				1198	id = sma->sem_perm.id;
				1199	} else {
				1200	sma = sem_obtain_object_check(ns, semid);
				1201	if (IS_ERR(sma)) {
				1202	err = PTR_ERR(sma);
				1203	goto out_unlock;
				1204	}
				1205	}
				1206
				1207	err = -EACCES;
				1208	if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
				1209	goto out_unlock;
				1210
				1211	err = security_sem_semctl(sma, cmd);
				1212	if (err)
				1213	goto out_unlock;
				1214
				1215	kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
				1216	semid64->sem_otime = get_semotime(sma);
				1217	semid64->sem_ctime = sma->sem_ctime;
				1218	semid64->sem_nsems = sma->sem_nsems;
				1219	rcu_read_unlock();
				1220	return id;
				1221
				1222	out_unlock:
				1223	rcu_read_unlock();
				1224	return err;
				1225	}
				1226
				1227	static int semctl_info(struct ipc_namespace *ns, int semid,
				1228	int cmd, void __user *p)
				1229	{
				1230	struct seminfo seminfo;
				1231	int max_id;
				1232	int err;
				1233
				1234	err = security_sem_semctl(NULL, cmd);
				1235	if (err)
				1236	return err;
				1237
				1238	memset(&seminfo, 0, sizeof(seminfo));
				1239	seminfo.semmni = ns->sc_semmni;
				1240	seminfo.semmns = ns->sc_semmns;
				1241	seminfo.semmsl = ns->sc_semmsl;
				1242	seminfo.semopm = ns->sc_semopm;
				1243	seminfo.semvmx = SEMVMX;
				1244	seminfo.semmnu = SEMMNU;
				1245	seminfo.semmap = SEMMAP;
				1246	seminfo.semume = SEMUME;
				1247	down_read(&sem_ids(ns).rwsem);
				1248	if (cmd == SEM_INFO) {
				1249	seminfo.semusz = sem_ids(ns).in_use;
				1250	seminfo.semaem = ns->used_sems;
				1251	} else {
				1252	seminfo.semusz = SEMUSZ;
				1253	seminfo.semaem = SEMAEM;
				1254	}
				1255	max_id = ipc_get_maxid(&sem_ids(ns));
				1256	up_read(&sem_ids(ns).rwsem);
				1257	if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
				1258	return -EFAULT;
				1259	return (max_id < 0) ? 0 : max_id;
				1260	}
				1261
				1262	static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
				1263	int val)
				1264	{
				1265	struct sem_undo *un;
				1266	struct sem_array *sma;
				1267	struct sem *curr;
				1268	int err;
				1269	DEFINE_WAKE_Q(wake_q);
				1270
				1271	if (val > SEMVMX \|\| val < 0)
				1272	return -ERANGE;
				1273
				1274	rcu_read_lock();
				1275	sma = sem_obtain_object_check(ns, semid);
				1276	if (IS_ERR(sma)) {
				1277	rcu_read_unlock();
				1278	return PTR_ERR(sma);
				1279	}
				1280
				1281	if (semnum < 0 \|\| semnum >= sma->sem_nsems) {
				1282	rcu_read_unlock();
				1283	return -EINVAL;
				1284	}
				1285
				1286
				1287	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
				1288	rcu_read_unlock();
				1289	return -EACCES;
				1290	}
				1291
				1292	err = security_sem_semctl(sma, SETVAL);
				1293	if (err) {
				1294	rcu_read_unlock();
				1295	return -EACCES;
				1296	}
				1297
				1298	sem_lock(sma, NULL, -1);
				1299
				1300	if (!ipc_valid_object(&sma->sem_perm)) {
				1301	sem_unlock(sma, -1);
				1302	rcu_read_unlock();
				1303	return -EIDRM;
				1304	}
				1305
				1306	curr = &sma->sems[semnum];
				1307
				1308	ipc_assert_locked_object(&sma->sem_perm);
				1309	list_for_each_entry(un, &sma->list_id, list_id)
				1310	un->semadj[semnum] = 0;
				1311
				1312	curr->semval = val;
				1313	curr->sempid = task_tgid_vnr(current);
				1314	sma->sem_ctime = ktime_get_real_seconds();
				1315	/* maybe some queued-up processes were waiting for this */
				1316	do_smart_update(sma, NULL, 0, 0, &wake_q);
				1317	sem_unlock(sma, -1);
				1318	rcu_read_unlock();
				1319	wake_up_q(&wake_q);
				1320	return 0;
				1321	}
				1322
				1323	static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
				1324	int cmd, void __user *p)
				1325	{
				1326	struct sem_array *sma;
				1327	struct sem *curr;
				1328	int err, nsems;
				1329	ushort fast_sem_io[SEMMSL_FAST];
				1330	ushort *sem_io = fast_sem_io;
				1331	DEFINE_WAKE_Q(wake_q);
				1332
				1333	rcu_read_lock();
				1334	sma = sem_obtain_object_check(ns, semid);
				1335	if (IS_ERR(sma)) {
				1336	rcu_read_unlock();
				1337	return PTR_ERR(sma);
				1338	}
				1339
				1340	nsems = sma->sem_nsems;
				1341
				1342	err = -EACCES;
				1343	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
				1344	goto out_rcu_wakeup;
				1345
				1346	err = security_sem_semctl(sma, cmd);
				1347	if (err)
				1348	goto out_rcu_wakeup;
				1349
				1350	err = -EACCES;
				1351	switch (cmd) {
				1352	case GETALL:
				1353	{
				1354	ushort __user *array = p;
				1355	int i;
				1356
				1357	sem_lock(sma, NULL, -1);
				1358	if (!ipc_valid_object(&sma->sem_perm)) {
				1359	err = -EIDRM;
				1360	goto out_unlock;
				1361	}
				1362	if (nsems > SEMMSL_FAST) {
				1363	if (!ipc_rcu_getref(&sma->sem_perm)) {
				1364	err = -EIDRM;
				1365	goto out_unlock;
				1366	}
				1367	sem_unlock(sma, -1);
				1368	rcu_read_unlock();
				1369	sem_io = kvmalloc_array(nsems, sizeof(ushort),
				1370	GFP_KERNEL);
				1371	if (sem_io == NULL) {
				1372	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1373	return -ENOMEM;
				1374	}
				1375
				1376	rcu_read_lock();
				1377	sem_lock_and_putref(sma);
				1378	if (!ipc_valid_object(&sma->sem_perm)) {
				1379	err = -EIDRM;
				1380	goto out_unlock;
				1381	}
				1382	}
				1383	for (i = 0; i < sma->sem_nsems; i++)
				1384	sem_io[i] = sma->sems[i].semval;
				1385	sem_unlock(sma, -1);
				1386	rcu_read_unlock();
				1387	err = 0;
				1388	if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
				1389	err = -EFAULT;
				1390	goto out_free;
				1391	}
				1392	case SETALL:
				1393	{
				1394	int i;
				1395	struct sem_undo *un;
				1396
				1397	if (!ipc_rcu_getref(&sma->sem_perm)) {
				1398	err = -EIDRM;
				1399	goto out_rcu_wakeup;
				1400	}
				1401	rcu_read_unlock();
				1402
				1403	if (nsems > SEMMSL_FAST) {
				1404	sem_io = kvmalloc_array(nsems, sizeof(ushort),
				1405	GFP_KERNEL);
				1406	if (sem_io == NULL) {
				1407	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1408	return -ENOMEM;
				1409	}
				1410	}
				1411
				1412	if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
				1413	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1414	err = -EFAULT;
				1415	goto out_free;
				1416	}
				1417
				1418	for (i = 0; i < nsems; i++) {
				1419	if (sem_io[i] > SEMVMX) {
				1420	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1421	err = -ERANGE;
				1422	goto out_free;
				1423	}
				1424	}
				1425	rcu_read_lock();
				1426	sem_lock_and_putref(sma);
				1427	if (!ipc_valid_object(&sma->sem_perm)) {
				1428	err = -EIDRM;
				1429	goto out_unlock;
				1430	}
				1431
				1432	for (i = 0; i < nsems; i++) {
				1433	sma->sems[i].semval = sem_io[i];
				1434	sma->sems[i].sempid = task_tgid_vnr(current);
				1435	}
				1436
				1437	ipc_assert_locked_object(&sma->sem_perm);
				1438	list_for_each_entry(un, &sma->list_id, list_id) {
				1439	for (i = 0; i < nsems; i++)
				1440	un->semadj[i] = 0;
				1441	}
				1442	sma->sem_ctime = ktime_get_real_seconds();
				1443	/* maybe some queued-up processes were waiting for this */
				1444	do_smart_update(sma, NULL, 0, 0, &wake_q);
				1445	err = 0;
				1446	goto out_unlock;
				1447	}
				1448	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
				1449	}
				1450	err = -EINVAL;
				1451	if (semnum < 0 \|\| semnum >= nsems)
				1452	goto out_rcu_wakeup;
				1453
				1454	sem_lock(sma, NULL, -1);
				1455	if (!ipc_valid_object(&sma->sem_perm)) {
				1456	err = -EIDRM;
				1457	goto out_unlock;
				1458	}
				1459	curr = &sma->sems[semnum];
				1460
				1461	switch (cmd) {
				1462	case GETVAL:
				1463	err = curr->semval;
				1464	goto out_unlock;
				1465	case GETPID:
				1466	err = curr->sempid;
				1467	goto out_unlock;
				1468	case GETNCNT:
				1469	err = count_semcnt(sma, semnum, 0);
				1470	goto out_unlock;
				1471	case GETZCNT:
				1472	err = count_semcnt(sma, semnum, 1);
				1473	goto out_unlock;
				1474	}
				1475
				1476	out_unlock:
				1477	sem_unlock(sma, -1);
				1478	out_rcu_wakeup:
				1479	rcu_read_unlock();
				1480	wake_up_q(&wake_q);
				1481	out_free:
				1482	if (sem_io != fast_sem_io)
				1483	kvfree(sem_io);
				1484	return err;
				1485	}
				1486
				1487	static inline unsigned long
				1488	copy_semid_from_user(struct semid64_ds out, void __user buf, int version)
				1489	{
				1490	switch (version) {
				1491	case IPC_64:
				1492	if (copy_from_user(out, buf, sizeof(*out)))
				1493	return -EFAULT;
				1494	return 0;
				1495	case IPC_OLD:
				1496	{
				1497	struct semid_ds tbuf_old;
				1498
				1499	if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
				1500	return -EFAULT;
				1501
				1502	out->sem_perm.uid = tbuf_old.sem_perm.uid;
				1503	out->sem_perm.gid = tbuf_old.sem_perm.gid;
				1504	out->sem_perm.mode = tbuf_old.sem_perm.mode;
				1505
				1506	return 0;
				1507	}
				1508	default:
				1509	return -EINVAL;
				1510	}
				1511	}
				1512
				1513	/*
				1514	* This function handles some semctl commands which require the rwsem
				1515	* to be held in write mode.
				1516	* NOTE: no locks must be held, the rwsem is taken inside this function.
				1517	*/
				1518	static int semctl_down(struct ipc_namespace *ns, int semid,
				1519	int cmd, struct semid64_ds *semid64)
				1520	{
				1521	struct sem_array *sma;
				1522	int err;
				1523	struct kern_ipc_perm *ipcp;
				1524
				1525	down_write(&sem_ids(ns).rwsem);
				1526	rcu_read_lock();
				1527
				1528	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
				1529	&semid64->sem_perm, 0);
				1530	if (IS_ERR(ipcp)) {
				1531	err = PTR_ERR(ipcp);
				1532	goto out_unlock1;
				1533	}
				1534
				1535	sma = container_of(ipcp, struct sem_array, sem_perm);
				1536
				1537	err = security_sem_semctl(sma, cmd);
				1538	if (err)
				1539	goto out_unlock1;
				1540
				1541	switch (cmd) {
				1542	case IPC_RMID:
				1543	sem_lock(sma, NULL, -1);
				1544	/* freeary unlocks the ipc object and rcu */
				1545	freeary(ns, ipcp);
				1546	goto out_up;
				1547	case IPC_SET:
				1548	sem_lock(sma, NULL, -1);
				1549	err = ipc_update_perm(&semid64->sem_perm, ipcp);
				1550	if (err)
				1551	goto out_unlock0;
				1552	sma->sem_ctime = ktime_get_real_seconds();
				1553	break;
				1554	default:
				1555	err = -EINVAL;
				1556	goto out_unlock1;
				1557	}
				1558
				1559	out_unlock0:
				1560	sem_unlock(sma, -1);
				1561	out_unlock1:
				1562	rcu_read_unlock();
				1563	out_up:
				1564	up_write(&sem_ids(ns).rwsem);
				1565	return err;
				1566	}
				1567
				1568	SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
				1569	{
				1570	int version;
				1571	struct ipc_namespace *ns;
				1572	void __user p = (void __user )arg;
				1573	struct semid64_ds semid64;
				1574	int err;
				1575
				1576	if (semid < 0)
				1577	return -EINVAL;
				1578
				1579	version = ipc_parse_version(&cmd);
				1580	ns = current->nsproxy->ipc_ns;
				1581
				1582	switch (cmd) {
				1583	case IPC_INFO:
				1584	case SEM_INFO:
				1585	return semctl_info(ns, semid, cmd, p);
				1586	case IPC_STAT:
				1587	case SEM_STAT:
				1588	err = semctl_stat(ns, semid, cmd, &semid64);
				1589	if (err < 0)
				1590	return err;
				1591	if (copy_semid_to_user(p, &semid64, version))
				1592	err = -EFAULT;
				1593	return err;
				1594	case GETALL:
				1595	case GETVAL:
				1596	case GETPID:
				1597	case GETNCNT:
				1598	case GETZCNT:
				1599	case SETALL:
				1600	return semctl_main(ns, semid, semnum, cmd, p);
				1601	case SETVAL: {
				1602	int val;
				1603	#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
				1604	/* big-endian 64bit */
				1605	val = arg >> 32;
				1606	#else
				1607	/* 32bit or little-endian 64bit */
				1608	val = arg;
				1609	#endif
				1610	return semctl_setval(ns, semid, semnum, val);
				1611	}
				1612	case IPC_SET:
				1613	if (copy_semid_from_user(&semid64, p, version))
				1614	return -EFAULT;
				1615	case IPC_RMID:
				1616	return semctl_down(ns, semid, cmd, &semid64);
				1617	default:
				1618	return -EINVAL;
				1619	}
				1620	}
				1621
				1622	#ifdef CONFIG_COMPAT
				1623
				1624	struct compat_semid_ds {
				1625	struct compat_ipc_perm sem_perm;
				1626	compat_time_t sem_otime;
				1627	compat_time_t sem_ctime;
				1628	compat_uptr_t sem_base;
				1629	compat_uptr_t sem_pending;
				1630	compat_uptr_t sem_pending_last;
				1631	compat_uptr_t undo;
				1632	unsigned short sem_nsems;
				1633	};
				1634
				1635	static int copy_compat_semid_from_user(struct semid64_ds out, void __user buf,
				1636	int version)
				1637	{
				1638	memset(out, 0, sizeof(*out));
				1639	if (version == IPC_64) {
				1640	struct compat_semid64_ds *p = buf;
				1641	return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm);
				1642	} else {
				1643	struct compat_semid_ds *p = buf;
				1644	return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm);
				1645	}
				1646	}
				1647
				1648	static int copy_compat_semid_to_user(void __user buf, struct semid64_ds in,
				1649	int version)
				1650	{
				1651	if (version == IPC_64) {
				1652	struct compat_semid64_ds v;
				1653	memset(&v, 0, sizeof(v));
				1654	to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
				1655	v.sem_otime = in->sem_otime;
				1656	v.sem_ctime = in->sem_ctime;
				1657	v.sem_nsems = in->sem_nsems;
				1658	return copy_to_user(buf, &v, sizeof(v));
				1659	} else {
				1660	struct compat_semid_ds v;
				1661	memset(&v, 0, sizeof(v));
				1662	to_compat_ipc_perm(&v.sem_perm, &in->sem_perm);
				1663	v.sem_otime = in->sem_otime;
				1664	v.sem_ctime = in->sem_ctime;
				1665	v.sem_nsems = in->sem_nsems;
				1666	return copy_to_user(buf, &v, sizeof(v));
				1667	}
				1668	}
				1669
				1670	COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
				1671	{
				1672	void __user *p = compat_ptr(arg);
				1673	struct ipc_namespace *ns;
				1674	struct semid64_ds semid64;
				1675	int version = compat_ipc_parse_version(&cmd);
				1676	int err;
				1677
				1678	ns = current->nsproxy->ipc_ns;
				1679
				1680	if (semid < 0)
				1681	return -EINVAL;
				1682
				1683	switch (cmd & (~IPC_64)) {
				1684	case IPC_INFO:
				1685	case SEM_INFO:
				1686	return semctl_info(ns, semid, cmd, p);
				1687	case IPC_STAT:
				1688	case SEM_STAT:
				1689	err = semctl_stat(ns, semid, cmd, &semid64);
				1690	if (err < 0)
				1691	return err;
				1692	if (copy_compat_semid_to_user(p, &semid64, version))
				1693	err = -EFAULT;
				1694	return err;
				1695	case GETVAL:
				1696	case GETPID:
				1697	case GETNCNT:
				1698	case GETZCNT:
				1699	case GETALL:
				1700	case SETALL:
				1701	return semctl_main(ns, semid, semnum, cmd, p);
				1702	case SETVAL:
				1703	return semctl_setval(ns, semid, semnum, arg);
				1704	case IPC_SET:
				1705	if (copy_compat_semid_from_user(&semid64, p, version))
				1706	return -EFAULT;
				1707	/* fallthru */
				1708	case IPC_RMID:
				1709	return semctl_down(ns, semid, cmd, &semid64);
				1710	default:
				1711	return -EINVAL;
				1712	}
				1713	}
				1714	#endif
				1715
				1716	/* If the task doesn't already have a undo_list, then allocate one
				1717	* here. We guarantee there is only one thread using this undo list,
				1718	* and current is THE ONE
				1719	*
				1720	* If this allocation and assignment succeeds, but later
				1721	* portions of this code fail, there is no need to free the sem_undo_list.
				1722	* Just let it stay associated with the task, and it'll be freed later
				1723	* at exit time.
				1724	*
				1725	* This can block, so callers must hold no locks.
				1726	*/
				1727	static inline int get_undo_list(struct sem_undo_list **undo_listp)
				1728	{
				1729	struct sem_undo_list *undo_list;
				1730
				1731	undo_list = current->sysvsem.undo_list;
				1732	if (!undo_list) {
				1733	undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
				1734	if (undo_list == NULL)
				1735	return -ENOMEM;
				1736	spin_lock_init(&undo_list->lock);
				1737	refcount_set(&undo_list->refcnt, 1);
				1738	INIT_LIST_HEAD(&undo_list->list_proc);
				1739
				1740	current->sysvsem.undo_list = undo_list;
				1741	}
				1742	*undo_listp = undo_list;
				1743	return 0;
				1744	}
				1745
				1746	static struct sem_undo __lookup_undo(struct sem_undo_list ulp, int semid)
				1747	{
				1748	struct sem_undo *un;
				1749
				1750	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
				1751	if (un->semid == semid)
				1752	return un;
				1753	}
				1754	return NULL;
				1755	}
				1756
				1757	static struct sem_undo lookup_undo(struct sem_undo_list ulp, int semid)
				1758	{
				1759	struct sem_undo *un;
				1760
				1761	assert_spin_locked(&ulp->lock);
				1762
				1763	un = __lookup_undo(ulp, semid);
				1764	if (un) {
				1765	list_del_rcu(&un->list_proc);
				1766	list_add_rcu(&un->list_proc, &ulp->list_proc);
				1767	}
				1768	return un;
				1769	}
				1770
				1771	/**
				1772	* find_alloc_undo - lookup (and if not present create) undo array
				1773	* @ns: namespace
				1774	* @semid: semaphore array id
				1775	*
				1776	* The function looks up (and if not present creates) the undo structure.
				1777	* The size of the undo structure depends on the size of the semaphore
				1778	* array, thus the alloc path is not that straightforward.
				1779	* Lifetime-rules: sem_undo is rcu-protected, on success, the function
				1780	* performs a rcu_read_lock().
				1781	*/
				1782	static struct sem_undo find_alloc_undo(struct ipc_namespace ns, int semid)
				1783	{
				1784	struct sem_array *sma;
				1785	struct sem_undo_list *ulp;
				1786	struct sem_undo un, new;
				1787	int nsems, error;
				1788
				1789	error = get_undo_list(&ulp);
				1790	if (error)
				1791	return ERR_PTR(error);
				1792
				1793	rcu_read_lock();
				1794	spin_lock(&ulp->lock);
				1795	un = lookup_undo(ulp, semid);
				1796	spin_unlock(&ulp->lock);
				1797	if (likely(un != NULL))
				1798	goto out;
				1799
				1800	/* no undo structure around - allocate one. */
				1801	/* step 1: figure out the size of the semaphore array */
				1802	sma = sem_obtain_object_check(ns, semid);
				1803	if (IS_ERR(sma)) {
				1804	rcu_read_unlock();
				1805	return ERR_CAST(sma);
				1806	}
				1807
				1808	nsems = sma->sem_nsems;
				1809	if (!ipc_rcu_getref(&sma->sem_perm)) {
				1810	rcu_read_unlock();
				1811	un = ERR_PTR(-EIDRM);
				1812	goto out;
				1813	}
				1814	rcu_read_unlock();
				1815
				1816	/* step 2: allocate new undo structure */
				1817	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
				1818	if (!new) {
				1819	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
				1820	return ERR_PTR(-ENOMEM);
				1821	}
				1822
				1823	/* step 3: Acquire the lock on semaphore array */
				1824	rcu_read_lock();
				1825	sem_lock_and_putref(sma);
				1826	if (!ipc_valid_object(&sma->sem_perm)) {
				1827	sem_unlock(sma, -1);
				1828	rcu_read_unlock();
				1829	kfree(new);
				1830	un = ERR_PTR(-EIDRM);
				1831	goto out;
				1832	}
				1833	spin_lock(&ulp->lock);
				1834
				1835	/*
				1836	* step 4: check for races: did someone else allocate the undo struct?
				1837	*/
				1838	un = lookup_undo(ulp, semid);
				1839	if (un) {
				1840	kfree(new);
				1841	goto success;
				1842	}
				1843	/* step 5: initialize & link new undo structure */
				1844	new->semadj = (short *) &new[1];
				1845	new->ulp = ulp;
				1846	new->semid = semid;
				1847	assert_spin_locked(&ulp->lock);
				1848	list_add_rcu(&new->list_proc, &ulp->list_proc);
				1849	ipc_assert_locked_object(&sma->sem_perm);
				1850	list_add(&new->list_id, &sma->list_id);
				1851	un = new;
				1852
				1853	success:
				1854	spin_unlock(&ulp->lock);
				1855	sem_unlock(sma, -1);
				1856	out:
				1857	return un;
				1858	}
				1859
				1860	static long do_semtimedop(int semid, struct sembuf __user *tsops,
				1861	unsigned nsops, const struct timespec64 *timeout)
				1862	{
				1863	int error = -EINVAL;
				1864	struct sem_array *sma;
				1865	struct sembuf fast_sops[SEMOPM_FAST];
				1866	struct sembuf sops = fast_sops, sop;
				1867	struct sem_undo *un;
				1868	int max, locknum;
				1869	bool undos = false, alter = false, dupsop = false;
				1870	struct sem_queue queue;
				1871	unsigned long dup = 0, jiffies_left = 0;
				1872	struct ipc_namespace *ns;
				1873
				1874	ns = current->nsproxy->ipc_ns;
				1875
				1876	if (nsops < 1 \|\| semid < 0)
				1877	return -EINVAL;
				1878	if (nsops > ns->sc_semopm)
				1879	return -E2BIG;
				1880	if (nsops > SEMOPM_FAST) {
				1881	sops = kvmalloc(sizeof(sops)nsops, GFP_KERNEL);
				1882	if (sops == NULL)
				1883	return -ENOMEM;
				1884	}
				1885
				1886	if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
				1887	error = -EFAULT;
				1888	goto out_free;
				1889	}
				1890
				1891	if (timeout) {
				1892	if (timeout->tv_sec < 0 \|\| timeout->tv_nsec < 0 \|\|
				1893	timeout->tv_nsec >= 1000000000L) {
				1894	error = -EINVAL;
				1895	goto out_free;
				1896	}
				1897	jiffies_left = timespec64_to_jiffies(timeout);
				1898	}
				1899
				1900	max = 0;
				1901	for (sop = sops; sop < sops + nsops; sop++) {
				1902	unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
				1903
				1904	if (sop->sem_num >= max)
				1905	max = sop->sem_num;
				1906	if (sop->sem_flg & SEM_UNDO)
				1907	undos = true;
				1908	if (dup & mask) {
				1909	/*
				1910	* There was a previous alter access that appears
				1911	* to have accessed the same semaphore, thus use
				1912	* the dupsop logic. "appears", because the detection
				1913	* can only check % BITS_PER_LONG.
				1914	*/
				1915	dupsop = true;
				1916	}
				1917	if (sop->sem_op != 0) {
				1918	alter = true;
				1919	dup \|= mask;
				1920	}
				1921	}
				1922
				1923	if (undos) {
				1924	/* On success, find_alloc_undo takes the rcu_read_lock */
				1925	un = find_alloc_undo(ns, semid);
				1926	if (IS_ERR(un)) {
				1927	error = PTR_ERR(un);
				1928	goto out_free;
				1929	}
				1930	} else {
				1931	un = NULL;
				1932	rcu_read_lock();
				1933	}
				1934
				1935	sma = sem_obtain_object_check(ns, semid);
				1936	if (IS_ERR(sma)) {
				1937	rcu_read_unlock();
				1938	error = PTR_ERR(sma);
				1939	goto out_free;
				1940	}
				1941
				1942	error = -EFBIG;
				1943	if (max >= sma->sem_nsems) {
				1944	rcu_read_unlock();
				1945	goto out_free;
				1946	}
				1947
				1948	error = -EACCES;
				1949	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
				1950	rcu_read_unlock();
				1951	goto out_free;
				1952	}
				1953
				1954	error = security_sem_semop(sma, sops, nsops, alter);
				1955	if (error) {
				1956	rcu_read_unlock();
				1957	goto out_free;
				1958	}
				1959
				1960	error = -EIDRM;
				1961	locknum = sem_lock(sma, sops, nsops);
				1962	/*
				1963	* We eventually might perform the following check in a lockless
				1964	* fashion, considering ipc_valid_object() locking constraints.
				1965	* If nsops == 1 and there is no contention for sem_perm.lock, then
				1966	* only a per-semaphore lock is held and it's OK to proceed with the
				1967	* check below. More details on the fine grained locking scheme
				1968	* entangled here and why it's RMID race safe on comments at sem_lock()
				1969	*/
				1970	if (!ipc_valid_object(&sma->sem_perm))
				1971	goto out_unlock_free;
				1972	/*
				1973	* semid identifiers are not unique - find_alloc_undo may have
				1974	* allocated an undo structure, it was invalidated by an RMID
				1975	* and now a new array with received the same id. Check and fail.
				1976	* This case can be detected checking un->semid. The existence of
				1977	* "un" itself is guaranteed by rcu.
				1978	*/
				1979	if (un && un->semid == -1)
				1980	goto out_unlock_free;
				1981
				1982	queue.sops = sops;
				1983	queue.nsops = nsops;
				1984	queue.undo = un;
				1985	queue.pid = task_tgid_vnr(current);
				1986	queue.alter = alter;
				1987	queue.dupsop = dupsop;
				1988
				1989	error = perform_atomic_semop(sma, &queue);
				1990	if (error == 0) { /* non-blocking succesfull path */
				1991	DEFINE_WAKE_Q(wake_q);
				1992
				1993	/*
				1994	* If the operation was successful, then do
				1995	* the required updates.
				1996	*/
				1997	if (alter)
				1998	do_smart_update(sma, sops, nsops, 1, &wake_q);
				1999	else
				2000	set_semotime(sma, sops);
				2001
				2002	sem_unlock(sma, locknum);
				2003	rcu_read_unlock();
				2004	wake_up_q(&wake_q);
				2005
				2006	goto out_free;
				2007	}
				2008	if (error < 0) /* non-blocking error path */
				2009	goto out_unlock_free;
				2010
				2011	/*
				2012	* We need to sleep on this operation, so we put the current
				2013	* task into the pending queue and go to sleep.
				2014	*/
				2015	if (nsops == 1) {
				2016	struct sem *curr;
				2017	curr = &sma->sems[sops->sem_num];
				2018
				2019	if (alter) {
				2020	if (sma->complex_count) {
				2021	list_add_tail(&queue.list,
				2022	&sma->pending_alter);
				2023	} else {
				2024
				2025	list_add_tail(&queue.list,
				2026	&curr->pending_alter);
				2027	}
				2028	} else {
				2029	list_add_tail(&queue.list, &curr->pending_const);
				2030	}
				2031	} else {
				2032	if (!sma->complex_count)
				2033	merge_queues(sma);
				2034
				2035	if (alter)
				2036	list_add_tail(&queue.list, &sma->pending_alter);
				2037	else
				2038	list_add_tail(&queue.list, &sma->pending_const);
				2039
				2040	sma->complex_count++;
				2041	}
				2042
				2043	do {
				2044	WRITE_ONCE(queue.status, -EINTR);
				2045	queue.sleeper = current;
				2046
				2047	__set_current_state(TASK_INTERRUPTIBLE);
				2048	sem_unlock(sma, locknum);
				2049	rcu_read_unlock();
				2050
				2051	if (timeout)
				2052	jiffies_left = schedule_timeout(jiffies_left);
				2053	else
				2054	schedule();
				2055
				2056	/*
				2057	* fastpath: the semop has completed, either successfully or
				2058	* not, from the syscall pov, is quite irrelevant to us at this
				2059	* point; we're done.
				2060	*
				2061	* We _do_ care, nonetheless, about being awoken by a signal or
				2062	* spuriously. The queue.status is checked again in the
				2063	* slowpath (aka after taking sem_lock), such that we can detect
				2064	* scenarios where we were awakened externally, during the
				2065	* window between wake_q_add() and wake_up_q().
				2066	*/
				2067	error = READ_ONCE(queue.status);
				2068	if (error != -EINTR) {
				2069	/*
				2070	* User space could assume that semop() is a memory
				2071	* barrier: Without the mb(), the cpu could
				2072	* speculatively read in userspace stale data that was
				2073	* overwritten by the previous owner of the semaphore.
				2074	*/
				2075	smp_mb();
				2076	goto out_free;
				2077	}
				2078
				2079	rcu_read_lock();
				2080	locknum = sem_lock(sma, sops, nsops);
				2081
				2082	if (!ipc_valid_object(&sma->sem_perm))
				2083	goto out_unlock_free;
				2084
				2085	error = READ_ONCE(queue.status);
				2086
				2087	/*
				2088	* If queue.status != -EINTR we are woken up by another process.
				2089	* Leave without unlink_queue(), but with sem_unlock().
				2090	*/
				2091	if (error != -EINTR)
				2092	goto out_unlock_free;
				2093
				2094	/*
				2095	* If an interrupt occurred we have to clean up the queue.
				2096	*/
				2097	if (timeout && jiffies_left == 0)
				2098	error = -EAGAIN;
				2099	} while (error == -EINTR && !signal_pending(current)); /* spurious */
				2100
				2101	unlink_queue(sma, &queue);
				2102
				2103	out_unlock_free:
				2104	sem_unlock(sma, locknum);
				2105	rcu_read_unlock();
				2106	out_free:
				2107	if (sops != fast_sops)
				2108	kvfree(sops);
				2109	return error;
				2110	}
				2111
				2112	SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
				2113	unsigned, nsops, const struct timespec __user *, timeout)
				2114	{
				2115	if (timeout) {
				2116	struct timespec64 ts;
				2117	if (get_timespec64(&ts, timeout))
				2118	return -EFAULT;
				2119	return do_semtimedop(semid, tsops, nsops, &ts);
				2120	}
				2121	return do_semtimedop(semid, tsops, nsops, NULL);
				2122	}
				2123
				2124	#ifdef CONFIG_COMPAT
				2125	COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems,
				2126	unsigned, nsops,
				2127	const struct compat_timespec __user *, timeout)
				2128	{
				2129	if (timeout) {
				2130	struct timespec64 ts;
				2131	if (compat_get_timespec64(&ts, timeout))
				2132	return -EFAULT;
				2133	return do_semtimedop(semid, tsems, nsops, &ts);
				2134	}
				2135	return do_semtimedop(semid, tsems, nsops, NULL);
				2136	}
				2137	#endif
				2138
				2139	SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
				2140	unsigned, nsops)
				2141	{
				2142	return do_semtimedop(semid, tsops, nsops, NULL);
				2143	}
				2144
				2145	/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
				2146	* parent and child tasks.
				2147	*/
				2148
				2149	int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
				2150	{
				2151	struct sem_undo_list *undo_list;
				2152	int error;
				2153
				2154	if (clone_flags & CLONE_SYSVSEM) {
				2155	error = get_undo_list(&undo_list);
				2156	if (error)
				2157	return error;
				2158	refcount_inc(&undo_list->refcnt);
				2159	tsk->sysvsem.undo_list = undo_list;
				2160	} else
				2161	tsk->sysvsem.undo_list = NULL;
				2162
				2163	return 0;
				2164	}
				2165
				2166	/*
				2167	* add semadj values to semaphores, free undo structures.
				2168	* undo structures are not freed when semaphore arrays are destroyed
				2169	* so some of them may be out of date.
				2170	* IMPLEMENTATION NOTE: There is some confusion over whether the
				2171	* set of adjustments that needs to be done should be done in an atomic
				2172	* manner or not. That is, if we are attempting to decrement the semval
				2173	* should we queue up and wait until we can do so legally?
				2174	* The original implementation attempted to do this (queue and wait).
				2175	* The current implementation does not do so. The POSIX standard
				2176	* and SVID should be consulted to determine what behavior is mandated.
				2177	*/
				2178	void exit_sem(struct task_struct *tsk)
				2179	{
				2180	struct sem_undo_list *ulp;
				2181
				2182	ulp = tsk->sysvsem.undo_list;
				2183	if (!ulp)
				2184	return;
				2185	tsk->sysvsem.undo_list = NULL;
				2186
				2187	if (!refcount_dec_and_test(&ulp->refcnt))
				2188	return;
				2189
				2190	for (;;) {
				2191	struct sem_array *sma;
				2192	struct sem_undo *un;
				2193	int semid, i;
				2194	DEFINE_WAKE_Q(wake_q);
				2195
				2196	cond_resched();
				2197
				2198	rcu_read_lock();
				2199	un = list_entry_rcu(ulp->list_proc.next,
				2200	struct sem_undo, list_proc);
				2201	if (&un->list_proc == &ulp->list_proc) {
				2202	/*
				2203	* We must wait for freeary() before freeing this ulp,
				2204	* in case we raced with last sem_undo. There is a small
				2205	* possibility where we exit while freeary() didn't
				2206	* finish unlocking sem_undo_list.
				2207	*/
				2208	spin_lock(&ulp->lock);
				2209	spin_unlock(&ulp->lock);
				2210	rcu_read_unlock();
				2211	break;
				2212	}
				2213	spin_lock(&ulp->lock);
				2214	semid = un->semid;
				2215	spin_unlock(&ulp->lock);
				2216
				2217	/* exit_sem raced with IPC_RMID, nothing to do */
				2218	if (semid == -1) {
				2219	rcu_read_unlock();
				2220	continue;
				2221	}
				2222
				2223	sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
				2224	/* exit_sem raced with IPC_RMID, nothing to do */
				2225	if (IS_ERR(sma)) {
				2226	rcu_read_unlock();
				2227	continue;
				2228	}
				2229
				2230	sem_lock(sma, NULL, -1);
				2231	/* exit_sem raced with IPC_RMID, nothing to do */
				2232	if (!ipc_valid_object(&sma->sem_perm)) {
				2233	sem_unlock(sma, -1);
				2234	rcu_read_unlock();
				2235	continue;
				2236	}
				2237	un = __lookup_undo(ulp, semid);
				2238	if (un == NULL) {
				2239	/* exit_sem raced with IPC_RMID+semget() that created
				2240	* exactly the same semid. Nothing to do.
				2241	*/
				2242	sem_unlock(sma, -1);
				2243	rcu_read_unlock();
				2244	continue;
				2245	}
				2246
				2247	/* remove un from the linked lists */
				2248	ipc_assert_locked_object(&sma->sem_perm);
				2249	list_del(&un->list_id);
				2250
				2251	spin_lock(&ulp->lock);
				2252	list_del_rcu(&un->list_proc);
				2253	spin_unlock(&ulp->lock);
				2254
				2255	/* perform adjustments registered in un */
				2256	for (i = 0; i < sma->sem_nsems; i++) {
				2257	struct sem *semaphore = &sma->sems[i];
				2258	if (un->semadj[i]) {
				2259	semaphore->semval += un->semadj[i];
				2260	/*
				2261	* Range checks of the new semaphore value,
				2262	* not defined by sus:
				2263	* - Some unices ignore the undo entirely
				2264	* (e.g. HP UX 11i 11.22, Tru64 V5.1)
				2265	* - some cap the value (e.g. FreeBSD caps
				2266	* at 0, but doesn't enforce SEMVMX)
				2267	*
				2268	* Linux caps the semaphore value, both at 0
				2269	* and at SEMVMX.
				2270	*
				2271	* Manfred <manfred@colorfullife.com>
				2272	*/
				2273	if (semaphore->semval < 0)
				2274	semaphore->semval = 0;
				2275	if (semaphore->semval > SEMVMX)
				2276	semaphore->semval = SEMVMX;
				2277	semaphore->sempid = task_tgid_vnr(current);
				2278	}
				2279	}
				2280	/* maybe some queued-up processes were waiting for this */
				2281	do_smart_update(sma, NULL, 0, 1, &wake_q);
				2282	sem_unlock(sma, -1);
				2283	rcu_read_unlock();
				2284	wake_up_q(&wake_q);
				2285
				2286	kfree_rcu(un, rcu);
				2287	}
				2288	kfree(ulp);
				2289	}
				2290
				2291	#ifdef CONFIG_PROC_FS
				2292	static int sysvipc_sem_proc_show(struct seq_file s, void it)
				2293	{
				2294	struct user_namespace *user_ns = seq_user_ns(s);
				2295	struct kern_ipc_perm *ipcp = it;
				2296	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
				2297	time64_t sem_otime;
				2298
				2299	/*
				2300	* The proc interface isn't aware of sem_lock(), it calls
				2301	* ipc_lock_object() directly (in sysvipc_find_ipc).
				2302	* In order to stay compatible with sem_lock(), we must
				2303	* enter / leave complex_mode.
				2304	*/
				2305	complexmode_enter(sma);
				2306
				2307	sem_otime = get_semotime(sma);
				2308
				2309	seq_printf(s,
				2310	"%10d %10d %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
				2311	sma->sem_perm.key,
				2312	sma->sem_perm.id,
				2313	sma->sem_perm.mode,
				2314	sma->sem_nsems,
				2315	from_kuid_munged(user_ns, sma->sem_perm.uid),
				2316	from_kgid_munged(user_ns, sma->sem_perm.gid),
				2317	from_kuid_munged(user_ns, sma->sem_perm.cuid),
				2318	from_kgid_munged(user_ns, sma->sem_perm.cgid),
				2319	sem_otime,
				2320	sma->sem_ctime);
				2321
				2322	complexmode_tryleave(sma);
				2323
				2324	return 0;
				2325	}
				2326	#endif