Blame - ap/os/linux/linux-3.4.x/ipc/sem.c - T106_DC

blob: 03fe70530a9e40546ec055d3c7ad55bab5563732 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* linux/ipc/sem.c
				3	* Copyright (C) 1992 Krishna Balasubramanian
				4	* Copyright (C) 1995 Eric Schenk, Bruno Haible
				5	*
				6	* /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
				7	*
				8	* SMP-threaded, sysctl's added
				9	* (c) 1999 Manfred Spraul <manfred@colorfullife.com>
				10	* Enforced range limit on SEM_UNDO
				11	* (c) 2001 Red Hat Inc
				12	* Lockless wakeup
				13	* (c) 2003 Manfred Spraul <manfred@colorfullife.com>
				14	* Further wakeup optimizations, documentation
				15	* (c) 2010 Manfred Spraul <manfred@colorfullife.com>
				16	*
				17	* support for audit of ipc object properties and permission changes
				18	* Dustin Kirkland <dustin.kirkland@us.ibm.com>
				19	*
				20	* namespaces support
				21	* OpenVZ, SWsoft Inc.
				22	* Pavel Emelianov <xemul@openvz.org>
				23	*
				24	* Implementation notes: (May 2010)
				25	* This file implements System V semaphores.
				26	*
				27	* User space visible behavior:
				28	* - FIFO ordering for semop() operations (just FIFO, not starvation
				29	* protection)
				30	* - multiple semaphore operations that alter the same semaphore in
				31	* one semop() are handled.
				32	* - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
				33	* SETALL calls.
				34	* - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
				35	* - undo adjustments at process exit are limited to 0..SEMVMX.
				36	* - namespace are supported.
				37	* - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
				38	* to /proc/sys/kernel/sem.
				39	* - statistics about the usage are reported in /proc/sysvipc/sem.
				40	*
				41	* Internals:
				42	* - scalability:
				43	* - all global variables are read-mostly.
				44	* - semop() calls and semctl(RMID) are synchronized by RCU.
				45	* - most operations do write operations (actually: spin_lock calls) to
				46	* the per-semaphore array structure.
				47	* Thus: Perfect SMP scaling between independent semaphore arrays.
				48	* If multiple semaphores in one array are used, then cache line
				49	* trashing on the semaphore array spinlock will limit the scaling.
				50	* - semncnt and semzcnt are calculated on demand in count_semncnt() and
				51	* count_semzcnt()
				52	* - the task that performs a successful semop() scans the list of all
				53	* sleeping tasks and completes any pending operations that can be fulfilled.
				54	* Semaphores are actively given to waiting tasks (necessary for FIFO).
				55	* (see update_queue())
				56	* - To improve the scalability, the actual wake-up calls are performed after
				57	* dropping all locks. (see wake_up_sem_queue_prepare(),
				58	* wake_up_sem_queue_do())
				59	* - All work is done by the waker, the woken up task does not have to do
				60	* anything - not even acquiring a lock or dropping a refcount.
				61	* - A woken up task may not even touch the semaphore array anymore, it may
				62	* have been destroyed already by a semctl(RMID).
				63	* - The synchronizations between wake-ups due to a timeout/signal and a
				64	* wake-up due to a completed semaphore operation is achieved by using an
				65	* intermediate state (IN_WAKEUP).
				66	* - UNDO values are stored in an array (one per process and per
				67	* semaphore array, lazily allocated). For backwards compatibility, multiple
				68	* modes for the UNDO variables are supported (per process, per thread)
				69	* (see copy_semundo, CLONE_SYSVSEM)
				70	* - There are two lists of the pending operations: a per-array list
				71	* and per-semaphore list (stored in the array). This allows to achieve FIFO
				72	* ordering without always scanning all pending operations.
				73	* The worst-case behavior is nevertheless O(N^2) for N wakeups.
				74	*/
				75
				76	#include <linux/slab.h>
				77	#include <linux/spinlock.h>
				78	#include <linux/init.h>
				79	#include <linux/proc_fs.h>
				80	#include <linux/time.h>
				81	#include <linux/security.h>
				82	#include <linux/syscalls.h>
				83	#include <linux/audit.h>
				84	#include <linux/capability.h>
				85	#include <linux/seq_file.h>
				86	#include <linux/rwsem.h>
				87	#include <linux/nsproxy.h>
				88	#include <linux/ipc_namespace.h>
				89
				90	#include <asm/uaccess.h>
				91	#include "util.h"
				92
				93	/* One semaphore structure for each semaphore in the system. */
				94	struct sem {
				95	int semval; /* current value */
				96	int sempid; /* pid of last operation */
				97	struct list_head sem_pending; /* pending single-sop operations */
				98	};
				99
				100	/* One queue for each sleeping process in the system. */
				101	struct sem_queue {
				102	struct list_head simple_list; /* queue of pending operations */
				103	struct list_head list; /* queue of pending operations */
				104	struct task_struct sleeper; / this process */
				105	struct sem_undo undo; / undo structure */
				106	int pid; /* process id of requesting process */
				107	int status; /* completion status of operation */
				108	struct sembuf sops; / array of pending operations */
				109	int nsops; /* number of operations */
				110	int alter; /* does sops alter the array? /
				111	};
				112
				113	/* Each task has a list of undo requests. They are executed automatically
				114	* when the process exits.
				115	*/
				116	struct sem_undo {
				117	struct list_head list_proc; /* per-process list: *
				118	* all undos from one process
				119	* rcu protected */
				120	struct rcu_head rcu; /* rcu struct for sem_undo */
				121	struct sem_undo_list ulp; / back ptr to sem_undo_list */
				122	struct list_head list_id; /* per semaphore array list:
				123	* all undos for one array */
				124	int semid; /* semaphore set identifier */
				125	short semadj; / array of adjustments */
				126	/* one per semaphore */
				127	};
				128
				129	/* sem_undo_list controls shared access to the list of sem_undo structures
				130	* that may be shared among all a CLONE_SYSVSEM task group.
				131	*/
				132	struct sem_undo_list {
				133	atomic_t refcnt;
				134	spinlock_t lock;
				135	struct list_head list_proc;
				136	};
				137
				138
				139	#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
				140
				141	#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm)
				142	#define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid)
				143
				144	static int newary(struct ipc_namespace , struct ipc_params );
				145	static void freeary(struct ipc_namespace , struct kern_ipc_perm );
				146	#ifdef CONFIG_PROC_FS
				147	static int sysvipc_sem_proc_show(struct seq_file s, void it);
				148	#endif
				149
				150	#define SEMMSL_FAST 256 /* 512 bytes on stack */
				151	#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
				152
				153	/*
				154	* linked list protection:
				155	* sem_undo.id_next,
				156	* sem_array.sem_pending{,last},
				157	* sem_array.sem_undo: sem_lock() for read/write
				158	* sem_undo.proc_next: only "current" is allowed to read/write that field.
				159	*
				160	*/
				161
				162	#define sc_semmsl sem_ctls[0]
				163	#define sc_semmns sem_ctls[1]
				164	#define sc_semopm sem_ctls[2]
				165	#define sc_semmni sem_ctls[3]
				166
				167	void sem_init_ns(struct ipc_namespace *ns)
				168	{
				169	ns->sc_semmsl = SEMMSL;
				170	ns->sc_semmns = SEMMNS;
				171	ns->sc_semopm = SEMOPM;
				172	ns->sc_semmni = SEMMNI;
				173	ns->used_sems = 0;
				174	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
				175	}
				176
				177	#ifdef CONFIG_IPC_NS
				178	void sem_exit_ns(struct ipc_namespace *ns)
				179	{
				180	free_ipcs(ns, &sem_ids(ns), freeary);
				181	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
				182	}
				183	#endif
				184
				185	void __init sem_init (void)
				186	{
				187	sem_init_ns(&init_ipc_ns);
				188	if (IS_ENABLED(CONFIG_PROC_STRIPPED))
				189	return 0;
				190
				191	ipc_init_proc_interface("sysvipc/sem",
				192	" key semid perms nsems uid gid cuid cgid otime ctime\n",
				193	IPC_SEM_IDS, sysvipc_sem_proc_show);
				194	}
				195
				196	/*
				197	* sem_lock_(check_) routines are called in the paths where the rw_mutex
				198	* is not held.
				199	*/
				200	static inline struct sem_array sem_lock(struct ipc_namespace ns, int id)
				201	{
				202	struct kern_ipc_perm *ipcp = ipc_lock(&sem_ids(ns), id);
				203
				204	if (IS_ERR(ipcp))
				205	return (struct sem_array *)ipcp;
				206
				207	return container_of(ipcp, struct sem_array, sem_perm);
				208	}
				209
				210	static inline struct sem_array sem_lock_check(struct ipc_namespace ns,
				211	int id)
				212	{
				213	struct kern_ipc_perm *ipcp = ipc_lock_check(&sem_ids(ns), id);
				214
				215	if (IS_ERR(ipcp))
				216	return (struct sem_array *)ipcp;
				217
				218	return container_of(ipcp, struct sem_array, sem_perm);
				219	}
				220
				221	static inline void sem_lock_and_putref(struct sem_array *sma)
				222	{
				223	ipc_lock_by_ptr(&sma->sem_perm);
				224	ipc_rcu_putref(sma);
				225	}
				226
				227	static inline void sem_getref_and_unlock(struct sem_array *sma)
				228	{
				229	ipc_rcu_getref(sma);
				230	ipc_unlock(&(sma)->sem_perm);
				231	}
				232
				233	static inline void sem_putref(struct sem_array *sma)
				234	{
				235	ipc_lock_by_ptr(&sma->sem_perm);
				236	ipc_rcu_putref(sma);
				237	ipc_unlock(&(sma)->sem_perm);
				238	}
				239
				240	static inline void sem_rmid(struct ipc_namespace ns, struct sem_array s)
				241	{
				242	ipc_rmid(&sem_ids(ns), &s->sem_perm);
				243	}
				244
				245	/*
				246	* Lockless wakeup algorithm:
				247	* Without the check/retry algorithm a lockless wakeup is possible:
				248	* - queue.status is initialized to -EINTR before blocking.
				249	* - wakeup is performed by
				250	* * unlinking the queue entry from sma->sem_pending
				251	* * setting queue.status to IN_WAKEUP
				252	* This is the notification for the blocked thread that a
				253	* result value is imminent.
				254	* * call wake_up_process
				255	* * set queue.status to the final value.
				256	* - the previously blocked thread checks queue.status:
				257	* * if it's IN_WAKEUP, then it must wait until the value changes
				258	* * if it's not -EINTR, then the operation was completed by
				259	* update_queue. semtimedop can return queue.status without
				260	* performing any operation on the sem array.
				261	* * otherwise it must acquire the spinlock and check what's up.
				262	*
				263	* The two-stage algorithm is necessary to protect against the following
				264	* races:
				265	* - if queue.status is set after wake_up_process, then the woken up idle
				266	* thread could race forward and try (and fail) to acquire sma->lock
				267	* before update_queue had a chance to set queue.status
				268	* - if queue.status is written before wake_up_process and if the
				269	* blocked process is woken up by a signal between writing
				270	* queue.status and the wake_up_process, then the woken up
				271	* process could return from semtimedop and die by calling
				272	* sys_exit before wake_up_process is called. Then wake_up_process
				273	* will oops, because the task structure is already invalid.
				274	* (yes, this happened on s390 with sysv msg).
				275	*
				276	*/
				277	#define IN_WAKEUP 1
				278
				279	/**
				280	* newary - Create a new semaphore set
				281	* @ns: namespace
				282	* @params: ptr to the structure that contains key, semflg and nsems
				283	*
				284	* Called with sem_ids.rw_mutex held (as a writer)
				285	*/
				286
				287	static int newary(struct ipc_namespace ns, struct ipc_params params)
				288	{
				289	int id;
				290	int retval;
				291	struct sem_array *sma;
				292	int size;
				293	key_t key = params->key;
				294	int nsems = params->u.nsems;
				295	int semflg = params->flg;
				296	int i;
				297
				298	if (!nsems)
				299	return -EINVAL;
				300	if (ns->used_sems + nsems > ns->sc_semmns)
				301	return -ENOSPC;
				302
				303	size = sizeof (sma) + nsems sizeof (struct sem);
				304	sma = ipc_rcu_alloc(size);
				305	if (!sma) {
				306	return -ENOMEM;
				307	}
				308	memset (sma, 0, size);
				309
				310	sma->sem_perm.mode = (semflg & S_IRWXUGO);
				311	sma->sem_perm.key = key;
				312
				313	sma->sem_perm.security = NULL;
				314	retval = security_sem_alloc(sma);
				315	if (retval) {
				316	ipc_rcu_putref(sma);
				317	return retval;
				318	}
				319
				320	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
				321	if (id < 0) {
				322	security_sem_free(sma);
				323	ipc_rcu_putref(sma);
				324	return id;
				325	}
				326	ns->used_sems += nsems;
				327
				328	sma->sem_base = (struct sem *) &sma[1];
				329
				330	for (i = 0; i < nsems; i++)
				331	INIT_LIST_HEAD(&sma->sem_base[i].sem_pending);
				332
				333	sma->complex_count = 0;
				334	INIT_LIST_HEAD(&sma->sem_pending);
				335	INIT_LIST_HEAD(&sma->list_id);
				336	sma->sem_nsems = nsems;
				337	sma->sem_ctime = get_seconds();
				338	sem_unlock(sma);
				339
				340	return sma->sem_perm.id;
				341	}
				342
				343
				344	/*
				345	* Called with sem_ids.rw_mutex and ipcp locked.
				346	*/
				347	static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
				348	{
				349	struct sem_array *sma;
				350
				351	sma = container_of(ipcp, struct sem_array, sem_perm);
				352	return security_sem_associate(sma, semflg);
				353	}
				354
				355	/*
				356	* Called with sem_ids.rw_mutex and ipcp locked.
				357	*/
				358	static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
				359	struct ipc_params *params)
				360	{
				361	struct sem_array *sma;
				362
				363	sma = container_of(ipcp, struct sem_array, sem_perm);
				364	if (params->u.nsems > sma->sem_nsems)
				365	return -EINVAL;
				366
				367	return 0;
				368	}
				369
				370	SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
				371	{
				372	struct ipc_namespace *ns;
				373	struct ipc_ops sem_ops;
				374	struct ipc_params sem_params;
				375
				376	ns = current->nsproxy->ipc_ns;
				377
				378	if (nsems < 0 \|\| nsems > ns->sc_semmsl)
				379	return -EINVAL;
				380
				381	sem_ops.getnew = newary;
				382	sem_ops.associate = sem_security;
				383	sem_ops.more_checks = sem_more_checks;
				384
				385	sem_params.key = key;
				386	sem_params.flg = semflg;
				387	sem_params.u.nsems = nsems;
				388
				389	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
				390	}
				391
				392	/*
				393	* Determine whether a sequence of semaphore operations would succeed
				394	* all at once. Return 0 if yes, 1 if need to sleep, else return error code.
				395	*/
				396
				397	static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
				398	int nsops, struct sem_undo *un, int pid)
				399	{
				400	int result, sem_op;
				401	struct sembuf *sop;
				402	struct sem * curr;
				403
				404	for (sop = sops; sop < sops + nsops; sop++) {
				405	curr = sma->sem_base + sop->sem_num;
				406	sem_op = sop->sem_op;
				407	result = curr->semval;
				408
				409	if (!sem_op && result)
				410	goto would_block;
				411
				412	result += sem_op;
				413	if (result < 0)
				414	goto would_block;
				415	if (result > SEMVMX)
				416	goto out_of_range;
				417	if (sop->sem_flg & SEM_UNDO) {
				418	int undo = un->semadj[sop->sem_num] - sem_op;
				419	/*
				420	* Exceeding the undo range is an error.
				421	*/
				422	if (undo < (-SEMAEM - 1) \|\| undo > SEMAEM)
				423	goto out_of_range;
				424	}
				425	curr->semval = result;
				426	}
				427
				428	sop--;
				429	while (sop >= sops) {
				430	sma->sem_base[sop->sem_num].sempid = pid;
				431	if (sop->sem_flg & SEM_UNDO)
				432	un->semadj[sop->sem_num] -= sop->sem_op;
				433	sop--;
				434	}
				435
				436	return 0;
				437
				438	out_of_range:
				439	result = -ERANGE;
				440	goto undo;
				441
				442	would_block:
				443	if (sop->sem_flg & IPC_NOWAIT)
				444	result = -EAGAIN;
				445	else
				446	result = 1;
				447
				448	undo:
				449	sop--;
				450	while (sop >= sops) {
				451	sma->sem_base[sop->sem_num].semval -= sop->sem_op;
				452	sop--;
				453	}
				454
				455	return result;
				456	}
				457
				458	/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
				459	* @q: queue entry that must be signaled
				460	* @error: Error value for the signal
				461	*
				462	* Prepare the wake-up of the queue entry q.
				463	*/
				464	static void wake_up_sem_queue_prepare(struct list_head *pt,
				465	struct sem_queue *q, int error)
				466	{
				467	#ifdef CONFIG_PREEMPT_RT_BASE
				468	struct task_struct *p = q->sleeper;
				469	get_task_struct(p);
				470	q->status = error;
				471	wake_up_process(p);
				472	put_task_struct(p);
				473	#else
				474	if (list_empty(pt)) {
				475	/*
				476	* Hold preempt off so that we don't get preempted and have the
				477	* wakee busy-wait until we're scheduled back on.
				478	*/
				479	preempt_disable();
				480	}
				481	q->status = IN_WAKEUP;
				482	q->pid = error;
				483
				484	list_add_tail(&q->simple_list, pt);
				485	#endif
				486	}
				487
				488	/**
				489	* wake_up_sem_queue_do(pt) - do the actual wake-up
				490	* @pt: list of tasks to be woken up
				491	*
				492	* Do the actual wake-up.
				493	* The function is called without any locks held, thus the semaphore array
				494	* could be destroyed already and the tasks can disappear as soon as the
				495	* status is set to the actual return code.
				496	*/
				497	static void wake_up_sem_queue_do(struct list_head *pt)
				498	{
				499	#ifndef CONFIG_PREEMPT_RT_BASE
				500	struct sem_queue q, t;
				501	int did_something;
				502
				503	did_something = !list_empty(pt);
				504	list_for_each_entry_safe(q, t, pt, simple_list) {
				505	wake_up_process(q->sleeper);
				506	/* q can disappear immediately after writing q->status. */
				507	smp_wmb();
				508	q->status = q->pid;
				509	}
				510	if (did_something)
				511	preempt_enable();
				512	#endif
				513	}
				514
				515	static void unlink_queue(struct sem_array sma, struct sem_queue q)
				516	{
				517	list_del(&q->list);
				518	if (q->nsops == 1)
				519	list_del(&q->simple_list);
				520	else
				521	sma->complex_count--;
				522	}
				523
				524	/** check_restart(sma, q)
				525	* @sma: semaphore array
				526	* @q: the operation that just completed
				527	*
				528	* update_queue is O(N^2) when it restarts scanning the whole queue of
				529	* waiting operations. Therefore this function checks if the restart is
				530	* really necessary. It is called after a previously waiting operation
				531	* was completed.
				532	*/
				533	static int check_restart(struct sem_array sma, struct sem_queue q)
				534	{
				535	struct sem *curr;
				536	struct sem_queue *h;
				537
				538	/* if the operation didn't modify the array, then no restart */
				539	if (q->alter == 0)
				540	return 0;
				541
				542	/* pending complex operations are too difficult to analyse */
				543	if (sma->complex_count)
				544	return 1;
				545
				546	/* we were a sleeping complex operation. Too difficult */
				547	if (q->nsops > 1)
				548	return 1;
				549
				550	curr = sma->sem_base + q->sops[0].sem_num;
				551
				552	/* No-one waits on this queue */
				553	if (list_empty(&curr->sem_pending))
				554	return 0;
				555
				556	/* the new semaphore value */
				557	if (curr->semval) {
				558	/* It is impossible that someone waits for the new value:
				559	* - q is a previously sleeping simple operation that
				560	* altered the array. It must be a decrement, because
				561	* simple increments never sleep.
				562	* - The value is not 0, thus wait-for-zero won't proceed.
				563	* - If there are older (higher priority) decrements
				564	* in the queue, then they have observed the original
				565	* semval value and couldn't proceed. The operation
				566	* decremented to value - thus they won't proceed either.
				567	*/
				568	BUG_ON(q->sops[0].sem_op >= 0);
				569	return 0;
				570	}
				571	/*
				572	* semval is 0. Check if there are wait-for-zero semops.
				573	* They must be the first entries in the per-semaphore simple queue
				574	*/
				575	h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list);
				576	BUG_ON(h->nsops != 1);
				577	BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
				578
				579	/* Yes, there is a wait-for-zero semop. Restart */
				580	if (h->sops[0].sem_op == 0)
				581	return 1;
				582
				583	/* Again - no-one is waiting for the new value. */
				584	return 0;
				585	}
				586
				587
				588	/**
				589	* update_queue(sma, semnum): Look for tasks that can be completed.
				590	* @sma: semaphore array.
				591	* @semnum: semaphore that was modified.
				592	* @pt: list head for the tasks that must be woken up.
				593	*
				594	* update_queue must be called after a semaphore in a semaphore array
				595	* was modified. If multiple semaphore were modified, then @semnum
				596	* must be set to -1.
				597	* The tasks that must be woken up are added to @pt. The return code
				598	* is stored in q->pid.
				599	* The function return 1 if at least one semop was completed successfully.
				600	*/
				601	static int update_queue(struct sem_array sma, int semnum, struct list_head pt)
				602	{
				603	struct sem_queue *q;
				604	struct list_head *walk;
				605	struct list_head *pending_list;
				606	int offset;
				607	int semop_completed = 0;
				608
				609	/* if there are complex operations around, then knowing the semaphore
				610	* that was modified doesn't help us. Assume that multiple semaphores
				611	* were modified.
				612	*/
				613	if (sma->complex_count)
				614	semnum = -1;
				615
				616	if (semnum == -1) {
				617	pending_list = &sma->sem_pending;
				618	offset = offsetof(struct sem_queue, list);
				619	} else {
				620	pending_list = &sma->sem_base[semnum].sem_pending;
				621	offset = offsetof(struct sem_queue, simple_list);
				622	}
				623
				624	again:
				625	walk = pending_list->next;
				626	while (walk != pending_list) {
				627	int error, restart;
				628
				629	q = (struct sem_queue )((char )walk - offset);
				630	walk = walk->next;
				631
				632	/* If we are scanning the single sop, per-semaphore list of
				633	* one semaphore and that semaphore is 0, then it is not
				634	* necessary to scan the "alter" entries: simple increments
				635	* that affect only one entry succeed immediately and cannot
				636	* be in the per semaphore pending queue, and decrements
				637	* cannot be successful if the value is already 0.
				638	*/
				639	if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
				640	q->alter)
				641	break;
				642
				643	error = try_atomic_semop(sma, q->sops, q->nsops,
				644	q->undo, q->pid);
				645
				646	/* Does q->sleeper still need to sleep? */
				647	if (error > 0)
				648	continue;
				649
				650	unlink_queue(sma, q);
				651
				652	if (error) {
				653	restart = 0;
				654	} else {
				655	semop_completed = 1;
				656	restart = check_restart(sma, q);
				657	}
				658
				659	wake_up_sem_queue_prepare(pt, q, error);
				660	if (restart)
				661	goto again;
				662	}
				663	return semop_completed;
				664	}
				665
				666	/**
				667	* do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
				668	* @sma: semaphore array
				669	* @sops: operations that were performed
				670	* @nsops: number of operations
				671	* @otime: force setting otime
				672	* @pt: list head of the tasks that must be woken up.
				673	*
				674	* do_smart_update() does the required called to update_queue, based on the
				675	* actual changes that were performed on the semaphore array.
				676	* Note that the function does not do the actual wake-up: the caller is
				677	* responsible for calling wake_up_sem_queue_do(@pt).
				678	* It is safe to perform this call after dropping all locks.
				679	*/
				680	static void do_smart_update(struct sem_array sma, struct sembuf sops, int nsops,
				681	int otime, struct list_head *pt)
				682	{
				683	int i;
				684
				685	if (sma->complex_count \|\| sops == NULL) {
				686	if (update_queue(sma, -1, pt))
				687	otime = 1;
				688	goto done;
				689	}
				690
				691	for (i = 0; i < nsops; i++) {
				692	if (sops[i].sem_op > 0 \|\|
				693	(sops[i].sem_op < 0 &&
				694	sma->sem_base[sops[i].sem_num].semval == 0))
				695	if (update_queue(sma, sops[i].sem_num, pt))
				696	otime = 1;
				697	}
				698	done:
				699	if (otime)
				700	sma->sem_otime = get_seconds();
				701	}
				702
				703
				704	/* The following counts are associated to each semaphore:
				705	* semncnt number of tasks waiting on semval being nonzero
				706	* semzcnt number of tasks waiting on semval being zero
				707	* This model assumes that a task waits on exactly one semaphore.
				708	* Since semaphore operations are to be performed atomically, tasks actually
				709	* wait on a whole sequence of semaphores simultaneously.
				710	* The counts we return here are a rough approximation, but still
				711	* warrant that semncnt+semzcnt>0 if the task is on the pending queue.
				712	*/
				713	static int count_semncnt (struct sem_array * sma, ushort semnum)
				714	{
				715	int semncnt;
				716	struct sem_queue * q;
				717
				718	semncnt = 0;
				719	list_for_each_entry(q, &sma->sem_pending, list) {
				720	struct sembuf * sops = q->sops;
				721	int nsops = q->nsops;
				722	int i;
				723	for (i = 0; i < nsops; i++)
				724	if (sops[i].sem_num == semnum
				725	&& (sops[i].sem_op < 0)
				726	&& !(sops[i].sem_flg & IPC_NOWAIT))
				727	semncnt++;
				728	}
				729	return semncnt;
				730	}
				731
				732	static int count_semzcnt (struct sem_array * sma, ushort semnum)
				733	{
				734	int semzcnt;
				735	struct sem_queue * q;
				736
				737	semzcnt = 0;
				738	list_for_each_entry(q, &sma->sem_pending, list) {
				739	struct sembuf * sops = q->sops;
				740	int nsops = q->nsops;
				741	int i;
				742	for (i = 0; i < nsops; i++)
				743	if (sops[i].sem_num == semnum
				744	&& (sops[i].sem_op == 0)
				745	&& !(sops[i].sem_flg & IPC_NOWAIT))
				746	semzcnt++;
				747	}
				748	return semzcnt;
				749	}
				750
				751	/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked
				752	* as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex
				753	* remains locked on exit.
				754	*/
				755	static void freeary(struct ipc_namespace ns, struct kern_ipc_perm ipcp)
				756	{
				757	struct sem_undo un, tu;
				758	struct sem_queue q, tq;
				759	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
				760	struct list_head tasks;
				761
				762	/* Free the existing undo structures for this semaphore set. */
				763	assert_spin_locked(&sma->sem_perm.lock);
				764	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
				765	list_del(&un->list_id);
				766	spin_lock(&un->ulp->lock);
				767	un->semid = -1;
				768	list_del_rcu(&un->list_proc);
				769	spin_unlock(&un->ulp->lock);
				770	kfree_rcu(un, rcu);
				771	}
				772
				773	/* Wake up all pending processes and let them fail with EIDRM. */
				774	INIT_LIST_HEAD(&tasks);
				775	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
				776	unlink_queue(sma, q);
				777	wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
				778	}
				779
				780	/* Remove the semaphore set from the IDR */
				781	sem_rmid(ns, sma);
				782	sem_unlock(sma);
				783
				784	wake_up_sem_queue_do(&tasks);
				785	ns->used_sems -= sma->sem_nsems;
				786	security_sem_free(sma);
				787	ipc_rcu_putref(sma);
				788	}
				789
				790	static unsigned long copy_semid_to_user(void __user buf, struct semid64_ds in, int version)
				791	{
				792	switch(version) {
				793	case IPC_64:
				794	return copy_to_user(buf, in, sizeof(*in));
				795	case IPC_OLD:
				796	{
				797	struct semid_ds out;
				798
				799	memset(&out, 0, sizeof(out));
				800
				801	ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
				802
				803	out.sem_otime = in->sem_otime;
				804	out.sem_ctime = in->sem_ctime;
				805	out.sem_nsems = in->sem_nsems;
				806
				807	return copy_to_user(buf, &out, sizeof(out));
				808	}
				809	default:
				810	return -EINVAL;
				811	}
				812	}
				813
				814	static int semctl_nolock(struct ipc_namespace *ns, int semid,
				815	int cmd, int version, union semun arg)
				816	{
				817	int err;
				818	struct sem_array *sma;
				819
				820	switch(cmd) {
				821	case IPC_INFO:
				822	case SEM_INFO:
				823	{
				824	struct seminfo seminfo;
				825	int max_id;
				826
				827	err = security_sem_semctl(NULL, cmd);
				828	if (err)
				829	return err;
				830
				831	memset(&seminfo,0,sizeof(seminfo));
				832	seminfo.semmni = ns->sc_semmni;
				833	seminfo.semmns = ns->sc_semmns;
				834	seminfo.semmsl = ns->sc_semmsl;
				835	seminfo.semopm = ns->sc_semopm;
				836	seminfo.semvmx = SEMVMX;
				837	seminfo.semmnu = SEMMNU;
				838	seminfo.semmap = SEMMAP;
				839	seminfo.semume = SEMUME;
				840	down_read(&sem_ids(ns).rw_mutex);
				841	if (cmd == SEM_INFO) {
				842	seminfo.semusz = sem_ids(ns).in_use;
				843	seminfo.semaem = ns->used_sems;
				844	} else {
				845	seminfo.semusz = SEMUSZ;
				846	seminfo.semaem = SEMAEM;
				847	}
				848	max_id = ipc_get_maxid(&sem_ids(ns));
				849	up_read(&sem_ids(ns).rw_mutex);
				850	if (copy_to_user (arg.__buf, &seminfo, sizeof(struct seminfo)))
				851	return -EFAULT;
				852	return (max_id < 0) ? 0: max_id;
				853	}
				854	case IPC_STAT:
				855	case SEM_STAT:
				856	{
				857	struct semid64_ds tbuf;
				858	int id;
				859
				860	if (cmd == SEM_STAT) {
				861	sma = sem_lock(ns, semid);
				862	if (IS_ERR(sma))
				863	return PTR_ERR(sma);
				864	id = sma->sem_perm.id;
				865	} else {
				866	sma = sem_lock_check(ns, semid);
				867	if (IS_ERR(sma))
				868	return PTR_ERR(sma);
				869	id = 0;
				870	}
				871
				872	err = -EACCES;
				873	if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
				874	goto out_unlock;
				875
				876	err = security_sem_semctl(sma, cmd);
				877	if (err)
				878	goto out_unlock;
				879
				880	memset(&tbuf, 0, sizeof(tbuf));
				881
				882	kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
				883	tbuf.sem_otime = sma->sem_otime;
				884	tbuf.sem_ctime = sma->sem_ctime;
				885	tbuf.sem_nsems = sma->sem_nsems;
				886	sem_unlock(sma);
				887	if (copy_semid_to_user (arg.buf, &tbuf, version))
				888	return -EFAULT;
				889	return id;
				890	}
				891	default:
				892	return -EINVAL;
				893	}
				894	out_unlock:
				895	sem_unlock(sma);
				896	return err;
				897	}
				898
				899	static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
				900	int cmd, int version, union semun arg)
				901	{
				902	struct sem_array *sma;
				903	struct sem* curr;
				904	int err;
				905	ushort fast_sem_io[SEMMSL_FAST];
				906	ushort* sem_io = fast_sem_io;
				907	int nsems;
				908	struct list_head tasks;
				909
				910	sma = sem_lock_check(ns, semid);
				911	if (IS_ERR(sma))
				912	return PTR_ERR(sma);
				913
				914	INIT_LIST_HEAD(&tasks);
				915	nsems = sma->sem_nsems;
				916
				917	err = -EACCES;
				918	if (ipcperms(ns, &sma->sem_perm,
				919	(cmd == SETVAL \|\| cmd == SETALL) ? S_IWUGO : S_IRUGO))
				920	goto out_unlock;
				921
				922	err = security_sem_semctl(sma, cmd);
				923	if (err)
				924	goto out_unlock;
				925
				926	err = -EACCES;
				927	switch (cmd) {
				928	case GETALL:
				929	{
				930	ushort __user *array = arg.array;
				931	int i;
				932
				933	if(nsems > SEMMSL_FAST) {
				934	sem_getref_and_unlock(sma);
				935
				936	sem_io = ipc_alloc(sizeof(ushort)*nsems);
				937	if(sem_io == NULL) {
				938	sem_putref(sma);
				939	return -ENOMEM;
				940	}
				941
				942	sem_lock_and_putref(sma);
				943	if (sma->sem_perm.deleted) {
				944	sem_unlock(sma);
				945	err = -EIDRM;
				946	goto out_free;
				947	}
				948	}
				949
				950	for (i = 0; i < sma->sem_nsems; i++)
				951	sem_io[i] = sma->sem_base[i].semval;
				952	sem_unlock(sma);
				953	err = 0;
				954	if(copy_to_user(array, sem_io, nsems*sizeof(ushort)))
				955	err = -EFAULT;
				956	goto out_free;
				957	}
				958	case SETALL:
				959	{
				960	int i;
				961	struct sem_undo *un;
				962
				963	sem_getref_and_unlock(sma);
				964
				965	if(nsems > SEMMSL_FAST) {
				966	sem_io = ipc_alloc(sizeof(ushort)*nsems);
				967	if(sem_io == NULL) {
				968	sem_putref(sma);
				969	return -ENOMEM;
				970	}
				971	}
				972
				973	if (copy_from_user (sem_io, arg.array, nsems*sizeof(ushort))) {
				974	sem_putref(sma);
				975	err = -EFAULT;
				976	goto out_free;
				977	}
				978
				979	for (i = 0; i < nsems; i++) {
				980	if (sem_io[i] > SEMVMX) {
				981	sem_putref(sma);
				982	err = -ERANGE;
				983	goto out_free;
				984	}
				985	}
				986	sem_lock_and_putref(sma);
				987	if (sma->sem_perm.deleted) {
				988	sem_unlock(sma);
				989	err = -EIDRM;
				990	goto out_free;
				991	}
				992
				993	for (i = 0; i < nsems; i++)
				994	sma->sem_base[i].semval = sem_io[i];
				995
				996	assert_spin_locked(&sma->sem_perm.lock);
				997	list_for_each_entry(un, &sma->list_id, list_id) {
				998	for (i = 0; i < nsems; i++)
				999	un->semadj[i] = 0;
				1000	}
				1001	sma->sem_ctime = get_seconds();
				1002	/* maybe some queued-up processes were waiting for this */
				1003	do_smart_update(sma, NULL, 0, 0, &tasks);
				1004	err = 0;
				1005	goto out_unlock;
				1006	}
				1007	/* GETVAL, GETPID, GETNCTN, GETZCNT, SETVAL: fall-through */
				1008	}
				1009	err = -EINVAL;
				1010	if(semnum < 0 \|\| semnum >= nsems)
				1011	goto out_unlock;
				1012
				1013	curr = &sma->sem_base[semnum];
				1014
				1015	switch (cmd) {
				1016	case GETVAL:
				1017	err = curr->semval;
				1018	goto out_unlock;
				1019	case GETPID:
				1020	err = curr->sempid;
				1021	goto out_unlock;
				1022	case GETNCNT:
				1023	err = count_semncnt(sma,semnum);
				1024	goto out_unlock;
				1025	case GETZCNT:
				1026	err = count_semzcnt(sma,semnum);
				1027	goto out_unlock;
				1028	case SETVAL:
				1029	{
				1030	int val = arg.val;
				1031	struct sem_undo *un;
				1032
				1033	err = -ERANGE;
				1034	if (val > SEMVMX \|\| val < 0)
				1035	goto out_unlock;
				1036
				1037	assert_spin_locked(&sma->sem_perm.lock);
				1038	list_for_each_entry(un, &sma->list_id, list_id)
				1039	un->semadj[semnum] = 0;
				1040
				1041	curr->semval = val;
				1042	curr->sempid = task_tgid_vnr(current);
				1043	sma->sem_ctime = get_seconds();
				1044	/* maybe some queued-up processes were waiting for this */
				1045	do_smart_update(sma, NULL, 0, 0, &tasks);
				1046	err = 0;
				1047	goto out_unlock;
				1048	}
				1049	}
				1050	out_unlock:
				1051	sem_unlock(sma);
				1052	wake_up_sem_queue_do(&tasks);
				1053
				1054	out_free:
				1055	if(sem_io != fast_sem_io)
				1056	ipc_free(sem_io, sizeof(ushort)*nsems);
				1057	return err;
				1058	}
				1059
				1060	static inline unsigned long
				1061	copy_semid_from_user(struct semid64_ds out, void __user buf, int version)
				1062	{
				1063	switch(version) {
				1064	case IPC_64:
				1065	if (copy_from_user(out, buf, sizeof(*out)))
				1066	return -EFAULT;
				1067	return 0;
				1068	case IPC_OLD:
				1069	{
				1070	struct semid_ds tbuf_old;
				1071
				1072	if(copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
				1073	return -EFAULT;
				1074
				1075	out->sem_perm.uid = tbuf_old.sem_perm.uid;
				1076	out->sem_perm.gid = tbuf_old.sem_perm.gid;
				1077	out->sem_perm.mode = tbuf_old.sem_perm.mode;
				1078
				1079	return 0;
				1080	}
				1081	default:
				1082	return -EINVAL;
				1083	}
				1084	}
				1085
				1086	/*
				1087	* This function handles some semctl commands which require the rw_mutex
				1088	* to be held in write mode.
				1089	* NOTE: no locks must be held, the rw_mutex is taken inside this function.
				1090	*/
				1091	static int semctl_down(struct ipc_namespace *ns, int semid,
				1092	int cmd, int version, union semun arg)
				1093	{
				1094	struct sem_array *sma;
				1095	int err;
				1096	struct semid64_ds semid64;
				1097	struct kern_ipc_perm *ipcp;
				1098
				1099	if(cmd == IPC_SET) {
				1100	if (copy_semid_from_user(&semid64, arg.buf, version))
				1101	return -EFAULT;
				1102	}
				1103
				1104	ipcp = ipcctl_pre_down(ns, &sem_ids(ns), semid, cmd,
				1105	&semid64.sem_perm, 0);
				1106	if (IS_ERR(ipcp))
				1107	return PTR_ERR(ipcp);
				1108
				1109	sma = container_of(ipcp, struct sem_array, sem_perm);
				1110
				1111	err = security_sem_semctl(sma, cmd);
				1112	if (err)
				1113	goto out_unlock;
				1114
				1115	switch(cmd){
				1116	case IPC_RMID:
				1117	freeary(ns, ipcp);
				1118	goto out_up;
				1119	case IPC_SET:
				1120	ipc_update_perm(&semid64.sem_perm, ipcp);
				1121	sma->sem_ctime = get_seconds();
				1122	break;
				1123	default:
				1124	err = -EINVAL;
				1125	}
				1126
				1127	out_unlock:
				1128	sem_unlock(sma);
				1129	out_up:
				1130	up_write(&sem_ids(ns).rw_mutex);
				1131	return err;
				1132	}
				1133
				1134	SYSCALL_DEFINE(semctl)(int semid, int semnum, int cmd, union semun arg)
				1135	{
				1136	int err = -EINVAL;
				1137	int version;
				1138	struct ipc_namespace *ns;
				1139
				1140	if (semid < 0)
				1141	return -EINVAL;
				1142
				1143	version = ipc_parse_version(&cmd);
				1144	ns = current->nsproxy->ipc_ns;
				1145
				1146	switch(cmd) {
				1147	case IPC_INFO:
				1148	case SEM_INFO:
				1149	case IPC_STAT:
				1150	case SEM_STAT:
				1151	err = semctl_nolock(ns, semid, cmd, version, arg);
				1152	return err;
				1153	case GETALL:
				1154	case GETVAL:
				1155	case GETPID:
				1156	case GETNCNT:
				1157	case GETZCNT:
				1158	case SETVAL:
				1159	case SETALL:
				1160	err = semctl_main(ns,semid,semnum,cmd,version,arg);
				1161	return err;
				1162	case IPC_RMID:
				1163	case IPC_SET:
				1164	err = semctl_down(ns, semid, cmd, version, arg);
				1165	return err;
				1166	default:
				1167	return -EINVAL;
				1168	}
				1169	}
				1170	#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
				1171	asmlinkage long SyS_semctl(int semid, int semnum, int cmd, union semun arg)
				1172	{
				1173	return SYSC_semctl((int) semid, (int) semnum, (int) cmd, arg);
				1174	}
				1175	SYSCALL_ALIAS(sys_semctl, SyS_semctl);
				1176	#endif
				1177
				1178	/* If the task doesn't already have a undo_list, then allocate one
				1179	* here. We guarantee there is only one thread using this undo list,
				1180	* and current is THE ONE
				1181	*
				1182	* If this allocation and assignment succeeds, but later
				1183	* portions of this code fail, there is no need to free the sem_undo_list.
				1184	* Just let it stay associated with the task, and it'll be freed later
				1185	* at exit time.
				1186	*
				1187	* This can block, so callers must hold no locks.
				1188	*/
				1189	static inline int get_undo_list(struct sem_undo_list **undo_listp)
				1190	{
				1191	struct sem_undo_list *undo_list;
				1192
				1193	undo_list = current->sysvsem.undo_list;
				1194	if (!undo_list) {
				1195	undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
				1196	if (undo_list == NULL)
				1197	return -ENOMEM;
				1198	spin_lock_init(&undo_list->lock);
				1199	atomic_set(&undo_list->refcnt, 1);
				1200	INIT_LIST_HEAD(&undo_list->list_proc);
				1201
				1202	current->sysvsem.undo_list = undo_list;
				1203	}
				1204	*undo_listp = undo_list;
				1205	return 0;
				1206	}
				1207
				1208	static struct sem_undo __lookup_undo(struct sem_undo_list ulp, int semid)
				1209	{
				1210	struct sem_undo *un;
				1211
				1212	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
				1213	if (un->semid == semid)
				1214	return un;
				1215	}
				1216	return NULL;
				1217	}
				1218
				1219	static struct sem_undo lookup_undo(struct sem_undo_list ulp, int semid)
				1220	{
				1221	struct sem_undo *un;
				1222
				1223	assert_spin_locked(&ulp->lock);
				1224
				1225	un = __lookup_undo(ulp, semid);
				1226	if (un) {
				1227	list_del_rcu(&un->list_proc);
				1228	list_add_rcu(&un->list_proc, &ulp->list_proc);
				1229	}
				1230	return un;
				1231	}
				1232
				1233	/**
				1234	* find_alloc_undo - Lookup (and if not present create) undo array
				1235	* @ns: namespace
				1236	* @semid: semaphore array id
				1237	*
				1238	* The function looks up (and if not present creates) the undo structure.
				1239	* The size of the undo structure depends on the size of the semaphore
				1240	* array, thus the alloc path is not that straightforward.
				1241	* Lifetime-rules: sem_undo is rcu-protected, on success, the function
				1242	* performs a rcu_read_lock().
				1243	*/
				1244	static struct sem_undo find_alloc_undo(struct ipc_namespace ns, int semid)
				1245	{
				1246	struct sem_array *sma;
				1247	struct sem_undo_list *ulp;
				1248	struct sem_undo un, new;
				1249	int nsems;
				1250	int error;
				1251
				1252	error = get_undo_list(&ulp);
				1253	if (error)
				1254	return ERR_PTR(error);
				1255
				1256	rcu_read_lock();
				1257	spin_lock(&ulp->lock);
				1258	un = lookup_undo(ulp, semid);
				1259	spin_unlock(&ulp->lock);
				1260	if (likely(un!=NULL))
				1261	goto out;
				1262	rcu_read_unlock();
				1263
				1264	/* no undo structure around - allocate one. */
				1265	/* step 1: figure out the size of the semaphore array */
				1266	sma = sem_lock_check(ns, semid);
				1267	if (IS_ERR(sma))
				1268	return ERR_CAST(sma);
				1269
				1270	nsems = sma->sem_nsems;
				1271	sem_getref_and_unlock(sma);
				1272
				1273	/* step 2: allocate new undo structure */
				1274	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
				1275	if (!new) {
				1276	sem_putref(sma);
				1277	return ERR_PTR(-ENOMEM);
				1278	}
				1279
				1280	/* step 3: Acquire the lock on semaphore array */
				1281	sem_lock_and_putref(sma);
				1282	if (sma->sem_perm.deleted) {
				1283	sem_unlock(sma);
				1284	kfree(new);
				1285	un = ERR_PTR(-EIDRM);
				1286	goto out;
				1287	}
				1288	spin_lock(&ulp->lock);
				1289
				1290	/*
				1291	* step 4: check for races: did someone else allocate the undo struct?
				1292	*/
				1293	un = lookup_undo(ulp, semid);
				1294	if (un) {
				1295	kfree(new);
				1296	goto success;
				1297	}
				1298	/* step 5: initialize & link new undo structure */
				1299	new->semadj = (short *) &new[1];
				1300	new->ulp = ulp;
				1301	new->semid = semid;
				1302	assert_spin_locked(&ulp->lock);
				1303	list_add_rcu(&new->list_proc, &ulp->list_proc);
				1304	assert_spin_locked(&sma->sem_perm.lock);
				1305	list_add(&new->list_id, &sma->list_id);
				1306	un = new;
				1307
				1308	success:
				1309	spin_unlock(&ulp->lock);
				1310	rcu_read_lock();
				1311	sem_unlock(sma);
				1312	out:
				1313	return un;
				1314	}
				1315
				1316
				1317	/**
				1318	* get_queue_result - Retrieve the result code from sem_queue
				1319	* @q: Pointer to queue structure
				1320	*
				1321	* Retrieve the return code from the pending queue. If IN_WAKEUP is found in
				1322	* q->status, then we must loop until the value is replaced with the final
				1323	* value: This may happen if a task is woken up by an unrelated event (e.g.
				1324	* signal) and in parallel the task is woken up by another task because it got
				1325	* the requested semaphores.
				1326	*
				1327	* The function can be called with or without holding the semaphore spinlock.
				1328	*/
				1329	static int get_queue_result(struct sem_queue *q)
				1330	{
				1331	int error;
				1332
				1333	error = q->status;
				1334	while (unlikely(error == IN_WAKEUP)) {
				1335	cpu_relax();
				1336	error = q->status;
				1337	}
				1338
				1339	return error;
				1340	}
				1341
				1342
				1343	SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
				1344	unsigned, nsops, const struct timespec __user *, timeout)
				1345	{
				1346	int error = -EINVAL;
				1347	struct sem_array *sma;
				1348	struct sembuf fast_sops[SEMOPM_FAST];
				1349	struct sembuf* sops = fast_sops, *sop;
				1350	struct sem_undo *un;
				1351	int undos = 0, alter = 0, max;
				1352	struct sem_queue queue;
				1353	unsigned long jiffies_left = 0;
				1354	struct ipc_namespace *ns;
				1355	struct list_head tasks;
				1356
				1357	ns = current->nsproxy->ipc_ns;
				1358
				1359	if (nsops < 1 \|\| semid < 0)
				1360	return -EINVAL;
				1361	if (nsops > ns->sc_semopm)
				1362	return -E2BIG;
				1363	if(nsops > SEMOPM_FAST) {
				1364	sops = kmalloc(sizeof(sops)nsops,GFP_KERNEL);
				1365	if(sops==NULL)
				1366	return -ENOMEM;
				1367	}
				1368	if (copy_from_user (sops, tsops, nsops * sizeof(*tsops))) {
				1369	error=-EFAULT;
				1370	goto out_free;
				1371	}
				1372	if (timeout) {
				1373	struct timespec _timeout;
				1374	if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
				1375	error = -EFAULT;
				1376	goto out_free;
				1377	}
				1378	if (_timeout.tv_sec < 0 \|\| _timeout.tv_nsec < 0 \|\|
				1379	_timeout.tv_nsec >= 1000000000L) {
				1380	error = -EINVAL;
				1381	goto out_free;
				1382	}
				1383	jiffies_left = timespec_to_jiffies(&_timeout);
				1384	}
				1385	max = 0;
				1386	for (sop = sops; sop < sops + nsops; sop++) {
				1387	if (sop->sem_num >= max)
				1388	max = sop->sem_num;
				1389	if (sop->sem_flg & SEM_UNDO)
				1390	undos = 1;
				1391	if (sop->sem_op != 0)
				1392	alter = 1;
				1393	}
				1394
				1395	if (undos) {
				1396	un = find_alloc_undo(ns, semid);
				1397	if (IS_ERR(un)) {
				1398	error = PTR_ERR(un);
				1399	goto out_free;
				1400	}
				1401	} else
				1402	un = NULL;
				1403
				1404	INIT_LIST_HEAD(&tasks);
				1405
				1406	sma = sem_lock_check(ns, semid);
				1407	if (IS_ERR(sma)) {
				1408	if (un)
				1409	rcu_read_unlock();
				1410	error = PTR_ERR(sma);
				1411	goto out_free;
				1412	}
				1413
				1414	/*
				1415	* semid identifiers are not unique - find_alloc_undo may have
				1416	* allocated an undo structure, it was invalidated by an RMID
				1417	* and now a new array with received the same id. Check and fail.
				1418	* This case can be detected checking un->semid. The existence of
				1419	* "un" itself is guaranteed by rcu.
				1420	*/
				1421	error = -EIDRM;
				1422	if (un) {
				1423	if (un->semid == -1) {
				1424	rcu_read_unlock();
				1425	goto out_unlock_free;
				1426	} else {
				1427	/*
				1428	* rcu lock can be released, "un" cannot disappear:
				1429	* - sem_lock is acquired, thus IPC_RMID is
				1430	* impossible.
				1431	* - exit_sem is impossible, it always operates on
				1432	* current (or a dead task).
				1433	*/
				1434
				1435	rcu_read_unlock();
				1436	}
				1437	}
				1438
				1439	error = -EFBIG;
				1440	if (max >= sma->sem_nsems)
				1441	goto out_unlock_free;
				1442
				1443	error = -EACCES;
				1444	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
				1445	goto out_unlock_free;
				1446
				1447	error = security_sem_semop(sma, sops, nsops, alter);
				1448	if (error)
				1449	goto out_unlock_free;
				1450
				1451	error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
				1452	if (error <= 0) {
				1453	if (alter && error == 0)
				1454	do_smart_update(sma, sops, nsops, 1, &tasks);
				1455
				1456	goto out_unlock_free;
				1457	}
				1458
				1459	/* We need to sleep on this operation, so we put the current
				1460	* task into the pending queue and go to sleep.
				1461	*/
				1462
				1463	queue.sops = sops;
				1464	queue.nsops = nsops;
				1465	queue.undo = un;
				1466	queue.pid = task_tgid_vnr(current);
				1467	queue.alter = alter;
				1468	if (alter)
				1469	list_add_tail(&queue.list, &sma->sem_pending);
				1470	else
				1471	list_add(&queue.list, &sma->sem_pending);
				1472
				1473	if (nsops == 1) {
				1474	struct sem *curr;
				1475	curr = &sma->sem_base[sops->sem_num];
				1476
				1477	if (alter)
				1478	list_add_tail(&queue.simple_list, &curr->sem_pending);
				1479	else
				1480	list_add(&queue.simple_list, &curr->sem_pending);
				1481	} else {
				1482	INIT_LIST_HEAD(&queue.simple_list);
				1483	sma->complex_count++;
				1484	}
				1485
				1486	queue.status = -EINTR;
				1487	queue.sleeper = current;
				1488
				1489	sleep_again:
				1490	current->state = TASK_INTERRUPTIBLE;
				1491	sem_unlock(sma);
				1492
				1493	if (timeout)
				1494	jiffies_left = schedule_timeout(jiffies_left);
				1495	else
				1496	schedule();
				1497
				1498	error = get_queue_result(&queue);
				1499
				1500	if (error != -EINTR) {
				1501	/* fast path: update_queue already obtained all requested
				1502	* resources.
				1503	* Perform a smp_mb(): User space could assume that semop()
				1504	* is a memory barrier: Without the mb(), the cpu could
				1505	* speculatively read in user space stale data that was
				1506	* overwritten by the previous owner of the semaphore.
				1507	*/
				1508	smp_mb();
				1509
				1510	goto out_free;
				1511	}
				1512
				1513	sma = sem_lock(ns, semid);
				1514
				1515	/*
				1516	* Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
				1517	*/
				1518	error = get_queue_result(&queue);
				1519
				1520	/*
				1521	* Array removed? If yes, leave without sem_unlock().
				1522	*/
				1523	if (IS_ERR(sma)) {
				1524	goto out_free;
				1525	}
				1526
				1527
				1528	/*
				1529	* If queue.status != -EINTR we are woken up by another process.
				1530	* Leave without unlink_queue(), but with sem_unlock().
				1531	*/
				1532
				1533	if (error != -EINTR) {
				1534	goto out_unlock_free;
				1535	}
				1536
				1537	/*
				1538	* If an interrupt occurred we have to clean up the queue
				1539	*/
				1540	if (timeout && jiffies_left == 0)
				1541	error = -EAGAIN;
				1542
				1543	/*
				1544	* If the wakeup was spurious, just retry
				1545	*/
				1546	if (error == -EINTR && !signal_pending(current))
				1547	goto sleep_again;
				1548
				1549	unlink_queue(sma, &queue);
				1550
				1551	out_unlock_free:
				1552	sem_unlock(sma);
				1553
				1554	wake_up_sem_queue_do(&tasks);
				1555	out_free:
				1556	if(sops != fast_sops)
				1557	kfree(sops);
				1558	return error;
				1559	}
				1560
				1561	SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
				1562	unsigned, nsops)
				1563	{
				1564	return sys_semtimedop(semid, tsops, nsops, NULL);
				1565	}
				1566
				1567	/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
				1568	* parent and child tasks.
				1569	*/
				1570
				1571	int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
				1572	{
				1573	struct sem_undo_list *undo_list;
				1574	int error;
				1575
				1576	if (clone_flags & CLONE_SYSVSEM) {
				1577	error = get_undo_list(&undo_list);
				1578	if (error)
				1579	return error;
				1580	atomic_inc(&undo_list->refcnt);
				1581	tsk->sysvsem.undo_list = undo_list;
				1582	} else
				1583	tsk->sysvsem.undo_list = NULL;
				1584
				1585	return 0;
				1586	}
				1587
				1588	/*
				1589	* add semadj values to semaphores, free undo structures.
				1590	* undo structures are not freed when semaphore arrays are destroyed
				1591	* so some of them may be out of date.
				1592	* IMPLEMENTATION NOTE: There is some confusion over whether the
				1593	* set of adjustments that needs to be done should be done in an atomic
				1594	* manner or not. That is, if we are attempting to decrement the semval
				1595	* should we queue up and wait until we can do so legally?
				1596	* The original implementation attempted to do this (queue and wait).
				1597	* The current implementation does not do so. The POSIX standard
				1598	* and SVID should be consulted to determine what behavior is mandated.
				1599	*/
				1600	void exit_sem(struct task_struct *tsk)
				1601	{
				1602	struct sem_undo_list *ulp;
				1603
				1604	ulp = tsk->sysvsem.undo_list;
				1605	if (!ulp)
				1606	return;
				1607	tsk->sysvsem.undo_list = NULL;
				1608
				1609	if (!atomic_dec_and_test(&ulp->refcnt))
				1610	return;
				1611
				1612	for (;;) {
				1613	struct sem_array *sma;
				1614	struct sem_undo *un;
				1615	struct list_head tasks;
				1616	int semid;
				1617	int i;
				1618
				1619	rcu_read_lock();
				1620	un = list_entry_rcu(ulp->list_proc.next,
				1621	struct sem_undo, list_proc);
				1622	if (&un->list_proc == &ulp->list_proc)
				1623	semid = -1;
				1624	else
				1625	semid = un->semid;
				1626	rcu_read_unlock();
				1627
				1628	if (semid == -1)
				1629	break;
				1630
				1631	sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
				1632
				1633	/* exit_sem raced with IPC_RMID, nothing to do */
				1634	if (IS_ERR(sma))
				1635	continue;
				1636
				1637	un = __lookup_undo(ulp, semid);
				1638	if (un == NULL) {
				1639	/* exit_sem raced with IPC_RMID+semget() that created
				1640	* exactly the same semid. Nothing to do.
				1641	*/
				1642	sem_unlock(sma);
				1643	continue;
				1644	}
				1645
				1646	/* remove un from the linked lists */
				1647	assert_spin_locked(&sma->sem_perm.lock);
				1648	list_del(&un->list_id);
				1649
				1650	spin_lock(&ulp->lock);
				1651	list_del_rcu(&un->list_proc);
				1652	spin_unlock(&ulp->lock);
				1653
				1654	/* perform adjustments registered in un */
				1655	for (i = 0; i < sma->sem_nsems; i++) {
				1656	struct sem * semaphore = &sma->sem_base[i];
				1657	if (un->semadj[i]) {
				1658	semaphore->semval += un->semadj[i];
				1659	/*
				1660	* Range checks of the new semaphore value,
				1661	* not defined by sus:
				1662	* - Some unices ignore the undo entirely
				1663	* (e.g. HP UX 11i 11.22, Tru64 V5.1)
				1664	* - some cap the value (e.g. FreeBSD caps
				1665	* at 0, but doesn't enforce SEMVMX)
				1666	*
				1667	* Linux caps the semaphore value, both at 0
				1668	* and at SEMVMX.
				1669	*
				1670	* Manfred <manfred@colorfullife.com>
				1671	*/
				1672	if (semaphore->semval < 0)
				1673	semaphore->semval = 0;
				1674	if (semaphore->semval > SEMVMX)
				1675	semaphore->semval = SEMVMX;
				1676	semaphore->sempid = task_tgid_vnr(current);
				1677	}
				1678	}
				1679	/* maybe some queued-up processes were waiting for this */
				1680	INIT_LIST_HEAD(&tasks);
				1681	do_smart_update(sma, NULL, 0, 1, &tasks);
				1682	sem_unlock(sma);
				1683	wake_up_sem_queue_do(&tasks);
				1684
				1685	kfree_rcu(un, rcu);
				1686	}
				1687	kfree(ulp);
				1688	}
				1689
				1690	#ifdef CONFIG_PROC_FS
				1691	static int sysvipc_sem_proc_show(struct seq_file s, void it)
				1692	{
				1693	struct sem_array *sma = it;
				1694
				1695	return seq_printf(s,
				1696	"%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
				1697	sma->sem_perm.key,
				1698	sma->sem_perm.id,
				1699	sma->sem_perm.mode,
				1700	sma->sem_nsems,
				1701	sma->sem_perm.uid,
				1702	sma->sem_perm.gid,
				1703	sma->sem_perm.cuid,
				1704	sma->sem_perm.cgid,
				1705	sma->sem_otime,
				1706	sma->sem_ctime);
				1707	}
				1708	#endif