Blame - marvell/linux/kernel/seccomp.c - T108

blob: 15497dd2fa7d71f6e259bf35d60ed520329f2f6b [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/kernel/seccomp.c
				4	*
				5	* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
				6	*
				7	* Copyright (C) 2012 Google, Inc.
				8	* Will Drewry <wad@chromium.org>
				9	*
				10	* This defines a simple but solid secure-computing facility.
				11	*
				12	* Mode 1 uses a fixed list of allowed system calls.
				13	* Mode 2 allows user-defined system call filters in the form
				14	* of Berkeley Packet Filters/Linux Socket Filters.
				15	*/
				16	#define pr_fmt(fmt) "seccomp: " fmt
				17
				18	#include <linux/refcount.h>
				19	#include <linux/audit.h>
				20	#include <linux/compat.h>
				21	#include <linux/coredump.h>
				22	#include <linux/kmemleak.h>
				23	#include <linux/nospec.h>
				24	#include <linux/prctl.h>
				25	#include <linux/sched.h>
				26	#include <linux/sched/task_stack.h>
				27	#include <linux/seccomp.h>
				28	#include <linux/slab.h>
				29	#include <linux/syscalls.h>
				30	#include <linux/sysctl.h>
				31
				32	/* Not exposed in headers: strictly internal use only. */
				33	#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
				34
				35	#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
				36	#include <asm/syscall.h>
				37	#endif
				38
				39	#ifdef CONFIG_SECCOMP_FILTER
				40	#include <linux/file.h>
				41	#include <linux/filter.h>
				42	#include <linux/pid.h>
				43	#include <linux/ptrace.h>
				44	#include <linux/capability.h>
				45	#include <linux/tracehook.h>
				46	#include <linux/uaccess.h>
				47	#include <linux/anon_inodes.h>
				48	#include <linux/lockdep.h>
				49
				50	/*
				51	* When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
				52	* wrong direction flag in the ioctl number. This is the broken one,
				53	* which the kernel needs to keep supporting until all userspaces stop
				54	* using the wrong command number.
				55	*/
				56	#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
				57
				58	enum notify_state {
				59	SECCOMP_NOTIFY_INIT,
				60	SECCOMP_NOTIFY_SENT,
				61	SECCOMP_NOTIFY_REPLIED,
				62	};
				63
				64	struct seccomp_knotif {
				65	/* The struct pid of the task whose filter triggered the notification */
				66	struct task_struct *task;
				67
				68	/* The "cookie" for this request; this is unique for this filter. */
				69	u64 id;
				70
				71	/*
				72	* The seccomp data. This pointer is valid the entire time this
				73	* notification is active, since it comes from __seccomp_filter which
				74	* eclipses the entire lifecycle here.
				75	*/
				76	const struct seccomp_data *data;
				77
				78	/*
				79	* Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
				80	* struct seccomp_knotif is created and starts out in INIT. Once the
				81	* handler reads the notification off of an FD, it transitions to SENT.
				82	* If a signal is received the state transitions back to INIT and
				83	* another message is sent. When the userspace handler replies, state
				84	* transitions to REPLIED.
				85	*/
				86	enum notify_state state;
				87
				88	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
				89	int error;
				90	long val;
				91	u32 flags;
				92
				93	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
				94	struct completion ready;
				95
				96	struct list_head list;
				97	};
				98
				99	/**
				100	* struct notification - container for seccomp userspace notifications. Since
				101	* most seccomp filters will not have notification listeners attached and this
				102	* structure is fairly large, we store the notification-specific stuff in a
				103	* separate structure.
				104	*
				105	* @request: A semaphore that users of this notification can wait on for
				106	* changes. Actual reads and writes are still controlled with
				107	* filter->notify_lock.
				108	* @next_id: The id of the next request.
				109	* @notifications: A list of struct seccomp_knotif elements.
				110	*/
				111	struct notification {
				112	struct semaphore request;
				113	u64 next_id;
				114	struct list_head notifications;
				115	};
				116
				117	#ifdef SECCOMP_ARCH_NATIVE
				118	/**
				119	* struct action_cache - per-filter cache of seccomp actions per
				120	* arch/syscall pair
				121	*
				122	* @allow_native: A bitmap where each bit represents whether the
				123	* filter will always allow the syscall, for the
				124	* native architecture.
				125	* @allow_compat: A bitmap where each bit represents whether the
				126	* filter will always allow the syscall, for the
				127	* compat architecture.
				128	*/
				129	struct action_cache {
				130	DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
				131	#ifdef SECCOMP_ARCH_COMPAT
				132	DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
				133	#endif
				134	};
				135	#else
				136	struct action_cache { };
				137
				138	static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
				139	const struct seccomp_data *sd)
				140	{
				141	return false;
				142	}
				143
				144	static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
				145	{
				146	}
				147	#endif /* SECCOMP_ARCH_NATIVE */
				148
				149	/**
				150	* struct seccomp_filter - container for seccomp BPF programs
				151	*
				152	* @refs: Reference count to manage the object lifetime.
				153	* A filter's reference count is incremented for each directly
				154	* attached task, once for the dependent filter, and if
				155	* requested for the user notifier. When @refs reaches zero,
				156	* the filter can be freed.
				157	* @users: A filter's @users count is incremented for each directly
				158	* attached task (filter installation, fork(), thread_sync),
				159	* and once for the dependent filter (tracked in filter->prev).
				160	* When it reaches zero it indicates that no direct or indirect
				161	* users of that filter exist. No new tasks can get associated with
				162	* this filter after reaching 0. The @users count is always smaller
				163	* or equal to @refs. Hence, reaching 0 for @users does not mean
				164	* the filter can be freed.
				165	* @cache: cache of arch/syscall mappings to actions
				166	* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
				167	* @prev: points to a previously installed, or inherited, filter
				168	* @prog: the BPF program to evaluate
				169	* @notif: the struct that holds all notification related information
				170	* @notify_lock: A lock for all notification-related accesses.
				171	* @wqh: A wait queue for poll if a notifier is in use.
				172	*
				173	* seccomp_filter objects are organized in a tree linked via the @prev
				174	* pointer. For any task, it appears to be a singly-linked list starting
				175	* with current->seccomp.filter, the most recently attached or inherited filter.
				176	* However, multiple filters may share a @prev node, by way of fork(), which
				177	* results in a unidirectional tree existing in memory. This is similar to
				178	* how namespaces work.
				179	*
				180	* seccomp_filter objects should never be modified after being attached
				181	* to a task_struct (other than @refs).
				182	*/
				183	struct seccomp_filter {
				184	refcount_t refs;
				185	refcount_t users;
				186	bool log;
				187	struct action_cache cache;
				188	struct seccomp_filter *prev;
				189	struct bpf_prog *prog;
				190	struct notification *notif;
				191	struct mutex notify_lock;
				192	wait_queue_head_t wqh;
				193	};
				194
				195	/* Limit any path through the tree to 256KB worth of instructions. */
				196	#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
				197
				198	/*
				199	* Endianness is explicitly ignored and left for BPF program authors to manage
				200	* as per the specific architecture.
				201	*/
				202	static void populate_seccomp_data(struct seccomp_data *sd)
				203	{
				204	/*
				205	* Instead of using current_pt_reg(), we're already doing the work
				206	* to safely fetch "current", so just use "task" everywhere below.
				207	*/
				208	struct task_struct *task = current;
				209	struct pt_regs *regs = task_pt_regs(task);
				210	unsigned long args[6];
				211
				212	sd->nr = syscall_get_nr(task, regs);
				213	sd->arch = syscall_get_arch(task);
				214	syscall_get_arguments(task, regs, args);
				215	sd->args[0] = args[0];
				216	sd->args[1] = args[1];
				217	sd->args[2] = args[2];
				218	sd->args[3] = args[3];
				219	sd->args[4] = args[4];
				220	sd->args[5] = args[5];
				221	sd->instruction_pointer = KSTK_EIP(task);
				222	}
				223
				224	/**
				225	* seccomp_check_filter - verify seccomp filter code
				226	* @filter: filter to verify
				227	* @flen: length of filter
				228	*
				229	* Takes a previously checked filter (by bpf_check_classic) and
				230	* redirects all filter code that loads struct sk_buff data
				231	* and related data through seccomp_bpf_load. It also
				232	* enforces length and alignment checking of those loads.
				233	*
				234	* Returns 0 if the rule set is legal or -EINVAL if not.
				235	*/
				236	static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
				237	{
				238	int pc;
				239	for (pc = 0; pc < flen; pc++) {
				240	struct sock_filter *ftest = &filter[pc];
				241	u16 code = ftest->code;
				242	u32 k = ftest->k;
				243
				244	switch (code) {
				245	case BPF_LD \| BPF_W \| BPF_ABS:
				246	ftest->code = BPF_LDX \| BPF_W \| BPF_ABS;
				247	/* 32-bit aligned and not out of bounds. */
				248	if (k >= sizeof(struct seccomp_data) \|\| k & 3)
				249	return -EINVAL;
				250	continue;
				251	case BPF_LD \| BPF_W \| BPF_LEN:
				252	ftest->code = BPF_LD \| BPF_IMM;
				253	ftest->k = sizeof(struct seccomp_data);
				254	continue;
				255	case BPF_LDX \| BPF_W \| BPF_LEN:
				256	ftest->code = BPF_LDX \| BPF_IMM;
				257	ftest->k = sizeof(struct seccomp_data);
				258	continue;
				259	/* Explicitly include allowed calls. */
				260	case BPF_RET \| BPF_K:
				261	case BPF_RET \| BPF_A:
				262	case BPF_ALU \| BPF_ADD \| BPF_K:
				263	case BPF_ALU \| BPF_ADD \| BPF_X:
				264	case BPF_ALU \| BPF_SUB \| BPF_K:
				265	case BPF_ALU \| BPF_SUB \| BPF_X:
				266	case BPF_ALU \| BPF_MUL \| BPF_K:
				267	case BPF_ALU \| BPF_MUL \| BPF_X:
				268	case BPF_ALU \| BPF_DIV \| BPF_K:
				269	case BPF_ALU \| BPF_DIV \| BPF_X:
				270	case BPF_ALU \| BPF_AND \| BPF_K:
				271	case BPF_ALU \| BPF_AND \| BPF_X:
				272	case BPF_ALU \| BPF_OR \| BPF_K:
				273	case BPF_ALU \| BPF_OR \| BPF_X:
				274	case BPF_ALU \| BPF_XOR \| BPF_K:
				275	case BPF_ALU \| BPF_XOR \| BPF_X:
				276	case BPF_ALU \| BPF_LSH \| BPF_K:
				277	case BPF_ALU \| BPF_LSH \| BPF_X:
				278	case BPF_ALU \| BPF_RSH \| BPF_K:
				279	case BPF_ALU \| BPF_RSH \| BPF_X:
				280	case BPF_ALU \| BPF_NEG:
				281	case BPF_LD \| BPF_IMM:
				282	case BPF_LDX \| BPF_IMM:
				283	case BPF_MISC \| BPF_TAX:
				284	case BPF_MISC \| BPF_TXA:
				285	case BPF_LD \| BPF_MEM:
				286	case BPF_LDX \| BPF_MEM:
				287	case BPF_ST:
				288	case BPF_STX:
				289	case BPF_JMP \| BPF_JA:
				290	case BPF_JMP \| BPF_JEQ \| BPF_K:
				291	case BPF_JMP \| BPF_JEQ \| BPF_X:
				292	case BPF_JMP \| BPF_JGE \| BPF_K:
				293	case BPF_JMP \| BPF_JGE \| BPF_X:
				294	case BPF_JMP \| BPF_JGT \| BPF_K:
				295	case BPF_JMP \| BPF_JGT \| BPF_X:
				296	case BPF_JMP \| BPF_JSET \| BPF_K:
				297	case BPF_JMP \| BPF_JSET \| BPF_X:
				298	continue;
				299	default:
				300	return -EINVAL;
				301	}
				302	}
				303	return 0;
				304	}
				305
				306	#ifdef SECCOMP_ARCH_NATIVE
				307	static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
				308	size_t bitmap_size,
				309	int syscall_nr)
				310	{
				311	if (unlikely(syscall_nr < 0 \|\| syscall_nr >= bitmap_size))
				312	return false;
				313	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
				314
				315	return test_bit(syscall_nr, bitmap);
				316	}
				317
				318	/**
				319	* seccomp_cache_check_allow - lookup seccomp cache
				320	* @sfilter: The seccomp filter
				321	* @sd: The seccomp data to lookup the cache with
				322	*
				323	* Returns true if the seccomp_data is cached and allowed.
				324	*/
				325	static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
				326	const struct seccomp_data *sd)
				327	{
				328	int syscall_nr = sd->nr;
				329	const struct action_cache *cache = &sfilter->cache;
				330
				331	#ifndef SECCOMP_ARCH_COMPAT
				332	/* A native-only architecture doesn't need to check sd->arch. */
				333	return seccomp_cache_check_allow_bitmap(cache->allow_native,
				334	SECCOMP_ARCH_NATIVE_NR,
				335	syscall_nr);
				336	#else
				337	if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
				338	return seccomp_cache_check_allow_bitmap(cache->allow_native,
				339	SECCOMP_ARCH_NATIVE_NR,
				340	syscall_nr);
				341	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
				342	return seccomp_cache_check_allow_bitmap(cache->allow_compat,
				343	SECCOMP_ARCH_COMPAT_NR,
				344	syscall_nr);
				345	#endif /* SECCOMP_ARCH_COMPAT */
				346
				347	WARN_ON_ONCE(true);
				348	return false;
				349	}
				350	#endif /* SECCOMP_ARCH_NATIVE */
				351
				352	/**
				353	* seccomp_run_filters - evaluates all seccomp filters against @sd
				354	* @sd: optional seccomp data to be passed to filters
				355	* @match: stores struct seccomp_filter that resulted in the return value,
				356	* unless filter returned SECCOMP_RET_ALLOW, in which case it will
				357	* be unchanged.
				358	*
				359	* Returns valid seccomp BPF response codes.
				360	*/
				361	#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
				362	static u32 seccomp_run_filters(const struct seccomp_data *sd,
				363	struct seccomp_filter **match)
				364	{
				365	u32 ret = SECCOMP_RET_ALLOW;
				366	/* Make sure cross-thread synced filter points somewhere sane. */
				367	struct seccomp_filter *f =
				368	READ_ONCE(current->seccomp.filter);
				369
				370	/* Ensure unexpected behavior doesn't result in failing open. */
				371	if (WARN_ON(f == NULL))
				372	return SECCOMP_RET_KILL_PROCESS;
				373
				374	if (seccomp_cache_check_allow(f, sd))
				375	return SECCOMP_RET_ALLOW;
				376
				377	/*
				378	* All filters in the list are evaluated and the lowest BPF return
				379	* value always takes priority (ignoring the DATA).
				380	*/
				381	preempt_disable();
				382	for (; f; f = f->prev) {
				383	u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
				384
				385	if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
				386	ret = cur_ret;
				387	*match = f;
				388	}
				389	}
				390	preempt_enable();
				391	return ret;
				392	}
				393	#endif /* CONFIG_SECCOMP_FILTER */
				394
				395	static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
				396	{
				397	assert_spin_locked(&current->sighand->siglock);
				398
				399	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
				400	return false;
				401
				402	return true;
				403	}
				404
				405	void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
				406
				407	static inline void seccomp_assign_mode(struct task_struct *task,
				408	unsigned long seccomp_mode,
				409	unsigned long flags)
				410	{
				411	assert_spin_locked(&task->sighand->siglock);
				412
				413	task->seccomp.mode = seccomp_mode;
				414	/*
				415	* Make sure TIF_SECCOMP cannot be set before the mode (and
				416	* filter) is set.
				417	*/
				418	smp_mb__before_atomic();
				419	/* Assume default seccomp processes want spec flaw mitigation. */
				420	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
				421	arch_seccomp_spec_mitigate(task);
				422	set_tsk_thread_flag(task, TIF_SECCOMP);
				423	}
				424
				425	#ifdef CONFIG_SECCOMP_FILTER
				426	/* Returns 1 if the parent is an ancestor of the child. */
				427	static int is_ancestor(struct seccomp_filter *parent,
				428	struct seccomp_filter *child)
				429	{
				430	/* NULL is the root ancestor. */
				431	if (parent == NULL)
				432	return 1;
				433	for (; child; child = child->prev)
				434	if (child == parent)
				435	return 1;
				436	return 0;
				437	}
				438
				439	/**
				440	* seccomp_can_sync_threads: checks if all threads can be synchronized
				441	*
				442	* Expects sighand and cred_guard_mutex locks to be held.
				443	*
				444	* Returns 0 on success, -ve on error, or the pid of a thread which was
				445	* either not in the correct seccomp mode or did not have an ancestral
				446	* seccomp filter.
				447	*/
				448	static inline pid_t seccomp_can_sync_threads(void)
				449	{
				450	struct task_struct thread, caller;
				451
				452	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
				453	assert_spin_locked(&current->sighand->siglock);
				454
				455	/* Validate all threads being eligible for synchronization. */
				456	caller = current;
				457	for_each_thread(caller, thread) {
				458	pid_t failed;
				459
				460	/* Skip current, since it is initiating the sync. */
				461	if (thread == caller)
				462	continue;
				463
				464	if (thread->seccomp.mode == SECCOMP_MODE_DISABLED \|\|
				465	(thread->seccomp.mode == SECCOMP_MODE_FILTER &&
				466	is_ancestor(thread->seccomp.filter,
				467	caller->seccomp.filter)))
				468	continue;
				469
				470	/* Return the first thread that cannot be synchronized. */
				471	failed = task_pid_vnr(thread);
				472	/* If the pid cannot be resolved, then return -ESRCH */
				473	if (WARN_ON(failed == 0))
				474	failed = -ESRCH;
				475	return failed;
				476	}
				477
				478	return 0;
				479	}
				480
				481	static inline void seccomp_filter_free(struct seccomp_filter *filter)
				482	{
				483	if (filter) {
				484	bpf_prog_destroy(filter->prog);
				485	kfree(filter);
				486	}
				487	}
				488
				489	static void __seccomp_filter_orphan(struct seccomp_filter *orig)
				490	{
				491	while (orig && refcount_dec_and_test(&orig->users)) {
				492	if (waitqueue_active(&orig->wqh))
				493	wake_up_poll(&orig->wqh, EPOLLHUP);
				494	orig = orig->prev;
				495	}
				496	}
				497
				498	static void __put_seccomp_filter(struct seccomp_filter *orig)
				499	{
				500	/* Clean up single-reference branches iteratively. */
				501	while (orig && refcount_dec_and_test(&orig->refs)) {
				502	struct seccomp_filter *freeme = orig;
				503	orig = orig->prev;
				504	seccomp_filter_free(freeme);
				505	}
				506	}
				507
				508	static void __seccomp_filter_release(struct seccomp_filter *orig)
				509	{
				510	/* Notify about any unused filters in the task's former filter tree. */
				511	__seccomp_filter_orphan(orig);
				512	/* Finally drop all references to the task's former tree. */
				513	__put_seccomp_filter(orig);
				514	}
				515
				516	/**
				517	* seccomp_filter_release - Detach the task from its filter tree,
				518	* drop its reference count, and notify
				519	* about unused filters
				520	*
				521	* This function should only be called when the task is exiting as
				522	* it detaches it from its filter tree. As such, READ_ONCE() and
				523	* barriers are not needed here, as would normally be needed.
				524	*/
				525	void seccomp_filter_release(struct task_struct *tsk)
				526	{
				527	struct seccomp_filter *orig = tsk->seccomp.filter;
				528
				529	/* Detach task from its filter tree. */
				530	tsk->seccomp.filter = NULL;
				531	__seccomp_filter_release(orig);
				532	}
				533
				534	/**
				535	* seccomp_sync_threads: sets all threads to use current's filter
				536	*
				537	* Expects sighand and cred_guard_mutex locks to be held, and for
				538	* seccomp_can_sync_threads() to have returned success already
				539	* without dropping the locks.
				540	*
				541	*/
				542	static inline void seccomp_sync_threads(unsigned long flags)
				543	{
				544	struct task_struct thread, caller;
				545
				546	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
				547	assert_spin_locked(&current->sighand->siglock);
				548
				549	/* Synchronize all threads. */
				550	caller = current;
				551	for_each_thread(caller, thread) {
				552	/* Skip current, since it needs no changes. */
				553	if (thread == caller)
				554	continue;
				555
				556	/* Get a task reference for the new leaf node. */
				557	get_seccomp_filter(caller);
				558
				559	/*
				560	* Drop the task reference to the shared ancestor since
				561	* current's path will hold a reference. (This also
				562	* allows a put before the assignment.)
				563	*/
				564	__seccomp_filter_release(thread->seccomp.filter);
				565
				566	/* Make our new filter tree visible. */
				567	smp_store_release(&thread->seccomp.filter,
				568	caller->seccomp.filter);
				569	atomic_set(&thread->seccomp.filter_count,
				570	atomic_read(&caller->seccomp.filter_count));
				571
				572	/*
				573	* Don't let an unprivileged task work around
				574	* the no_new_privs restriction by creating
				575	* a thread that sets it up, enters seccomp,
				576	* then dies.
				577	*/
				578	if (task_no_new_privs(caller))
				579	task_set_no_new_privs(thread);
				580
				581	/*
				582	* Opt the other thread into seccomp if needed.
				583	* As threads are considered to be trust-realm
				584	* equivalent (see ptrace_may_access), it is safe to
				585	* allow one thread to transition the other.
				586	*/
				587	if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
				588	seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
				589	flags);
				590	}
				591	}
				592
				593	/**
				594	* seccomp_prepare_filter: Prepares a seccomp filter for use.
				595	* @fprog: BPF program to install
				596	*
				597	* Returns filter on success or an ERR_PTR on failure.
				598	*/
				599	static struct seccomp_filter seccomp_prepare_filter(struct sock_fprog fprog)
				600	{
				601	struct seccomp_filter *sfilter;
				602	int ret;
				603	const bool save_orig =
				604	#if defined(CONFIG_CHECKPOINT_RESTORE) \|\| defined(SECCOMP_ARCH_NATIVE)
				605	true;
				606	#else
				607	false;
				608	#endif
				609
				610	if (fprog->len == 0 \|\| fprog->len > BPF_MAXINSNS)
				611	return ERR_PTR(-EINVAL);
				612
				613	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
				614
				615	/*
				616	* Installing a seccomp filter requires that the task has
				617	* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
				618	* This avoids scenarios where unprivileged tasks can affect the
				619	* behavior of privileged children.
				620	*/
				621	if (!task_no_new_privs(current) &&
				622	!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
				623	return ERR_PTR(-EACCES);
				624
				625	/* Allocate a new seccomp_filter */
				626	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL \| __GFP_NOWARN);
				627	if (!sfilter)
				628	return ERR_PTR(-ENOMEM);
				629
				630	mutex_init(&sfilter->notify_lock);
				631	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
				632	seccomp_check_filter, save_orig);
				633	if (ret < 0) {
				634	kfree(sfilter);
				635	return ERR_PTR(ret);
				636	}
				637
				638	refcount_set(&sfilter->refs, 1);
				639	refcount_set(&sfilter->users, 1);
				640	init_waitqueue_head(&sfilter->wqh);
				641
				642	return sfilter;
				643	}
				644
				645	/**
				646	* seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
				647	* @user_filter: pointer to the user data containing a sock_fprog.
				648	*
				649	* Returns 0 on success and non-zero otherwise.
				650	*/
				651	static struct seccomp_filter *
				652	seccomp_prepare_user_filter(const char __user *user_filter)
				653	{
				654	struct sock_fprog fprog;
				655	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
				656
				657	#ifdef CONFIG_COMPAT
				658	if (in_compat_syscall()) {
				659	struct compat_sock_fprog fprog32;
				660	if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
				661	goto out;
				662	fprog.len = fprog32.len;
				663	fprog.filter = compat_ptr(fprog32.filter);
				664	} else /* falls through to the if below. */
				665	#endif
				666	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
				667	goto out;
				668	filter = seccomp_prepare_filter(&fprog);
				669	out:
				670	return filter;
				671	}
				672
				673	#ifdef SECCOMP_ARCH_NATIVE
				674	/**
				675	* seccomp_is_const_allow - check if filter is constant allow with given data
				676	* @fprog: The BPF programs
				677	* @sd: The seccomp data to check against, only syscall number and arch
				678	* number are considered constant.
				679	*/
				680	static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
				681	struct seccomp_data *sd)
				682	{
				683	unsigned int reg_value = 0;
				684	unsigned int pc;
				685	bool op_res;
				686
				687	if (WARN_ON_ONCE(!fprog))
				688	return false;
				689
				690	for (pc = 0; pc < fprog->len; pc++) {
				691	struct sock_filter *insn = &fprog->filter[pc];
				692	u16 code = insn->code;
				693	u32 k = insn->k;
				694
				695	switch (code) {
				696	case BPF_LD \| BPF_W \| BPF_ABS:
				697	switch (k) {
				698	case offsetof(struct seccomp_data, nr):
				699	reg_value = sd->nr;
				700	break;
				701	case offsetof(struct seccomp_data, arch):
				702	reg_value = sd->arch;
				703	break;
				704	default:
				705	/* can't optimize (non-constant value load) */
				706	return false;
				707	}
				708	break;
				709	case BPF_RET \| BPF_K:
				710	/* reached return with constant values only, check allow */
				711	return k == SECCOMP_RET_ALLOW;
				712	case BPF_JMP \| BPF_JA:
				713	pc += insn->k;
				714	break;
				715	case BPF_JMP \| BPF_JEQ \| BPF_K:
				716	case BPF_JMP \| BPF_JGE \| BPF_K:
				717	case BPF_JMP \| BPF_JGT \| BPF_K:
				718	case BPF_JMP \| BPF_JSET \| BPF_K:
				719	switch (BPF_OP(code)) {
				720	case BPF_JEQ:
				721	op_res = reg_value == k;
				722	break;
				723	case BPF_JGE:
				724	op_res = reg_value >= k;
				725	break;
				726	case BPF_JGT:
				727	op_res = reg_value > k;
				728	break;
				729	case BPF_JSET:
				730	op_res = !!(reg_value & k);
				731	break;
				732	default:
				733	/* can't optimize (unknown jump) */
				734	return false;
				735	}
				736
				737	pc += op_res ? insn->jt : insn->jf;
				738	break;
				739	case BPF_ALU \| BPF_AND \| BPF_K:
				740	reg_value &= k;
				741	break;
				742	default:
				743	/* can't optimize (unknown insn) */
				744	return false;
				745	}
				746	}
				747
				748	/* ran off the end of the filter?! */
				749	WARN_ON(1);
				750	return false;
				751	}
				752
				753	static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
				754	void bitmap, const void bitmap_prev,
				755	size_t bitmap_size, int arch)
				756	{
				757	struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
				758	struct seccomp_data sd;
				759	int nr;
				760
				761	if (bitmap_prev) {
				762	/* The new filter must be as restrictive as the last. */
				763	bitmap_copy(bitmap, bitmap_prev, bitmap_size);
				764	} else {
				765	/* Before any filters, all syscalls are always allowed. */
				766	bitmap_fill(bitmap, bitmap_size);
				767	}
				768
				769	for (nr = 0; nr < bitmap_size; nr++) {
				770	/* No bitmap change: not a cacheable action. */
				771	if (!test_bit(nr, bitmap))
				772	continue;
				773
				774	sd.nr = nr;
				775	sd.arch = arch;
				776
				777	/* No bitmap change: continue to always allow. */
				778	if (seccomp_is_const_allow(fprog, &sd))
				779	continue;
				780
				781	/*
				782	* Not a cacheable action: always run filters.
				783	* atomic clear_bit() not needed, filter not visible yet.
				784	*/
				785	__clear_bit(nr, bitmap);
				786	}
				787	}
				788
				789	/**
				790	* seccomp_cache_prepare - emulate the filter to find cachable syscalls
				791	* @sfilter: The seccomp filter
				792	*
				793	* Returns 0 if successful or -errno if error occurred.
				794	*/
				795	static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
				796	{
				797	struct action_cache *cache = &sfilter->cache;
				798	const struct action_cache *cache_prev =
				799	sfilter->prev ? &sfilter->prev->cache : NULL;
				800
				801	seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
				802	cache_prev ? cache_prev->allow_native : NULL,
				803	SECCOMP_ARCH_NATIVE_NR,
				804	SECCOMP_ARCH_NATIVE);
				805
				806	#ifdef SECCOMP_ARCH_COMPAT
				807	seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
				808	cache_prev ? cache_prev->allow_compat : NULL,
				809	SECCOMP_ARCH_COMPAT_NR,
				810	SECCOMP_ARCH_COMPAT);
				811	#endif /* SECCOMP_ARCH_COMPAT */
				812	}
				813	#endif /* SECCOMP_ARCH_NATIVE */
				814
				815	/**
				816	* seccomp_attach_filter: validate and attach filter
				817	* @flags: flags to change filter behavior
				818	* @filter: seccomp filter to add to the current process
				819	*
				820	* Caller must be holding current->sighand->siglock lock.
				821	*
				822	* Returns 0 on success, -ve on error, or
				823	* - in TSYNC mode: the pid of a thread which was either not in the correct
				824	* seccomp mode or did not have an ancestral seccomp filter
				825	* - in NEW_LISTENER mode: the fd of the new listener
				826	*/
				827	static long seccomp_attach_filter(unsigned int flags,
				828	struct seccomp_filter *filter)
				829	{
				830	unsigned long total_insns;
				831	struct seccomp_filter *walker;
				832
				833	assert_spin_locked(&current->sighand->siglock);
				834
				835	/* Validate resulting filter length. */
				836	total_insns = filter->prog->len;
				837	for (walker = current->seccomp.filter; walker; walker = walker->prev)
				838	total_insns += walker->prog->len + 4; /* 4 instr penalty */
				839	if (total_insns > MAX_INSNS_PER_PATH)
				840	return -ENOMEM;
				841
				842	/* If thread sync has been requested, check that it is possible. */
				843	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
				844	int ret;
				845
				846	ret = seccomp_can_sync_threads();
				847	if (ret)
				848	return ret;
				849	}
				850
				851	/* Set log flag, if present. */
				852	if (flags & SECCOMP_FILTER_FLAG_LOG)
				853	filter->log = true;
				854
				855	/*
				856	* If there is an existing filter, make it the prev and don't drop its
				857	* task reference.
				858	*/
				859	filter->prev = current->seccomp.filter;
				860	seccomp_cache_prepare(filter);
				861	current->seccomp.filter = filter;
				862	atomic_inc(&current->seccomp.filter_count);
				863
				864	/* Now that the new filter is in place, synchronize to all threads. */
				865	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
				866	seccomp_sync_threads(flags);
				867
				868	return 0;
				869	}
				870
				871	static void __get_seccomp_filter(struct seccomp_filter *filter)
				872	{
				873	refcount_inc(&filter->refs);
				874	}
				875
				876	/* get_seccomp_filter - increments the reference count of the filter on @tsk */
				877	void get_seccomp_filter(struct task_struct *tsk)
				878	{
				879	struct seccomp_filter *orig = tsk->seccomp.filter;
				880	if (!orig)
				881	return;
				882	__get_seccomp_filter(orig);
				883	refcount_inc(&orig->users);
				884	}
				885
				886	static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
				887	{
				888	clear_siginfo(info);
				889	info->si_signo = SIGSYS;
				890	info->si_code = SYS_SECCOMP;
				891	info->si_call_addr = (void __user *)KSTK_EIP(current);
				892	info->si_errno = reason;
				893	info->si_arch = syscall_get_arch(current);
				894	info->si_syscall = syscall;
				895	}
				896
				897	/**
				898	* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
				899	* @syscall: syscall number to send to userland
				900	* @reason: filter-supplied reason code to send to userland (via si_errno)
				901	*
				902	* Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
				903	*/
				904	static void seccomp_send_sigsys(int syscall, int reason)
				905	{
				906	struct kernel_siginfo info;
				907	seccomp_init_siginfo(&info, syscall, reason);
				908	force_sig_info(&info);
				909	}
				910	#endif /* CONFIG_SECCOMP_FILTER */
				911
				912	/* For use with seccomp_actions_logged */
				913	#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
				914	#define SECCOMP_LOG_KILL_THREAD (1 << 1)
				915	#define SECCOMP_LOG_TRAP (1 << 2)
				916	#define SECCOMP_LOG_ERRNO (1 << 3)
				917	#define SECCOMP_LOG_TRACE (1 << 4)
				918	#define SECCOMP_LOG_LOG (1 << 5)
				919	#define SECCOMP_LOG_ALLOW (1 << 6)
				920	#define SECCOMP_LOG_USER_NOTIF (1 << 7)
				921
				922	static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS \|
				923	SECCOMP_LOG_KILL_THREAD \|
				924	SECCOMP_LOG_TRAP \|
				925	SECCOMP_LOG_ERRNO \|
				926	SECCOMP_LOG_USER_NOTIF \|
				927	SECCOMP_LOG_TRACE \|
				928	SECCOMP_LOG_LOG;
				929
				930	static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
				931	bool requested)
				932	{
				933	bool log = false;
				934
				935	switch (action) {
				936	case SECCOMP_RET_ALLOW:
				937	break;
				938	case SECCOMP_RET_TRAP:
				939	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
				940	break;
				941	case SECCOMP_RET_ERRNO:
				942	log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
				943	break;
				944	case SECCOMP_RET_TRACE:
				945	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
				946	break;
				947	case SECCOMP_RET_USER_NOTIF:
				948	log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
				949	break;
				950	case SECCOMP_RET_LOG:
				951	log = seccomp_actions_logged & SECCOMP_LOG_LOG;
				952	break;
				953	case SECCOMP_RET_KILL_THREAD:
				954	log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
				955	break;
				956	case SECCOMP_RET_KILL_PROCESS:
				957	default:
				958	log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
				959	}
				960
				961	/*
				962	* Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
				963	* FILTER_FLAG_LOG bit was set. The admin has the ability to silence
				964	* any action from being logged by removing the action name from the
				965	* seccomp_actions_logged sysctl.
				966	*/
				967	if (!log)
				968	return;
				969
				970	audit_seccomp(syscall, signr, action);
				971	}
				972
				973	/*
				974	* Secure computing mode 1 allows only read/write/exit/sigreturn.
				975	* To be fully secure this must be combined with rlimit
				976	* to limit the stack allocations too.
				977	*/
				978	static const int mode1_syscalls[] = {
				979	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
				980	-1, /* negative terminated */
				981	};
				982
				983	static void __secure_computing_strict(int this_syscall)
				984	{
				985	const int *allowed_syscalls = mode1_syscalls;
				986	#ifdef CONFIG_COMPAT
				987	if (in_compat_syscall())
				988	allowed_syscalls = get_compat_mode1_syscalls();
				989	#endif
				990	do {
				991	if (*allowed_syscalls == this_syscall)
				992	return;
				993	} while (*++allowed_syscalls != -1);
				994
				995	#ifdef SECCOMP_DEBUG
				996	dump_stack();
				997	#endif
				998	current->seccomp.mode = SECCOMP_MODE_DEAD;
				999	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
				1000	do_exit(SIGKILL);
				1001	}
				1002
				1003	#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
				1004	void secure_computing_strict(int this_syscall)
				1005	{
				1006	int mode = current->seccomp.mode;
				1007
				1008	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
				1009	unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
				1010	return;
				1011
				1012	if (mode == SECCOMP_MODE_DISABLED)
				1013	return;
				1014	else if (mode == SECCOMP_MODE_STRICT)
				1015	__secure_computing_strict(this_syscall);
				1016	else
				1017	BUG();
				1018	}
				1019	#else
				1020
				1021	#ifdef CONFIG_SECCOMP_FILTER
				1022	static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
				1023	{
				1024	/*
				1025	* Note: overflow is ok here, the id just needs to be unique per
				1026	* filter.
				1027	*/
				1028	lockdep_assert_held(&filter->notify_lock);
				1029	return filter->notif->next_id++;
				1030	}
				1031
				1032	static int seccomp_do_user_notification(int this_syscall,
				1033	struct seccomp_filter *match,
				1034	const struct seccomp_data *sd)
				1035	{
				1036	int err;
				1037	u32 flags = 0;
				1038	long ret = 0;
				1039	struct seccomp_knotif n = {};
				1040
				1041	mutex_lock(&match->notify_lock);
				1042	err = -ENOSYS;
				1043	if (!match->notif)
				1044	goto out;
				1045
				1046	n.task = current;
				1047	n.state = SECCOMP_NOTIFY_INIT;
				1048	n.data = sd;
				1049	n.id = seccomp_next_notify_id(match);
				1050	init_completion(&n.ready);
				1051	list_add(&n.list, &match->notif->notifications);
				1052
				1053	up(&match->notif->request);
				1054	wake_up_poll(&match->wqh, EPOLLIN \| EPOLLRDNORM);
				1055	mutex_unlock(&match->notify_lock);
				1056
				1057	/*
				1058	* This is where we wait for a reply from userspace.
				1059	*/
				1060	err = wait_for_completion_interruptible(&n.ready);
				1061	mutex_lock(&match->notify_lock);
				1062	if (err == 0) {
				1063	ret = n.val;
				1064	err = n.error;
				1065	flags = n.flags;
				1066	}
				1067
				1068	/*
				1069	* Note that it's possible the listener died in between the time when
				1070	* we were notified of a respons (or a signal) and when we were able to
				1071	* re-acquire the lock, so only delete from the list if the
				1072	* notification actually exists.
				1073	*
				1074	* Also note that this test is only valid because there's no way to
				1075	* reattach to a notifier right now. If one is added, we'll need to
				1076	* keep track of the notif itself and make sure they match here.
				1077	*/
				1078	if (match->notif)
				1079	list_del(&n.list);
				1080	out:
				1081	mutex_unlock(&match->notify_lock);
				1082
				1083	/* Userspace requests to continue the syscall. */
				1084	if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
				1085	return 0;
				1086
				1087	syscall_set_return_value(current, current_pt_regs(),
				1088	err, ret);
				1089	return -1;
				1090	}
				1091
				1092	static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
				1093	const bool recheck_after_trace)
				1094	{
				1095	u32 filter_ret, action;
				1096	struct seccomp_filter *match = NULL;
				1097	int data;
				1098	struct seccomp_data sd_local;
				1099
				1100	/*
				1101	* Make sure that any changes to mode from another thread have
				1102	* been seen after TIF_SECCOMP was seen.
				1103	*/
				1104	rmb();
				1105
				1106	if (!sd) {
				1107	populate_seccomp_data(&sd_local);
				1108	sd = &sd_local;
				1109	}
				1110
				1111	filter_ret = seccomp_run_filters(sd, &match);
				1112	data = filter_ret & SECCOMP_RET_DATA;
				1113	action = filter_ret & SECCOMP_RET_ACTION_FULL;
				1114
				1115	switch (action) {
				1116	case SECCOMP_RET_ERRNO:
				1117	/* Set low-order bits as an errno, capped at MAX_ERRNO. */
				1118	if (data > MAX_ERRNO)
				1119	data = MAX_ERRNO;
				1120	syscall_set_return_value(current, current_pt_regs(),
				1121	-data, 0);
				1122	goto skip;
				1123
				1124	case SECCOMP_RET_TRAP:
				1125	/* Show the handler the original registers. */
				1126	syscall_rollback(current, current_pt_regs());
				1127	/* Let the filter pass back 16 bits of data. */
				1128	seccomp_send_sigsys(this_syscall, data);
				1129	goto skip;
				1130
				1131	case SECCOMP_RET_TRACE:
				1132	/* We've been put in this state by the ptracer already. */
				1133	if (recheck_after_trace)
				1134	return 0;
				1135
				1136	/* ENOSYS these calls if there is no tracer attached. */
				1137	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
				1138	syscall_set_return_value(current,
				1139	current_pt_regs(),
				1140	-ENOSYS, 0);
				1141	goto skip;
				1142	}
				1143
				1144	/* Allow the BPF to provide the event message */
				1145	ptrace_event(PTRACE_EVENT_SECCOMP, data);
				1146	/*
				1147	* The delivery of a fatal signal during event
				1148	* notification may silently skip tracer notification,
				1149	* which could leave us with a potentially unmodified
				1150	* syscall that the tracer would have liked to have
				1151	* changed. Since the process is about to die, we just
				1152	* force the syscall to be skipped and let the signal
				1153	* kill the process and correctly handle any tracer exit
				1154	* notifications.
				1155	*/
				1156	if (fatal_signal_pending(current))
				1157	goto skip;
				1158	/* Check if the tracer forced the syscall to be skipped. */
				1159	this_syscall = syscall_get_nr(current, current_pt_regs());
				1160	if (this_syscall < 0)
				1161	goto skip;
				1162
				1163	/*
				1164	* Recheck the syscall, since it may have changed. This
				1165	* intentionally uses a NULL struct seccomp_data to force
				1166	* a reload of all registers. This does not goto skip since
				1167	* a skip would have already been reported.
				1168	*/
				1169	if (__seccomp_filter(this_syscall, NULL, true))
				1170	return -1;
				1171
				1172	return 0;
				1173
				1174	case SECCOMP_RET_USER_NOTIF:
				1175	if (seccomp_do_user_notification(this_syscall, match, sd))
				1176	goto skip;
				1177
				1178	return 0;
				1179
				1180	case SECCOMP_RET_LOG:
				1181	seccomp_log(this_syscall, 0, action, true);
				1182	return 0;
				1183
				1184	case SECCOMP_RET_ALLOW:
				1185	/*
				1186	* Note that the "match" filter will always be NULL for
				1187	* this action since SECCOMP_RET_ALLOW is the starting
				1188	* state in seccomp_run_filters().
				1189	*/
				1190	return 0;
				1191
				1192	case SECCOMP_RET_KILL_THREAD:
				1193	case SECCOMP_RET_KILL_PROCESS:
				1194	default:
				1195	current->seccomp.mode = SECCOMP_MODE_DEAD;
				1196	seccomp_log(this_syscall, SIGSYS, action, true);
				1197	/* Dump core only if this is the last remaining thread. */
				1198	if (action != SECCOMP_RET_KILL_THREAD \|\|
				1199	get_nr_threads(current) == 1) {
				1200	kernel_siginfo_t info;
				1201
				1202	/* Show the original registers in the dump. */
				1203	syscall_rollback(current, current_pt_regs());
				1204	/* Trigger a manual coredump since do_exit skips it. */
				1205	seccomp_init_siginfo(&info, this_syscall, data);
				1206	do_coredump(&info);
				1207	}
				1208	if (action == SECCOMP_RET_KILL_THREAD)
				1209	do_exit(SIGSYS);
				1210	else
				1211	do_group_exit(SIGSYS);
				1212	}
				1213
				1214	unreachable();
				1215
				1216	skip:
				1217	seccomp_log(this_syscall, 0, action, match ? match->log : false);
				1218	return -1;
				1219	}
				1220	#else
				1221	static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
				1222	const bool recheck_after_trace)
				1223	{
				1224	BUG();
				1225
				1226	return -1;
				1227	}
				1228	#endif
				1229
				1230	int __secure_computing(const struct seccomp_data *sd)
				1231	{
				1232	int mode = current->seccomp.mode;
				1233	int this_syscall;
				1234
				1235	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
				1236	unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
				1237	return 0;
				1238
				1239	this_syscall = sd ? sd->nr :
				1240	syscall_get_nr(current, current_pt_regs());
				1241
				1242	switch (mode) {
				1243	case SECCOMP_MODE_STRICT:
				1244	__secure_computing_strict(this_syscall); /* may call do_exit */
				1245	return 0;
				1246	case SECCOMP_MODE_FILTER:
				1247	return __seccomp_filter(this_syscall, sd, false);
				1248	/* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
				1249	case SECCOMP_MODE_DEAD:
				1250	WARN_ON_ONCE(1);
				1251	do_exit(SIGKILL);
				1252	return -1;
				1253	default:
				1254	BUG();
				1255	}
				1256	}
				1257	#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
				1258
				1259	long prctl_get_seccomp(void)
				1260	{
				1261	return current->seccomp.mode;
				1262	}
				1263
				1264	/**
				1265	* seccomp_set_mode_strict: internal function for setting strict seccomp
				1266	*
				1267	* Once current->seccomp.mode is non-zero, it may not be changed.
				1268	*
				1269	* Returns 0 on success or -EINVAL on failure.
				1270	*/
				1271	static long seccomp_set_mode_strict(void)
				1272	{
				1273	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
				1274	long ret = -EINVAL;
				1275
				1276	spin_lock_irq(&current->sighand->siglock);
				1277
				1278	if (!seccomp_may_assign_mode(seccomp_mode))
				1279	goto out;
				1280
				1281	#ifdef TIF_NOTSC
				1282	disable_TSC();
				1283	#endif
				1284	seccomp_assign_mode(current, seccomp_mode, 0);
				1285	ret = 0;
				1286
				1287	out:
				1288	spin_unlock_irq(&current->sighand->siglock);
				1289
				1290	return ret;
				1291	}
				1292
				1293	#ifdef CONFIG_SECCOMP_FILTER
				1294	static void seccomp_notify_free(struct seccomp_filter *filter)
				1295	{
				1296	kfree(filter->notif);
				1297	filter->notif = NULL;
				1298	}
				1299
				1300	static void seccomp_notify_detach(struct seccomp_filter *filter)
				1301	{
				1302	struct seccomp_knotif *knotif;
				1303
				1304	if (!filter)
				1305	return;
				1306
				1307	mutex_lock(&filter->notify_lock);
				1308
				1309	/*
				1310	* If this file is being closed because e.g. the task who owned it
				1311	* died, let's wake everyone up who was waiting on us.
				1312	*/
				1313	list_for_each_entry(knotif, &filter->notif->notifications, list) {
				1314	if (knotif->state == SECCOMP_NOTIFY_REPLIED)
				1315	continue;
				1316
				1317	knotif->state = SECCOMP_NOTIFY_REPLIED;
				1318	knotif->error = -ENOSYS;
				1319	knotif->val = 0;
				1320
				1321	complete(&knotif->ready);
				1322	}
				1323
				1324	seccomp_notify_free(filter);
				1325	mutex_unlock(&filter->notify_lock);
				1326	}
				1327
				1328	static int seccomp_notify_release(struct inode inode, struct file file)
				1329	{
				1330	struct seccomp_filter *filter = file->private_data;
				1331
				1332	seccomp_notify_detach(filter);
				1333	__put_seccomp_filter(filter);
				1334	return 0;
				1335	}
				1336
				1337	/* must be called with notif_lock held */
				1338	static inline struct seccomp_knotif *
				1339	find_notification(struct seccomp_filter *filter, u64 id)
				1340	{
				1341	struct seccomp_knotif *cur;
				1342
				1343	lockdep_assert_held(&filter->notify_lock);
				1344
				1345	list_for_each_entry(cur, &filter->notif->notifications, list) {
				1346	if (cur->id == id)
				1347	return cur;
				1348	}
				1349
				1350	return NULL;
				1351	}
				1352
				1353
				1354	static long seccomp_notify_recv(struct seccomp_filter *filter,
				1355	void __user *buf)
				1356	{
				1357	struct seccomp_knotif knotif = NULL, cur;
				1358	struct seccomp_notif unotif;
				1359	ssize_t ret;
				1360
				1361	/* Verify that we're not given garbage to keep struct extensible. */
				1362	ret = check_zeroed_user(buf, sizeof(unotif));
				1363	if (ret < 0)
				1364	return ret;
				1365	if (!ret)
				1366	return -EINVAL;
				1367
				1368	memset(&unotif, 0, sizeof(unotif));
				1369
				1370	ret = down_interruptible(&filter->notif->request);
				1371	if (ret < 0)
				1372	return ret;
				1373
				1374	mutex_lock(&filter->notify_lock);
				1375	list_for_each_entry(cur, &filter->notif->notifications, list) {
				1376	if (cur->state == SECCOMP_NOTIFY_INIT) {
				1377	knotif = cur;
				1378	break;
				1379	}
				1380	}
				1381
				1382	/*
				1383	* If we didn't find a notification, it could be that the task was
				1384	* interrupted by a fatal signal between the time we were woken and
				1385	* when we were able to acquire the rw lock.
				1386	*/
				1387	if (!knotif) {
				1388	ret = -ENOENT;
				1389	goto out;
				1390	}
				1391
				1392	unotif.id = knotif->id;
				1393	unotif.pid = task_pid_vnr(knotif->task);
				1394	unotif.data = *(knotif->data);
				1395
				1396	knotif->state = SECCOMP_NOTIFY_SENT;
				1397	wake_up_poll(&filter->wqh, EPOLLOUT \| EPOLLWRNORM);
				1398	ret = 0;
				1399	out:
				1400	mutex_unlock(&filter->notify_lock);
				1401
				1402	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
				1403	ret = -EFAULT;
				1404
				1405	/*
				1406	* Userspace screwed up. To make sure that we keep this
				1407	* notification alive, let's reset it back to INIT. It
				1408	* may have died when we released the lock, so we need to make
				1409	* sure it's still around.
				1410	*/
				1411	mutex_lock(&filter->notify_lock);
				1412	knotif = find_notification(filter, unotif.id);
				1413	if (knotif) {
				1414	knotif->state = SECCOMP_NOTIFY_INIT;
				1415	up(&filter->notif->request);
				1416	}
				1417	mutex_unlock(&filter->notify_lock);
				1418	}
				1419
				1420	return ret;
				1421	}
				1422
				1423	static long seccomp_notify_send(struct seccomp_filter *filter,
				1424	void __user *buf)
				1425	{
				1426	struct seccomp_notif_resp resp = {};
				1427	struct seccomp_knotif *knotif;
				1428	long ret;
				1429
				1430	if (copy_from_user(&resp, buf, sizeof(resp)))
				1431	return -EFAULT;
				1432
				1433	if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
				1434	return -EINVAL;
				1435
				1436	if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
				1437	(resp.error \|\| resp.val))
				1438	return -EINVAL;
				1439
				1440	ret = mutex_lock_interruptible(&filter->notify_lock);
				1441	if (ret < 0)
				1442	return ret;
				1443
				1444	knotif = find_notification(filter, resp.id);
				1445	if (!knotif) {
				1446	ret = -ENOENT;
				1447	goto out;
				1448	}
				1449
				1450	/* Allow exactly one reply. */
				1451	if (knotif->state != SECCOMP_NOTIFY_SENT) {
				1452	ret = -EINPROGRESS;
				1453	goto out;
				1454	}
				1455
				1456	ret = 0;
				1457	knotif->state = SECCOMP_NOTIFY_REPLIED;
				1458	knotif->error = resp.error;
				1459	knotif->val = resp.val;
				1460	knotif->flags = resp.flags;
				1461	complete(&knotif->ready);
				1462	out:
				1463	mutex_unlock(&filter->notify_lock);
				1464	return ret;
				1465	}
				1466
				1467	static long seccomp_notify_id_valid(struct seccomp_filter *filter,
				1468	void __user *buf)
				1469	{
				1470	struct seccomp_knotif *knotif;
				1471	u64 id;
				1472	long ret;
				1473
				1474	if (copy_from_user(&id, buf, sizeof(id)))
				1475	return -EFAULT;
				1476
				1477	ret = mutex_lock_interruptible(&filter->notify_lock);
				1478	if (ret < 0)
				1479	return ret;
				1480
				1481	knotif = find_notification(filter, id);
				1482	if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
				1483	ret = 0;
				1484	else
				1485	ret = -ENOENT;
				1486
				1487	mutex_unlock(&filter->notify_lock);
				1488	return ret;
				1489	}
				1490
				1491	static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
				1492	unsigned long arg)
				1493	{
				1494	struct seccomp_filter *filter = file->private_data;
				1495	void __user buf = (void __user )arg;
				1496
				1497	switch (cmd) {
				1498	case SECCOMP_IOCTL_NOTIF_RECV:
				1499	return seccomp_notify_recv(filter, buf);
				1500	case SECCOMP_IOCTL_NOTIF_SEND:
				1501	return seccomp_notify_send(filter, buf);
				1502	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
				1503	case SECCOMP_IOCTL_NOTIF_ID_VALID:
				1504	return seccomp_notify_id_valid(filter, buf);
				1505	default:
				1506	return -EINVAL;
				1507	}
				1508	}
				1509
				1510	static __poll_t seccomp_notify_poll(struct file *file,
				1511	struct poll_table_struct *poll_tab)
				1512	{
				1513	struct seccomp_filter *filter = file->private_data;
				1514	__poll_t ret = 0;
				1515	struct seccomp_knotif *cur;
				1516
				1517	poll_wait(file, &filter->wqh, poll_tab);
				1518
				1519	if (mutex_lock_interruptible(&filter->notify_lock) < 0)
				1520	return EPOLLERR;
				1521
				1522	list_for_each_entry(cur, &filter->notif->notifications, list) {
				1523	if (cur->state == SECCOMP_NOTIFY_INIT)
				1524	ret \|= EPOLLIN \| EPOLLRDNORM;
				1525	if (cur->state == SECCOMP_NOTIFY_SENT)
				1526	ret \|= EPOLLOUT \| EPOLLWRNORM;
				1527	if ((ret & EPOLLIN) && (ret & EPOLLOUT))
				1528	break;
				1529	}
				1530
				1531	mutex_unlock(&filter->notify_lock);
				1532
				1533	if (refcount_read(&filter->users) == 0)
				1534	ret \|= EPOLLHUP;
				1535
				1536	return ret;
				1537	}
				1538
				1539	static const struct file_operations seccomp_notify_ops = {
				1540	.poll = seccomp_notify_poll,
				1541	.release = seccomp_notify_release,
				1542	.unlocked_ioctl = seccomp_notify_ioctl,
				1543	.compat_ioctl = seccomp_notify_ioctl,
				1544	};
				1545
				1546	static struct file init_listener(struct seccomp_filter filter)
				1547	{
				1548	struct file *ret;
				1549
				1550	ret = ERR_PTR(-ENOMEM);
				1551	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
				1552	if (!filter->notif)
				1553	goto out;
				1554
				1555	sema_init(&filter->notif->request, 0);
				1556	filter->notif->next_id = get_random_u64();
				1557	INIT_LIST_HEAD(&filter->notif->notifications);
				1558
				1559	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
				1560	filter, O_RDWR);
				1561	if (IS_ERR(ret))
				1562	goto out_notif;
				1563
				1564	/* The file has a reference to it now */
				1565	__get_seccomp_filter(filter);
				1566
				1567	out_notif:
				1568	if (IS_ERR(ret))
				1569	seccomp_notify_free(filter);
				1570	out:
				1571	return ret;
				1572	}
				1573
				1574	/*
				1575	* Does @new_child have a listener while an ancestor also has a listener?
				1576	* If so, we'll want to reject this filter.
				1577	* This only has to be tested for the current process, even in the TSYNC case,
				1578	* because TSYNC installs @child with the same parent on all threads.
				1579	* Note that @new_child is not hooked up to its parent at this point yet, so
				1580	* we use current->seccomp.filter.
				1581	*/
				1582	static bool has_duplicate_listener(struct seccomp_filter *new_child)
				1583	{
				1584	struct seccomp_filter *cur;
				1585
				1586	/* must be protected against concurrent TSYNC */
				1587	lockdep_assert_held(&current->sighand->siglock);
				1588
				1589	if (!new_child->notif)
				1590	return false;
				1591	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
				1592	if (cur->notif)
				1593	return true;
				1594	}
				1595
				1596	return false;
				1597	}
				1598
				1599	/**
				1600	* seccomp_set_mode_filter: internal function for setting seccomp filter
				1601	* @flags: flags to change filter behavior
				1602	* @filter: struct sock_fprog containing filter
				1603	*
				1604	* This function may be called repeatedly to install additional filters.
				1605	* Every filter successfully installed will be evaluated (in reverse order)
				1606	* for each system call the task makes.
				1607	*
				1608	* Once current->seccomp.mode is non-zero, it may not be changed.
				1609	*
				1610	* Returns 0 on success or -EINVAL on failure.
				1611	*/
				1612	static long seccomp_set_mode_filter(unsigned int flags,
				1613	const char __user *filter)
				1614	{
				1615	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
				1616	struct seccomp_filter *prepared = NULL;
				1617	long ret = -EINVAL;
				1618	int listener = -1;
				1619	struct file *listener_f = NULL;
				1620
				1621	/* Validate flags. */
				1622	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
				1623	return -EINVAL;
				1624
				1625	/*
				1626	* In the successful case, NEW_LISTENER returns the new listener fd.
				1627	* But in the failure case, TSYNC returns the thread that died. If you
				1628	* combine these two flags, there's no way to tell whether something
				1629	* succeeded or failed. So, let's disallow this combination.
				1630	*/
				1631	if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
				1632	(flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
				1633	return -EINVAL;
				1634
				1635	/* Prepare the new filter before holding any locks. */
				1636	prepared = seccomp_prepare_user_filter(filter);
				1637	if (IS_ERR(prepared))
				1638	return PTR_ERR(prepared);
				1639
				1640	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
				1641	listener = get_unused_fd_flags(O_CLOEXEC);
				1642	if (listener < 0) {
				1643	ret = listener;
				1644	goto out_free;
				1645	}
				1646
				1647	listener_f = init_listener(prepared);
				1648	if (IS_ERR(listener_f)) {
				1649	put_unused_fd(listener);
				1650	ret = PTR_ERR(listener_f);
				1651	goto out_free;
				1652	}
				1653	}
				1654
				1655	/*
				1656	* Make sure we cannot change seccomp or nnp state via TSYNC
				1657	* while another thread is in the middle of calling exec.
				1658	*/
				1659	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
				1660	mutex_lock_killable(&current->signal->cred_guard_mutex))
				1661	goto out_put_fd;
				1662
				1663	spin_lock_irq(&current->sighand->siglock);
				1664
				1665	if (!seccomp_may_assign_mode(seccomp_mode))
				1666	goto out;
				1667
				1668	if (has_duplicate_listener(prepared)) {
				1669	ret = -EBUSY;
				1670	goto out;
				1671	}
				1672
				1673	ret = seccomp_attach_filter(flags, prepared);
				1674	if (ret)
				1675	goto out;
				1676	/* Do not free the successfully attached filter. */
				1677	prepared = NULL;
				1678
				1679	seccomp_assign_mode(current, seccomp_mode, flags);
				1680	out:
				1681	spin_unlock_irq(&current->sighand->siglock);
				1682	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
				1683	mutex_unlock(&current->signal->cred_guard_mutex);
				1684	out_put_fd:
				1685	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
				1686	if (ret) {
				1687	listener_f->private_data = NULL;
				1688	fput(listener_f);
				1689	put_unused_fd(listener);
				1690	seccomp_notify_detach(prepared);
				1691	} else {
				1692	fd_install(listener, listener_f);
				1693	ret = listener;
				1694	}
				1695	}
				1696	out_free:
				1697	seccomp_filter_free(prepared);
				1698	return ret;
				1699	}
				1700	#else
				1701	static inline long seccomp_set_mode_filter(unsigned int flags,
				1702	const char __user *filter)
				1703	{
				1704	return -EINVAL;
				1705	}
				1706	#endif
				1707
				1708	static long seccomp_get_action_avail(const char __user *uaction)
				1709	{
				1710	u32 action;
				1711
				1712	if (copy_from_user(&action, uaction, sizeof(action)))
				1713	return -EFAULT;
				1714
				1715	switch (action) {
				1716	case SECCOMP_RET_KILL_PROCESS:
				1717	case SECCOMP_RET_KILL_THREAD:
				1718	case SECCOMP_RET_TRAP:
				1719	case SECCOMP_RET_ERRNO:
				1720	case SECCOMP_RET_USER_NOTIF:
				1721	case SECCOMP_RET_TRACE:
				1722	case SECCOMP_RET_LOG:
				1723	case SECCOMP_RET_ALLOW:
				1724	break;
				1725	default:
				1726	return -EOPNOTSUPP;
				1727	}
				1728
				1729	return 0;
				1730	}
				1731
				1732	static long seccomp_get_notif_sizes(void __user *usizes)
				1733	{
				1734	struct seccomp_notif_sizes sizes = {
				1735	.seccomp_notif = sizeof(struct seccomp_notif),
				1736	.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
				1737	.seccomp_data = sizeof(struct seccomp_data),
				1738	};
				1739
				1740	if (copy_to_user(usizes, &sizes, sizeof(sizes)))
				1741	return -EFAULT;
				1742
				1743	return 0;
				1744	}
				1745
				1746	/* Common entry point for both prctl and syscall. */
				1747	static long do_seccomp(unsigned int op, unsigned int flags,
				1748	void __user *uargs)
				1749	{
				1750	switch (op) {
				1751	case SECCOMP_SET_MODE_STRICT:
				1752	if (flags != 0 \|\| uargs != NULL)
				1753	return -EINVAL;
				1754	return seccomp_set_mode_strict();
				1755	case SECCOMP_SET_MODE_FILTER:
				1756	return seccomp_set_mode_filter(flags, uargs);
				1757	case SECCOMP_GET_ACTION_AVAIL:
				1758	if (flags != 0)
				1759	return -EINVAL;
				1760
				1761	return seccomp_get_action_avail(uargs);
				1762	case SECCOMP_GET_NOTIF_SIZES:
				1763	if (flags != 0)
				1764	return -EINVAL;
				1765
				1766	return seccomp_get_notif_sizes(uargs);
				1767	default:
				1768	return -EINVAL;
				1769	}
				1770	}
				1771
				1772	SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
				1773	void __user *, uargs)
				1774	{
				1775	return do_seccomp(op, flags, uargs);
				1776	}
				1777
				1778	/**
				1779	* prctl_set_seccomp: configures current->seccomp.mode
				1780	* @seccomp_mode: requested mode to use
				1781	* @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
				1782	*
				1783	* Returns 0 on success or -EINVAL on failure.
				1784	*/
				1785	long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
				1786	{
				1787	unsigned int op;
				1788	void __user *uargs;
				1789
				1790	switch (seccomp_mode) {
				1791	case SECCOMP_MODE_STRICT:
				1792	op = SECCOMP_SET_MODE_STRICT;
				1793	/*
				1794	* Setting strict mode through prctl always ignored filter,
				1795	* so make sure it is always NULL here to pass the internal
				1796	* check in do_seccomp().
				1797	*/
				1798	uargs = NULL;
				1799	break;
				1800	case SECCOMP_MODE_FILTER:
				1801	op = SECCOMP_SET_MODE_FILTER;
				1802	uargs = filter;
				1803	break;
				1804	default:
				1805	return -EINVAL;
				1806	}
				1807
				1808	/* prctl interface doesn't have flags, so they are always zero. */
				1809	return do_seccomp(op, 0, uargs);
				1810	}
				1811
				1812	#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
				1813	static struct seccomp_filter get_nth_filter(struct task_struct task,
				1814	unsigned long filter_off)
				1815	{
				1816	struct seccomp_filter orig, filter;
				1817	unsigned long count;
				1818
				1819	/*
				1820	* Note: this is only correct because the caller should be the (ptrace)
				1821	* tracer of the task, otherwise lock_task_sighand is needed.
				1822	*/
				1823	spin_lock_irq(&task->sighand->siglock);
				1824
				1825	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
				1826	spin_unlock_irq(&task->sighand->siglock);
				1827	return ERR_PTR(-EINVAL);
				1828	}
				1829
				1830	orig = task->seccomp.filter;
				1831	__get_seccomp_filter(orig);
				1832	spin_unlock_irq(&task->sighand->siglock);
				1833
				1834	count = 0;
				1835	for (filter = orig; filter; filter = filter->prev)
				1836	count++;
				1837
				1838	if (filter_off >= count) {
				1839	filter = ERR_PTR(-ENOENT);
				1840	goto out;
				1841	}
				1842
				1843	count -= filter_off;
				1844	for (filter = orig; filter && count > 1; filter = filter->prev)
				1845	count--;
				1846
				1847	if (WARN_ON(count != 1 \|\| !filter)) {
				1848	filter = ERR_PTR(-ENOENT);
				1849	goto out;
				1850	}
				1851
				1852	__get_seccomp_filter(filter);
				1853
				1854	out:
				1855	__put_seccomp_filter(orig);
				1856	return filter;
				1857	}
				1858
				1859	long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
				1860	void __user *data)
				1861	{
				1862	struct seccomp_filter *filter;
				1863	struct sock_fprog_kern *fprog;
				1864	long ret;
				1865
				1866	if (!capable(CAP_SYS_ADMIN) \|\|
				1867	current->seccomp.mode != SECCOMP_MODE_DISABLED) {
				1868	return -EACCES;
				1869	}
				1870
				1871	filter = get_nth_filter(task, filter_off);
				1872	if (IS_ERR(filter))
				1873	return PTR_ERR(filter);
				1874
				1875	fprog = filter->prog->orig_prog;
				1876	if (!fprog) {
				1877	/* This must be a new non-cBPF filter, since we save
				1878	* every cBPF filter's orig_prog above when
				1879	* CONFIG_CHECKPOINT_RESTORE is enabled.
				1880	*/
				1881	ret = -EMEDIUMTYPE;
				1882	goto out;
				1883	}
				1884
				1885	ret = fprog->len;
				1886	if (!data)
				1887	goto out;
				1888
				1889	if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
				1890	ret = -EFAULT;
				1891
				1892	out:
				1893	__put_seccomp_filter(filter);
				1894	return ret;
				1895	}
				1896
				1897	long seccomp_get_metadata(struct task_struct *task,
				1898	unsigned long size, void __user *data)
				1899	{
				1900	long ret;
				1901	struct seccomp_filter *filter;
				1902	struct seccomp_metadata kmd = {};
				1903
				1904	if (!capable(CAP_SYS_ADMIN) \|\|
				1905	current->seccomp.mode != SECCOMP_MODE_DISABLED) {
				1906	return -EACCES;
				1907	}
				1908
				1909	size = min_t(unsigned long, size, sizeof(kmd));
				1910
				1911	if (size < sizeof(kmd.filter_off))
				1912	return -EINVAL;
				1913
				1914	if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))
				1915	return -EFAULT;
				1916
				1917	filter = get_nth_filter(task, kmd.filter_off);
				1918	if (IS_ERR(filter))
				1919	return PTR_ERR(filter);
				1920
				1921	if (filter->log)
				1922	kmd.flags \|= SECCOMP_FILTER_FLAG_LOG;
				1923
				1924	ret = size;
				1925	if (copy_to_user(data, &kmd, size))
				1926	ret = -EFAULT;
				1927
				1928	__put_seccomp_filter(filter);
				1929	return ret;
				1930	}
				1931	#endif
				1932
				1933	#ifdef CONFIG_SYSCTL
				1934
				1935	/* Human readable action names for friendly sysctl interaction */
				1936	#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
				1937	#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
				1938	#define SECCOMP_RET_TRAP_NAME "trap"
				1939	#define SECCOMP_RET_ERRNO_NAME "errno"
				1940	#define SECCOMP_RET_USER_NOTIF_NAME "user_notif"
				1941	#define SECCOMP_RET_TRACE_NAME "trace"
				1942	#define SECCOMP_RET_LOG_NAME "log"
				1943	#define SECCOMP_RET_ALLOW_NAME "allow"
				1944
				1945	static const char seccomp_actions_avail[] =
				1946	SECCOMP_RET_KILL_PROCESS_NAME " "
				1947	SECCOMP_RET_KILL_THREAD_NAME " "
				1948	SECCOMP_RET_TRAP_NAME " "
				1949	SECCOMP_RET_ERRNO_NAME " "
				1950	SECCOMP_RET_USER_NOTIF_NAME " "
				1951	SECCOMP_RET_TRACE_NAME " "
				1952	SECCOMP_RET_LOG_NAME " "
				1953	SECCOMP_RET_ALLOW_NAME;
				1954
				1955	struct seccomp_log_name {
				1956	u32 log;
				1957	const char *name;
				1958	};
				1959
				1960	static const struct seccomp_log_name seccomp_log_names[] = {
				1961	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
				1962	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
				1963	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
				1964	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
				1965	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
				1966	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
				1967	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
				1968	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
				1969	{ }
				1970	};
				1971
				1972	static bool seccomp_names_from_actions_logged(char *names, size_t size,
				1973	u32 actions_logged,
				1974	const char *sep)
				1975	{
				1976	const struct seccomp_log_name *cur;
				1977	bool append_sep = false;
				1978
				1979	for (cur = seccomp_log_names; cur->name && size; cur++) {
				1980	ssize_t ret;
				1981
				1982	if (!(actions_logged & cur->log))
				1983	continue;
				1984
				1985	if (append_sep) {
				1986	ret = strscpy(names, sep, size);
				1987	if (ret < 0)
				1988	return false;
				1989
				1990	names += ret;
				1991	size -= ret;
				1992	} else
				1993	append_sep = true;
				1994
				1995	ret = strscpy(names, cur->name, size);
				1996	if (ret < 0)
				1997	return false;
				1998
				1999	names += ret;
				2000	size -= ret;
				2001	}
				2002
				2003	return true;
				2004	}
				2005
				2006	static bool seccomp_action_logged_from_name(u32 *action_logged,
				2007	const char *name)
				2008	{
				2009	const struct seccomp_log_name *cur;
				2010
				2011	for (cur = seccomp_log_names; cur->name; cur++) {
				2012	if (!strcmp(cur->name, name)) {
				2013	*action_logged = cur->log;
				2014	return true;
				2015	}
				2016	}
				2017
				2018	return false;
				2019	}
				2020
				2021	static bool seccomp_actions_logged_from_names(u32 actions_logged, char names)
				2022	{
				2023	char *name;
				2024
				2025	*actions_logged = 0;
				2026	while ((name = strsep(&names, " ")) && *name) {
				2027	u32 action_logged = 0;
				2028
				2029	if (!seccomp_action_logged_from_name(&action_logged, name))
				2030	return false;
				2031
				2032	*actions_logged \|= action_logged;
				2033	}
				2034
				2035	return true;
				2036	}
				2037
				2038	static int read_actions_logged(struct ctl_table ro_table, void buffer,
				2039	size_t lenp, loff_t ppos)
				2040	{
				2041	char names[sizeof(seccomp_actions_avail)];
				2042	struct ctl_table table;
				2043
				2044	memset(names, 0, sizeof(names));
				2045
				2046	if (!seccomp_names_from_actions_logged(names, sizeof(names),
				2047	seccomp_actions_logged, " "))
				2048	return -EINVAL;
				2049
				2050	table = *ro_table;
				2051	table.data = names;
				2052	table.maxlen = sizeof(names);
				2053	return proc_dostring(&table, 0, buffer, lenp, ppos);
				2054	}
				2055
				2056	static int write_actions_logged(struct ctl_table ro_table, void buffer,
				2057	size_t lenp, loff_t ppos, u32 *actions_logged)
				2058	{
				2059	char names[sizeof(seccomp_actions_avail)];
				2060	struct ctl_table table;
				2061	int ret;
				2062
				2063	if (!capable(CAP_SYS_ADMIN))
				2064	return -EPERM;
				2065
				2066	memset(names, 0, sizeof(names));
				2067
				2068	table = *ro_table;
				2069	table.data = names;
				2070	table.maxlen = sizeof(names);
				2071	ret = proc_dostring(&table, 1, buffer, lenp, ppos);
				2072	if (ret)
				2073	return ret;
				2074
				2075	if (!seccomp_actions_logged_from_names(actions_logged, table.data))
				2076	return -EINVAL;
				2077
				2078	if (*actions_logged & SECCOMP_LOG_ALLOW)
				2079	return -EINVAL;
				2080
				2081	seccomp_actions_logged = *actions_logged;
				2082	return 0;
				2083	}
				2084
				2085	static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
				2086	int ret)
				2087	{
				2088	char names[sizeof(seccomp_actions_avail)];
				2089	char old_names[sizeof(seccomp_actions_avail)];
				2090	const char *new = names;
				2091	const char *old = old_names;
				2092
				2093	if (!audit_enabled)
				2094	return;
				2095
				2096	memset(names, 0, sizeof(names));
				2097	memset(old_names, 0, sizeof(old_names));
				2098
				2099	if (ret)
				2100	new = "?";
				2101	else if (!actions_logged)
				2102	new = "(none)";
				2103	else if (!seccomp_names_from_actions_logged(names, sizeof(names),
				2104	actions_logged, ","))
				2105	new = "?";
				2106
				2107	if (!old_actions_logged)
				2108	old = "(none)";
				2109	else if (!seccomp_names_from_actions_logged(old_names,
				2110	sizeof(old_names),
				2111	old_actions_logged, ","))
				2112	old = "?";
				2113
				2114	return audit_seccomp_actions_logged(new, old, !ret);
				2115	}
				2116
				2117	static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
				2118	void __user buffer, size_t lenp,
				2119	loff_t *ppos)
				2120	{
				2121	int ret;
				2122
				2123	if (write) {
				2124	u32 actions_logged = 0;
				2125	u32 old_actions_logged = seccomp_actions_logged;
				2126
				2127	ret = write_actions_logged(ro_table, buffer, lenp, ppos,
				2128	&actions_logged);
				2129	audit_actions_logged(actions_logged, old_actions_logged, ret);
				2130	} else
				2131	ret = read_actions_logged(ro_table, buffer, lenp, ppos);
				2132
				2133	return ret;
				2134	}
				2135
				2136	static struct ctl_path seccomp_sysctl_path[] = {
				2137	{ .procname = "kernel", },
				2138	{ .procname = "seccomp", },
				2139	{ }
				2140	};
				2141
				2142	static struct ctl_table seccomp_sysctl_table[] = {
				2143	{
				2144	.procname = "actions_avail",
				2145	.data = (void *) &seccomp_actions_avail,
				2146	.maxlen = sizeof(seccomp_actions_avail),
				2147	.mode = 0444,
				2148	.proc_handler = proc_dostring,
				2149	},
				2150	{
				2151	.procname = "actions_logged",
				2152	.mode = 0644,
				2153	.proc_handler = seccomp_actions_logged_handler,
				2154	},
				2155	{ }
				2156	};
				2157
				2158	static int __init seccomp_sysctl_init(void)
				2159	{
				2160	struct ctl_table_header *hdr;
				2161
				2162	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
				2163	if (!hdr)
				2164	pr_warn("sysctl registration failed\n");
				2165	else
				2166	kmemleak_not_leak(hdr);
				2167
				2168	return 0;
				2169	}
				2170
				2171	device_initcall(seccomp_sysctl_init)
				2172
				2173	#endif /* CONFIG_SYSCTL */