Blame - ap/os/linux/linux-3.4.x/kernel/trace/ring_buffer.c - R306

blob: 04aaae290fecf496bde3be23b9d0f832806f3b51 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* Generic ring buffer
				3	*
				4	* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
				5	*/
				6	#include <linux/ring_buffer.h>
				7	#include <linux/trace_clock.h>
				8	#include <linux/spinlock.h>
				9	#include <linux/debugfs.h>
				10	#include <linux/uaccess.h>
				11	#include <linux/hardirq.h>
				12	#include <linux/kmemcheck.h>
				13	#include <linux/module.h>
				14	#include <linux/percpu.h>
				15	#include <linux/mutex.h>
				16	#include <linux/slab.h>
				17	#include <linux/init.h>
				18	#include <linux/hash.h>
				19	#include <linux/list.h>
				20	#include <linux/cpu.h>
				21	#include <linux/fs.h>
				22
				23	#include <asm/local.h>
				24	#include "trace.h"
				25
				26	/*
				27	* The ring buffer header is special. We must manually up keep it.
				28	*/
				29	int ring_buffer_print_entry_header(struct trace_seq *s)
				30	{
				31	int ret;
				32
				33	ret = trace_seq_printf(s, "# compressed entry header\n");
				34	ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
				35	ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
				36	ret = trace_seq_printf(s, "\tarray : 32 bits\n");
				37	ret = trace_seq_printf(s, "\n");
				38	ret = trace_seq_printf(s, "\tpadding : type == %d\n",
				39	RINGBUF_TYPE_PADDING);
				40	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
				41	RINGBUF_TYPE_TIME_EXTEND);
				42	ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
				43	RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
				44
				45	return ret;
				46	}
				47
				48	/*
				49	* The ring buffer is made up of a list of pages. A separate list of pages is
				50	* allocated for each CPU. A writer may only write to a buffer that is
				51	* associated with the CPU it is currently executing on. A reader may read
				52	* from any per cpu buffer.
				53	*
				54	* The reader is special. For each per cpu buffer, the reader has its own
				55	* reader page. When a reader has read the entire reader page, this reader
				56	* page is swapped with another page in the ring buffer.
				57	*
				58	* Now, as long as the writer is off the reader page, the reader can do what
				59	* ever it wants with that page. The writer will never write to that page
				60	* again (as long as it is out of the ring buffer).
				61	*
				62	* Here's some silly ASCII art.
				63	*
				64	* +------+
				65	* \|reader\| RING BUFFER
				66	* \|page \|
				67	* +------+ +---+ +---+ +---+
				68	* \| \|-->\| \|-->\| \|
				69	* +---+ +---+ +---+
				70	* ^ \|
				71	* \| \|
				72	* +---------------+
				73	*
				74	*
				75	* +------+
				76	* \|reader\| RING BUFFER
				77	* \|page \|------------------v
				78	* +------+ +---+ +---+ +---+
				79	* \| \|-->\| \|-->\| \|
				80	* +---+ +---+ +---+
				81	* ^ \|
				82	* \| \|
				83	* +---------------+
				84	*
				85	*
				86	* +------+
				87	* \|reader\| RING BUFFER
				88	* \|page \|------------------v
				89	* +------+ +---+ +---+ +---+
				90	* ^ \| \|-->\| \|-->\| \|
				91	* \| +---+ +---+ +---+
				92	* \| \|
				93	* \| \|
				94	* +------------------------------+
				95	*
				96	*
				97	* +------+
				98	* \|buffer\| RING BUFFER
				99	* \|page \|------------------v
				100	* +------+ +---+ +---+ +---+
				101	* ^ \| \| \| \|-->\| \|
				102	* \| New +---+ +---+ +---+
				103	* \| Reader------^ \|
				104	* \| page \|
				105	* +------------------------------+
				106	*
				107	*
				108	* After we make this swap, the reader can hand this page off to the splice
				109	* code and be done with it. It can even allocate a new page if it needs to
				110	* and swap that into the ring buffer.
				111	*
				112	* We will be using cmpxchg soon to make all this lockless.
				113	*
				114	*/
				115
				116	/*
				117	* A fast way to enable or disable all ring buffers is to
				118	* call tracing_on or tracing_off. Turning off the ring buffers
				119	* prevents all ring buffers from being recorded to.
				120	* Turning this switch on, makes it OK to write to the
				121	* ring buffer, if the ring buffer is enabled itself.
				122	*
				123	* There's three layers that must be on in order to write
				124	* to the ring buffer.
				125	*
				126	* 1) This global flag must be set.
				127	* 2) The ring buffer must be enabled for recording.
				128	* 3) The per cpu buffer must be enabled for recording.
				129	*
				130	* In case of an anomaly, this global flag has a bit set that
				131	* will permantly disable all ring buffers.
				132	*/
				133
				134	/*
				135	* Global flag to disable all recording to ring buffers
				136	* This has two bits: ON, DISABLED
				137	*
				138	* ON DISABLED
				139	* ---- ----------
				140	* 0 0 : ring buffers are off
				141	* 1 0 : ring buffers are on
				142	* X 1 : ring buffers are permanently disabled
				143	*/
				144
				145	enum {
				146	RB_BUFFERS_ON_BIT = 0,
				147	RB_BUFFERS_DISABLED_BIT = 1,
				148	};
				149
				150	enum {
				151	RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
				152	RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
				153	};
				154
				155	static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
				156
				157	/* Used for individual buffers (after the counter) */
				158	#define RB_BUFFER_OFF (1 << 20)
				159
				160	#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
				161
				162	/**
				163	* tracing_off_permanent - permanently disable ring buffers
				164	*
				165	* This function, once called, will disable all ring buffers
				166	* permanently.
				167	*/
				168	void tracing_off_permanent(void)
				169	{
				170	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
				171	}
				172
				173	#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
				174	#define RB_ALIGNMENT 4U
				175	#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
				176	#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
				177
				178	#if !defined(CONFIG_64BIT) \|\| defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
				179	# define RB_FORCE_8BYTE_ALIGNMENT 0
				180	# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
				181	#else
				182	# define RB_FORCE_8BYTE_ALIGNMENT 1
				183	# define RB_ARCH_ALIGNMENT 8U
				184	#endif
				185
				186	/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
				187	#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
				188
				189	enum {
				190	RB_LEN_TIME_EXTEND = 8,
				191	RB_LEN_TIME_STAMP = 16,
				192	};
				193
				194	#define skip_time_extend(event) \
				195	((struct ring_buffer_event )((char )event + RB_LEN_TIME_EXTEND))
				196
				197	static inline int rb_null_event(struct ring_buffer_event *event)
				198	{
				199	return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
				200	}
				201
				202	static void rb_event_set_padding(struct ring_buffer_event *event)
				203	{
				204	/* padding has a NULL time_delta */
				205	event->type_len = RINGBUF_TYPE_PADDING;
				206	event->time_delta = 0;
				207	}
				208
				209	static unsigned
				210	rb_event_data_length(struct ring_buffer_event *event)
				211	{
				212	unsigned length;
				213
				214	if (event->type_len)
				215	length = event->type_len * RB_ALIGNMENT;
				216	else
				217	length = event->array[0];
				218	return length + RB_EVNT_HDR_SIZE;
				219	}
				220
				221	/*
				222	* Return the length of the given event. Will return
				223	* the length of the time extend if the event is a
				224	* time extend.
				225	*/
				226	static inline unsigned
				227	rb_event_length(struct ring_buffer_event *event)
				228	{
				229	switch (event->type_len) {
				230	case RINGBUF_TYPE_PADDING:
				231	if (rb_null_event(event))
				232	/* undefined */
				233	return -1;
				234	return event->array[0] + RB_EVNT_HDR_SIZE;
				235
				236	case RINGBUF_TYPE_TIME_EXTEND:
				237	return RB_LEN_TIME_EXTEND;
				238
				239	case RINGBUF_TYPE_TIME_STAMP:
				240	return RB_LEN_TIME_STAMP;
				241
				242	case RINGBUF_TYPE_DATA:
				243	return rb_event_data_length(event);
				244	default:
				245	BUG();
				246	}
				247	/* not hit */
				248	return 0;
				249	}
				250
				251	/*
				252	* Return total length of time extend and data,
				253	* or just the event length for all other events.
				254	*/
				255	static inline unsigned
				256	rb_event_ts_length(struct ring_buffer_event *event)
				257	{
				258	unsigned len = 0;
				259
				260	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
				261	/* time extends include the data event after it */
				262	len = RB_LEN_TIME_EXTEND;
				263	event = skip_time_extend(event);
				264	}
				265	return len + rb_event_length(event);
				266	}
				267
				268	/**
				269	* ring_buffer_event_length - return the length of the event
				270	* @event: the event to get the length of
				271	*
				272	* Returns the size of the data load of a data event.
				273	* If the event is something other than a data event, it
				274	* returns the size of the event itself. With the exception
				275	* of a TIME EXTEND, where it still returns the size of the
				276	* data load of the data event after it.
				277	*/
				278	unsigned ring_buffer_event_length(struct ring_buffer_event *event)
				279	{
				280	unsigned length;
				281
				282	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
				283	event = skip_time_extend(event);
				284
				285	length = rb_event_length(event);
				286	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
				287	return length;
				288	length -= RB_EVNT_HDR_SIZE;
				289	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
				290	length -= sizeof(event->array[0]);
				291	return length;
				292	}
				293	EXPORT_SYMBOL_GPL(ring_buffer_event_length);
				294
				295	/* inline for ring buffer fast paths */
				296	static void *
				297	rb_event_data(struct ring_buffer_event *event)
				298	{
				299	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
				300	event = skip_time_extend(event);
				301	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
				302	/* If length is in len field, then array[0] has the data */
				303	if (event->type_len)
				304	return (void *)&event->array[0];
				305	/* Otherwise length is in array[0] and array[1] has the data */
				306	return (void *)&event->array[1];
				307	}
				308
				309	/**
				310	* ring_buffer_event_data - return the data of the event
				311	* @event: the event to get the data from
				312	*/
				313	void ring_buffer_event_data(struct ring_buffer_event event)
				314	{
				315	return rb_event_data(event);
				316	}
				317	EXPORT_SYMBOL_GPL(ring_buffer_event_data);
				318
				319	#define for_each_buffer_cpu(buffer, cpu) \
				320	for_each_cpu(cpu, buffer->cpumask)
				321
				322	#define TS_SHIFT 27
				323	#define TS_MASK ((1ULL << TS_SHIFT) - 1)
				324	#define TS_DELTA_TEST (~TS_MASK)
				325
				326	/* Flag when events were overwritten */
				327	#define RB_MISSED_EVENTS (1 << 31)
				328	/* Missed count stored at end */
				329	#define RB_MISSED_STORED (1 << 30)
				330
				331	struct buffer_data_page {
				332	u64 time_stamp; /* page time stamp */
				333	local_t commit; /* write committed index */
				334	unsigned char data[]; /* data of buffer page */
				335	};
				336
				337	/*
				338	* Note, the buffer_page list must be first. The buffer pages
				339	* are allocated in cache lines, which means that each buffer
				340	* page will be at the beginning of a cache line, and thus
				341	* the least significant bits will be zero. We use this to
				342	* add flags in the list struct pointers, to make the ring buffer
				343	* lockless.
				344	*/
				345	struct buffer_page {
				346	struct list_head list; /* list of buffer pages */
				347	local_t write; /* index for next write */
				348	unsigned read; /* index for next read */
				349	local_t entries; /* entries on this page */
				350	unsigned long real_end; /* real end of data */
				351	struct buffer_data_page page; / Actual data page */
				352	};
				353
				354	/*
				355	* The buffer page counters, write and entries, must be reset
				356	* atomically when crossing page boundaries. To synchronize this
				357	* update, two counters are inserted into the number. One is
				358	* the actual counter for the write position or count on the page.
				359	*
				360	* The other is a counter of updaters. Before an update happens
				361	* the update partition of the counter is incremented. This will
				362	* allow the updater to update the counter atomically.
				363	*
				364	* The counter is 20 bits, and the state data is 12.
				365	*/
				366	#define RB_WRITE_MASK 0xfffff
				367	#define RB_WRITE_INTCNT (1 << 20)
				368
				369	static void rb_init_page(struct buffer_data_page *bpage)
				370	{
				371	local_set(&bpage->commit, 0);
				372	}
				373
				374	/**
				375	* ring_buffer_page_len - the size of data on the page.
				376	* @page: The page to read
				377	*
				378	* Returns the amount of data on the page, including buffer page header.
				379	*/
				380	size_t ring_buffer_page_len(void *page)
				381	{
				382	return local_read(&((struct buffer_data_page *)page)->commit)
				383	+ BUF_PAGE_HDR_SIZE;
				384	}
				385
				386	/*
				387	* Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
				388	* this issue out.
				389	*/
				390	static void free_buffer_page(struct buffer_page *bpage)
				391	{
				392	free_page((unsigned long)bpage->page);
				393	kfree(bpage);
				394	}
				395
				396	/*
				397	* We need to fit the time_stamp delta into 27 bits.
				398	*/
				399	static inline int test_time_stamp(u64 delta)
				400	{
				401	if (delta & TS_DELTA_TEST)
				402	return 1;
				403	return 0;
				404	}
				405
				406	#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
				407
				408	/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
				409	#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
				410
				411	int ring_buffer_print_page_header(struct trace_seq *s)
				412	{
				413	struct buffer_data_page field;
				414	int ret;
				415
				416	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
				417	"offset:0;\tsize:%u;\tsigned:%u;\n",
				418	(unsigned int)sizeof(field.time_stamp),
				419	(unsigned int)is_signed_type(u64));
				420
				421	ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
				422	"offset:%u;\tsize:%u;\tsigned:%u;\n",
				423	(unsigned int)offsetof(typeof(field), commit),
				424	(unsigned int)sizeof(field.commit),
				425	(unsigned int)is_signed_type(long));
				426
				427	ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
				428	"offset:%u;\tsize:%u;\tsigned:%u;\n",
				429	(unsigned int)offsetof(typeof(field), commit),
				430	1,
				431	(unsigned int)is_signed_type(long));
				432
				433	ret = trace_seq_printf(s, "\tfield: char data;\t"
				434	"offset:%u;\tsize:%u;\tsigned:%u;\n",
				435	(unsigned int)offsetof(typeof(field), data),
				436	(unsigned int)BUF_PAGE_SIZE,
				437	(unsigned int)is_signed_type(char));
				438
				439	return ret;
				440	}
				441
				442	/*
				443	* head_page == tail_page && head == tail then buffer is empty.
				444	*/
				445	struct ring_buffer_per_cpu {
				446	int cpu;
				447	atomic_t record_disabled;
				448	struct ring_buffer *buffer;
				449	spinlock_t reader_lock; /* serialize readers */
				450	arch_spinlock_t lock;
				451	struct lock_class_key lock_key;
				452	struct list_head *pages;
				453	struct buffer_page head_page; / read from head */
				454	struct buffer_page tail_page; / write to tail */
				455	struct buffer_page commit_page; / committed pages */
				456	struct buffer_page *reader_page;
				457	unsigned long lost_events;
				458	unsigned long last_overrun;
				459	local_t entries_bytes;
				460	local_t commit_overrun;
				461	local_t overrun;
				462	local_t entries;
				463	local_t committing;
				464	local_t commits;
				465	unsigned long read;
				466	unsigned long read_bytes;
				467	u64 write_stamp;
				468	u64 read_stamp;
				469	};
				470
				471	struct ring_buffer {
				472	unsigned pages;
				473	unsigned flags;
				474	int cpus;
				475	atomic_t record_disabled;
				476	cpumask_var_t cpumask;
				477
				478	struct lock_class_key *reader_lock_key;
				479
				480	struct mutex mutex;
				481
				482	struct ring_buffer_per_cpu **buffers;
				483
				484	#ifdef CONFIG_HOTPLUG_CPU
				485	struct notifier_block cpu_notify;
				486	#endif
				487	u64 (*clock)(void);
				488	};
				489
				490	struct ring_buffer_iter {
				491	struct ring_buffer_per_cpu *cpu_buffer;
				492	unsigned long head;
				493	struct buffer_page *head_page;
				494	struct buffer_page *cache_reader_page;
				495	unsigned long cache_read;
				496	u64 read_stamp;
				497	};
				498
				499	/* buffer may be either ring_buffer or ring_buffer_per_cpu */
				500	#define RB_WARN_ON(b, cond) \
				501	({ \
				502	int _____ret = unlikely(cond); \
				503	if (_____ret) { \
				504	if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
				505	struct ring_buffer_per_cpu *__b = \
				506	(void *)b; \
				507	atomic_inc(&__b->buffer->record_disabled); \
				508	} else \
				509	atomic_inc(&b->record_disabled); \
				510	WARN_ON(1); \
				511	} \
				512	_____ret; \
				513	})
				514
				515	/* Up this if you want to test the TIME_EXTENTS and normalization */
				516	#define DEBUG_SHIFT 0
				517
				518	static inline u64 rb_time_stamp(struct ring_buffer *buffer)
				519	{
				520	/* shift to debug/test normalization and TIME_EXTENTS */
				521	return buffer->clock() << DEBUG_SHIFT;
				522	}
				523
				524	u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
				525	{
				526	u64 time;
				527
				528	preempt_disable_notrace();
				529	time = rb_time_stamp(buffer);
				530	preempt_enable_no_resched_notrace();
				531
				532	return time;
				533	}
				534	EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
				535
				536	void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
				537	int cpu, u64 *ts)
				538	{
				539	/* Just stupid testing the normalize function and deltas */
				540	*ts >>= DEBUG_SHIFT;
				541	}
				542	EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
				543
				544	/*
				545	* Making the ring buffer lockless makes things tricky.
				546	* Although writes only happen on the CPU that they are on,
				547	* and they only need to worry about interrupts. Reads can
				548	* happen on any CPU.
				549	*
				550	* The reader page is always off the ring buffer, but when the
				551	* reader finishes with a page, it needs to swap its page with
				552	* a new one from the buffer. The reader needs to take from
				553	* the head (writes go to the tail). But if a writer is in overwrite
				554	* mode and wraps, it must push the head page forward.
				555	*
				556	* Here lies the problem.
				557	*
				558	* The reader must be careful to replace only the head page, and
				559	* not another one. As described at the top of the file in the
				560	* ASCII art, the reader sets its old page to point to the next
				561	* page after head. It then sets the page after head to point to
				562	* the old reader page. But if the writer moves the head page
				563	* during this operation, the reader could end up with the tail.
				564	*
				565	* We use cmpxchg to help prevent this race. We also do something
				566	* special with the page before head. We set the LSB to 1.
				567	*
				568	* When the writer must push the page forward, it will clear the
				569	* bit that points to the head page, move the head, and then set
				570	* the bit that points to the new head page.
				571	*
				572	* We also don't want an interrupt coming in and moving the head
				573	* page on another writer. Thus we use the second LSB to catch
				574	* that too. Thus:
				575	*
				576	* head->list->prev->next bit 1 bit 0
				577	* ------- -------
				578	* Normal page 0 0
				579	* Points to head page 0 1
				580	* New head page 1 0
				581	*
				582	* Note we can not trust the prev pointer of the head page, because:
				583	*
				584	* +----+ +-----+ +-----+
				585	* \| \|------>\| T \|---X--->\| N \|
				586	* \| \|<------\| \| \| \|
				587	* +----+ +-----+ +-----+
				588	* ^ ^ \|
				589	* \| +-----+ \| \|
				590	* +----------\| R \|----------+ \|
				591	* \| \|<-----------+
				592	* +-----+
				593	*
				594	* Key: ---X--> HEAD flag set in pointer
				595	* T Tail page
				596	* R Reader page
				597	* N Next page
				598	*
				599	* (see __rb_reserve_next() to see where this happens)
				600	*
				601	* What the above shows is that the reader just swapped out
				602	* the reader page with a page in the buffer, but before it
				603	* could make the new header point back to the new page added
				604	* it was preempted by a writer. The writer moved forward onto
				605	* the new page added by the reader and is about to move forward
				606	* again.
				607	*
				608	* You can see, it is legitimate for the previous pointer of
				609	* the head (or any page) not to point back to itself. But only
				610	* temporarially.
				611	*/
				612
				613	#define RB_PAGE_NORMAL 0UL
				614	#define RB_PAGE_HEAD 1UL
				615	#define RB_PAGE_UPDATE 2UL
				616
				617
				618	#define RB_FLAG_MASK 3UL
				619
				620	/* PAGE_MOVED is not part of the mask */
				621	#define RB_PAGE_MOVED 4UL
				622
				623	/*
				624	* rb_list_head - remove any bit
				625	*/
				626	static struct list_head rb_list_head(struct list_head list)
				627	{
				628	unsigned long val = (unsigned long)list;
				629
				630	return (struct list_head *)(val & ~RB_FLAG_MASK);
				631	}
				632
				633	/*
				634	* rb_is_head_page - test if the given page is the head page
				635	*
				636	* Because the reader may move the head_page pointer, we can
				637	* not trust what the head page is (it may be pointing to
				638	* the reader page). But if the next page is a header page,
				639	* its flags will be non zero.
				640	*/
				641	static inline int
				642	rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
				643	struct buffer_page page, struct list_head list)
				644	{
				645	unsigned long val;
				646
				647	val = (unsigned long)list->next;
				648
				649	if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
				650	return RB_PAGE_MOVED;
				651
				652	return val & RB_FLAG_MASK;
				653	}
				654
				655	/*
				656	* rb_is_reader_page
				657	*
				658	* The unique thing about the reader page, is that, if the
				659	* writer is ever on it, the previous pointer never points
				660	* back to the reader page.
				661	*/
				662	static int rb_is_reader_page(struct buffer_page *page)
				663	{
				664	struct list_head *list = page->list.prev;
				665
				666	return rb_list_head(list->next) != &page->list;
				667	}
				668
				669	/*
				670	* rb_set_list_to_head - set a list_head to be pointing to head.
				671	*/
				672	static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
				673	struct list_head *list)
				674	{
				675	unsigned long *ptr;
				676
				677	ptr = (unsigned long *)&list->next;
				678	*ptr \|= RB_PAGE_HEAD;
				679	*ptr &= ~RB_PAGE_UPDATE;
				680	}
				681
				682	/*
				683	* rb_head_page_activate - sets up head page
				684	*/
				685	static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
				686	{
				687	struct buffer_page *head;
				688
				689	head = cpu_buffer->head_page;
				690	if (!head)
				691	return;
				692
				693	/*
				694	* Set the previous list pointer to have the HEAD flag.
				695	*/
				696	rb_set_list_to_head(cpu_buffer, head->list.prev);
				697	}
				698
				699	static void rb_list_head_clear(struct list_head *list)
				700	{
				701	unsigned long ptr = (unsigned long )&list->next;
				702
				703	*ptr &= ~RB_FLAG_MASK;
				704	}
				705
				706	/*
				707	* rb_head_page_dactivate - clears head page ptr (for free list)
				708	*/
				709	static void
				710	rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
				711	{
				712	struct list_head *hd;
				713
				714	/* Go through the whole list and clear any pointers found. */
				715	rb_list_head_clear(cpu_buffer->pages);
				716
				717	list_for_each(hd, cpu_buffer->pages)
				718	rb_list_head_clear(hd);
				719	}
				720
				721	static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
				722	struct buffer_page *head,
				723	struct buffer_page *prev,
				724	int old_flag, int new_flag)
				725	{
				726	struct list_head *list;
				727	unsigned long val = (unsigned long)&head->list;
				728	unsigned long ret;
				729
				730	list = &prev->list;
				731
				732	val &= ~RB_FLAG_MASK;
				733
				734	ret = cmpxchg((unsigned long *)&list->next,
				735	val \| old_flag, val \| new_flag);
				736
				737	/* check if the reader took the page */
				738	if ((ret & ~RB_FLAG_MASK) != val)
				739	return RB_PAGE_MOVED;
				740
				741	return ret & RB_FLAG_MASK;
				742	}
				743
				744	static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
				745	struct buffer_page *head,
				746	struct buffer_page *prev,
				747	int old_flag)
				748	{
				749	return rb_head_page_set(cpu_buffer, head, prev,
				750	old_flag, RB_PAGE_UPDATE);
				751	}
				752
				753	static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
				754	struct buffer_page *head,
				755	struct buffer_page *prev,
				756	int old_flag)
				757	{
				758	return rb_head_page_set(cpu_buffer, head, prev,
				759	old_flag, RB_PAGE_HEAD);
				760	}
				761
				762	static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
				763	struct buffer_page *head,
				764	struct buffer_page *prev,
				765	int old_flag)
				766	{
				767	return rb_head_page_set(cpu_buffer, head, prev,
				768	old_flag, RB_PAGE_NORMAL);
				769	}
				770
				771	static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
				772	struct buffer_page **bpage)
				773	{
				774	struct list_head p = rb_list_head((bpage)->list.next);
				775
				776	*bpage = list_entry(p, struct buffer_page, list);
				777	}
				778
				779	static struct buffer_page *
				780	rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
				781	{
				782	struct buffer_page *head;
				783	struct buffer_page *page;
				784	struct list_head *list;
				785	int i;
				786
				787	if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
				788	return NULL;
				789
				790	/* sanity check */
				791	list = cpu_buffer->pages;
				792	if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
				793	return NULL;
				794
				795	page = head = cpu_buffer->head_page;
				796	/*
				797	* It is possible that the writer moves the header behind
				798	* where we started, and we miss in one loop.
				799	* A second loop should grab the header, but we'll do
				800	* three loops just because I'm paranoid.
				801	*/
				802	for (i = 0; i < 3; i++) {
				803	do {
				804	if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
				805	cpu_buffer->head_page = page;
				806	return page;
				807	}
				808	rb_inc_page(cpu_buffer, &page);
				809	} while (page != head);
				810	}
				811
				812	RB_WARN_ON(cpu_buffer, 1);
				813
				814	return NULL;
				815	}
				816
				817	static int rb_head_page_replace(struct buffer_page *old,
				818	struct buffer_page *new)
				819	{
				820	unsigned long ptr = (unsigned long )&old->list.prev->next;
				821	unsigned long val;
				822	unsigned long ret;
				823
				824	val = *ptr & ~RB_FLAG_MASK;
				825	val \|= RB_PAGE_HEAD;
				826
				827	ret = cmpxchg(ptr, val, (unsigned long)&new->list);
				828
				829	return ret == val;
				830	}
				831
				832	/*
				833	* rb_tail_page_update - move the tail page forward
				834	*
				835	* Returns 1 if moved tail page, 0 if someone else did.
				836	*/
				837	static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
				838	struct buffer_page *tail_page,
				839	struct buffer_page *next_page)
				840	{
				841	struct buffer_page *old_tail;
				842	unsigned long old_entries;
				843	unsigned long old_write;
				844	int ret = 0;
				845
				846	/*
				847	* The tail page now needs to be moved forward.
				848	*
				849	* We need to reset the tail page, but without messing
				850	* with possible erasing of data brought in by interrupts
				851	* that have moved the tail page and are currently on it.
				852	*
				853	* We add a counter to the write field to denote this.
				854	*/
				855	old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
				856	old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
				857
				858	/*
				859	* Just make sure we have seen our old_write and synchronize
				860	* with any interrupts that come in.
				861	*/
				862	barrier();
				863
				864	/*
				865	* If the tail page is still the same as what we think
				866	* it is, then it is up to us to update the tail
				867	* pointer.
				868	*/
				869	if (tail_page == cpu_buffer->tail_page) {
				870	/* Zero the write counter */
				871	unsigned long val = old_write & ~RB_WRITE_MASK;
				872	unsigned long eval = old_entries & ~RB_WRITE_MASK;
				873
				874	/*
				875	* This will only succeed if an interrupt did
				876	* not come in and change it. In which case, we
				877	* do not want to modify it.
				878	*
				879	* We add (void) to let the compiler know that we do not care
				880	* about the return value of these functions. We use the
				881	* cmpxchg to only update if an interrupt did not already
				882	* do it for us. If the cmpxchg fails, we don't care.
				883	*/
				884	(void)local_cmpxchg(&next_page->write, old_write, val);
				885	(void)local_cmpxchg(&next_page->entries, old_entries, eval);
				886
				887	/*
				888	* No need to worry about races with clearing out the commit.
				889	* it only can increment when a commit takes place. But that
				890	* only happens in the outer most nested commit.
				891	*/
				892	local_set(&next_page->page->commit, 0);
				893
				894	old_tail = cmpxchg(&cpu_buffer->tail_page,
				895	tail_page, next_page);
				896
				897	if (old_tail == tail_page)
				898	ret = 1;
				899	}
				900
				901	return ret;
				902	}
				903
				904	static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
				905	struct buffer_page *bpage)
				906	{
				907	unsigned long val = (unsigned long)bpage;
				908
				909	if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
				910	return 1;
				911
				912	return 0;
				913	}
				914
				915	/**
				916	* rb_check_list - make sure a pointer to a list has the last bits zero
				917	*/
				918	static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
				919	struct list_head *list)
				920	{
				921	if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
				922	return 1;
				923	if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
				924	return 1;
				925	return 0;
				926	}
				927
				928	/**
				929	* check_pages - integrity check of buffer pages
				930	* @cpu_buffer: CPU buffer with pages to test
				931	*
				932	* As a safety measure we check to make sure the data pages have not
				933	* been corrupted.
				934	*/
				935	static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
				936	{
				937	struct list_head *head = cpu_buffer->pages;
				938	struct buffer_page bpage, tmp;
				939
				940	rb_head_page_deactivate(cpu_buffer);
				941
				942	if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
				943	return -1;
				944	if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
				945	return -1;
				946
				947	if (rb_check_list(cpu_buffer, head))
				948	return -1;
				949
				950	list_for_each_entry_safe(bpage, tmp, head, list) {
				951	if (RB_WARN_ON(cpu_buffer,
				952	bpage->list.next->prev != &bpage->list))
				953	return -1;
				954	if (RB_WARN_ON(cpu_buffer,
				955	bpage->list.prev->next != &bpage->list))
				956	return -1;
				957	if (rb_check_list(cpu_buffer, &bpage->list))
				958	return -1;
				959	}
				960
				961	rb_head_page_activate(cpu_buffer);
				962
				963	return 0;
				964	}
				965
				966	static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
				967	unsigned nr_pages)
				968	{
				969	struct buffer_page bpage, tmp;
				970	LIST_HEAD(pages);
				971	unsigned i;
				972
				973	WARN_ON(!nr_pages);
				974
				975	for (i = 0; i < nr_pages; i++) {
				976	struct page *page;
				977	/*
				978	* __GFP_NORETRY flag makes sure that the allocation fails
				979	* gracefully without invoking oom-killer and the system is
				980	* not destabilized.
				981	*/
				982	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
				983	GFP_KERNEL \| __GFP_NORETRY,
				984	cpu_to_node(cpu_buffer->cpu));
				985	if (!bpage)
				986	goto free_pages;
				987
				988	rb_check_bpage(cpu_buffer, bpage);
				989
				990	list_add(&bpage->list, &pages);
				991
				992	page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
				993	GFP_KERNEL \| __GFP_NORETRY, 0);
				994	if (!page)
				995	goto free_pages;
				996	bpage->page = page_address(page);
				997	rb_init_page(bpage->page);
				998	}
				999
				1000	/*
				1001	* The ring buffer page list is a circular list that does not
				1002	* start and end with a list head. All page list items point to
				1003	* other pages.
				1004	*/
				1005	cpu_buffer->pages = pages.next;
				1006	list_del(&pages);
				1007
				1008	rb_check_pages(cpu_buffer);
				1009
				1010	return 0;
				1011
				1012	free_pages:
				1013	list_for_each_entry_safe(bpage, tmp, &pages, list) {
				1014	list_del_init(&bpage->list);
				1015	free_buffer_page(bpage);
				1016	}
				1017	return -ENOMEM;
				1018	}
				1019
				1020	static inline int ok_to_lock(void)
				1021	{
				1022	if (in_nmi())
				1023	return 0;
				1024	#ifdef CONFIG_PREEMPT_RT_FULL
				1025	if (in_atomic() \|\| irqs_disabled())
				1026	return 0;
				1027	#endif
				1028	return 1;
				1029	}
				1030
				1031	static int
				1032	read_buffer_lock(struct ring_buffer_per_cpu *cpu_buffer,
				1033	unsigned long *flags)
				1034	{
				1035	/*
				1036	* If an NMI die dumps out the content of the ring buffer
				1037	* do not grab locks. We also permanently disable the ring
				1038	* buffer too. A one time deal is all you get from reading
				1039	* the ring buffer from an NMI.
				1040	*/
				1041	if (!ok_to_lock()) {
				1042	if (spin_trylock_irqsave(&cpu_buffer->reader_lock, *flags))
				1043	return 1;
				1044	tracing_off_permanent();
				1045	return 0;
				1046	}
				1047	spin_lock_irqsave(&cpu_buffer->reader_lock, *flags);
				1048	return 1;
				1049	}
				1050
				1051	static void
				1052	read_buffer_unlock(struct ring_buffer_per_cpu *cpu_buffer,
				1053	unsigned long flags, int locked)
				1054	{
				1055	if (locked)
				1056	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
				1057	}
				1058	static struct ring_buffer_per_cpu *
				1059	rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
				1060	{
				1061	struct ring_buffer_per_cpu *cpu_buffer;
				1062	struct buffer_page *bpage;
				1063	struct page *page;
				1064	int ret;
				1065
				1066	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
				1067	GFP_KERNEL, cpu_to_node(cpu));
				1068	if (!cpu_buffer)
				1069	return NULL;
				1070
				1071	cpu_buffer->cpu = cpu;
				1072	cpu_buffer->buffer = buffer;
				1073	spin_lock_init(&cpu_buffer->reader_lock);
				1074	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
				1075	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
				1076
				1077	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
				1078	GFP_KERNEL, cpu_to_node(cpu));
				1079	if (!bpage)
				1080	goto fail_free_buffer;
				1081
				1082	rb_check_bpage(cpu_buffer, bpage);
				1083
				1084	cpu_buffer->reader_page = bpage;
				1085	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
				1086	if (!page)
				1087	goto fail_free_reader;
				1088	bpage->page = page_address(page);
				1089	rb_init_page(bpage->page);
				1090
				1091	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
				1092
				1093	ret = rb_allocate_pages(cpu_buffer, buffer->pages);
				1094	if (ret < 0)
				1095	goto fail_free_reader;
				1096
				1097	cpu_buffer->head_page
				1098	= list_entry(cpu_buffer->pages, struct buffer_page, list);
				1099	cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
				1100
				1101	rb_head_page_activate(cpu_buffer);
				1102
				1103	return cpu_buffer;
				1104
				1105	fail_free_reader:
				1106	free_buffer_page(cpu_buffer->reader_page);
				1107
				1108	fail_free_buffer:
				1109	kfree(cpu_buffer);
				1110	return NULL;
				1111	}
				1112
				1113	static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
				1114	{
				1115	struct list_head *head = cpu_buffer->pages;
				1116	struct buffer_page bpage, tmp;
				1117
				1118	free_buffer_page(cpu_buffer->reader_page);
				1119
				1120	rb_head_page_deactivate(cpu_buffer);
				1121
				1122	if (head) {
				1123	list_for_each_entry_safe(bpage, tmp, head, list) {
				1124	list_del_init(&bpage->list);
				1125	free_buffer_page(bpage);
				1126	}
				1127	bpage = list_entry(head, struct buffer_page, list);
				1128	free_buffer_page(bpage);
				1129	}
				1130
				1131	kfree(cpu_buffer);
				1132	}
				1133
				1134	#ifdef CONFIG_HOTPLUG_CPU
				1135	static int rb_cpu_notify(struct notifier_block *self,
				1136	unsigned long action, void *hcpu);
				1137	#endif
				1138
				1139	/**
				1140	* ring_buffer_alloc - allocate a new ring_buffer
				1141	* @size: the size in bytes per cpu that is needed.
				1142	* @flags: attributes to set for the ring buffer.
				1143	*
				1144	* Currently the only flag that is available is the RB_FL_OVERWRITE
				1145	* flag. This flag means that the buffer will overwrite old data
				1146	* when the buffer wraps. If this flag is not set, the buffer will
				1147	* drop data when the tail hits the head.
				1148	*/
				1149	struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
				1150	struct lock_class_key *key)
				1151	{
				1152	struct ring_buffer *buffer;
				1153	int bsize;
				1154	int cpu;
				1155
				1156	/* keep it in its own cache line */
				1157	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
				1158	GFP_KERNEL);
				1159	if (!buffer)
				1160	return NULL;
				1161
				1162	if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
				1163	goto fail_free_buffer;
				1164
				1165	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
				1166	buffer->flags = flags;
				1167	buffer->clock = trace_clock_local;
				1168	buffer->reader_lock_key = key;
				1169
				1170	/* need at least two pages */
				1171	if (buffer->pages < 2)
				1172	buffer->pages = 2;
				1173
				1174	/*
				1175	* In case of non-hotplug cpu, if the ring-buffer is allocated
				1176	* in early initcall, it will not be notified of secondary cpus.
				1177	* In that off case, we need to allocate for all possible cpus.
				1178	*/
				1179	#ifdef CONFIG_HOTPLUG_CPU
				1180	get_online_cpus();
				1181	cpumask_copy(buffer->cpumask, cpu_online_mask);
				1182	#else
				1183	cpumask_copy(buffer->cpumask, cpu_possible_mask);
				1184	#endif
				1185	buffer->cpus = nr_cpu_ids;
				1186
				1187	bsize = sizeof(void ) nr_cpu_ids;
				1188	buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
				1189	GFP_KERNEL);
				1190	if (!buffer->buffers)
				1191	goto fail_free_cpumask;
				1192
				1193	for_each_buffer_cpu(buffer, cpu) {
				1194	buffer->buffers[cpu] =
				1195	rb_allocate_cpu_buffer(buffer, cpu);
				1196	if (!buffer->buffers[cpu])
				1197	goto fail_free_buffers;
				1198	}
				1199
				1200	#ifdef CONFIG_HOTPLUG_CPU
				1201	buffer->cpu_notify.notifier_call = rb_cpu_notify;
				1202	buffer->cpu_notify.priority = 0;
				1203	register_cpu_notifier(&buffer->cpu_notify);
				1204	#endif
				1205
				1206	put_online_cpus();
				1207	mutex_init(&buffer->mutex);
				1208
				1209	return buffer;
				1210
				1211	fail_free_buffers:
				1212	for_each_buffer_cpu(buffer, cpu) {
				1213	if (buffer->buffers[cpu])
				1214	rb_free_cpu_buffer(buffer->buffers[cpu]);
				1215	}
				1216	kfree(buffer->buffers);
				1217
				1218	fail_free_cpumask:
				1219	free_cpumask_var(buffer->cpumask);
				1220	put_online_cpus();
				1221
				1222	fail_free_buffer:
				1223	kfree(buffer);
				1224	return NULL;
				1225	}
				1226	EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
				1227
				1228	/**
				1229	* ring_buffer_free - free a ring buffer.
				1230	* @buffer: the buffer to free.
				1231	*/
				1232	void
				1233	ring_buffer_free(struct ring_buffer *buffer)
				1234	{
				1235	int cpu;
				1236
				1237	get_online_cpus();
				1238
				1239	#ifdef CONFIG_HOTPLUG_CPU
				1240	unregister_cpu_notifier(&buffer->cpu_notify);
				1241	#endif
				1242
				1243	for_each_buffer_cpu(buffer, cpu)
				1244	rb_free_cpu_buffer(buffer->buffers[cpu]);
				1245
				1246	put_online_cpus();
				1247
				1248	kfree(buffer->buffers);
				1249	free_cpumask_var(buffer->cpumask);
				1250
				1251	kfree(buffer);
				1252	}
				1253	EXPORT_SYMBOL_GPL(ring_buffer_free);
				1254
				1255	void ring_buffer_set_clock(struct ring_buffer *buffer,
				1256	u64 (*clock)(void))
				1257	{
				1258	buffer->clock = clock;
				1259	}
				1260
				1261	static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
				1262
				1263	static void
				1264	rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
				1265	{
				1266	struct buffer_page *bpage;
				1267	struct list_head *p;
				1268	unsigned long flags;
				1269	unsigned i;
				1270	int locked;
				1271
				1272	locked = read_buffer_lock(cpu_buffer, &flags);
				1273	rb_head_page_deactivate(cpu_buffer);
				1274
				1275	for (i = 0; i < nr_pages; i++) {
				1276	if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
				1277	goto out;
				1278	p = cpu_buffer->pages->next;
				1279	bpage = list_entry(p, struct buffer_page, list);
				1280	list_del_init(&bpage->list);
				1281	free_buffer_page(bpage);
				1282	}
				1283	if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
				1284	goto out;
				1285
				1286	rb_reset_cpu(cpu_buffer);
				1287	rb_check_pages(cpu_buffer);
				1288
				1289	out:
				1290	read_buffer_unlock(cpu_buffer, flags, locked);
				1291	}
				1292
				1293	static void
				1294	rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
				1295	struct list_head *pages, unsigned nr_pages)
				1296	{
				1297	struct buffer_page *bpage;
				1298	struct list_head *p;
				1299	unsigned long flags;
				1300	unsigned i;
				1301	int locked;
				1302
				1303	locked = read_buffer_lock(cpu_buffer, &flags);
				1304	rb_head_page_deactivate(cpu_buffer);
				1305
				1306	for (i = 0; i < nr_pages; i++) {
				1307	if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
				1308	goto out;
				1309	p = pages->next;
				1310	bpage = list_entry(p, struct buffer_page, list);
				1311	list_del_init(&bpage->list);
				1312	list_add_tail(&bpage->list, cpu_buffer->pages);
				1313	}
				1314	rb_reset_cpu(cpu_buffer);
				1315	rb_check_pages(cpu_buffer);
				1316
				1317	out:
				1318	read_buffer_unlock(cpu_buffer, flags, locked);
				1319	}
				1320
				1321	/**
				1322	* ring_buffer_resize - resize the ring buffer
				1323	* @buffer: the buffer to resize.
				1324	* @size: the new size.
				1325	*
				1326	* Minimum size is 2 * BUF_PAGE_SIZE.
				1327	*
				1328	* Returns -1 on failure.
				1329	*/
				1330	int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
				1331	{
				1332	struct ring_buffer_per_cpu *cpu_buffer;
				1333	unsigned nr_pages, rm_pages, new_pages;
				1334	struct buffer_page bpage, tmp;
				1335	unsigned long buffer_size;
				1336	LIST_HEAD(pages);
				1337	int i, cpu;
				1338
				1339	/*
				1340	* Always succeed at resizing a non-existent buffer:
				1341	*/
				1342	if (!buffer)
				1343	return size;
				1344
				1345	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
				1346	size *= BUF_PAGE_SIZE;
				1347	buffer_size = buffer->pages * BUF_PAGE_SIZE;
				1348
				1349	/* we need a minimum of two pages */
				1350	if (size < BUF_PAGE_SIZE * 2)
				1351	size = BUF_PAGE_SIZE * 2;
				1352
				1353	if (size == buffer_size)
				1354	return size;
				1355
				1356	atomic_inc(&buffer->record_disabled);
				1357
				1358	/* Make sure all writers are done with this buffer. */
				1359	synchronize_sched();
				1360
				1361	mutex_lock(&buffer->mutex);
				1362	get_online_cpus();
				1363
				1364	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
				1365
				1366	if (size < buffer_size) {
				1367
				1368	/* easy case, just free pages */
				1369	if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
				1370	goto out_fail;
				1371
				1372	rm_pages = buffer->pages - nr_pages;
				1373
				1374	for_each_buffer_cpu(buffer, cpu) {
				1375	cpu_buffer = buffer->buffers[cpu];
				1376	rb_remove_pages(cpu_buffer, rm_pages);
				1377	}
				1378	goto out;
				1379	}
				1380
				1381	/*
				1382	* This is a bit more difficult. We only want to add pages
				1383	* when we can allocate enough for all CPUs. We do this
				1384	* by allocating all the pages and storing them on a local
				1385	* link list. If we succeed in our allocation, then we
				1386	* add these pages to the cpu_buffers. Otherwise we just free
				1387	* them all and return -ENOMEM;
				1388	*/
				1389	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
				1390	goto out_fail;
				1391
				1392	new_pages = nr_pages - buffer->pages;
				1393
				1394	for_each_buffer_cpu(buffer, cpu) {
				1395	for (i = 0; i < new_pages; i++) {
				1396	struct page *page;
				1397	/*
				1398	* __GFP_NORETRY flag makes sure that the allocation
				1399	* fails gracefully without invoking oom-killer and
				1400	* the system is not destabilized.
				1401	*/
				1402	bpage = kzalloc_node(ALIGN(sizeof(*bpage),
				1403	cache_line_size()),
				1404	GFP_KERNEL \| __GFP_NORETRY,
				1405	cpu_to_node(cpu));
				1406	if (!bpage)
				1407	goto free_pages;
				1408	list_add(&bpage->list, &pages);
				1409	page = alloc_pages_node(cpu_to_node(cpu),
				1410	GFP_KERNEL \| __GFP_NORETRY, 0);
				1411	if (!page)
				1412	goto free_pages;
				1413	bpage->page = page_address(page);
				1414	rb_init_page(bpage->page);
				1415	}
				1416	}
				1417
				1418	for_each_buffer_cpu(buffer, cpu) {
				1419	cpu_buffer = buffer->buffers[cpu];
				1420	rb_insert_pages(cpu_buffer, &pages, new_pages);
				1421	}
				1422
				1423	if (RB_WARN_ON(buffer, !list_empty(&pages)))
				1424	goto out_fail;
				1425
				1426	out:
				1427	buffer->pages = nr_pages;
				1428	put_online_cpus();
				1429	mutex_unlock(&buffer->mutex);
				1430
				1431	atomic_dec(&buffer->record_disabled);
				1432
				1433	return size;
				1434
				1435	free_pages:
				1436	list_for_each_entry_safe(bpage, tmp, &pages, list) {
				1437	list_del_init(&bpage->list);
				1438	free_buffer_page(bpage);
				1439	}
				1440	put_online_cpus();
				1441	mutex_unlock(&buffer->mutex);
				1442	atomic_dec(&buffer->record_disabled);
				1443	return -ENOMEM;
				1444
				1445	/*
				1446	* Something went totally wrong, and we are too paranoid
				1447	* to even clean up the mess.
				1448	*/
				1449	out_fail:
				1450	put_online_cpus();
				1451	mutex_unlock(&buffer->mutex);
				1452	atomic_dec(&buffer->record_disabled);
				1453	return -1;
				1454	}
				1455	EXPORT_SYMBOL_GPL(ring_buffer_resize);
				1456
				1457	void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
				1458	{
				1459	mutex_lock(&buffer->mutex);
				1460	if (val)
				1461	buffer->flags \|= RB_FL_OVERWRITE;
				1462	else
				1463	buffer->flags &= ~RB_FL_OVERWRITE;
				1464	mutex_unlock(&buffer->mutex);
				1465	}
				1466	EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
				1467
				1468	static inline void *
				1469	__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
				1470	{
				1471	return bpage->data + index;
				1472	}
				1473
				1474	static inline void __rb_page_index(struct buffer_page bpage, unsigned index)
				1475	{
				1476	return bpage->page->data + index;
				1477	}
				1478
				1479	static inline struct ring_buffer_event *
				1480	rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
				1481	{
				1482	return __rb_page_index(cpu_buffer->reader_page,
				1483	cpu_buffer->reader_page->read);
				1484	}
				1485
				1486	static inline struct ring_buffer_event *
				1487	rb_iter_head_event(struct ring_buffer_iter *iter)
				1488	{
				1489	return __rb_page_index(iter->head_page, iter->head);
				1490	}
				1491
				1492	static inline unsigned long rb_page_write(struct buffer_page *bpage)
				1493	{
				1494	return local_read(&bpage->write) & RB_WRITE_MASK;
				1495	}
				1496
				1497	static inline unsigned rb_page_commit(struct buffer_page *bpage)
				1498	{
				1499	return local_read(&bpage->page->commit);
				1500	}
				1501
				1502	static inline unsigned long rb_page_entries(struct buffer_page *bpage)
				1503	{
				1504	return local_read(&bpage->entries) & RB_WRITE_MASK;
				1505	}
				1506
				1507	/* Size is determined by what has been committed */
				1508	static inline unsigned rb_page_size(struct buffer_page *bpage)
				1509	{
				1510	return rb_page_commit(bpage);
				1511	}
				1512
				1513	static inline unsigned
				1514	rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
				1515	{
				1516	return rb_page_commit(cpu_buffer->commit_page);
				1517	}
				1518
				1519	static inline unsigned
				1520	rb_event_index(struct ring_buffer_event *event)
				1521	{
				1522	unsigned long addr = (unsigned long)event;
				1523
				1524	return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
				1525	}
				1526
				1527	static inline int
				1528	rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
				1529	struct ring_buffer_event *event)
				1530	{
				1531	unsigned long addr = (unsigned long)event;
				1532	unsigned long index;
				1533
				1534	index = rb_event_index(event);
				1535	addr &= PAGE_MASK;
				1536
				1537	return cpu_buffer->commit_page->page == (void *)addr &&
				1538	rb_commit_index(cpu_buffer) == index;
				1539	}
				1540
				1541	static void
				1542	rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
				1543	{
				1544	unsigned long max_count;
				1545
				1546	/*
				1547	* We only race with interrupts and NMIs on this CPU.
				1548	* If we own the commit event, then we can commit
				1549	* all others that interrupted us, since the interruptions
				1550	* are in stack format (they finish before they come
				1551	* back to us). This allows us to do a simple loop to
				1552	* assign the commit to the tail.
				1553	*/
				1554	again:
				1555	max_count = cpu_buffer->buffer->pages * 100;
				1556
				1557	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
				1558	if (RB_WARN_ON(cpu_buffer, !(--max_count)))
				1559	return;
				1560	if (RB_WARN_ON(cpu_buffer,
				1561	rb_is_reader_page(cpu_buffer->tail_page)))
				1562	return;
				1563	local_set(&cpu_buffer->commit_page->page->commit,
				1564	rb_page_write(cpu_buffer->commit_page));
				1565	rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
				1566	cpu_buffer->write_stamp =
				1567	cpu_buffer->commit_page->page->time_stamp;
				1568	/* add barrier to keep gcc from optimizing too much */
				1569	barrier();
				1570	}
				1571	while (rb_commit_index(cpu_buffer) !=
				1572	rb_page_write(cpu_buffer->commit_page)) {
				1573
				1574	local_set(&cpu_buffer->commit_page->page->commit,
				1575	rb_page_write(cpu_buffer->commit_page));
				1576	RB_WARN_ON(cpu_buffer,
				1577	local_read(&cpu_buffer->commit_page->page->commit) &
				1578	~RB_WRITE_MASK);
				1579	barrier();
				1580	}
				1581
				1582	/* again, keep gcc from optimizing */
				1583	barrier();
				1584
				1585	/*
				1586	* If an interrupt came in just after the first while loop
				1587	* and pushed the tail page forward, we will be left with
				1588	* a dangling commit that will never go forward.
				1589	*/
				1590	if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
				1591	goto again;
				1592	}
				1593
				1594	static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
				1595	{
				1596	cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
				1597	cpu_buffer->reader_page->read = 0;
				1598	}
				1599
				1600	static void rb_inc_iter(struct ring_buffer_iter *iter)
				1601	{
				1602	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
				1603
				1604	/*
				1605	* The iterator could be on the reader page (it starts there).
				1606	* But the head could have moved, since the reader was
				1607	* found. Check for this case and assign the iterator
				1608	* to the head page instead of next.
				1609	*/
				1610	if (iter->head_page == cpu_buffer->reader_page)
				1611	iter->head_page = rb_set_head_page(cpu_buffer);
				1612	else
				1613	rb_inc_page(cpu_buffer, &iter->head_page);
				1614
				1615	iter->read_stamp = iter->head_page->page->time_stamp;
				1616	iter->head = 0;
				1617	}
				1618
				1619	/* Slow path, do not inline */
				1620	static noinline struct ring_buffer_event *
				1621	rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
				1622	{
				1623	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
				1624
				1625	/* Not the first event on the page? */
				1626	if (rb_event_index(event)) {
				1627	event->time_delta = delta & TS_MASK;
				1628	event->array[0] = delta >> TS_SHIFT;
				1629	} else {
				1630	/* nope, just zero it */
				1631	event->time_delta = 0;
				1632	event->array[0] = 0;
				1633	}
				1634
				1635	return skip_time_extend(event);
				1636	}
				1637
				1638	/**
				1639	* ring_buffer_update_event - update event type and data
				1640	* @event: the even to update
				1641	* @type: the type of event
				1642	* @length: the size of the event field in the ring buffer
				1643	*
				1644	* Update the type and data fields of the event. The length
				1645	* is the actual size that is written to the ring buffer,
				1646	* and with this, we can determine what to place into the
				1647	* data field.
				1648	*/
				1649	static void
				1650	rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
				1651	struct ring_buffer_event *event, unsigned length,
				1652	int add_timestamp, u64 delta)
				1653	{
				1654	/* Only a commit updates the timestamp */
				1655	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
				1656	delta = 0;
				1657
				1658	/*
				1659	* If we need to add a timestamp, then we
				1660	* add it to the start of the resevered space.
				1661	*/
				1662	if (unlikely(add_timestamp)) {
				1663	event = rb_add_time_stamp(event, delta);
				1664	length -= RB_LEN_TIME_EXTEND;
				1665	delta = 0;
				1666	}
				1667
				1668	event->time_delta = delta;
				1669	length -= RB_EVNT_HDR_SIZE;
				1670	if (length > RB_MAX_SMALL_DATA \|\| RB_FORCE_8BYTE_ALIGNMENT) {
				1671	event->type_len = 0;
				1672	event->array[0] = length;
				1673	} else
				1674	event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
				1675	}
				1676
				1677	/*
				1678	* rb_handle_head_page - writer hit the head page
				1679	*
				1680	* Returns: +1 to retry page
				1681	* 0 to continue
				1682	* -1 on error
				1683	*/
				1684	static int
				1685	rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
				1686	struct buffer_page *tail_page,
				1687	struct buffer_page *next_page)
				1688	{
				1689	struct buffer_page *new_head;
				1690	int entries;
				1691	int type;
				1692	int ret;
				1693
				1694	entries = rb_page_entries(next_page);
				1695
				1696	/*
				1697	* The hard part is here. We need to move the head
				1698	* forward, and protect against both readers on
				1699	* other CPUs and writers coming in via interrupts.
				1700	*/
				1701	type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
				1702	RB_PAGE_HEAD);
				1703
				1704	/*
				1705	* type can be one of four:
				1706	* NORMAL - an interrupt already moved it for us
				1707	* HEAD - we are the first to get here.
				1708	* UPDATE - we are the interrupt interrupting
				1709	* a current move.
				1710	* MOVED - a reader on another CPU moved the next
				1711	* pointer to its reader page. Give up
				1712	* and try again.
				1713	*/
				1714
				1715	switch (type) {
				1716	case RB_PAGE_HEAD:
				1717	/*
				1718	* We changed the head to UPDATE, thus
				1719	* it is our responsibility to update
				1720	* the counters.
				1721	*/
				1722	local_add(entries, &cpu_buffer->overrun);
				1723	local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
				1724
				1725	/*
				1726	* The entries will be zeroed out when we move the
				1727	* tail page.
				1728	*/
				1729
				1730	/* still more to do */
				1731	break;
				1732
				1733	case RB_PAGE_UPDATE:
				1734	/*
				1735	* This is an interrupt that interrupt the
				1736	* previous update. Still more to do.
				1737	*/
				1738	break;
				1739	case RB_PAGE_NORMAL:
				1740	/*
				1741	* An interrupt came in before the update
				1742	* and processed this for us.
				1743	* Nothing left to do.
				1744	*/
				1745	return 1;
				1746	case RB_PAGE_MOVED:
				1747	/*
				1748	* The reader is on another CPU and just did
				1749	* a swap with our next_page.
				1750	* Try again.
				1751	*/
				1752	return 1;
				1753	default:
				1754	RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
				1755	return -1;
				1756	}
				1757
				1758	/*
				1759	* Now that we are here, the old head pointer is
				1760	* set to UPDATE. This will keep the reader from
				1761	* swapping the head page with the reader page.
				1762	* The reader (on another CPU) will spin till
				1763	* we are finished.
				1764	*
				1765	* We just need to protect against interrupts
				1766	* doing the job. We will set the next pointer
				1767	* to HEAD. After that, we set the old pointer
				1768	* to NORMAL, but only if it was HEAD before.
				1769	* otherwise we are an interrupt, and only
				1770	* want the outer most commit to reset it.
				1771	*/
				1772	new_head = next_page;
				1773	rb_inc_page(cpu_buffer, &new_head);
				1774
				1775	ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
				1776	RB_PAGE_NORMAL);
				1777
				1778	/*
				1779	* Valid returns are:
				1780	* HEAD - an interrupt came in and already set it.
				1781	* NORMAL - One of two things:
				1782	* 1) We really set it.
				1783	* 2) A bunch of interrupts came in and moved
				1784	* the page forward again.
				1785	*/
				1786	switch (ret) {
				1787	case RB_PAGE_HEAD:
				1788	case RB_PAGE_NORMAL:
				1789	/* OK */
				1790	break;
				1791	default:
				1792	RB_WARN_ON(cpu_buffer, 1);
				1793	return -1;
				1794	}
				1795
				1796	/*
				1797	* It is possible that an interrupt came in,
				1798	* set the head up, then more interrupts came in
				1799	* and moved it again. When we get back here,
				1800	* the page would have been set to NORMAL but we
				1801	* just set it back to HEAD.
				1802	*
				1803	* How do you detect this? Well, if that happened
				1804	* the tail page would have moved.
				1805	*/
				1806	if (ret == RB_PAGE_NORMAL) {
				1807	/*
				1808	* If the tail had moved passed next, then we need
				1809	* to reset the pointer.
				1810	*/
				1811	if (cpu_buffer->tail_page != tail_page &&
				1812	cpu_buffer->tail_page != next_page)
				1813	rb_head_page_set_normal(cpu_buffer, new_head,
				1814	next_page,
				1815	RB_PAGE_HEAD);
				1816	}
				1817
				1818	/*
				1819	* If this was the outer most commit (the one that
				1820	* changed the original pointer from HEAD to UPDATE),
				1821	* then it is up to us to reset it to NORMAL.
				1822	*/
				1823	if (type == RB_PAGE_HEAD) {
				1824	ret = rb_head_page_set_normal(cpu_buffer, next_page,
				1825	tail_page,
				1826	RB_PAGE_UPDATE);
				1827	if (RB_WARN_ON(cpu_buffer,
				1828	ret != RB_PAGE_UPDATE))
				1829	return -1;
				1830	}
				1831
				1832	return 0;
				1833	}
				1834
				1835	static unsigned rb_calculate_event_length(unsigned length)
				1836	{
				1837	struct ring_buffer_event event; /* Used only for sizeof array */
				1838
				1839	/* zero length can cause confusions */
				1840	if (!length)
				1841	length = 1;
				1842
				1843	if (length > RB_MAX_SMALL_DATA \|\| RB_FORCE_8BYTE_ALIGNMENT)
				1844	length += sizeof(event.array[0]);
				1845
				1846	length += RB_EVNT_HDR_SIZE;
				1847	length = ALIGN(length, RB_ARCH_ALIGNMENT);
				1848
				1849	return length;
				1850	}
				1851
				1852	static inline void
				1853	rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
				1854	struct buffer_page *tail_page,
				1855	unsigned long tail, unsigned long length)
				1856	{
				1857	struct ring_buffer_event *event;
				1858
				1859	/*
				1860	* Only the event that crossed the page boundary
				1861	* must fill the old tail_page with padding.
				1862	*/
				1863	if (tail >= BUF_PAGE_SIZE) {
				1864	/*
				1865	* If the page was filled, then we still need
				1866	* to update the real_end. Reset it to zero
				1867	* and the reader will ignore it.
				1868	*/
				1869	if (tail == BUF_PAGE_SIZE)
				1870	tail_page->real_end = 0;
				1871
				1872	local_sub(length, &tail_page->write);
				1873	return;
				1874	}
				1875
				1876	event = __rb_page_index(tail_page, tail);
				1877	kmemcheck_annotate_bitfield(event, bitfield);
				1878
				1879	/* account for padding bytes */
				1880	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
				1881
				1882	/*
				1883	* Save the original length to the meta data.
				1884	* This will be used by the reader to add lost event
				1885	* counter.
				1886	*/
				1887	tail_page->real_end = tail;
				1888
				1889	/*
				1890	* If this event is bigger than the minimum size, then
				1891	* we need to be careful that we don't subtract the
				1892	* write counter enough to allow another writer to slip
				1893	* in on this page.
				1894	* We put in a discarded commit instead, to make sure
				1895	* that this space is not used again.
				1896	*
				1897	* If we are less than the minimum size, we don't need to
				1898	* worry about it.
				1899	*/
				1900	if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
				1901	/* No room for any events */
				1902
				1903	/* Mark the rest of the page with padding */
				1904	rb_event_set_padding(event);
				1905
				1906	/* Set the write back to the previous setting */
				1907	local_sub(length, &tail_page->write);
				1908	return;
				1909	}
				1910
				1911	/* Put in a discarded event */
				1912	event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
				1913	event->type_len = RINGBUF_TYPE_PADDING;
				1914	/* time delta must be non zero */
				1915	event->time_delta = 1;
				1916
				1917	/* Set write to end of buffer */
				1918	length = (tail + length) - BUF_PAGE_SIZE;
				1919	local_sub(length, &tail_page->write);
				1920	}
				1921
				1922	/*
				1923	* This is the slow path, force gcc not to inline it.
				1924	*/
				1925	static noinline struct ring_buffer_event *
				1926	rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
				1927	unsigned long length, unsigned long tail,
				1928	struct buffer_page *tail_page, u64 ts)
				1929	{
				1930	struct buffer_page *commit_page = cpu_buffer->commit_page;
				1931	struct ring_buffer *buffer = cpu_buffer->buffer;
				1932	struct buffer_page *next_page;
				1933	int ret;
				1934
				1935	next_page = tail_page;
				1936
				1937	rb_inc_page(cpu_buffer, &next_page);
				1938
				1939	/*
				1940	* If for some reason, we had an interrupt storm that made
				1941	* it all the way around the buffer, bail, and warn
				1942	* about it.
				1943	*/
				1944	if (unlikely(next_page == commit_page)) {
				1945	local_inc(&cpu_buffer->commit_overrun);
				1946	goto out_reset;
				1947	}
				1948
				1949	/*
				1950	* This is where the fun begins!
				1951	*
				1952	* We are fighting against races between a reader that
				1953	* could be on another CPU trying to swap its reader
				1954	* page with the buffer head.
				1955	*
				1956	* We are also fighting against interrupts coming in and
				1957	* moving the head or tail on us as well.
				1958	*
				1959	* If the next page is the head page then we have filled
				1960	* the buffer, unless the commit page is still on the
				1961	* reader page.
				1962	*/
				1963	if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
				1964
				1965	/*
				1966	* If the commit is not on the reader page, then
				1967	* move the header page.
				1968	*/
				1969	if (!rb_is_reader_page(cpu_buffer->commit_page)) {
				1970	/*
				1971	* If we are not in overwrite mode,
				1972	* this is easy, just stop here.
				1973	*/
				1974	if (!(buffer->flags & RB_FL_OVERWRITE))
				1975	goto out_reset;
				1976
				1977	ret = rb_handle_head_page(cpu_buffer,
				1978	tail_page,
				1979	next_page);
				1980	if (ret < 0)
				1981	goto out_reset;
				1982	if (ret)
				1983	goto out_again;
				1984	} else {
				1985	/*
				1986	* We need to be careful here too. The
				1987	* commit page could still be on the reader
				1988	* page. We could have a small buffer, and
				1989	* have filled up the buffer with events
				1990	* from interrupts and such, and wrapped.
				1991	*
				1992	* Note, if the tail page is also the on the
				1993	* reader_page, we let it move out.
				1994	*/
				1995	if (unlikely((cpu_buffer->commit_page !=
				1996	cpu_buffer->tail_page) &&
				1997	(cpu_buffer->commit_page ==
				1998	cpu_buffer->reader_page))) {
				1999	local_inc(&cpu_buffer->commit_overrun);
				2000	goto out_reset;
				2001	}
				2002	}
				2003	}
				2004
				2005	ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
				2006	if (ret) {
				2007	/*
				2008	* Nested commits always have zero deltas, so
				2009	* just reread the time stamp
				2010	*/
				2011	ts = rb_time_stamp(buffer);
				2012	next_page->page->time_stamp = ts;
				2013	}
				2014
				2015	out_again:
				2016
				2017	rb_reset_tail(cpu_buffer, tail_page, tail, length);
				2018
				2019	/* fail and let the caller try again */
				2020	return ERR_PTR(-EAGAIN);
				2021
				2022	out_reset:
				2023	/* reset write */
				2024	rb_reset_tail(cpu_buffer, tail_page, tail, length);
				2025
				2026	return NULL;
				2027	}
				2028
				2029	static struct ring_buffer_event *
				2030	__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
				2031	unsigned long length, u64 ts,
				2032	u64 delta, int add_timestamp)
				2033	{
				2034	struct buffer_page *tail_page;
				2035	struct ring_buffer_event *event;
				2036	unsigned long tail, write;
				2037
				2038	/*
				2039	* If the time delta since the last event is too big to
				2040	* hold in the time field of the event, then we append a
				2041	* TIME EXTEND event ahead of the data event.
				2042	*/
				2043	if (unlikely(add_timestamp))
				2044	length += RB_LEN_TIME_EXTEND;
				2045
				2046	tail_page = cpu_buffer->tail_page;
				2047	write = local_add_return(length, &tail_page->write);
				2048
				2049	/* set write to only the index of the write */
				2050	write &= RB_WRITE_MASK;
				2051	tail = write - length;
				2052
				2053	/*
				2054	* If this is the first commit on the page, then it has the same
				2055	* timestamp as the page itself.
				2056	*/
				2057	if (!tail)
				2058	delta = 0;
				2059
				2060	/* See if we shot pass the end of this buffer page */
				2061	if (unlikely(write > BUF_PAGE_SIZE))
				2062	return rb_move_tail(cpu_buffer, length, tail,
				2063	tail_page, ts);
				2064
				2065	/* We reserved something on the buffer */
				2066
				2067	event = __rb_page_index(tail_page, tail);
				2068	kmemcheck_annotate_bitfield(event, bitfield);
				2069	rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
				2070
				2071	local_inc(&tail_page->entries);
				2072
				2073	/*
				2074	* If this is the first commit on the page, then update
				2075	* its timestamp.
				2076	*/
				2077	if (!tail)
				2078	tail_page->page->time_stamp = ts;
				2079
				2080	/* account for these added bytes */
				2081	local_add(length, &cpu_buffer->entries_bytes);
				2082
				2083	return event;
				2084	}
				2085
				2086	static inline int
				2087	rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
				2088	struct ring_buffer_event *event)
				2089	{
				2090	unsigned long new_index, old_index;
				2091	struct buffer_page *bpage;
				2092	unsigned long index;
				2093	unsigned long addr;
				2094
				2095	new_index = rb_event_index(event);
				2096	old_index = new_index + rb_event_ts_length(event);
				2097	addr = (unsigned long)event;
				2098	addr &= PAGE_MASK;
				2099
				2100	bpage = cpu_buffer->tail_page;
				2101
				2102	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
				2103	unsigned long write_mask =
				2104	local_read(&bpage->write) & ~RB_WRITE_MASK;
				2105	unsigned long event_length = rb_event_length(event);
				2106	/*
				2107	* This is on the tail page. It is possible that
				2108	* a write could come in and move the tail page
				2109	* and write to the next page. That is fine
				2110	* because we just shorten what is on this page.
				2111	*/
				2112	old_index += write_mask;
				2113	new_index += write_mask;
				2114	index = local_cmpxchg(&bpage->write, old_index, new_index);
				2115	if (index == old_index) {
				2116	/* update counters */
				2117	local_sub(event_length, &cpu_buffer->entries_bytes);
				2118	return 1;
				2119	}
				2120	}
				2121
				2122	/* could not discard */
				2123	return 0;
				2124	}
				2125
				2126	static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
				2127	{
				2128	local_inc(&cpu_buffer->committing);
				2129	local_inc(&cpu_buffer->commits);
				2130	}
				2131
				2132	static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
				2133	{
				2134	unsigned long commits;
				2135
				2136	if (RB_WARN_ON(cpu_buffer,
				2137	!local_read(&cpu_buffer->committing)))
				2138	return;
				2139
				2140	again:
				2141	commits = local_read(&cpu_buffer->commits);
				2142	/* synchronize with interrupts */
				2143	barrier();
				2144	if (local_read(&cpu_buffer->committing) == 1)
				2145	rb_set_commit_to_write(cpu_buffer);
				2146
				2147	local_dec(&cpu_buffer->committing);
				2148
				2149	/* synchronize with interrupts */
				2150	barrier();
				2151
				2152	/*
				2153	* Need to account for interrupts coming in between the
				2154	* updating of the commit page and the clearing of the
				2155	* committing counter.
				2156	*/
				2157	if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
				2158	!local_read(&cpu_buffer->committing)) {
				2159	local_inc(&cpu_buffer->committing);
				2160	goto again;
				2161	}
				2162	}
				2163
				2164	static struct ring_buffer_event *
				2165	rb_reserve_next_event(struct ring_buffer *buffer,
				2166	struct ring_buffer_per_cpu *cpu_buffer,
				2167	unsigned long length)
				2168	{
				2169	struct ring_buffer_event *event;
				2170	u64 ts, delta;
				2171	int nr_loops = 0;
				2172	int add_timestamp;
				2173	u64 diff;
				2174
				2175	rb_start_commit(cpu_buffer);
				2176
				2177	#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
				2178	/*
				2179	* Due to the ability to swap a cpu buffer from a buffer
				2180	* it is possible it was swapped before we committed.
				2181	* (committing stops a swap). We check for it here and
				2182	* if it happened, we have to fail the write.
				2183	*/
				2184	barrier();
				2185	if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
				2186	local_dec(&cpu_buffer->committing);
				2187	local_dec(&cpu_buffer->commits);
				2188	return NULL;
				2189	}
				2190	#endif
				2191
				2192	length = rb_calculate_event_length(length);
				2193	again:
				2194	add_timestamp = 0;
				2195	delta = 0;
				2196
				2197	/*
				2198	* We allow for interrupts to reenter here and do a trace.
				2199	* If one does, it will cause this original code to loop
				2200	* back here. Even with heavy interrupts happening, this
				2201	* should only happen a few times in a row. If this happens
				2202	* 1000 times in a row, there must be either an interrupt
				2203	* storm or we have something buggy.
				2204	* Bail!
				2205	*/
				2206	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
				2207	goto out_fail;
				2208
				2209	ts = rb_time_stamp(cpu_buffer->buffer);
				2210	diff = ts - cpu_buffer->write_stamp;
				2211
				2212	/* make sure this diff is calculated here */
				2213	barrier();
				2214
				2215	/* Did the write stamp get updated already? */
				2216	if (likely(ts >= cpu_buffer->write_stamp)) {
				2217	delta = diff;
				2218	if (unlikely(test_time_stamp(delta))) {
				2219	int local_clock_stable = 1;
				2220	#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
				2221	local_clock_stable = sched_clock_stable;
				2222	#endif
				2223	WARN_ONCE(delta > (1ULL << 59),
				2224	KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
				2225	(unsigned long long)delta,
				2226	(unsigned long long)ts,
				2227	(unsigned long long)cpu_buffer->write_stamp,
				2228	local_clock_stable ? "" :
				2229	"If you just came from a suspend/resume,\n"
				2230	"please switch to the trace global clock:\n"
				2231	" echo global > /sys/kernel/debug/tracing/trace_clock\n");
				2232	add_timestamp = 1;
				2233	}
				2234	}
				2235
				2236	event = __rb_reserve_next(cpu_buffer, length, ts,
				2237	delta, add_timestamp);
				2238	if (unlikely(PTR_ERR(event) == -EAGAIN))
				2239	goto again;
				2240
				2241	if (!event)
				2242	goto out_fail;
				2243
				2244	return event;
				2245
				2246	out_fail:
				2247	rb_end_commit(cpu_buffer);
				2248	return NULL;
				2249	}
				2250
				2251	#ifdef CONFIG_TRACING
				2252
				2253	#define TRACE_RECURSIVE_DEPTH 16
				2254
				2255	/* Keep this code out of the fast path cache */
				2256	static noinline void trace_recursive_fail(void)
				2257	{
				2258	/* Disable all tracing before we do anything else */
				2259	tracing_off_permanent();
				2260
				2261	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
				2262	"HC[%lu]:SC[%lu]:NMI[%lu]\n",
				2263	trace_recursion_buffer(),
				2264	hardirq_count() >> HARDIRQ_SHIFT,
				2265	softirq_count() >> SOFTIRQ_SHIFT,
				2266	in_nmi());
				2267
				2268	WARN_ON_ONCE(1);
				2269	}
				2270
				2271	static inline int trace_recursive_lock(void)
				2272	{
				2273	trace_recursion_inc();
				2274
				2275	if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
				2276	return 0;
				2277
				2278	trace_recursive_fail();
				2279
				2280	return -1;
				2281	}
				2282
				2283	static inline void trace_recursive_unlock(void)
				2284	{
				2285	WARN_ON_ONCE(!trace_recursion_buffer());
				2286
				2287	trace_recursion_dec();
				2288	}
				2289
				2290	#else
				2291
				2292	#define trace_recursive_lock() (0)
				2293	#define trace_recursive_unlock() do { } while (0)
				2294
				2295	#endif
				2296
				2297	/**
				2298	* ring_buffer_lock_reserve - reserve a part of the buffer
				2299	* @buffer: the ring buffer to reserve from
				2300	* @length: the length of the data to reserve (excluding event header)
				2301	*
				2302	* Returns a reseverd event on the ring buffer to copy directly to.
				2303	* The user of this interface will need to get the body to write into
				2304	* and can use the ring_buffer_event_data() interface.
				2305	*
				2306	* The length is the length of the data needed, not the event length
				2307	* which also includes the event header.
				2308	*
				2309	* Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
				2310	* If NULL is returned, then nothing has been allocated or locked.
				2311	*/
				2312	struct ring_buffer_event *
				2313	ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
				2314	{
				2315	struct ring_buffer_per_cpu *cpu_buffer;
				2316	struct ring_buffer_event *event;
				2317	int cpu;
				2318
				2319	if (ring_buffer_flags != RB_BUFFERS_ON)
				2320	return NULL;
				2321
				2322	/* If we are tracing schedule, we don't want to recurse */
				2323	preempt_disable_notrace();
				2324
				2325	if (atomic_read(&buffer->record_disabled))
				2326	goto out_nocheck;
				2327
				2328	if (trace_recursive_lock())
				2329	goto out_nocheck;
				2330
				2331	cpu = raw_smp_processor_id();
				2332
				2333	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2334	goto out;
				2335
				2336	cpu_buffer = buffer->buffers[cpu];
				2337
				2338	if (atomic_read(&cpu_buffer->record_disabled))
				2339	goto out;
				2340
				2341	if (length > BUF_MAX_DATA_SIZE)
				2342	goto out;
				2343
				2344	event = rb_reserve_next_event(buffer, cpu_buffer, length);
				2345	if (!event)
				2346	goto out;
				2347
				2348	return event;
				2349
				2350	out:
				2351	trace_recursive_unlock();
				2352
				2353	out_nocheck:
				2354	preempt_enable_notrace();
				2355	return NULL;
				2356	}
				2357	EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
				2358
				2359	static void
				2360	rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
				2361	struct ring_buffer_event *event)
				2362	{
				2363	u64 delta;
				2364
				2365	/*
				2366	* The event first in the commit queue updates the
				2367	* time stamp.
				2368	*/
				2369	if (rb_event_is_commit(cpu_buffer, event)) {
				2370	/*
				2371	* A commit event that is first on a page
				2372	* updates the write timestamp with the page stamp
				2373	*/
				2374	if (!rb_event_index(event))
				2375	cpu_buffer->write_stamp =
				2376	cpu_buffer->commit_page->page->time_stamp;
				2377	else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
				2378	delta = event->array[0];
				2379	delta <<= TS_SHIFT;
				2380	delta += event->time_delta;
				2381	cpu_buffer->write_stamp += delta;
				2382	} else
				2383	cpu_buffer->write_stamp += event->time_delta;
				2384	}
				2385	}
				2386
				2387	static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
				2388	struct ring_buffer_event *event)
				2389	{
				2390	local_inc(&cpu_buffer->entries);
				2391	rb_update_write_stamp(cpu_buffer, event);
				2392	rb_end_commit(cpu_buffer);
				2393	}
				2394
				2395	/**
				2396	* ring_buffer_unlock_commit - commit a reserved
				2397	* @buffer: The buffer to commit to
				2398	* @event: The event pointer to commit.
				2399	*
				2400	* This commits the data to the ring buffer, and releases any locks held.
				2401	*
				2402	* Must be paired with ring_buffer_lock_reserve.
				2403	*/
				2404	int ring_buffer_unlock_commit(struct ring_buffer *buffer,
				2405	struct ring_buffer_event *event)
				2406	{
				2407	struct ring_buffer_per_cpu *cpu_buffer;
				2408	int cpu = raw_smp_processor_id();
				2409
				2410	cpu_buffer = buffer->buffers[cpu];
				2411
				2412	rb_commit(cpu_buffer, event);
				2413
				2414	trace_recursive_unlock();
				2415
				2416	preempt_enable_notrace();
				2417
				2418	return 0;
				2419	}
				2420	EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
				2421
				2422	static inline void rb_event_discard(struct ring_buffer_event *event)
				2423	{
				2424	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
				2425	event = skip_time_extend(event);
				2426
				2427	/* array[0] holds the actual length for the discarded event */
				2428	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
				2429	event->type_len = RINGBUF_TYPE_PADDING;
				2430	/* time delta must be non zero */
				2431	if (!event->time_delta)
				2432	event->time_delta = 1;
				2433	}
				2434
				2435	/*
				2436	* Decrement the entries to the page that an event is on.
				2437	* The event does not even need to exist, only the pointer
				2438	* to the page it is on. This may only be called before the commit
				2439	* takes place.
				2440	*/
				2441	static inline void
				2442	rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
				2443	struct ring_buffer_event *event)
				2444	{
				2445	unsigned long addr = (unsigned long)event;
				2446	struct buffer_page *bpage = cpu_buffer->commit_page;
				2447	struct buffer_page *start;
				2448
				2449	addr &= PAGE_MASK;
				2450
				2451	/* Do the likely case first */
				2452	if (likely(bpage->page == (void *)addr)) {
				2453	local_dec(&bpage->entries);
				2454	return;
				2455	}
				2456
				2457	/*
				2458	* Because the commit page may be on the reader page we
				2459	* start with the next page and check the end loop there.
				2460	*/
				2461	rb_inc_page(cpu_buffer, &bpage);
				2462	start = bpage;
				2463	do {
				2464	if (bpage->page == (void *)addr) {
				2465	local_dec(&bpage->entries);
				2466	return;
				2467	}
				2468	rb_inc_page(cpu_buffer, &bpage);
				2469	} while (bpage != start);
				2470
				2471	/* commit not part of this buffer?? */
				2472	RB_WARN_ON(cpu_buffer, 1);
				2473	}
				2474
				2475	/**
				2476	* ring_buffer_commit_discard - discard an event that has not been committed
				2477	* @buffer: the ring buffer
				2478	* @event: non committed event to discard
				2479	*
				2480	* Sometimes an event that is in the ring buffer needs to be ignored.
				2481	* This function lets the user discard an event in the ring buffer
				2482	* and then that event will not be read later.
				2483	*
				2484	* This function only works if it is called before the the item has been
				2485	* committed. It will try to free the event from the ring buffer
				2486	* if another event has not been added behind it.
				2487	*
				2488	* If another event has been added behind it, it will set the event
				2489	* up as discarded, and perform the commit.
				2490	*
				2491	* If this function is called, do not call ring_buffer_unlock_commit on
				2492	* the event.
				2493	*/
				2494	void ring_buffer_discard_commit(struct ring_buffer *buffer,
				2495	struct ring_buffer_event *event)
				2496	{
				2497	struct ring_buffer_per_cpu *cpu_buffer;
				2498	int cpu;
				2499
				2500	/* The event is discarded regardless */
				2501	rb_event_discard(event);
				2502
				2503	cpu = smp_processor_id();
				2504	cpu_buffer = buffer->buffers[cpu];
				2505
				2506	/*
				2507	* This must only be called if the event has not been
				2508	* committed yet. Thus we can assume that preemption
				2509	* is still disabled.
				2510	*/
				2511	RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
				2512
				2513	rb_decrement_entry(cpu_buffer, event);
				2514	if (rb_try_to_discard(cpu_buffer, event))
				2515	goto out;
				2516
				2517	/*
				2518	* The commit is still visible by the reader, so we
				2519	* must still update the timestamp.
				2520	*/
				2521	rb_update_write_stamp(cpu_buffer, event);
				2522	out:
				2523	rb_end_commit(cpu_buffer);
				2524
				2525	trace_recursive_unlock();
				2526
				2527	preempt_enable_notrace();
				2528
				2529	}
				2530	EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
				2531
				2532	/**
				2533	* ring_buffer_write - write data to the buffer without reserving
				2534	* @buffer: The ring buffer to write to.
				2535	* @length: The length of the data being written (excluding the event header)
				2536	* @data: The data to write to the buffer.
				2537	*
				2538	* This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
				2539	* one function. If you already have the data to write to the buffer, it
				2540	* may be easier to simply call this function.
				2541	*
				2542	* Note, like ring_buffer_lock_reserve, the length is the length of the data
				2543	* and not the length of the event which would hold the header.
				2544	*/
				2545	int ring_buffer_write(struct ring_buffer *buffer,
				2546	unsigned long length,
				2547	void *data)
				2548	{
				2549	struct ring_buffer_per_cpu *cpu_buffer;
				2550	struct ring_buffer_event *event;
				2551	void *body;
				2552	int ret = -EBUSY;
				2553	int cpu;
				2554
				2555	if (ring_buffer_flags != RB_BUFFERS_ON)
				2556	return -EBUSY;
				2557
				2558	preempt_disable_notrace();
				2559
				2560	if (atomic_read(&buffer->record_disabled))
				2561	goto out;
				2562
				2563	cpu = raw_smp_processor_id();
				2564
				2565	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2566	goto out;
				2567
				2568	cpu_buffer = buffer->buffers[cpu];
				2569
				2570	if (atomic_read(&cpu_buffer->record_disabled))
				2571	goto out;
				2572
				2573	if (length > BUF_MAX_DATA_SIZE)
				2574	goto out;
				2575
				2576	event = rb_reserve_next_event(buffer, cpu_buffer, length);
				2577	if (!event)
				2578	goto out;
				2579
				2580	body = rb_event_data(event);
				2581
				2582	memcpy(body, data, length);
				2583
				2584	rb_commit(cpu_buffer, event);
				2585
				2586	ret = 0;
				2587	out:
				2588	preempt_enable_notrace();
				2589
				2590	return ret;
				2591	}
				2592	EXPORT_SYMBOL_GPL(ring_buffer_write);
				2593
				2594	static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
				2595	{
				2596	struct buffer_page *reader = cpu_buffer->reader_page;
				2597	struct buffer_page *head = rb_set_head_page(cpu_buffer);
				2598	struct buffer_page *commit = cpu_buffer->commit_page;
				2599
				2600	/* In case of error, head will be NULL */
				2601	if (unlikely(!head))
				2602	return 1;
				2603
				2604	return reader->read == rb_page_commit(reader) &&
				2605	(commit == reader \|\|
				2606	(commit == head &&
				2607	head->read == rb_page_commit(commit)));
				2608	}
				2609
				2610	/**
				2611	* ring_buffer_record_disable - stop all writes into the buffer
				2612	* @buffer: The ring buffer to stop writes to.
				2613	*
				2614	* This prevents all writes to the buffer. Any attempt to write
				2615	* to the buffer after this will fail and return NULL.
				2616	*
				2617	* The caller should call synchronize_sched() after this.
				2618	*/
				2619	void ring_buffer_record_disable(struct ring_buffer *buffer)
				2620	{
				2621	atomic_inc(&buffer->record_disabled);
				2622	}
				2623	EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
				2624
				2625	/**
				2626	* ring_buffer_record_enable - enable writes to the buffer
				2627	* @buffer: The ring buffer to enable writes
				2628	*
				2629	* Note, multiple disables will need the same number of enables
				2630	* to truly enable the writing (much like preempt_disable).
				2631	*/
				2632	void ring_buffer_record_enable(struct ring_buffer *buffer)
				2633	{
				2634	atomic_dec(&buffer->record_disabled);
				2635	}
				2636	EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
				2637
				2638	/**
				2639	* ring_buffer_record_off - stop all writes into the buffer
				2640	* @buffer: The ring buffer to stop writes to.
				2641	*
				2642	* This prevents all writes to the buffer. Any attempt to write
				2643	* to the buffer after this will fail and return NULL.
				2644	*
				2645	* This is different than ring_buffer_record_disable() as
				2646	* it works like an on/off switch, where as the disable() verison
				2647	* must be paired with a enable().
				2648	*/
				2649	void ring_buffer_record_off(struct ring_buffer *buffer)
				2650	{
				2651	unsigned int rd;
				2652	unsigned int new_rd;
				2653
				2654	do {
				2655	rd = atomic_read(&buffer->record_disabled);
				2656	new_rd = rd \| RB_BUFFER_OFF;
				2657	} while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
				2658	}
				2659	EXPORT_SYMBOL_GPL(ring_buffer_record_off);
				2660
				2661	/**
				2662	* ring_buffer_record_on - restart writes into the buffer
				2663	* @buffer: The ring buffer to start writes to.
				2664	*
				2665	* This enables all writes to the buffer that was disabled by
				2666	* ring_buffer_record_off().
				2667	*
				2668	* This is different than ring_buffer_record_enable() as
				2669	* it works like an on/off switch, where as the enable() verison
				2670	* must be paired with a disable().
				2671	*/
				2672	void ring_buffer_record_on(struct ring_buffer *buffer)
				2673	{
				2674	unsigned int rd;
				2675	unsigned int new_rd;
				2676
				2677	do {
				2678	rd = atomic_read(&buffer->record_disabled);
				2679	new_rd = rd & ~RB_BUFFER_OFF;
				2680	} while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
				2681	}
				2682	EXPORT_SYMBOL_GPL(ring_buffer_record_on);
				2683
				2684	/**
				2685	* ring_buffer_record_is_on - return true if the ring buffer can write
				2686	* @buffer: The ring buffer to see if write is enabled
				2687	*
				2688	* Returns true if the ring buffer is in a state that it accepts writes.
				2689	*/
				2690	int ring_buffer_record_is_on(struct ring_buffer *buffer)
				2691	{
				2692	return !atomic_read(&buffer->record_disabled);
				2693	}
				2694
				2695	/**
				2696	* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
				2697	* @buffer: The ring buffer to stop writes to.
				2698	* @cpu: The CPU buffer to stop
				2699	*
				2700	* This prevents all writes to the buffer. Any attempt to write
				2701	* to the buffer after this will fail and return NULL.
				2702	*
				2703	* The caller should call synchronize_sched() after this.
				2704	*/
				2705	void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
				2706	{
				2707	struct ring_buffer_per_cpu *cpu_buffer;
				2708
				2709	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2710	return;
				2711
				2712	cpu_buffer = buffer->buffers[cpu];
				2713	atomic_inc(&cpu_buffer->record_disabled);
				2714	}
				2715	EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
				2716
				2717	/**
				2718	* ring_buffer_record_enable_cpu - enable writes to the buffer
				2719	* @buffer: The ring buffer to enable writes
				2720	* @cpu: The CPU to enable.
				2721	*
				2722	* Note, multiple disables will need the same number of enables
				2723	* to truly enable the writing (much like preempt_disable).
				2724	*/
				2725	void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
				2726	{
				2727	struct ring_buffer_per_cpu *cpu_buffer;
				2728
				2729	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2730	return;
				2731
				2732	cpu_buffer = buffer->buffers[cpu];
				2733	atomic_dec(&cpu_buffer->record_disabled);
				2734	}
				2735	EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
				2736
				2737	/*
				2738	* The total entries in the ring buffer is the running counter
				2739	* of entries entered into the ring buffer, minus the sum of
				2740	* the entries read from the ring buffer and the number of
				2741	* entries that were overwritten.
				2742	*/
				2743	static inline unsigned long
				2744	rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
				2745	{
				2746	return local_read(&cpu_buffer->entries) -
				2747	(local_read(&cpu_buffer->overrun) + cpu_buffer->read);
				2748	}
				2749
				2750	/**
				2751	* ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
				2752	* @buffer: The ring buffer
				2753	* @cpu: The per CPU buffer to read from.
				2754	*/
				2755	unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
				2756	{
				2757	unsigned long flags;
				2758	struct ring_buffer_per_cpu *cpu_buffer;
				2759	struct buffer_page *bpage;
				2760	unsigned long ret = 0;
				2761
				2762	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2763	return 0;
				2764
				2765	cpu_buffer = buffer->buffers[cpu];
				2766	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
				2767	/*
				2768	* if the tail is on reader_page, oldest time stamp is on the reader
				2769	* page
				2770	*/
				2771	if (cpu_buffer->tail_page == cpu_buffer->reader_page)
				2772	bpage = cpu_buffer->reader_page;
				2773	else
				2774	bpage = rb_set_head_page(cpu_buffer);
				2775	if (bpage)
				2776	ret = bpage->page->time_stamp;
				2777	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
				2778
				2779	return ret;
				2780	}
				2781	EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
				2782
				2783	/**
				2784	* ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
				2785	* @buffer: The ring buffer
				2786	* @cpu: The per CPU buffer to read from.
				2787	*/
				2788	unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
				2789	{
				2790	struct ring_buffer_per_cpu *cpu_buffer;
				2791	unsigned long ret;
				2792
				2793	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2794	return 0;
				2795
				2796	cpu_buffer = buffer->buffers[cpu];
				2797	ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
				2798
				2799	return ret;
				2800	}
				2801	EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
				2802
				2803	/**
				2804	* ring_buffer_entries_cpu - get the number of entries in a cpu buffer
				2805	* @buffer: The ring buffer
				2806	* @cpu: The per CPU buffer to get the entries from.
				2807	*/
				2808	unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
				2809	{
				2810	struct ring_buffer_per_cpu *cpu_buffer;
				2811
				2812	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2813	return 0;
				2814
				2815	cpu_buffer = buffer->buffers[cpu];
				2816
				2817	return rb_num_of_entries(cpu_buffer);
				2818	}
				2819	EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
				2820
				2821	/**
				2822	* ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
				2823	* @buffer: The ring buffer
				2824	* @cpu: The per CPU buffer to get the number of overruns from
				2825	*/
				2826	unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
				2827	{
				2828	struct ring_buffer_per_cpu *cpu_buffer;
				2829	unsigned long ret;
				2830
				2831	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2832	return 0;
				2833
				2834	cpu_buffer = buffer->buffers[cpu];
				2835	ret = local_read(&cpu_buffer->overrun);
				2836
				2837	return ret;
				2838	}
				2839	EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
				2840
				2841	/**
				2842	* ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
				2843	* @buffer: The ring buffer
				2844	* @cpu: The per CPU buffer to get the number of overruns from
				2845	*/
				2846	unsigned long
				2847	ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
				2848	{
				2849	struct ring_buffer_per_cpu *cpu_buffer;
				2850	unsigned long ret;
				2851
				2852	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				2853	return 0;
				2854
				2855	cpu_buffer = buffer->buffers[cpu];
				2856	ret = local_read(&cpu_buffer->commit_overrun);
				2857
				2858	return ret;
				2859	}
				2860	EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
				2861
				2862	/**
				2863	* ring_buffer_entries - get the number of entries in a buffer
				2864	* @buffer: The ring buffer
				2865	*
				2866	* Returns the total number of entries in the ring buffer
				2867	* (all CPU entries)
				2868	*/
				2869	unsigned long ring_buffer_entries(struct ring_buffer *buffer)
				2870	{
				2871	struct ring_buffer_per_cpu *cpu_buffer;
				2872	unsigned long entries = 0;
				2873	int cpu;
				2874
				2875	/* if you care about this being correct, lock the buffer */
				2876	for_each_buffer_cpu(buffer, cpu) {
				2877	cpu_buffer = buffer->buffers[cpu];
				2878	entries += rb_num_of_entries(cpu_buffer);
				2879	}
				2880
				2881	return entries;
				2882	}
				2883	EXPORT_SYMBOL_GPL(ring_buffer_entries);
				2884
				2885	/**
				2886	* ring_buffer_overruns - get the number of overruns in buffer
				2887	* @buffer: The ring buffer
				2888	*
				2889	* Returns the total number of overruns in the ring buffer
				2890	* (all CPU entries)
				2891	*/
				2892	unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
				2893	{
				2894	struct ring_buffer_per_cpu *cpu_buffer;
				2895	unsigned long overruns = 0;
				2896	int cpu;
				2897
				2898	/* if you care about this being correct, lock the buffer */
				2899	for_each_buffer_cpu(buffer, cpu) {
				2900	cpu_buffer = buffer->buffers[cpu];
				2901	overruns += local_read(&cpu_buffer->overrun);
				2902	}
				2903
				2904	return overruns;
				2905	}
				2906	EXPORT_SYMBOL_GPL(ring_buffer_overruns);
				2907
				2908	static void rb_iter_reset(struct ring_buffer_iter *iter)
				2909	{
				2910	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
				2911
				2912	/* Iterator usage is expected to have record disabled */
				2913	if (list_empty(&cpu_buffer->reader_page->list)) {
				2914	iter->head_page = rb_set_head_page(cpu_buffer);
				2915	if (unlikely(!iter->head_page))
				2916	return;
				2917	iter->head = iter->head_page->read;
				2918	} else {
				2919	iter->head_page = cpu_buffer->reader_page;
				2920	iter->head = cpu_buffer->reader_page->read;
				2921	}
				2922	if (iter->head)
				2923	iter->read_stamp = cpu_buffer->read_stamp;
				2924	else
				2925	iter->read_stamp = iter->head_page->page->time_stamp;
				2926	iter->cache_reader_page = cpu_buffer->reader_page;
				2927	iter->cache_read = cpu_buffer->read;
				2928	}
				2929
				2930	/**
				2931	* ring_buffer_iter_reset - reset an iterator
				2932	* @iter: The iterator to reset
				2933	*
				2934	* Resets the iterator, so that it will start from the beginning
				2935	* again.
				2936	*/
				2937	void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
				2938	{
				2939	struct ring_buffer_per_cpu *cpu_buffer;
				2940	unsigned long flags;
				2941	int locked;
				2942
				2943	if (!iter)
				2944	return;
				2945
				2946	cpu_buffer = iter->cpu_buffer;
				2947
				2948	locked = read_buffer_lock(cpu_buffer, &flags);
				2949	rb_iter_reset(iter);
				2950	read_buffer_unlock(cpu_buffer, flags, locked);
				2951	}
				2952	EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
				2953
				2954	/**
				2955	* ring_buffer_iter_empty - check if an iterator has no more to read
				2956	* @iter: The iterator to check
				2957	*/
				2958	int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
				2959	{
				2960	struct ring_buffer_per_cpu *cpu_buffer;
				2961
				2962	cpu_buffer = iter->cpu_buffer;
				2963
				2964	return iter->head_page == cpu_buffer->commit_page &&
				2965	iter->head == rb_commit_index(cpu_buffer);
				2966	}
				2967	EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
				2968
				2969	static void
				2970	rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
				2971	struct ring_buffer_event *event)
				2972	{
				2973	u64 delta;
				2974
				2975	switch (event->type_len) {
				2976	case RINGBUF_TYPE_PADDING:
				2977	return;
				2978
				2979	case RINGBUF_TYPE_TIME_EXTEND:
				2980	delta = event->array[0];
				2981	delta <<= TS_SHIFT;
				2982	delta += event->time_delta;
				2983	cpu_buffer->read_stamp += delta;
				2984	return;
				2985
				2986	case RINGBUF_TYPE_TIME_STAMP:
				2987	/* FIXME: not implemented */
				2988	return;
				2989
				2990	case RINGBUF_TYPE_DATA:
				2991	cpu_buffer->read_stamp += event->time_delta;
				2992	return;
				2993
				2994	default:
				2995	BUG();
				2996	}
				2997	return;
				2998	}
				2999
				3000	static void
				3001	rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
				3002	struct ring_buffer_event *event)
				3003	{
				3004	u64 delta;
				3005
				3006	switch (event->type_len) {
				3007	case RINGBUF_TYPE_PADDING:
				3008	return;
				3009
				3010	case RINGBUF_TYPE_TIME_EXTEND:
				3011	delta = event->array[0];
				3012	delta <<= TS_SHIFT;
				3013	delta += event->time_delta;
				3014	iter->read_stamp += delta;
				3015	return;
				3016
				3017	case RINGBUF_TYPE_TIME_STAMP:
				3018	/* FIXME: not implemented */
				3019	return;
				3020
				3021	case RINGBUF_TYPE_DATA:
				3022	iter->read_stamp += event->time_delta;
				3023	return;
				3024
				3025	default:
				3026	BUG();
				3027	}
				3028	return;
				3029	}
				3030
				3031	static struct buffer_page *
				3032	rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
				3033	{
				3034	struct buffer_page *reader = NULL;
				3035	unsigned long overwrite;
				3036	unsigned long flags;
				3037	int nr_loops = 0;
				3038	int ret;
				3039
				3040	local_irq_save(flags);
				3041	arch_spin_lock(&cpu_buffer->lock);
				3042
				3043	again:
				3044	/*
				3045	* This should normally only loop twice. But because the
				3046	* start of the reader inserts an empty page, it causes
				3047	* a case where we will loop three times. There should be no
				3048	* reason to loop four times (that I know of).
				3049	*/
				3050	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
				3051	reader = NULL;
				3052	goto out;
				3053	}
				3054
				3055	reader = cpu_buffer->reader_page;
				3056
				3057	/* If there's more to read, return this page */
				3058	if (cpu_buffer->reader_page->read < rb_page_size(reader))
				3059	goto out;
				3060
				3061	/* Never should we have an index greater than the size */
				3062	if (RB_WARN_ON(cpu_buffer,
				3063	cpu_buffer->reader_page->read > rb_page_size(reader)))
				3064	goto out;
				3065
				3066	/* check if we caught up to the tail */
				3067	reader = NULL;
				3068	if (cpu_buffer->commit_page == cpu_buffer->reader_page)
				3069	goto out;
				3070
				3071	/*
				3072	* Reset the reader page to size zero.
				3073	*/
				3074	local_set(&cpu_buffer->reader_page->write, 0);
				3075	local_set(&cpu_buffer->reader_page->entries, 0);
				3076	local_set(&cpu_buffer->reader_page->page->commit, 0);
				3077	cpu_buffer->reader_page->real_end = 0;
				3078
				3079	spin:
				3080	/*
				3081	* Splice the empty reader page into the list around the head.
				3082	*/
				3083	reader = rb_set_head_page(cpu_buffer);
				3084	if (!reader)
				3085	goto out;
				3086	cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
				3087	cpu_buffer->reader_page->list.prev = reader->list.prev;
				3088
				3089	/*
				3090	* cpu_buffer->pages just needs to point to the buffer, it
				3091	* has no specific buffer page to point to. Lets move it out
				3092	* of our way so we don't accidentally swap it.
				3093	*/
				3094	cpu_buffer->pages = reader->list.prev;
				3095
				3096	/* The reader page will be pointing to the new head */
				3097	rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
				3098
				3099	/*
				3100	* We want to make sure we read the overruns after we set up our
				3101	* pointers to the next object. The writer side does a
				3102	* cmpxchg to cross pages which acts as the mb on the writer
				3103	* side. Note, the reader will constantly fail the swap
				3104	* while the writer is updating the pointers, so this
				3105	* guarantees that the overwrite recorded here is the one we
				3106	* want to compare with the last_overrun.
				3107	*/
				3108	smp_mb();
				3109	overwrite = local_read(&(cpu_buffer->overrun));
				3110
				3111	/*
				3112	* Here's the tricky part.
				3113	*
				3114	* We need to move the pointer past the header page.
				3115	* But we can only do that if a writer is not currently
				3116	* moving it. The page before the header page has the
				3117	* flag bit '1' set if it is pointing to the page we want.
				3118	* but if the writer is in the process of moving it
				3119	* than it will be '2' or already moved '0'.
				3120	*/
				3121
				3122	ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
				3123
				3124	/*
				3125	* If we did not convert it, then we must try again.
				3126	*/
				3127	if (!ret)
				3128	goto spin;
				3129
				3130	/*
				3131	* Yeah! We succeeded in replacing the page.
				3132	*
				3133	* Now make the new head point back to the reader page.
				3134	*/
				3135	rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
				3136	rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
				3137
				3138	/* Finally update the reader page to the new head */
				3139	cpu_buffer->reader_page = reader;
				3140	rb_reset_reader_page(cpu_buffer);
				3141
				3142	if (overwrite != cpu_buffer->last_overrun) {
				3143	cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
				3144	cpu_buffer->last_overrun = overwrite;
				3145	}
				3146
				3147	goto again;
				3148
				3149	out:
				3150	arch_spin_unlock(&cpu_buffer->lock);
				3151	local_irq_restore(flags);
				3152
				3153	return reader;
				3154	}
				3155
				3156	static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
				3157	{
				3158	struct ring_buffer_event *event;
				3159	struct buffer_page *reader;
				3160	unsigned length;
				3161
				3162	reader = rb_get_reader_page(cpu_buffer);
				3163
				3164	/* This function should not be called when buffer is empty */
				3165	if (RB_WARN_ON(cpu_buffer, !reader))
				3166	return;
				3167
				3168	event = rb_reader_event(cpu_buffer);
				3169
				3170	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
				3171	cpu_buffer->read++;
				3172
				3173	rb_update_read_stamp(cpu_buffer, event);
				3174
				3175	length = rb_event_length(event);
				3176	cpu_buffer->reader_page->read += length;
				3177	}
				3178
				3179	static void rb_advance_iter(struct ring_buffer_iter *iter)
				3180	{
				3181	struct ring_buffer_per_cpu *cpu_buffer;
				3182	struct ring_buffer_event *event;
				3183	unsigned length;
				3184
				3185	cpu_buffer = iter->cpu_buffer;
				3186
				3187	/*
				3188	* Check if we are at the end of the buffer.
				3189	*/
				3190	if (iter->head >= rb_page_size(iter->head_page)) {
				3191	/* discarded commits can make the page empty */
				3192	if (iter->head_page == cpu_buffer->commit_page)
				3193	return;
				3194	rb_inc_iter(iter);
				3195	return;
				3196	}
				3197
				3198	event = rb_iter_head_event(iter);
				3199
				3200	length = rb_event_length(event);
				3201
				3202	/*
				3203	* This should not be called to advance the header if we are
				3204	* at the tail of the buffer.
				3205	*/
				3206	if (RB_WARN_ON(cpu_buffer,
				3207	(iter->head_page == cpu_buffer->commit_page) &&
				3208	(iter->head + length > rb_commit_index(cpu_buffer))))
				3209	return;
				3210
				3211	rb_update_iter_read_stamp(iter, event);
				3212
				3213	iter->head += length;
				3214
				3215	/* check for end of page padding */
				3216	if ((iter->head >= rb_page_size(iter->head_page)) &&
				3217	(iter->head_page != cpu_buffer->commit_page))
				3218	rb_advance_iter(iter);
				3219	}
				3220
				3221	static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
				3222	{
				3223	return cpu_buffer->lost_events;
				3224	}
				3225
				3226	static struct ring_buffer_event *
				3227	rb_buffer_peek(struct ring_buffer_per_cpu cpu_buffer, u64 ts,
				3228	unsigned long *lost_events)
				3229	{
				3230	struct ring_buffer_event *event;
				3231	struct buffer_page *reader;
				3232	int nr_loops = 0;
				3233
				3234	again:
				3235	/*
				3236	* We repeat when a time extend is encountered.
				3237	* Since the time extend is always attached to a data event,
				3238	* we should never loop more than once.
				3239	* (We never hit the following condition more than twice).
				3240	*/
				3241	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
				3242	return NULL;
				3243
				3244	reader = rb_get_reader_page(cpu_buffer);
				3245	if (!reader)
				3246	return NULL;
				3247
				3248	event = rb_reader_event(cpu_buffer);
				3249
				3250	switch (event->type_len) {
				3251	case RINGBUF_TYPE_PADDING:
				3252	if (rb_null_event(event))
				3253	RB_WARN_ON(cpu_buffer, 1);
				3254	/*
				3255	* Because the writer could be discarding every
				3256	* event it creates (which would probably be bad)
				3257	* if we were to go back to "again" then we may never
				3258	* catch up, and will trigger the warn on, or lock
				3259	* the box. Return the padding, and we will release
				3260	* the current locks, and try again.
				3261	*/
				3262	return event;
				3263
				3264	case RINGBUF_TYPE_TIME_EXTEND:
				3265	/* Internal data, OK to advance */
				3266	rb_advance_reader(cpu_buffer);
				3267	goto again;
				3268
				3269	case RINGBUF_TYPE_TIME_STAMP:
				3270	/* FIXME: not implemented */
				3271	rb_advance_reader(cpu_buffer);
				3272	goto again;
				3273
				3274	case RINGBUF_TYPE_DATA:
				3275	if (ts) {
				3276	*ts = cpu_buffer->read_stamp + event->time_delta;
				3277	ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
				3278	cpu_buffer->cpu, ts);
				3279	}
				3280	if (lost_events)
				3281	*lost_events = rb_lost_events(cpu_buffer);
				3282	return event;
				3283
				3284	default:
				3285	BUG();
				3286	}
				3287
				3288	return NULL;
				3289	}
				3290	EXPORT_SYMBOL_GPL(ring_buffer_peek);
				3291
				3292	static struct ring_buffer_event *
				3293	rb_iter_peek(struct ring_buffer_iter iter, u64 ts)
				3294	{
				3295	struct ring_buffer *buffer;
				3296	struct ring_buffer_per_cpu *cpu_buffer;
				3297	struct ring_buffer_event *event;
				3298	int nr_loops = 0;
				3299
				3300	cpu_buffer = iter->cpu_buffer;
				3301	buffer = cpu_buffer->buffer;
				3302
				3303	/*
				3304	* Check if someone performed a consuming read to
				3305	* the buffer. A consuming read invalidates the iterator
				3306	* and we need to reset the iterator in this case.
				3307	*/
				3308	if (unlikely(iter->cache_read != cpu_buffer->read \|\|
				3309	iter->cache_reader_page != cpu_buffer->reader_page))
				3310	rb_iter_reset(iter);
				3311
				3312	again:
				3313	if (ring_buffer_iter_empty(iter))
				3314	return NULL;
				3315
				3316	/*
				3317	* We repeat when a time extend is encountered.
				3318	* Since the time extend is always attached to a data event,
				3319	* we should never loop more than once.
				3320	* (We never hit the following condition more than twice).
				3321	*/
				3322	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
				3323	return NULL;
				3324
				3325	if (rb_per_cpu_empty(cpu_buffer))
				3326	return NULL;
				3327
				3328	if (iter->head >= local_read(&iter->head_page->page->commit)) {
				3329	rb_inc_iter(iter);
				3330	goto again;
				3331	}
				3332
				3333	event = rb_iter_head_event(iter);
				3334
				3335	switch (event->type_len) {
				3336	case RINGBUF_TYPE_PADDING:
				3337	if (rb_null_event(event)) {
				3338	rb_inc_iter(iter);
				3339	goto again;
				3340	}
				3341	rb_advance_iter(iter);
				3342	return event;
				3343
				3344	case RINGBUF_TYPE_TIME_EXTEND:
				3345	/* Internal data, OK to advance */
				3346	rb_advance_iter(iter);
				3347	goto again;
				3348
				3349	case RINGBUF_TYPE_TIME_STAMP:
				3350	/* FIXME: not implemented */
				3351	rb_advance_iter(iter);
				3352	goto again;
				3353
				3354	case RINGBUF_TYPE_DATA:
				3355	if (ts) {
				3356	*ts = iter->read_stamp + event->time_delta;
				3357	ring_buffer_normalize_time_stamp(buffer,
				3358	cpu_buffer->cpu, ts);
				3359	}
				3360	return event;
				3361
				3362	default:
				3363	BUG();
				3364	}
				3365
				3366	return NULL;
				3367	}
				3368	EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
				3369
				3370	/**
				3371	* ring_buffer_peek - peek at the next event to be read
				3372	* @buffer: The ring buffer to read
				3373	* @cpu: The cpu to peak at
				3374	* @ts: The timestamp counter of this event.
				3375	* @lost_events: a variable to store if events were lost (may be NULL)
				3376	*
				3377	* This will return the event that will be read next, but does
				3378	* not consume the data.
				3379	*/
				3380	struct ring_buffer_event *
				3381	ring_buffer_peek(struct ring_buffer buffer, int cpu, u64 ts,
				3382	unsigned long *lost_events)
				3383	{
				3384	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
				3385	struct ring_buffer_event *event;
				3386	unsigned long flags;
				3387	int locked;
				3388
				3389	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				3390	return NULL;
				3391
				3392	again:
				3393	locked = read_buffer_lock(cpu_buffer, &flags);
				3394	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
				3395	if (event && event->type_len == RINGBUF_TYPE_PADDING)
				3396	rb_advance_reader(cpu_buffer);
				3397	read_buffer_unlock(cpu_buffer, flags, locked);
				3398
				3399	if (event && event->type_len == RINGBUF_TYPE_PADDING)
				3400	goto again;
				3401
				3402	return event;
				3403	}
				3404
				3405	/**
				3406	* ring_buffer_iter_peek - peek at the next event to be read
				3407	* @iter: The ring buffer iterator
				3408	* @ts: The timestamp counter of this event.
				3409	*
				3410	* This will return the event that will be read next, but does
				3411	* not increment the iterator.
				3412	*/
				3413	struct ring_buffer_event *
				3414	ring_buffer_iter_peek(struct ring_buffer_iter iter, u64 ts)
				3415	{
				3416	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
				3417	struct ring_buffer_event *event;
				3418	unsigned long flags;
				3419	int locked;
				3420
				3421	again:
				3422	locked = read_buffer_lock(cpu_buffer, &flags);
				3423	event = rb_iter_peek(iter, ts);
				3424	read_buffer_unlock(cpu_buffer, flags, locked);
				3425
				3426	if (event && event->type_len == RINGBUF_TYPE_PADDING)
				3427	goto again;
				3428
				3429	return event;
				3430	}
				3431
				3432	/**
				3433	* ring_buffer_consume - return an event and consume it
				3434	* @buffer: The ring buffer to get the next event from
				3435	* @cpu: the cpu to read the buffer from
				3436	* @ts: a variable to store the timestamp (may be NULL)
				3437	* @lost_events: a variable to store if events were lost (may be NULL)
				3438	*
				3439	* Returns the next event in the ring buffer, and that event is consumed.
				3440	* Meaning, that sequential reads will keep returning a different event,
				3441	* and eventually empty the ring buffer if the producer is slower.
				3442	*/
				3443	struct ring_buffer_event *
				3444	ring_buffer_consume(struct ring_buffer buffer, int cpu, u64 ts,
				3445	unsigned long *lost_events)
				3446	{
				3447	struct ring_buffer_per_cpu *cpu_buffer;
				3448	struct ring_buffer_event *event = NULL;
				3449	unsigned long flags;
				3450	int locked;
				3451
				3452	again:
				3453	/* might be called in atomic */
				3454	preempt_disable();
				3455
				3456	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				3457	goto out;
				3458
				3459	cpu_buffer = buffer->buffers[cpu];
				3460	locked = read_buffer_lock(cpu_buffer, &flags);
				3461
				3462	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
				3463	if (event) {
				3464	cpu_buffer->lost_events = 0;
				3465	rb_advance_reader(cpu_buffer);
				3466	}
				3467
				3468	read_buffer_unlock(cpu_buffer, flags, locked);
				3469
				3470
				3471	out:
				3472	preempt_enable();
				3473
				3474	if (event && event->type_len == RINGBUF_TYPE_PADDING)
				3475	goto again;
				3476
				3477	return event;
				3478	}
				3479	EXPORT_SYMBOL_GPL(ring_buffer_consume);
				3480
				3481	/**
				3482	* ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
				3483	* @buffer: The ring buffer to read from
				3484	* @cpu: The cpu buffer to iterate over
				3485	*
				3486	* This performs the initial preparations necessary to iterate
				3487	* through the buffer. Memory is allocated, buffer recording
				3488	* is disabled, and the iterator pointer is returned to the caller.
				3489	*
				3490	* Disabling buffer recordng prevents the reading from being
				3491	* corrupted. This is not a consuming read, so a producer is not
				3492	* expected.
				3493	*
				3494	* After a sequence of ring_buffer_read_prepare calls, the user is
				3495	* expected to make at least one call to ring_buffer_prepare_sync.
				3496	* Afterwards, ring_buffer_read_start is invoked to get things going
				3497	* for real.
				3498	*
				3499	* This overall must be paired with ring_buffer_finish.
				3500	*/
				3501	struct ring_buffer_iter *
				3502	ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
				3503	{
				3504	struct ring_buffer_per_cpu *cpu_buffer;
				3505	struct ring_buffer_iter *iter;
				3506
				3507	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				3508	return NULL;
				3509
				3510	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
				3511	if (!iter)
				3512	return NULL;
				3513
				3514	cpu_buffer = buffer->buffers[cpu];
				3515
				3516	iter->cpu_buffer = cpu_buffer;
				3517
				3518	atomic_inc(&cpu_buffer->record_disabled);
				3519
				3520	return iter;
				3521	}
				3522	EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
				3523
				3524	/**
				3525	* ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
				3526	*
				3527	* All previously invoked ring_buffer_read_prepare calls to prepare
				3528	* iterators will be synchronized. Afterwards, read_buffer_read_start
				3529	* calls on those iterators are allowed.
				3530	*/
				3531	void
				3532	ring_buffer_read_prepare_sync(void)
				3533	{
				3534	synchronize_sched();
				3535	}
				3536	EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
				3537
				3538	/**
				3539	* ring_buffer_read_start - start a non consuming read of the buffer
				3540	* @iter: The iterator returned by ring_buffer_read_prepare
				3541	*
				3542	* This finalizes the startup of an iteration through the buffer.
				3543	* The iterator comes from a call to ring_buffer_read_prepare and
				3544	* an intervening ring_buffer_read_prepare_sync must have been
				3545	* performed.
				3546	*
				3547	* Must be paired with ring_buffer_finish.
				3548	*/
				3549	void
				3550	ring_buffer_read_start(struct ring_buffer_iter *iter)
				3551	{
				3552	struct ring_buffer_per_cpu *cpu_buffer;
				3553	unsigned long flags;
				3554	int locked;
				3555
				3556	if (!iter)
				3557	return;
				3558
				3559	cpu_buffer = iter->cpu_buffer;
				3560
				3561	locked = read_buffer_lock(cpu_buffer, &flags);
				3562	arch_spin_lock(&cpu_buffer->lock);
				3563	rb_iter_reset(iter);
				3564	arch_spin_unlock(&cpu_buffer->lock);
				3565	read_buffer_unlock(cpu_buffer, flags, locked);
				3566	}
				3567	EXPORT_SYMBOL_GPL(ring_buffer_read_start);
				3568
				3569	/**
				3570	* ring_buffer_finish - finish reading the iterator of the buffer
				3571	* @iter: The iterator retrieved by ring_buffer_start
				3572	*
				3573	* This re-enables the recording to the buffer, and frees the
				3574	* iterator.
				3575	*/
				3576	void
				3577	ring_buffer_read_finish(struct ring_buffer_iter *iter)
				3578	{
				3579	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
				3580
				3581	atomic_dec(&cpu_buffer->record_disabled);
				3582	kfree(iter);
				3583	}
				3584	EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
				3585
				3586	/**
				3587	* ring_buffer_read - read the next item in the ring buffer by the iterator
				3588	* @iter: The ring buffer iterator
				3589	* @ts: The time stamp of the event read.
				3590	*
				3591	* This reads the next event in the ring buffer and increments the iterator.
				3592	*/
				3593	struct ring_buffer_event *
				3594	ring_buffer_read(struct ring_buffer_iter iter, u64 ts)
				3595	{
				3596	struct ring_buffer_event *event;
				3597	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
				3598	unsigned long flags;
				3599	int locked;
				3600
				3601	locked = read_buffer_lock(cpu_buffer, &flags);
				3602	again:
				3603	event = rb_iter_peek(iter, ts);
				3604	if (!event)
				3605	goto out;
				3606
				3607	if (event->type_len == RINGBUF_TYPE_PADDING)
				3608	goto again;
				3609
				3610	rb_advance_iter(iter);
				3611	out:
				3612	read_buffer_unlock(cpu_buffer, flags, locked);
				3613
				3614	return event;
				3615	}
				3616	EXPORT_SYMBOL_GPL(ring_buffer_read);
				3617
				3618	/**
				3619	* ring_buffer_size - return the size of the ring buffer (in bytes)
				3620	* @buffer: The ring buffer.
				3621	*/
				3622	unsigned long ring_buffer_size(struct ring_buffer *buffer)
				3623	{
				3624	return BUF_PAGE_SIZE * buffer->pages;
				3625	}
				3626	EXPORT_SYMBOL_GPL(ring_buffer_size);
				3627
				3628	static void
				3629	rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
				3630	{
				3631	rb_head_page_deactivate(cpu_buffer);
				3632
				3633	cpu_buffer->head_page
				3634	= list_entry(cpu_buffer->pages, struct buffer_page, list);
				3635	local_set(&cpu_buffer->head_page->write, 0);
				3636	local_set(&cpu_buffer->head_page->entries, 0);
				3637	local_set(&cpu_buffer->head_page->page->commit, 0);
				3638
				3639	cpu_buffer->head_page->read = 0;
				3640
				3641	cpu_buffer->tail_page = cpu_buffer->head_page;
				3642	cpu_buffer->commit_page = cpu_buffer->head_page;
				3643
				3644	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
				3645	local_set(&cpu_buffer->reader_page->write, 0);
				3646	local_set(&cpu_buffer->reader_page->entries, 0);
				3647	local_set(&cpu_buffer->reader_page->page->commit, 0);
				3648	cpu_buffer->reader_page->read = 0;
				3649
				3650	local_set(&cpu_buffer->commit_overrun, 0);
				3651	local_set(&cpu_buffer->entries_bytes, 0);
				3652	local_set(&cpu_buffer->overrun, 0);
				3653	local_set(&cpu_buffer->entries, 0);
				3654	local_set(&cpu_buffer->committing, 0);
				3655	local_set(&cpu_buffer->commits, 0);
				3656	cpu_buffer->read = 0;
				3657	cpu_buffer->read_bytes = 0;
				3658
				3659	cpu_buffer->write_stamp = 0;
				3660	cpu_buffer->read_stamp = 0;
				3661
				3662	cpu_buffer->lost_events = 0;
				3663	cpu_buffer->last_overrun = 0;
				3664
				3665	rb_head_page_activate(cpu_buffer);
				3666	}
				3667
				3668	/**
				3669	* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
				3670	* @buffer: The ring buffer to reset a per cpu buffer of
				3671	* @cpu: The CPU buffer to be reset
				3672	*/
				3673	void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
				3674	{
				3675	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
				3676	unsigned long flags;
				3677	int locked;
				3678
				3679	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				3680	return;
				3681
				3682	atomic_inc(&cpu_buffer->record_disabled);
				3683
				3684	locked = read_buffer_lock(cpu_buffer, &flags);
				3685
				3686	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
				3687	goto out;
				3688
				3689	arch_spin_lock(&cpu_buffer->lock);
				3690
				3691	rb_reset_cpu(cpu_buffer);
				3692
				3693	arch_spin_unlock(&cpu_buffer->lock);
				3694
				3695	out:
				3696	read_buffer_unlock(cpu_buffer, flags, locked);
				3697
				3698	atomic_dec(&cpu_buffer->record_disabled);
				3699	}
				3700	EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
				3701
				3702	/**
				3703	* ring_buffer_reset - reset a ring buffer
				3704	* @buffer: The ring buffer to reset all cpu buffers
				3705	*/
				3706	void ring_buffer_reset(struct ring_buffer *buffer)
				3707	{
				3708	int cpu;
				3709
				3710	for_each_buffer_cpu(buffer, cpu)
				3711	ring_buffer_reset_cpu(buffer, cpu);
				3712	}
				3713	EXPORT_SYMBOL_GPL(ring_buffer_reset);
				3714
				3715	/**
				3716	* rind_buffer_empty - is the ring buffer empty?
				3717	* @buffer: The ring buffer to test
				3718	*/
				3719	int ring_buffer_empty(struct ring_buffer *buffer)
				3720	{
				3721	struct ring_buffer_per_cpu *cpu_buffer;
				3722	unsigned long flags;
				3723	int locked;
				3724	int cpu;
				3725	int ret;
				3726
				3727	/* yes this is racy, but if you don't like the race, lock the buffer */
				3728	for_each_buffer_cpu(buffer, cpu) {
				3729	cpu_buffer = buffer->buffers[cpu];
				3730	locked = read_buffer_lock(cpu_buffer, &flags);
				3731	ret = rb_per_cpu_empty(cpu_buffer);
				3732	read_buffer_unlock(cpu_buffer, flags, locked);
				3733
				3734	if (!ret)
				3735	return 0;
				3736	}
				3737
				3738	return 1;
				3739	}
				3740	EXPORT_SYMBOL_GPL(ring_buffer_empty);
				3741
				3742	/**
				3743	* ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
				3744	* @buffer: The ring buffer
				3745	* @cpu: The CPU buffer to test
				3746	*/
				3747	int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
				3748	{
				3749	struct ring_buffer_per_cpu *cpu_buffer;
				3750	unsigned long flags;
				3751	int locked;
				3752	int ret;
				3753
				3754	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				3755	return 1;
				3756
				3757	cpu_buffer = buffer->buffers[cpu];
				3758	locked = read_buffer_lock(cpu_buffer, &flags);
				3759	ret = rb_per_cpu_empty(cpu_buffer);
				3760	read_buffer_unlock(cpu_buffer, flags, locked);
				3761
				3762	return ret;
				3763	}
				3764	EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
				3765
				3766	#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
				3767	/**
				3768	* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
				3769	* @buffer_a: One buffer to swap with
				3770	* @buffer_b: The other buffer to swap with
				3771	*
				3772	* This function is useful for tracers that want to take a "snapshot"
				3773	* of a CPU buffer and has another back up buffer lying around.
				3774	* it is expected that the tracer handles the cpu buffer not being
				3775	* used at the moment.
				3776	*/
				3777	int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
				3778	struct ring_buffer *buffer_b, int cpu)
				3779	{
				3780	struct ring_buffer_per_cpu *cpu_buffer_a;
				3781	struct ring_buffer_per_cpu *cpu_buffer_b;
				3782	int ret = -EINVAL;
				3783
				3784	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) \|\|
				3785	!cpumask_test_cpu(cpu, buffer_b->cpumask))
				3786	goto out;
				3787
				3788	/* At least make sure the two buffers are somewhat the same */
				3789	if (buffer_a->pages != buffer_b->pages)
				3790	goto out;
				3791
				3792	ret = -EAGAIN;
				3793
				3794	if (ring_buffer_flags != RB_BUFFERS_ON)
				3795	goto out;
				3796
				3797	if (atomic_read(&buffer_a->record_disabled))
				3798	goto out;
				3799
				3800	if (atomic_read(&buffer_b->record_disabled))
				3801	goto out;
				3802
				3803	cpu_buffer_a = buffer_a->buffers[cpu];
				3804	cpu_buffer_b = buffer_b->buffers[cpu];
				3805
				3806	if (atomic_read(&cpu_buffer_a->record_disabled))
				3807	goto out;
				3808
				3809	if (atomic_read(&cpu_buffer_b->record_disabled))
				3810	goto out;
				3811
				3812	/*
				3813	* We can't do a synchronize_sched here because this
				3814	* function can be called in atomic context.
				3815	* Normally this will be called from the same CPU as cpu.
				3816	* If not it's up to the caller to protect this.
				3817	*/
				3818	atomic_inc(&cpu_buffer_a->record_disabled);
				3819	atomic_inc(&cpu_buffer_b->record_disabled);
				3820
				3821	ret = -EBUSY;
				3822	if (local_read(&cpu_buffer_a->committing))
				3823	goto out_dec;
				3824	if (local_read(&cpu_buffer_b->committing))
				3825	goto out_dec;
				3826
				3827	buffer_a->buffers[cpu] = cpu_buffer_b;
				3828	buffer_b->buffers[cpu] = cpu_buffer_a;
				3829
				3830	cpu_buffer_b->buffer = buffer_a;
				3831	cpu_buffer_a->buffer = buffer_b;
				3832
				3833	ret = 0;
				3834
				3835	out_dec:
				3836	atomic_dec(&cpu_buffer_a->record_disabled);
				3837	atomic_dec(&cpu_buffer_b->record_disabled);
				3838	out:
				3839	return ret;
				3840	}
				3841	EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
				3842	#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
				3843
				3844	/**
				3845	* ring_buffer_alloc_read_page - allocate a page to read from buffer
				3846	* @buffer: the buffer to allocate for.
				3847	*
				3848	* This function is used in conjunction with ring_buffer_read_page.
				3849	* When reading a full page from the ring buffer, these functions
				3850	* can be used to speed up the process. The calling function should
				3851	* allocate a few pages first with this function. Then when it
				3852	* needs to get pages from the ring buffer, it passes the result
				3853	* of this function into ring_buffer_read_page, which will swap
				3854	* the page that was allocated, with the read page of the buffer.
				3855	*
				3856	* Returns:
				3857	* The page allocated, or NULL on error.
				3858	*/
				3859	void ring_buffer_alloc_read_page(struct ring_buffer buffer, int cpu)
				3860	{
				3861	struct buffer_data_page *bpage;
				3862	struct page *page;
				3863
				3864	page = alloc_pages_node(cpu_to_node(cpu),
				3865	GFP_KERNEL \| __GFP_NORETRY, 0);
				3866	if (!page)
				3867	return NULL;
				3868
				3869	bpage = page_address(page);
				3870
				3871	rb_init_page(bpage);
				3872
				3873	return bpage;
				3874	}
				3875	EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
				3876
				3877	/**
				3878	* ring_buffer_free_read_page - free an allocated read page
				3879	* @buffer: the buffer the page was allocate for
				3880	* @data: the page to free
				3881	*
				3882	* Free a page allocated from ring_buffer_alloc_read_page.
				3883	*/
				3884	void ring_buffer_free_read_page(struct ring_buffer buffer, void data)
				3885	{
				3886	free_page((unsigned long)data);
				3887	}
				3888	EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
				3889
				3890	/**
				3891	* ring_buffer_read_page - extract a page from the ring buffer
				3892	* @buffer: buffer to extract from
				3893	* @data_page: the page to use allocated from ring_buffer_alloc_read_page
				3894	* @len: amount to extract
				3895	* @cpu: the cpu of the buffer to extract
				3896	* @full: should the extraction only happen when the page is full.
				3897	*
				3898	* This function will pull out a page from the ring buffer and consume it.
				3899	* @data_page must be the address of the variable that was returned
				3900	* from ring_buffer_alloc_read_page. This is because the page might be used
				3901	* to swap with a page in the ring buffer.
				3902	*
				3903	* for example:
				3904	* rpage = ring_buffer_alloc_read_page(buffer);
				3905	* if (!rpage)
				3906	* return error;
				3907	* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
				3908	* if (ret >= 0)
				3909	* process_page(rpage, ret);
				3910	*
				3911	* When @full is set, the function will not return true unless
				3912	* the writer is off the reader page.
				3913	*
				3914	* Note: it is up to the calling functions to handle sleeps and wakeups.
				3915	* The ring buffer can be used anywhere in the kernel and can not
				3916	* blindly call wake_up. The layer that uses the ring buffer must be
				3917	* responsible for that.
				3918	*
				3919	* Returns:
				3920	* >=0 if data has been transferred, returns the offset of consumed data.
				3921	* <0 if no data has been transferred.
				3922	*/
				3923	int ring_buffer_read_page(struct ring_buffer *buffer,
				3924	void **data_page, size_t len, int cpu, int full)
				3925	{
				3926	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
				3927	struct ring_buffer_event *event;
				3928	struct buffer_data_page *bpage;
				3929	struct buffer_page *reader;
				3930	unsigned long missed_events;
				3931	unsigned long flags;
				3932	unsigned int commit;
				3933	unsigned int read;
				3934	u64 save_timestamp;
				3935	int locked;
				3936	int ret = -1;
				3937
				3938	if (!cpumask_test_cpu(cpu, buffer->cpumask))
				3939	goto out;
				3940
				3941	/*
				3942	* If len is not big enough to hold the page header, then
				3943	* we can not copy anything.
				3944	*/
				3945	if (len <= BUF_PAGE_HDR_SIZE)
				3946	goto out;
				3947
				3948	len -= BUF_PAGE_HDR_SIZE;
				3949
				3950	if (!data_page)
				3951	goto out;
				3952
				3953	bpage = *data_page;
				3954	if (!bpage)
				3955	goto out;
				3956
				3957	locked = read_buffer_lock(cpu_buffer, &flags);
				3958
				3959	reader = rb_get_reader_page(cpu_buffer);
				3960	if (!reader)
				3961	goto out_unlock;
				3962
				3963	event = rb_reader_event(cpu_buffer);
				3964
				3965	read = reader->read;
				3966	commit = rb_page_commit(reader);
				3967
				3968	/* Check if any events were dropped */
				3969	missed_events = cpu_buffer->lost_events;
				3970
				3971	/*
				3972	* If this page has been partially read or
				3973	* if len is not big enough to read the rest of the page or
				3974	* a writer is still on the page, then
				3975	* we must copy the data from the page to the buffer.
				3976	* Otherwise, we can simply swap the page with the one passed in.
				3977	*/
				3978	if (read \|\| (len < (commit - read)) \|\|
				3979	cpu_buffer->reader_page == cpu_buffer->commit_page) {
				3980	struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
				3981	unsigned int rpos = read;
				3982	unsigned int pos = 0;
				3983	unsigned int size;
				3984
				3985	if (full)
				3986	goto out_unlock;
				3987
				3988	if (len > (commit - read))
				3989	len = (commit - read);
				3990
				3991	/* Always keep the time extend and data together */
				3992	size = rb_event_ts_length(event);
				3993
				3994	if (len < size)
				3995	goto out_unlock;
				3996
				3997	/* save the current timestamp, since the user will need it */
				3998	save_timestamp = cpu_buffer->read_stamp;
				3999
				4000	/* Need to copy one event at a time */
				4001	do {
				4002	/* We need the size of one event, because
				4003	* rb_advance_reader only advances by one event,
				4004	* whereas rb_event_ts_length may include the size of
				4005	* one or two events.
				4006	* We have already ensured there's enough space if this
				4007	* is a time extend. */
				4008	size = rb_event_length(event);
				4009	memcpy(bpage->data + pos, rpage->data + rpos, size);
				4010
				4011	len -= size;
				4012
				4013	rb_advance_reader(cpu_buffer);
				4014	rpos = reader->read;
				4015	pos += size;
				4016
				4017	if (rpos >= commit)
				4018	break;
				4019
				4020	event = rb_reader_event(cpu_buffer);
				4021	/* Always keep the time extend and data together */
				4022	size = rb_event_ts_length(event);
				4023	} while (len >= size);
				4024
				4025	/* update bpage */
				4026	local_set(&bpage->commit, pos);
				4027	bpage->time_stamp = save_timestamp;
				4028
				4029	/* we copied everything to the beginning */
				4030	read = 0;
				4031	} else {
				4032	/* update the entry counter */
				4033	cpu_buffer->read += rb_page_entries(reader);
				4034	cpu_buffer->read_bytes += BUF_PAGE_SIZE;
				4035
				4036	/* swap the pages */
				4037	rb_init_page(bpage);
				4038	bpage = reader->page;
				4039	reader->page = *data_page;
				4040	local_set(&reader->write, 0);
				4041	local_set(&reader->entries, 0);
				4042	reader->read = 0;
				4043	*data_page = bpage;
				4044
				4045	/*
				4046	* Use the real_end for the data size,
				4047	* This gives us a chance to store the lost events
				4048	* on the page.
				4049	*/
				4050	if (reader->real_end)
				4051	local_set(&bpage->commit, reader->real_end);
				4052	}
				4053	ret = read;
				4054
				4055	cpu_buffer->lost_events = 0;
				4056
				4057	commit = local_read(&bpage->commit);
				4058	/*
				4059	* Set a flag in the commit field if we lost events
				4060	*/
				4061	if (missed_events) {
				4062	/* If there is room at the end of the page to save the
				4063	* missed events, then record it there.
				4064	*/
				4065	if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
				4066	memcpy(&bpage->data[commit], &missed_events,
				4067	sizeof(missed_events));
				4068	local_add(RB_MISSED_STORED, &bpage->commit);
				4069	commit += sizeof(missed_events);
				4070	}
				4071	local_add(RB_MISSED_EVENTS, &bpage->commit);
				4072	}
				4073
				4074	/*
				4075	* This page may be off to user land. Zero it out here.
				4076	*/
				4077	if (commit < BUF_PAGE_SIZE)
				4078	memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
				4079
				4080	out_unlock:
				4081	read_buffer_unlock(cpu_buffer, flags, locked);
				4082
				4083	out:
				4084	return ret;
				4085	}
				4086	EXPORT_SYMBOL_GPL(ring_buffer_read_page);
				4087
				4088	#ifdef CONFIG_HOTPLUG_CPU
				4089	static int rb_cpu_notify(struct notifier_block *self,
				4090	unsigned long action, void *hcpu)
				4091	{
				4092	struct ring_buffer *buffer =
				4093	container_of(self, struct ring_buffer, cpu_notify);
				4094	long cpu = (long)hcpu;
				4095
				4096	switch (action) {
				4097	case CPU_UP_PREPARE:
				4098	case CPU_UP_PREPARE_FROZEN:
				4099	if (cpumask_test_cpu(cpu, buffer->cpumask))
				4100	return NOTIFY_OK;
				4101
				4102	buffer->buffers[cpu] =
				4103	rb_allocate_cpu_buffer(buffer, cpu);
				4104	if (!buffer->buffers[cpu]) {
				4105	WARN(1, "failed to allocate ring buffer on CPU %ld\n",
				4106	cpu);
				4107	return NOTIFY_OK;
				4108	}
				4109	smp_wmb();
				4110	cpumask_set_cpu(cpu, buffer->cpumask);
				4111	break;
				4112	case CPU_DOWN_PREPARE:
				4113	case CPU_DOWN_PREPARE_FROZEN:
				4114	/*
				4115	* Do nothing.
				4116	* If we were to free the buffer, then the user would
				4117	* lose any trace that was in the buffer.
				4118	*/
				4119	break;
				4120	default:
				4121	break;
				4122	}
				4123	return NOTIFY_OK;
				4124	}
				4125	#endif