Blame - src/kernel/linux/v4.14/fs/jbd2/transaction.c - T103

blob: 3311b1e684defbbf9bdc2f7c74b83f5a006171ea [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* linux/fs/jbd2/transaction.c
				3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Generic filesystem transaction handling code; part of the ext2fs
				13	* journaling system.
				14	*
				15	* This file manages transactions (compound commits managed by the
				16	* journaling code) and handles (individual atomic operations by the
				17	* filesystem).
				18	*/
				19
				20	#include <linux/time.h>
				21	#include <linux/fs.h>
				22	#include <linux/jbd2.h>
				23	#include <linux/errno.h>
				24	#include <linux/slab.h>
				25	#include <linux/timer.h>
				26	#include <linux/mm.h>
				27	#include <linux/highmem.h>
				28	#include <linux/hrtimer.h>
				29	#include <linux/backing-dev.h>
				30	#include <linux/bug.h>
				31	#include <linux/module.h>
				32	#include <linux/sched/mm.h>
				33
				34	#include <trace/events/jbd2.h>
				35
				36	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
				37	static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
				38
				39	static struct kmem_cache *transaction_cache;
				40	int __init jbd2_journal_init_transaction_cache(void)
				41	{
				42	J_ASSERT(!transaction_cache);
				43	transaction_cache = kmem_cache_create("jbd2_transaction_s",
				44	sizeof(transaction_t),
				45	0,
				46	SLAB_HWCACHE_ALIGN\|SLAB_TEMPORARY,
				47	NULL);
				48	if (transaction_cache)
				49	return 0;
				50	return -ENOMEM;
				51	}
				52
				53	void jbd2_journal_destroy_transaction_cache(void)
				54	{
				55	if (transaction_cache) {
				56	kmem_cache_destroy(transaction_cache);
				57	transaction_cache = NULL;
				58	}
				59	}
				60
				61	void jbd2_journal_free_transaction(transaction_t *transaction)
				62	{
				63	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
				64	return;
				65	kmem_cache_free(transaction_cache, transaction);
				66	}
				67
				68	/*
				69	* jbd2_get_transaction: obtain a new transaction_t object.
				70	*
				71	* Simply allocate and initialise a new transaction. Create it in
				72	* RUNNING state and add it to the current journal (which should not
				73	* have an existing running transaction: we only make a new transaction
				74	* once we have started to commit the old one).
				75	*
				76	* Preconditions:
				77	* The journal MUST be locked. We don't perform atomic mallocs on the
				78	* new transaction and we can't block without protecting against other
				79	* processes trying to touch the journal while it is in transition.
				80	*
				81	*/
				82
				83	static transaction_t *
				84	jbd2_get_transaction(journal_t journal, transaction_t transaction)
				85	{
				86	transaction->t_journal = journal;
				87	transaction->t_state = T_RUNNING;
				88	transaction->t_start_time = ktime_get();
				89	transaction->t_tid = journal->j_transaction_sequence++;
				90	transaction->t_expires = jiffies + journal->j_commit_interval;
				91	spin_lock_init(&transaction->t_handle_lock);
				92	atomic_set(&transaction->t_updates, 0);
				93	atomic_set(&transaction->t_outstanding_credits,
				94	atomic_read(&journal->j_reserved_credits));
				95	atomic_set(&transaction->t_handle_count, 0);
				96	INIT_LIST_HEAD(&transaction->t_inode_list);
				97	INIT_LIST_HEAD(&transaction->t_private_list);
				98
				99	/* Set up the commit timer for the new transaction. */
				100	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
				101	add_timer(&journal->j_commit_timer);
				102
				103	J_ASSERT(journal->j_running_transaction == NULL);
				104	journal->j_running_transaction = transaction;
				105	transaction->t_max_wait = 0;
				106	transaction->t_start = jiffies;
				107	transaction->t_requested = 0;
				108
				109	return transaction;
				110	}
				111
				112	/*
				113	* Handle management.
				114	*
				115	* A handle_t is an object which represents a single atomic update to a
				116	* filesystem, and which tracks all of the modifications which form part
				117	* of that one update.
				118	*/
				119
				120	/*
				121	* Update transaction's maximum wait time, if debugging is enabled.
				122	*
				123	* In order for t_max_wait to be reliable, it must be protected by a
				124	* lock. But doing so will mean that start_this_handle() can not be
				125	* run in parallel on SMP systems, which limits our scalability. So
				126	* unless debugging is enabled, we no longer update t_max_wait, which
				127	* means that maximum wait time reported by the jbd2_run_stats
				128	* tracepoint will always be zero.
				129	*/
				130	static inline void update_t_max_wait(transaction_t *transaction,
				131	unsigned long ts)
				132	{
				133	#ifdef CONFIG_JBD2_DEBUG
				134	if (jbd2_journal_enable_debug &&
				135	time_after(transaction->t_start, ts)) {
				136	ts = jbd2_time_diff(ts, transaction->t_start);
				137	spin_lock(&transaction->t_handle_lock);
				138	if (ts > transaction->t_max_wait)
				139	transaction->t_max_wait = ts;
				140	spin_unlock(&transaction->t_handle_lock);
				141	}
				142	#endif
				143	}
				144
				145	/*
				146	* Wait until running transaction passes T_LOCKED state. Also starts the commit
				147	* if needed. The function expects running transaction to exist and releases
				148	* j_state_lock.
				149	*/
				150	static void wait_transaction_locked(journal_t *journal)
				151	__releases(journal->j_state_lock)
				152	{
				153	DEFINE_WAIT(wait);
				154	int need_to_start;
				155	tid_t tid = journal->j_running_transaction->t_tid;
				156
				157	prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
				158	TASK_UNINTERRUPTIBLE);
				159	need_to_start = !tid_geq(journal->j_commit_request, tid);
				160	read_unlock(&journal->j_state_lock);
				161	if (need_to_start)
				162	jbd2_log_start_commit(journal, tid);
				163	jbd2_might_wait_for_commit(journal);
				164	schedule();
				165	finish_wait(&journal->j_wait_transaction_locked, &wait);
				166	}
				167
				168	static void sub_reserved_credits(journal_t *journal, int blocks)
				169	{
				170	atomic_sub(blocks, &journal->j_reserved_credits);
				171	wake_up(&journal->j_wait_reserved);
				172	}
				173
				174	/*
				175	* Wait until we can add credits for handle to the running transaction. Called
				176	* with j_state_lock held for reading. Returns 0 if handle joined the running
				177	* transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
				178	* caller must retry.
				179	*/
				180	static int add_transaction_credits(journal_t *journal, int blocks,
				181	int rsv_blocks)
				182	{
				183	transaction_t *t = journal->j_running_transaction;
				184	int needed;
				185	int total = blocks + rsv_blocks;
				186
				187	/*
				188	* If the current transaction is locked down for commit, wait
				189	* for the lock to be released.
				190	*/
				191	if (t->t_state == T_LOCKED) {
				192	wait_transaction_locked(journal);
				193	return 1;
				194	}
				195
				196	/*
				197	* If there is not enough space left in the log to write all
				198	* potential buffers requested by this operation, we need to
				199	* stall pending a log checkpoint to free some more log space.
				200	*/
				201	needed = atomic_add_return(total, &t->t_outstanding_credits);
				202	if (needed > journal->j_max_transaction_buffers) {
				203	/*
				204	* If the current transaction is already too large,
				205	* then start to commit it: we can then go back and
				206	* attach this handle to a new transaction.
				207	*/
				208	atomic_sub(total, &t->t_outstanding_credits);
				209
				210	/*
				211	* Is the number of reserved credits in the current transaction too
				212	* big to fit this handle? Wait until reserved credits are freed.
				213	*/
				214	if (atomic_read(&journal->j_reserved_credits) + total >
				215	journal->j_max_transaction_buffers) {
				216	read_unlock(&journal->j_state_lock);
				217	jbd2_might_wait_for_commit(journal);
				218	wait_event(journal->j_wait_reserved,
				219	atomic_read(&journal->j_reserved_credits) + total <=
				220	journal->j_max_transaction_buffers);
				221	return 1;
				222	}
				223
				224	wait_transaction_locked(journal);
				225	return 1;
				226	}
				227
				228	/*
				229	* The commit code assumes that it can get enough log space
				230	* without forcing a checkpoint. This is critical for
				231	* correctness: a checkpoint of a buffer which is also
				232	* associated with a committing transaction creates a deadlock,
				233	* so commit simply cannot force through checkpoints.
				234	*
				235	* We must therefore ensure the necessary space in the journal
				236	* before starting to dirty potentially checkpointed buffers
				237	* in the new transaction.
				238	*/
				239	if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
				240	atomic_sub(total, &t->t_outstanding_credits);
				241	read_unlock(&journal->j_state_lock);
				242	jbd2_might_wait_for_commit(journal);
				243	write_lock(&journal->j_state_lock);
				244	if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
				245	__jbd2_log_wait_for_space(journal);
				246	write_unlock(&journal->j_state_lock);
				247	return 1;
				248	}
				249
				250	/* No reservation? We are done... */
				251	if (!rsv_blocks)
				252	return 0;
				253
				254	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
				255	/* We allow at most half of a transaction to be reserved */
				256	if (needed > journal->j_max_transaction_buffers / 2) {
				257	sub_reserved_credits(journal, rsv_blocks);
				258	atomic_sub(total, &t->t_outstanding_credits);
				259	read_unlock(&journal->j_state_lock);
				260	jbd2_might_wait_for_commit(journal);
				261	wait_event(journal->j_wait_reserved,
				262	atomic_read(&journal->j_reserved_credits) + rsv_blocks
				263	<= journal->j_max_transaction_buffers / 2);
				264	return 1;
				265	}
				266	return 0;
				267	}
				268
				269	/*
				270	* start_this_handle: Given a handle, deal with any locking or stalling
				271	* needed to make sure that there is enough journal space for the handle
				272	* to begin. Attach the handle to a transaction and set up the
				273	* transaction's buffer credits.
				274	*/
				275
				276	static int start_this_handle(journal_t journal, handle_t handle,
				277	gfp_t gfp_mask)
				278	{
				279	transaction_t transaction, new_transaction = NULL;
				280	int blocks = handle->h_buffer_credits;
				281	int rsv_blocks = 0;
				282	unsigned long ts = jiffies;
				283
				284	if (handle->h_rsv_handle)
				285	rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
				286
				287	/*
				288	* Limit the number of reserved credits to 1/2 of maximum transaction
				289	* size and limit the number of total credits to not exceed maximum
				290	* transaction size per operation.
				291	*/
				292	if ((rsv_blocks > journal->j_max_transaction_buffers / 2) \|\|
				293	(rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
				294	printk(KERN_ERR "JBD2: %s wants too many credits "
				295	"credits:%d rsv_credits:%d max:%d\n",
				296	current->comm, blocks, rsv_blocks,
				297	journal->j_max_transaction_buffers);
				298	WARN_ON(1);
				299	return -ENOSPC;
				300	}
				301
				302	alloc_transaction:
				303	if (!journal->j_running_transaction) {
				304	/*
				305	* If __GFP_FS is not present, then we may be being called from
				306	* inside the fs writeback layer, so we MUST NOT fail.
				307	*/
				308	if ((gfp_mask & __GFP_FS) == 0)
				309	gfp_mask \|= __GFP_NOFAIL;
				310	new_transaction = kmem_cache_zalloc(transaction_cache,
				311	gfp_mask);
				312	if (!new_transaction)
				313	return -ENOMEM;
				314	}
				315
				316	jbd_debug(3, "New handle %p going live.\n", handle);
				317
				318	/*
				319	* We need to hold j_state_lock until t_updates has been incremented,
				320	* for proper journal barrier handling
				321	*/
				322	repeat:
				323	read_lock(&journal->j_state_lock);
				324	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
				325	if (is_journal_aborted(journal) \|\|
				326	(journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
				327	read_unlock(&journal->j_state_lock);
				328	jbd2_journal_free_transaction(new_transaction);
				329	return -EROFS;
				330	}
				331
				332	/*
				333	* Wait on the journal's transaction barrier if necessary. Specifically
				334	* we allow reserved handles to proceed because otherwise commit could
				335	* deadlock on page writeback not being able to complete.
				336	*/
				337	if (!handle->h_reserved && journal->j_barrier_count) {
				338	read_unlock(&journal->j_state_lock);
				339	wait_event(journal->j_wait_transaction_locked,
				340	journal->j_barrier_count == 0);
				341	goto repeat;
				342	}
				343
				344	if (!journal->j_running_transaction) {
				345	read_unlock(&journal->j_state_lock);
				346	if (!new_transaction)
				347	goto alloc_transaction;
				348	write_lock(&journal->j_state_lock);
				349	if (!journal->j_running_transaction &&
				350	(handle->h_reserved \|\| !journal->j_barrier_count)) {
				351	jbd2_get_transaction(journal, new_transaction);
				352	new_transaction = NULL;
				353	}
				354	write_unlock(&journal->j_state_lock);
				355	goto repeat;
				356	}
				357
				358	transaction = journal->j_running_transaction;
				359
				360	if (!handle->h_reserved) {
				361	/* We may have dropped j_state_lock - restart in that case */
				362	if (add_transaction_credits(journal, blocks, rsv_blocks))
				363	goto repeat;
				364	} else {
				365	/*
				366	* We have handle reserved so we are allowed to join T_LOCKED
				367	* transaction and we don't have to check for transaction size
				368	* and journal space.
				369	*/
				370	sub_reserved_credits(journal, blocks);
				371	handle->h_reserved = 0;
				372	}
				373
				374	/* OK, account for the buffers that this operation expects to
				375	* use and add the handle to the running transaction.
				376	*/
				377	update_t_max_wait(transaction, ts);
				378	handle->h_transaction = transaction;
				379	handle->h_requested_credits = blocks;
				380	handle->h_start_jiffies = jiffies;
				381	atomic_inc(&transaction->t_updates);
				382	atomic_inc(&transaction->t_handle_count);
				383	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
				384	handle, blocks,
				385	atomic_read(&transaction->t_outstanding_credits),
				386	jbd2_log_space_left(journal));
				387	read_unlock(&journal->j_state_lock);
				388	current->journal_info = handle;
				389
				390	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
				391	jbd2_journal_free_transaction(new_transaction);
				392	/*
				393	* Ensure that no allocations done while the transaction is open are
				394	* going to recurse back to the fs layer.
				395	*/
				396	handle->saved_alloc_context = memalloc_nofs_save();
				397	return 0;
				398	}
				399
				400	/* Allocate a new handle. This should probably be in a slab... */
				401	static handle_t *new_handle(int nblocks)
				402	{
				403	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
				404	if (!handle)
				405	return NULL;
				406	handle->h_buffer_credits = nblocks;
				407	handle->h_ref = 1;
				408
				409	return handle;
				410	}
				411
				412	handle_t jbd2__journal_start(journal_t journal, int nblocks, int rsv_blocks,
				413	gfp_t gfp_mask, unsigned int type,
				414	unsigned int line_no)
				415	{
				416	handle_t *handle = journal_current_handle();
				417	int err;
				418
				419	if (!journal)
				420	return ERR_PTR(-EROFS);
				421
				422	if (handle) {
				423	J_ASSERT(handle->h_transaction->t_journal == journal);
				424	handle->h_ref++;
				425	return handle;
				426	}
				427
				428	handle = new_handle(nblocks);
				429	if (!handle)
				430	return ERR_PTR(-ENOMEM);
				431	if (rsv_blocks) {
				432	handle_t *rsv_handle;
				433
				434	rsv_handle = new_handle(rsv_blocks);
				435	if (!rsv_handle) {
				436	jbd2_free_handle(handle);
				437	return ERR_PTR(-ENOMEM);
				438	}
				439	rsv_handle->h_reserved = 1;
				440	rsv_handle->h_journal = journal;
				441	handle->h_rsv_handle = rsv_handle;
				442	}
				443
				444	err = start_this_handle(journal, handle, gfp_mask);
				445	if (err < 0) {
				446	if (handle->h_rsv_handle)
				447	jbd2_free_handle(handle->h_rsv_handle);
				448	jbd2_free_handle(handle);
				449	return ERR_PTR(err);
				450	}
				451	handle->h_type = type;
				452	handle->h_line_no = line_no;
				453	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
				454	handle->h_transaction->t_tid, type,
				455	line_no, nblocks);
				456
				457	return handle;
				458	}
				459	EXPORT_SYMBOL(jbd2__journal_start);
				460
				461
				462	/**
				463	* handle_t *jbd2_journal_start() - Obtain a new handle.
				464	* @journal: Journal to start transaction on.
				465	* @nblocks: number of block buffer we might modify
				466	*
				467	* We make sure that the transaction can guarantee at least nblocks of
				468	* modified buffers in the log. We block until the log can guarantee
				469	* that much space. Additionally, if rsv_blocks > 0, we also create another
				470	* handle with rsv_blocks reserved blocks in the journal. This handle is
				471	* is stored in h_rsv_handle. It is not attached to any particular transaction
				472	* and thus doesn't block transaction commit. If the caller uses this reserved
				473	* handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
				474	* on the parent handle will dispose the reserved one. Reserved handle has to
				475	* be converted to a normal handle using jbd2_journal_start_reserved() before
				476	* it can be used.
				477	*
				478	* Return a pointer to a newly allocated handle, or an ERR_PTR() value
				479	* on failure.
				480	*/
				481	handle_t jbd2_journal_start(journal_t journal, int nblocks)
				482	{
				483	return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
				484	}
				485	EXPORT_SYMBOL(jbd2_journal_start);
				486
				487	void jbd2_journal_free_reserved(handle_t *handle)
				488	{
				489	journal_t *journal = handle->h_journal;
				490
				491	WARN_ON(!handle->h_reserved);
				492	sub_reserved_credits(journal, handle->h_buffer_credits);
				493	jbd2_free_handle(handle);
				494	}
				495	EXPORT_SYMBOL(jbd2_journal_free_reserved);
				496
				497	/**
				498	* int jbd2_journal_start_reserved() - start reserved handle
				499	* @handle: handle to start
				500	* @type: for handle statistics
				501	* @line_no: for handle statistics
				502	*
				503	* Start handle that has been previously reserved with jbd2_journal_reserve().
				504	* This attaches @handle to the running transaction (or creates one if there's
				505	* not transaction running). Unlike jbd2_journal_start() this function cannot
				506	* block on journal commit, checkpointing, or similar stuff. It can block on
				507	* memory allocation or frozen journal though.
				508	*
				509	* Return 0 on success, non-zero on error - handle is freed in that case.
				510	*/
				511	int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
				512	unsigned int line_no)
				513	{
				514	journal_t *journal = handle->h_journal;
				515	int ret = -EIO;
				516
				517	if (WARN_ON(!handle->h_reserved)) {
				518	/* Someone passed in normal handle? Just stop it. */
				519	jbd2_journal_stop(handle);
				520	return ret;
				521	}
				522	/*
				523	* Usefulness of mixing of reserved and unreserved handles is
				524	* questionable. So far nobody seems to need it so just error out.
				525	*/
				526	if (WARN_ON(current->journal_info)) {
				527	jbd2_journal_free_reserved(handle);
				528	return ret;
				529	}
				530
				531	handle->h_journal = NULL;
				532	/*
				533	* GFP_NOFS is here because callers are likely from writeback or
				534	* similarly constrained call sites
				535	*/
				536	ret = start_this_handle(journal, handle, GFP_NOFS);
				537	if (ret < 0) {
				538	handle->h_journal = journal;
				539	jbd2_journal_free_reserved(handle);
				540	return ret;
				541	}
				542	handle->h_type = type;
				543	handle->h_line_no = line_no;
				544	return 0;
				545	}
				546	EXPORT_SYMBOL(jbd2_journal_start_reserved);
				547
				548	/**
				549	* int jbd2_journal_extend() - extend buffer credits.
				550	* @handle: handle to 'extend'
				551	* @nblocks: nr blocks to try to extend by.
				552	*
				553	* Some transactions, such as large extends and truncates, can be done
				554	* atomically all at once or in several stages. The operation requests
				555	* a credit for a number of buffer modifications in advance, but can
				556	* extend its credit if it needs more.
				557	*
				558	* jbd2_journal_extend tries to give the running handle more buffer credits.
				559	* It does not guarantee that allocation - this is a best-effort only.
				560	* The calling process MUST be able to deal cleanly with a failure to
				561	* extend here.
				562	*
				563	* Return 0 on success, non-zero on failure.
				564	*
				565	* return code < 0 implies an error
				566	* return code > 0 implies normal transaction-full status.
				567	*/
				568	int jbd2_journal_extend(handle_t *handle, int nblocks)
				569	{
				570	transaction_t *transaction = handle->h_transaction;
				571	journal_t *journal;
				572	int result;
				573	int wanted;
				574
				575	if (is_handle_aborted(handle))
				576	return -EROFS;
				577	journal = transaction->t_journal;
				578
				579	result = 1;
				580
				581	read_lock(&journal->j_state_lock);
				582
				583	/* Don't extend a locked-down transaction! */
				584	if (transaction->t_state != T_RUNNING) {
				585	jbd_debug(3, "denied handle %p %d blocks: "
				586	"transaction not running\n", handle, nblocks);
				587	goto error_out;
				588	}
				589
				590	spin_lock(&transaction->t_handle_lock);
				591	wanted = atomic_add_return(nblocks,
				592	&transaction->t_outstanding_credits);
				593
				594	if (wanted > journal->j_max_transaction_buffers) {
				595	jbd_debug(3, "denied handle %p %d blocks: "
				596	"transaction too large\n", handle, nblocks);
				597	atomic_sub(nblocks, &transaction->t_outstanding_credits);
				598	goto unlock;
				599	}
				600
				601	if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
				602	jbd2_log_space_left(journal)) {
				603	jbd_debug(3, "denied handle %p %d blocks: "
				604	"insufficient log space\n", handle, nblocks);
				605	atomic_sub(nblocks, &transaction->t_outstanding_credits);
				606	goto unlock;
				607	}
				608
				609	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
				610	transaction->t_tid,
				611	handle->h_type, handle->h_line_no,
				612	handle->h_buffer_credits,
				613	nblocks);
				614
				615	handle->h_buffer_credits += nblocks;
				616	handle->h_requested_credits += nblocks;
				617	result = 0;
				618
				619	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
				620	unlock:
				621	spin_unlock(&transaction->t_handle_lock);
				622	error_out:
				623	read_unlock(&journal->j_state_lock);
				624	return result;
				625	}
				626
				627
				628	/**
				629	* int jbd2_journal_restart() - restart a handle .
				630	* @handle: handle to restart
				631	* @nblocks: nr credits requested
				632	* @gfp_mask: memory allocation flags (for start_this_handle)
				633	*
				634	* Restart a handle for a multi-transaction filesystem
				635	* operation.
				636	*
				637	* If the jbd2_journal_extend() call above fails to grant new buffer credits
				638	* to a running handle, a call to jbd2_journal_restart will commit the
				639	* handle's transaction so far and reattach the handle to a new
				640	* transaction capable of guaranteeing the requested number of
				641	* credits. We preserve reserved handle if there's any attached to the
				642	* passed in handle.
				643	*/
				644	int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
				645	{
				646	transaction_t *transaction = handle->h_transaction;
				647	journal_t *journal;
				648	tid_t tid;
				649	int need_to_start, ret;
				650
				651	/* If we've had an abort of any type, don't even think about
				652	* actually doing the restart! */
				653	if (is_handle_aborted(handle))
				654	return 0;
				655	journal = transaction->t_journal;
				656
				657	/*
				658	* First unlink the handle from its current transaction, and start the
				659	* commit on that.
				660	*/
				661	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
				662	J_ASSERT(journal_current_handle() == handle);
				663
				664	read_lock(&journal->j_state_lock);
				665	spin_lock(&transaction->t_handle_lock);
				666	atomic_sub(handle->h_buffer_credits,
				667	&transaction->t_outstanding_credits);
				668	if (handle->h_rsv_handle) {
				669	sub_reserved_credits(journal,
				670	handle->h_rsv_handle->h_buffer_credits);
				671	}
				672	if (atomic_dec_and_test(&transaction->t_updates))
				673	wake_up(&journal->j_wait_updates);
				674	tid = transaction->t_tid;
				675	spin_unlock(&transaction->t_handle_lock);
				676	handle->h_transaction = NULL;
				677	current->journal_info = NULL;
				678
				679	jbd_debug(2, "restarting handle %p\n", handle);
				680	need_to_start = !tid_geq(journal->j_commit_request, tid);
				681	read_unlock(&journal->j_state_lock);
				682	if (need_to_start)
				683	jbd2_log_start_commit(journal, tid);
				684
				685	rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
				686	handle->h_buffer_credits = nblocks;
				687	/*
				688	* Restore the original nofs context because the journal restart
				689	* is basically the same thing as journal stop and start.
				690	* start_this_handle will start a new nofs context.
				691	*/
				692	memalloc_nofs_restore(handle->saved_alloc_context);
				693	ret = start_this_handle(journal, handle, gfp_mask);
				694	return ret;
				695	}
				696	EXPORT_SYMBOL(jbd2__journal_restart);
				697
				698
				699	int jbd2_journal_restart(handle_t *handle, int nblocks)
				700	{
				701	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
				702	}
				703	EXPORT_SYMBOL(jbd2_journal_restart);
				704
				705	/**
				706	* void jbd2_journal_lock_updates () - establish a transaction barrier.
				707	* @journal: Journal to establish a barrier on.
				708	*
				709	* This locks out any further updates from being started, and blocks
				710	* until all existing updates have completed, returning only once the
				711	* journal is in a quiescent state with no updates running.
				712	*
				713	* The journal lock should not be held on entry.
				714	*/
				715	void jbd2_journal_lock_updates(journal_t *journal)
				716	{
				717	DEFINE_WAIT(wait);
				718
				719	jbd2_might_wait_for_commit(journal);
				720
				721	write_lock(&journal->j_state_lock);
				722	++journal->j_barrier_count;
				723
				724	/* Wait until there are no reserved handles */
				725	if (atomic_read(&journal->j_reserved_credits)) {
				726	write_unlock(&journal->j_state_lock);
				727	wait_event(journal->j_wait_reserved,
				728	atomic_read(&journal->j_reserved_credits) == 0);
				729	write_lock(&journal->j_state_lock);
				730	}
				731
				732	/* Wait until there are no running updates */
				733	while (1) {
				734	transaction_t *transaction = journal->j_running_transaction;
				735
				736	if (!transaction)
				737	break;
				738
				739	spin_lock(&transaction->t_handle_lock);
				740	prepare_to_wait(&journal->j_wait_updates, &wait,
				741	TASK_UNINTERRUPTIBLE);
				742	if (!atomic_read(&transaction->t_updates)) {
				743	spin_unlock(&transaction->t_handle_lock);
				744	finish_wait(&journal->j_wait_updates, &wait);
				745	break;
				746	}
				747	spin_unlock(&transaction->t_handle_lock);
				748	write_unlock(&journal->j_state_lock);
				749	schedule();
				750	finish_wait(&journal->j_wait_updates, &wait);
				751	write_lock(&journal->j_state_lock);
				752	}
				753	write_unlock(&journal->j_state_lock);
				754
				755	/*
				756	* We have now established a barrier against other normal updates, but
				757	* we also need to barrier against other jbd2_journal_lock_updates() calls
				758	* to make sure that we serialise special journal-locked operations
				759	* too.
				760	*/
				761	mutex_lock(&journal->j_barrier);
				762	}
				763
				764	/**
				765	* void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
				766	* @journal: Journal to release the barrier on.
				767	*
				768	* Release a transaction barrier obtained with jbd2_journal_lock_updates().
				769	*
				770	* Should be called without the journal lock held.
				771	*/
				772	void jbd2_journal_unlock_updates (journal_t *journal)
				773	{
				774	J_ASSERT(journal->j_barrier_count != 0);
				775
				776	mutex_unlock(&journal->j_barrier);
				777	write_lock(&journal->j_state_lock);
				778	--journal->j_barrier_count;
				779	write_unlock(&journal->j_state_lock);
				780	wake_up(&journal->j_wait_transaction_locked);
				781	}
				782
				783	static void warn_dirty_buffer(struct buffer_head *bh)
				784	{
				785	printk(KERN_WARNING
				786	"JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
				787	"There's a risk of filesystem corruption in case of system "
				788	"crash.\n",
				789	bh->b_bdev, (unsigned long long)bh->b_blocknr);
				790	}
				791
				792	/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
				793	static void jbd2_freeze_jh_data(struct journal_head *jh)
				794	{
				795	struct page *page;
				796	int offset;
				797	char *source;
				798	struct buffer_head *bh = jh2bh(jh);
				799
				800	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
				801	page = bh->b_page;
				802	offset = offset_in_page(bh->b_data);
				803	source = kmap_atomic(page);
				804	/* Fire data frozen trigger just before we copy the data */
				805	jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
				806	memcpy(jh->b_frozen_data, source + offset, bh->b_size);
				807	kunmap_atomic(source);
				808
				809	/*
				810	* Now that the frozen data is saved off, we need to store any matching
				811	* triggers.
				812	*/
				813	jh->b_frozen_triggers = jh->b_triggers;
				814	}
				815
				816	/*
				817	* If the buffer is already part of the current transaction, then there
				818	* is nothing we need to do. If it is already part of a prior
				819	* transaction which we are still committing to disk, then we need to
				820	* make sure that we do not overwrite the old copy: we do copy-out to
				821	* preserve the copy going to disk. We also account the buffer against
				822	* the handle's metadata buffer credits (unless the buffer is already
				823	* part of the transaction, that is).
				824	*
				825	*/
				826	static int
				827	do_get_write_access(handle_t handle, struct journal_head jh,
				828	int force_copy)
				829	{
				830	struct buffer_head *bh;
				831	transaction_t *transaction = handle->h_transaction;
				832	journal_t *journal;
				833	int error;
				834	char *frozen_buffer = NULL;
				835	unsigned long start_lock, time_lock;
				836
				837	if (is_handle_aborted(handle))
				838	return -EROFS;
				839	journal = transaction->t_journal;
				840
				841	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
				842
				843	JBUFFER_TRACE(jh, "entry");
				844	repeat:
				845	bh = jh2bh(jh);
				846
				847	/* @@@ Need to check for errors here at some point. */
				848
				849	start_lock = jiffies;
				850	lock_buffer(bh);
				851	jbd_lock_bh_state(bh);
				852
				853	/* If it takes too long to lock the buffer, trace it */
				854	time_lock = jbd2_time_diff(start_lock, jiffies);
				855	if (time_lock > HZ/10)
				856	trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
				857	jiffies_to_msecs(time_lock));
				858
				859	/* We now hold the buffer lock so it is safe to query the buffer
				860	* state. Is the buffer dirty?
				861	*
				862	* If so, there are two possibilities. The buffer may be
				863	* non-journaled, and undergoing a quite legitimate writeback.
				864	* Otherwise, it is journaled, and we don't expect dirty buffers
				865	* in that state (the buffers should be marked JBD_Dirty
				866	* instead.) So either the IO is being done under our own
				867	* control and this is a bug, or it's a third party IO such as
				868	* dump(8) (which may leave the buffer scheduled for read ---
				869	* ie. locked but not dirty) or tune2fs (which may actually have
				870	* the buffer dirtied, ugh.) */
				871
				872	if (buffer_dirty(bh)) {
				873	/*
				874	* First question: is this buffer already part of the current
				875	* transaction or the existing committing transaction?
				876	*/
				877	if (jh->b_transaction) {
				878	J_ASSERT_JH(jh,
				879	jh->b_transaction == transaction \|\|
				880	jh->b_transaction ==
				881	journal->j_committing_transaction);
				882	if (jh->b_next_transaction)
				883	J_ASSERT_JH(jh, jh->b_next_transaction ==
				884	transaction);
				885	warn_dirty_buffer(bh);
				886	}
				887	/*
				888	* In any case we need to clean the dirty flag and we must
				889	* do it under the buffer lock to be sure we don't race
				890	* with running write-out.
				891	*/
				892	JBUFFER_TRACE(jh, "Journalling dirty buffer");
				893	clear_buffer_dirty(bh);
				894	set_buffer_jbddirty(bh);
				895	}
				896
				897	unlock_buffer(bh);
				898
				899	error = -EROFS;
				900	if (is_handle_aborted(handle)) {
				901	jbd_unlock_bh_state(bh);
				902	goto out;
				903	}
				904	error = 0;
				905
				906	/*
				907	* The buffer is already part of this transaction if b_transaction or
				908	* b_next_transaction points to it
				909	*/
				910	if (jh->b_transaction == transaction \|\|
				911	jh->b_next_transaction == transaction)
				912	goto done;
				913
				914	/*
				915	* this is the first time this transaction is touching this buffer,
				916	* reset the modified flag
				917	*/
				918	jh->b_modified = 0;
				919
				920	/*
				921	* If the buffer is not journaled right now, we need to make sure it
				922	* doesn't get written to disk before the caller actually commits the
				923	* new data
				924	*/
				925	if (!jh->b_transaction) {
				926	JBUFFER_TRACE(jh, "no transaction");
				927	J_ASSERT_JH(jh, !jh->b_next_transaction);
				928	JBUFFER_TRACE(jh, "file as BJ_Reserved");
				929	/*
				930	* Make sure all stores to jh (b_modified, b_frozen_data) are
				931	* visible before attaching it to the running transaction.
				932	* Paired with barrier in jbd2_write_access_granted()
				933	*/
				934	smp_wmb();
				935	spin_lock(&journal->j_list_lock);
				936	__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
				937	spin_unlock(&journal->j_list_lock);
				938	goto done;
				939	}
				940	/*
				941	* If there is already a copy-out version of this buffer, then we don't
				942	* need to make another one
				943	*/
				944	if (jh->b_frozen_data) {
				945	JBUFFER_TRACE(jh, "has frozen data");
				946	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
				947	goto attach_next;
				948	}
				949
				950	JBUFFER_TRACE(jh, "owned by older transaction");
				951	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
				952	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
				953
				954	/*
				955	* There is one case we have to be very careful about. If the
				956	* committing transaction is currently writing this buffer out to disk
				957	* and has NOT made a copy-out, then we cannot modify the buffer
				958	* contents at all right now. The essence of copy-out is that it is
				959	* the extra copy, not the primary copy, which gets journaled. If the
				960	* primary copy is already going to disk then we cannot do copy-out
				961	* here.
				962	*/
				963	if (buffer_shadow(bh)) {
				964	JBUFFER_TRACE(jh, "on shadow: sleep");
				965	jbd_unlock_bh_state(bh);
				966	wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
				967	goto repeat;
				968	}
				969
				970	/*
				971	* Only do the copy if the currently-owning transaction still needs it.
				972	* If buffer isn't on BJ_Metadata list, the committing transaction is
				973	* past that stage (here we use the fact that BH_Shadow is set under
				974	* bh_state lock together with refiling to BJ_Shadow list and at this
				975	* point we know the buffer doesn't have BH_Shadow set).
				976	*
				977	* Subtle point, though: if this is a get_undo_access, then we will be
				978	* relying on the frozen_data to contain the new value of the
				979	* committed_data record after the transaction, so we HAVE to force the
				980	* frozen_data copy in that case.
				981	*/
				982	if (jh->b_jlist == BJ_Metadata \|\| force_copy) {
				983	JBUFFER_TRACE(jh, "generate frozen data");
				984	if (!frozen_buffer) {
				985	JBUFFER_TRACE(jh, "allocate memory for buffer");
				986	jbd_unlock_bh_state(bh);
				987	frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
				988	GFP_NOFS \| __GFP_NOFAIL);
				989	goto repeat;
				990	}
				991	jh->b_frozen_data = frozen_buffer;
				992	frozen_buffer = NULL;
				993	jbd2_freeze_jh_data(jh);
				994	}
				995	attach_next:
				996	/*
				997	* Make sure all stores to jh (b_modified, b_frozen_data) are visible
				998	* before attaching it to the running transaction. Paired with barrier
				999	* in jbd2_write_access_granted()
				1000	*/
				1001	smp_wmb();
				1002	jh->b_next_transaction = transaction;
				1003
				1004	done:
				1005	jbd_unlock_bh_state(bh);
				1006
				1007	/*
				1008	* If we are about to journal a buffer, then any revoke pending on it is
				1009	* no longer valid
				1010	*/
				1011	jbd2_journal_cancel_revoke(handle, jh);
				1012
				1013	out:
				1014	if (unlikely(frozen_buffer)) /* It's usually NULL */
				1015	jbd2_free(frozen_buffer, bh->b_size);
				1016
				1017	JBUFFER_TRACE(jh, "exit");
				1018	return error;
				1019	}
				1020
				1021	/* Fast check whether buffer is already attached to the required transaction */
				1022	static bool jbd2_write_access_granted(handle_t handle, struct buffer_head bh,
				1023	bool undo)
				1024	{
				1025	struct journal_head *jh;
				1026	bool ret = false;
				1027
				1028	/* Dirty buffers require special handling... */
				1029	if (buffer_dirty(bh))
				1030	return false;
				1031
				1032	/*
				1033	* RCU protects us from dereferencing freed pages. So the checks we do
				1034	* are guaranteed not to oops. However the jh slab object can get freed
				1035	* & reallocated while we work with it. So we have to be careful. When
				1036	* we see jh attached to the running transaction, we know it must stay
				1037	* so until the transaction is committed. Thus jh won't be freed and
				1038	* will be attached to the same bh while we run. However it can
				1039	* happen jh gets freed, reallocated, and attached to the transaction
				1040	* just after we get pointer to it from bh. So we have to be careful
				1041	* and recheck jh still belongs to our bh before we return success.
				1042	*/
				1043	rcu_read_lock();
				1044	if (!buffer_jbd(bh))
				1045	goto out;
				1046	/* This should be bh2jh() but that doesn't work with inline functions */
				1047	jh = READ_ONCE(bh->b_private);
				1048	if (!jh)
				1049	goto out;
				1050	/* For undo access buffer must have data copied */
				1051	if (undo && !jh->b_committed_data)
				1052	goto out;
				1053	if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
				1054	READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
				1055	goto out;
				1056	/*
				1057	* There are two reasons for the barrier here:
				1058	* 1) Make sure to fetch b_bh after we did previous checks so that we
				1059	* detect when jh went through free, realloc, attach to transaction
				1060	* while we were checking. Paired with implicit barrier in that path.
				1061	* 2) So that access to bh done after jbd2_write_access_granted()
				1062	* doesn't get reordered and see inconsistent state of concurrent
				1063	* do_get_write_access().
				1064	*/
				1065	smp_mb();
				1066	if (unlikely(jh->b_bh != bh))
				1067	goto out;
				1068	ret = true;
				1069	out:
				1070	rcu_read_unlock();
				1071	return ret;
				1072	}
				1073
				1074	/**
				1075	* int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
				1076	* @handle: transaction to add buffer modifications to
				1077	* @bh: bh to be used for metadata writes
				1078	*
				1079	* Returns: error code or 0 on success.
				1080	*
				1081	* In full data journalling mode the buffer may be of type BJ_AsyncData,
				1082	* because we're ``write()ing`` a buffer which is also part of a shared mapping.
				1083	*/
				1084
				1085	int jbd2_journal_get_write_access(handle_t handle, struct buffer_head bh)
				1086	{
				1087	struct journal_head *jh;
				1088	int rc;
				1089
				1090	if (jbd2_write_access_granted(handle, bh, false))
				1091	return 0;
				1092
				1093	jh = jbd2_journal_add_journal_head(bh);
				1094	/* We do not want to get caught playing with fields which the
				1095	* log thread also manipulates. Make sure that the buffer
				1096	* completes any outstanding IO before proceeding. */
				1097	rc = do_get_write_access(handle, jh, 0);
				1098	jbd2_journal_put_journal_head(jh);
				1099	return rc;
				1100	}
				1101
				1102
				1103	/*
				1104	* When the user wants to journal a newly created buffer_head
				1105	* (ie. getblk() returned a new buffer and we are going to populate it
				1106	* manually rather than reading off disk), then we need to keep the
				1107	* buffer_head locked until it has been completely filled with new
				1108	* data. In this case, we should be able to make the assertion that
				1109	* the bh is not already part of an existing transaction.
				1110	*
				1111	* The buffer should already be locked by the caller by this point.
				1112	* There is no lock ranking violation: it was a newly created,
				1113	* unlocked buffer beforehand. */
				1114
				1115	/**
				1116	* int jbd2_journal_get_create_access () - notify intent to use newly created bh
				1117	* @handle: transaction to new buffer to
				1118	* @bh: new buffer.
				1119	*
				1120	* Call this if you create a new bh.
				1121	*/
				1122	int jbd2_journal_get_create_access(handle_t handle, struct buffer_head bh)
				1123	{
				1124	transaction_t *transaction = handle->h_transaction;
				1125	journal_t *journal;
				1126	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
				1127	int err;
				1128
				1129	jbd_debug(5, "journal_head %p\n", jh);
				1130	err = -EROFS;
				1131	if (is_handle_aborted(handle))
				1132	goto out;
				1133	journal = transaction->t_journal;
				1134	err = 0;
				1135
				1136	JBUFFER_TRACE(jh, "entry");
				1137	/*
				1138	* The buffer may already belong to this transaction due to pre-zeroing
				1139	* in the filesystem's new_block code. It may also be on the previous,
				1140	* committing transaction's lists, but it HAS to be in Forget state in
				1141	* that case: the transaction must have deleted the buffer for it to be
				1142	* reused here.
				1143	*/
				1144	jbd_lock_bh_state(bh);
				1145	J_ASSERT_JH(jh, (jh->b_transaction == transaction \|\|
				1146	jh->b_transaction == NULL \|\|
				1147	(jh->b_transaction == journal->j_committing_transaction &&
				1148	jh->b_jlist == BJ_Forget)));
				1149
				1150	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
				1151	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
				1152
				1153	if (jh->b_transaction == NULL) {
				1154	/*
				1155	* Previous jbd2_journal_forget() could have left the buffer
				1156	* with jbddirty bit set because it was being committed. When
				1157	* the commit finished, we've filed the buffer for
				1158	* checkpointing and marked it dirty. Now we are reallocating
				1159	* the buffer so the transaction freeing it must have
				1160	* committed and so it's safe to clear the dirty bit.
				1161	*/
				1162	clear_buffer_dirty(jh2bh(jh));
				1163	/* first access by this transaction */
				1164	jh->b_modified = 0;
				1165
				1166	JBUFFER_TRACE(jh, "file as BJ_Reserved");
				1167	spin_lock(&journal->j_list_lock);
				1168	__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
				1169	spin_unlock(&journal->j_list_lock);
				1170	} else if (jh->b_transaction == journal->j_committing_transaction) {
				1171	/* first access by this transaction */
				1172	jh->b_modified = 0;
				1173
				1174	JBUFFER_TRACE(jh, "set next transaction");
				1175	spin_lock(&journal->j_list_lock);
				1176	jh->b_next_transaction = transaction;
				1177	spin_unlock(&journal->j_list_lock);
				1178	}
				1179	jbd_unlock_bh_state(bh);
				1180
				1181	/*
				1182	* akpm: I added this. ext3_alloc_branch can pick up new indirect
				1183	* blocks which contain freed but then revoked metadata. We need
				1184	* to cancel the revoke in case we end up freeing it yet again
				1185	* and the reallocating as data - this would cause a second revoke,
				1186	* which hits an assertion error.
				1187	*/
				1188	JBUFFER_TRACE(jh, "cancelling revoke");
				1189	jbd2_journal_cancel_revoke(handle, jh);
				1190	out:
				1191	jbd2_journal_put_journal_head(jh);
				1192	return err;
				1193	}
				1194
				1195	/**
				1196	* int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
				1197	* non-rewindable consequences
				1198	* @handle: transaction
				1199	* @bh: buffer to undo
				1200	*
				1201	* Sometimes there is a need to distinguish between metadata which has
				1202	* been committed to disk and that which has not. The ext3fs code uses
				1203	* this for freeing and allocating space, we have to make sure that we
				1204	* do not reuse freed space until the deallocation has been committed,
				1205	* since if we overwrote that space we would make the delete
				1206	* un-rewindable in case of a crash.
				1207	*
				1208	* To deal with that, jbd2_journal_get_undo_access requests write access to a
				1209	* buffer for parts of non-rewindable operations such as delete
				1210	* operations on the bitmaps. The journaling code must keep a copy of
				1211	* the buffer's contents prior to the undo_access call until such time
				1212	* as we know that the buffer has definitely been committed to disk.
				1213	*
				1214	* We never need to know which transaction the committed data is part
				1215	* of, buffers touched here are guaranteed to be dirtied later and so
				1216	* will be committed to a new transaction in due course, at which point
				1217	* we can discard the old committed data pointer.
				1218	*
				1219	* Returns error number or 0 on success.
				1220	*/
				1221	int jbd2_journal_get_undo_access(handle_t handle, struct buffer_head bh)
				1222	{
				1223	int err;
				1224	struct journal_head *jh;
				1225	char *committed_data = NULL;
				1226
				1227	if (jbd2_write_access_granted(handle, bh, true))
				1228	return 0;
				1229
				1230	jh = jbd2_journal_add_journal_head(bh);
				1231	JBUFFER_TRACE(jh, "entry");
				1232
				1233	/*
				1234	* Do this first --- it can drop the journal lock, so we want to
				1235	* make sure that obtaining the committed_data is done
				1236	* atomically wrt. completion of any outstanding commits.
				1237	*/
				1238	err = do_get_write_access(handle, jh, 1);
				1239	if (err)
				1240	goto out;
				1241
				1242	repeat:
				1243	if (!jh->b_committed_data)
				1244	committed_data = jbd2_alloc(jh2bh(jh)->b_size,
				1245	GFP_NOFS\|__GFP_NOFAIL);
				1246
				1247	jbd_lock_bh_state(bh);
				1248	if (!jh->b_committed_data) {
				1249	/* Copy out the current buffer contents into the
				1250	* preserved, committed copy. */
				1251	JBUFFER_TRACE(jh, "generate b_committed data");
				1252	if (!committed_data) {
				1253	jbd_unlock_bh_state(bh);
				1254	goto repeat;
				1255	}
				1256
				1257	jh->b_committed_data = committed_data;
				1258	committed_data = NULL;
				1259	memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
				1260	}
				1261	jbd_unlock_bh_state(bh);
				1262	out:
				1263	jbd2_journal_put_journal_head(jh);
				1264	if (unlikely(committed_data))
				1265	jbd2_free(committed_data, bh->b_size);
				1266	return err;
				1267	}
				1268
				1269	/**
				1270	* void jbd2_journal_set_triggers() - Add triggers for commit writeout
				1271	* @bh: buffer to trigger on
				1272	* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
				1273	*
				1274	* Set any triggers on this journal_head. This is always safe, because
				1275	* triggers for a committing buffer will be saved off, and triggers for
				1276	* a running transaction will match the buffer in that transaction.
				1277	*
				1278	* Call with NULL to clear the triggers.
				1279	*/
				1280	void jbd2_journal_set_triggers(struct buffer_head *bh,
				1281	struct jbd2_buffer_trigger_type *type)
				1282	{
				1283	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
				1284
				1285	if (WARN_ON(!jh))
				1286	return;
				1287	jh->b_triggers = type;
				1288	jbd2_journal_put_journal_head(jh);
				1289	}
				1290
				1291	void jbd2_buffer_frozen_trigger(struct journal_head jh, void mapped_data,
				1292	struct jbd2_buffer_trigger_type *triggers)
				1293	{
				1294	struct buffer_head *bh = jh2bh(jh);
				1295
				1296	if (!triggers \|\| !triggers->t_frozen)
				1297	return;
				1298
				1299	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
				1300	}
				1301
				1302	void jbd2_buffer_abort_trigger(struct journal_head *jh,
				1303	struct jbd2_buffer_trigger_type *triggers)
				1304	{
				1305	if (!triggers \|\| !triggers->t_abort)
				1306	return;
				1307
				1308	triggers->t_abort(triggers, jh2bh(jh));
				1309	}
				1310
				1311	/**
				1312	* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
				1313	* @handle: transaction to add buffer to.
				1314	* @bh: buffer to mark
				1315	*
				1316	* mark dirty metadata which needs to be journaled as part of the current
				1317	* transaction.
				1318	*
				1319	* The buffer must have previously had jbd2_journal_get_write_access()
				1320	* called so that it has a valid journal_head attached to the buffer
				1321	* head.
				1322	*
				1323	* The buffer is placed on the transaction's metadata list and is marked
				1324	* as belonging to the transaction.
				1325	*
				1326	* Returns error number or 0 on success.
				1327	*
				1328	* Special care needs to be taken if the buffer already belongs to the
				1329	* current committing transaction (in which case we should have frozen
				1330	* data present for that commit). In that case, we don't relink the
				1331	* buffer: that only gets done when the old transaction finally
				1332	* completes its commit.
				1333	*/
				1334	int jbd2_journal_dirty_metadata(handle_t handle, struct buffer_head bh)
				1335	{
				1336	transaction_t *transaction = handle->h_transaction;
				1337	journal_t *journal;
				1338	struct journal_head *jh;
				1339	int ret = 0;
				1340
				1341	if (is_handle_aborted(handle))
				1342	return -EROFS;
				1343	if (!buffer_jbd(bh))
				1344	return -EUCLEAN;
				1345
				1346	/*
				1347	* We don't grab jh reference here since the buffer must be part
				1348	* of the running transaction.
				1349	*/
				1350	jh = bh2jh(bh);
				1351	jbd_debug(5, "journal_head %p\n", jh);
				1352	JBUFFER_TRACE(jh, "entry");
				1353
				1354	/*
				1355	* This and the following assertions are unreliable since we may see jh
				1356	* in inconsistent state unless we grab bh_state lock. But this is
				1357	* crucial to catch bugs so let's do a reliable check until the
				1358	* lockless handling is fully proven.
				1359	*/
				1360	if (jh->b_transaction != transaction &&
				1361	jh->b_next_transaction != transaction) {
				1362	jbd_lock_bh_state(bh);
				1363	J_ASSERT_JH(jh, jh->b_transaction == transaction \|\|
				1364	jh->b_next_transaction == transaction);
				1365	jbd_unlock_bh_state(bh);
				1366	}
				1367	if (jh->b_modified == 1) {
				1368	/* If it's in our transaction it must be in BJ_Metadata list. */
				1369	if (jh->b_transaction == transaction &&
				1370	jh->b_jlist != BJ_Metadata) {
				1371	jbd_lock_bh_state(bh);
				1372	if (jh->b_transaction == transaction &&
				1373	jh->b_jlist != BJ_Metadata)
				1374	pr_err("JBD2: assertion failure: h_type=%u "
				1375	"h_line_no=%u block_no=%llu jlist=%u\n",
				1376	handle->h_type, handle->h_line_no,
				1377	(unsigned long long) bh->b_blocknr,
				1378	jh->b_jlist);
				1379	J_ASSERT_JH(jh, jh->b_transaction != transaction \|\|
				1380	jh->b_jlist == BJ_Metadata);
				1381	jbd_unlock_bh_state(bh);
				1382	}
				1383	goto out;
				1384	}
				1385
				1386	journal = transaction->t_journal;
				1387	jbd_lock_bh_state(bh);
				1388
				1389	if (jh->b_modified == 0) {
				1390	/*
				1391	* This buffer's got modified and becoming part
				1392	* of the transaction. This needs to be done
				1393	* once a transaction -bzzz
				1394	*/
				1395	if (handle->h_buffer_credits <= 0) {
				1396	ret = -ENOSPC;
				1397	goto out_unlock_bh;
				1398	}
				1399	jh->b_modified = 1;
				1400	handle->h_buffer_credits--;
				1401	}
				1402
				1403	/*
				1404	* fastpath, to avoid expensive locking. If this buffer is already
				1405	* on the running transaction's metadata list there is nothing to do.
				1406	* Nobody can take it off again because there is a handle open.
				1407	* I _think_ we're OK here with SMP barriers - a mistaken decision will
				1408	* result in this test being false, so we go in and take the locks.
				1409	*/
				1410	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
				1411	JBUFFER_TRACE(jh, "fastpath");
				1412	if (unlikely(jh->b_transaction !=
				1413	journal->j_running_transaction)) {
				1414	printk(KERN_ERR "JBD2: %s: "
				1415	"jh->b_transaction (%llu, %p, %u) != "
				1416	"journal->j_running_transaction (%p, %u)\n",
				1417	journal->j_devname,
				1418	(unsigned long long) bh->b_blocknr,
				1419	jh->b_transaction,
				1420	jh->b_transaction ? jh->b_transaction->t_tid : 0,
				1421	journal->j_running_transaction,
				1422	journal->j_running_transaction ?
				1423	journal->j_running_transaction->t_tid : 0);
				1424	ret = -EINVAL;
				1425	}
				1426	goto out_unlock_bh;
				1427	}
				1428
				1429	set_buffer_jbddirty(bh);
				1430
				1431	/*
				1432	* Metadata already on the current transaction list doesn't
				1433	* need to be filed. Metadata on another transaction's list must
				1434	* be committing, and will be refiled once the commit completes:
				1435	* leave it alone for now.
				1436	*/
				1437	if (jh->b_transaction != transaction) {
				1438	JBUFFER_TRACE(jh, "already on other transaction");
				1439	if (unlikely(((jh->b_transaction !=
				1440	journal->j_committing_transaction)) \|\|
				1441	(jh->b_next_transaction != transaction))) {
				1442	printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
				1443	"bad jh for block %llu: "
				1444	"transaction (%p, %u), "
				1445	"jh->b_transaction (%p, %u), "
				1446	"jh->b_next_transaction (%p, %u), jlist %u\n",
				1447	journal->j_devname,
				1448	(unsigned long long) bh->b_blocknr,
				1449	transaction, transaction->t_tid,
				1450	jh->b_transaction,
				1451	jh->b_transaction ?
				1452	jh->b_transaction->t_tid : 0,
				1453	jh->b_next_transaction,
				1454	jh->b_next_transaction ?
				1455	jh->b_next_transaction->t_tid : 0,
				1456	jh->b_jlist);
				1457	WARN_ON(1);
				1458	ret = -EINVAL;
				1459	}
				1460	/* And this case is illegal: we can't reuse another
				1461	* transaction's data buffer, ever. */
				1462	goto out_unlock_bh;
				1463	}
				1464
				1465	/* That test should have eliminated the following case: */
				1466	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
				1467
				1468	JBUFFER_TRACE(jh, "file as BJ_Metadata");
				1469	spin_lock(&journal->j_list_lock);
				1470	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
				1471	spin_unlock(&journal->j_list_lock);
				1472	out_unlock_bh:
				1473	jbd_unlock_bh_state(bh);
				1474	out:
				1475	JBUFFER_TRACE(jh, "exit");
				1476	return ret;
				1477	}
				1478
				1479	/**
				1480	* void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
				1481	* @handle: transaction handle
				1482	* @bh: bh to 'forget'
				1483	*
				1484	* We can only do the bforget if there are no commits pending against the
				1485	* buffer. If the buffer is dirty in the current running transaction we
				1486	* can safely unlink it.
				1487	*
				1488	* bh may not be a journalled buffer at all - it may be a non-JBD
				1489	* buffer which came off the hashtable. Check for this.
				1490	*
				1491	* Decrements bh->b_count by one.
				1492	*
				1493	* Allow this call even if the handle has aborted --- it may be part of
				1494	* the caller's cleanup after an abort.
				1495	*/
				1496	int jbd2_journal_forget (handle_t handle, struct buffer_head bh)
				1497	{
				1498	transaction_t *transaction = handle->h_transaction;
				1499	journal_t *journal;
				1500	struct journal_head *jh;
				1501	int drop_reserve = 0;
				1502	int err = 0;
				1503	int was_modified = 0;
				1504
				1505	if (is_handle_aborted(handle))
				1506	return -EROFS;
				1507	journal = transaction->t_journal;
				1508
				1509	BUFFER_TRACE(bh, "entry");
				1510
				1511	jbd_lock_bh_state(bh);
				1512
				1513	if (!buffer_jbd(bh))
				1514	goto not_jbd;
				1515	jh = bh2jh(bh);
				1516
				1517	/* Critical error: attempting to delete a bitmap buffer, maybe?
				1518	* Don't do any jbd operations, and return an error. */
				1519	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
				1520	"inconsistent data on disk")) {
				1521	err = -EIO;
				1522	goto not_jbd;
				1523	}
				1524
				1525	/* keep track of whether or not this transaction modified us */
				1526	was_modified = jh->b_modified;
				1527
				1528	/*
				1529	* The buffer's going from the transaction, we must drop
				1530	* all references -bzzz
				1531	*/
				1532	jh->b_modified = 0;
				1533
				1534	if (jh->b_transaction == transaction) {
				1535	J_ASSERT_JH(jh, !jh->b_frozen_data);
				1536
				1537	/* If we are forgetting a buffer which is already part
				1538	* of this transaction, then we can just drop it from
				1539	* the transaction immediately. */
				1540	clear_buffer_dirty(bh);
				1541	clear_buffer_jbddirty(bh);
				1542
				1543	JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
				1544
				1545	/*
				1546	* we only want to drop a reference if this transaction
				1547	* modified the buffer
				1548	*/
				1549	if (was_modified)
				1550	drop_reserve = 1;
				1551
				1552	/*
				1553	* We are no longer going to journal this buffer.
				1554	* However, the commit of this transaction is still
				1555	* important to the buffer: the delete that we are now
				1556	* processing might obsolete an old log entry, so by
				1557	* committing, we can satisfy the buffer's checkpoint.
				1558	*
				1559	* So, if we have a checkpoint on the buffer, we should
				1560	* now refile the buffer on our BJ_Forget list so that
				1561	* we know to remove the checkpoint after we commit.
				1562	*/
				1563
				1564	spin_lock(&journal->j_list_lock);
				1565	if (jh->b_cp_transaction) {
				1566	__jbd2_journal_temp_unlink_buffer(jh);
				1567	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
				1568	} else {
				1569	__jbd2_journal_unfile_buffer(jh);
				1570	if (!buffer_jbd(bh)) {
				1571	spin_unlock(&journal->j_list_lock);
				1572	jbd_unlock_bh_state(bh);
				1573	__bforget(bh);
				1574	goto drop;
				1575	}
				1576	}
				1577	spin_unlock(&journal->j_list_lock);
				1578	} else if (jh->b_transaction) {
				1579	J_ASSERT_JH(jh, (jh->b_transaction ==
				1580	journal->j_committing_transaction));
				1581	/* However, if the buffer is still owned by a prior
				1582	* (committing) transaction, we can't drop it yet... */
				1583	JBUFFER_TRACE(jh, "belongs to older transaction");
				1584	/* ... but we CAN drop it from the new transaction through
				1585	* marking the buffer as freed and set j_next_transaction to
				1586	* the new transaction, so that not only the commit code
				1587	* knows it should clear dirty bits when it is done with the
				1588	* buffer, but also the buffer can be checkpointed only
				1589	* after the new transaction commits. */
				1590
				1591	set_buffer_freed(bh);
				1592
				1593	if (!jh->b_next_transaction) {
				1594	spin_lock(&journal->j_list_lock);
				1595	jh->b_next_transaction = transaction;
				1596	spin_unlock(&journal->j_list_lock);
				1597	} else {
				1598	J_ASSERT(jh->b_next_transaction == transaction);
				1599
				1600	/*
				1601	* only drop a reference if this transaction modified
				1602	* the buffer
				1603	*/
				1604	if (was_modified)
				1605	drop_reserve = 1;
				1606	}
				1607	}
				1608
				1609	not_jbd:
				1610	jbd_unlock_bh_state(bh);
				1611	__brelse(bh);
				1612	drop:
				1613	if (drop_reserve) {
				1614	/* no need to reserve log space for this block -bzzz */
				1615	handle->h_buffer_credits++;
				1616	}
				1617	return err;
				1618	}
				1619
				1620	/**
				1621	* int jbd2_journal_stop() - complete a transaction
				1622	* @handle: transaction to complete.
				1623	*
				1624	* All done for a particular handle.
				1625	*
				1626	* There is not much action needed here. We just return any remaining
				1627	* buffer credits to the transaction and remove the handle. The only
				1628	* complication is that we need to start a commit operation if the
				1629	* filesystem is marked for synchronous update.
				1630	*
				1631	* jbd2_journal_stop itself will not usually return an error, but it may
				1632	* do so in unusual circumstances. In particular, expect it to
				1633	* return -EIO if a jbd2_journal_abort has been executed since the
				1634	* transaction began.
				1635	*/
				1636	int jbd2_journal_stop(handle_t *handle)
				1637	{
				1638	transaction_t *transaction = handle->h_transaction;
				1639	journal_t *journal;
				1640	int err = 0, wait_for_commit = 0;
				1641	tid_t tid;
				1642	pid_t pid;
				1643
				1644	if (!transaction) {
				1645	/*
				1646	* Handle is already detached from the transaction so
				1647	* there is nothing to do other than decrease a refcount,
				1648	* or free the handle if refcount drops to zero
				1649	*/
				1650	if (--handle->h_ref > 0) {
				1651	jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
				1652	handle->h_ref);
				1653	return err;
				1654	} else {
				1655	if (handle->h_rsv_handle)
				1656	jbd2_free_handle(handle->h_rsv_handle);
				1657	goto free_and_exit;
				1658	}
				1659	}
				1660	journal = transaction->t_journal;
				1661
				1662	J_ASSERT(journal_current_handle() == handle);
				1663
				1664	if (is_handle_aborted(handle))
				1665	err = -EIO;
				1666	else
				1667	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
				1668
				1669	if (--handle->h_ref > 0) {
				1670	jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
				1671	handle->h_ref);
				1672	return err;
				1673	}
				1674
				1675	jbd_debug(4, "Handle %p going down\n", handle);
				1676	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
				1677	transaction->t_tid,
				1678	handle->h_type, handle->h_line_no,
				1679	jiffies - handle->h_start_jiffies,
				1680	handle->h_sync, handle->h_requested_credits,
				1681	(handle->h_requested_credits -
				1682	handle->h_buffer_credits));
				1683
				1684	/*
				1685	* Implement synchronous transaction batching. If the handle
				1686	* was synchronous, don't force a commit immediately. Let's
				1687	* yield and let another thread piggyback onto this
				1688	* transaction. Keep doing that while new threads continue to
				1689	* arrive. It doesn't cost much - we're about to run a commit
				1690	* and sleep on IO anyway. Speeds up many-threaded, many-dir
				1691	* operations by 30x or more...
				1692	*
				1693	* We try and optimize the sleep time against what the
				1694	* underlying disk can do, instead of having a static sleep
				1695	* time. This is useful for the case where our storage is so
				1696	* fast that it is more optimal to go ahead and force a flush
				1697	* and wait for the transaction to be committed than it is to
				1698	* wait for an arbitrary amount of time for new writers to
				1699	* join the transaction. We achieve this by measuring how
				1700	* long it takes to commit a transaction, and compare it with
				1701	* how long this transaction has been running, and if run time
				1702	* < commit time then we sleep for the delta and commit. This
				1703	* greatly helps super fast disks that would see slowdowns as
				1704	* more threads started doing fsyncs.
				1705	*
				1706	* But don't do this if this process was the most recent one
				1707	* to perform a synchronous write. We do this to detect the
				1708	* case where a single process is doing a stream of sync
				1709	* writes. No point in waiting for joiners in that case.
				1710	*
				1711	* Setting max_batch_time to 0 disables this completely.
				1712	*/
				1713	pid = current->pid;
				1714	if (handle->h_sync && journal->j_last_sync_writer != pid &&
				1715	journal->j_max_batch_time) {
				1716	u64 commit_time, trans_time;
				1717
				1718	journal->j_last_sync_writer = pid;
				1719
				1720	read_lock(&journal->j_state_lock);
				1721	commit_time = journal->j_average_commit_time;
				1722	read_unlock(&journal->j_state_lock);
				1723
				1724	trans_time = ktime_to_ns(ktime_sub(ktime_get(),
				1725	transaction->t_start_time));
				1726
				1727	commit_time = max_t(u64, commit_time,
				1728	1000*journal->j_min_batch_time);
				1729	commit_time = min_t(u64, commit_time,
				1730	1000*journal->j_max_batch_time);
				1731
				1732	if (trans_time < commit_time) {
				1733	ktime_t expires = ktime_add_ns(ktime_get(),
				1734	commit_time);
				1735	set_current_state(TASK_UNINTERRUPTIBLE);
				1736	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
				1737	}
				1738	}
				1739
				1740	if (handle->h_sync)
				1741	transaction->t_synchronous_commit = 1;
				1742	current->journal_info = NULL;
				1743	atomic_sub(handle->h_buffer_credits,
				1744	&transaction->t_outstanding_credits);
				1745
				1746	/*
				1747	* If the handle is marked SYNC, we need to set another commit
				1748	* going! We also want to force a commit if the current
				1749	* transaction is occupying too much of the log, or if the
				1750	* transaction is too old now.
				1751	*/
				1752	if (handle->h_sync \|\|
				1753	(atomic_read(&transaction->t_outstanding_credits) >
				1754	journal->j_max_transaction_buffers) \|\|
				1755	time_after_eq(jiffies, transaction->t_expires)) {
				1756	/* Do this even for aborted journals: an abort still
				1757	* completes the commit thread, it just doesn't write
				1758	* anything to disk. */
				1759
				1760	jbd_debug(2, "transaction too old, requesting commit for "
				1761	"handle %p\n", handle);
				1762	/* This is non-blocking */
				1763	jbd2_log_start_commit(journal, transaction->t_tid);
				1764
				1765	/*
				1766	* Special case: JBD2_SYNC synchronous updates require us
				1767	* to wait for the commit to complete.
				1768	*/
				1769	if (handle->h_sync && !(current->flags & PF_MEMALLOC))
				1770	wait_for_commit = 1;
				1771	}
				1772
				1773	/*
				1774	* Once we drop t_updates, if it goes to zero the transaction
				1775	* could start committing on us and eventually disappear. So
				1776	* once we do this, we must not dereference transaction
				1777	* pointer again.
				1778	*/
				1779	tid = transaction->t_tid;
				1780	if (atomic_dec_and_test(&transaction->t_updates)) {
				1781	wake_up(&journal->j_wait_updates);
				1782	if (journal->j_barrier_count)
				1783	wake_up(&journal->j_wait_transaction_locked);
				1784	}
				1785
				1786	rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
				1787
				1788	if (wait_for_commit)
				1789	err = jbd2_log_wait_commit(journal, tid);
				1790
				1791	if (handle->h_rsv_handle)
				1792	jbd2_journal_free_reserved(handle->h_rsv_handle);
				1793	free_and_exit:
				1794	/*
				1795	* Scope of the GFP_NOFS context is over here and so we can restore the
				1796	* original alloc context.
				1797	*/
				1798	memalloc_nofs_restore(handle->saved_alloc_context);
				1799	jbd2_free_handle(handle);
				1800	return err;
				1801	}
				1802
				1803	/*
				1804	*
				1805	* List management code snippets: various functions for manipulating the
				1806	* transaction buffer lists.
				1807	*
				1808	*/
				1809
				1810	/*
				1811	* Append a buffer to a transaction list, given the transaction's list head
				1812	* pointer.
				1813	*
				1814	* j_list_lock is held.
				1815	*
				1816	* jbd_lock_bh_state(jh2bh(jh)) is held.
				1817	*/
				1818
				1819	static inline void
				1820	__blist_add_buffer(struct journal_head *list, struct journal_head jh)
				1821	{
				1822	if (!*list) {
				1823	jh->b_tnext = jh->b_tprev = jh;
				1824	*list = jh;
				1825	} else {
				1826	/* Insert at the tail of the list to preserve order */
				1827	struct journal_head first = list, *last = first->b_tprev;
				1828	jh->b_tprev = last;
				1829	jh->b_tnext = first;
				1830	last->b_tnext = first->b_tprev = jh;
				1831	}
				1832	}
				1833
				1834	/*
				1835	* Remove a buffer from a transaction list, given the transaction's list
				1836	* head pointer.
				1837	*
				1838	* Called with j_list_lock held, and the journal may not be locked.
				1839	*
				1840	* jbd_lock_bh_state(jh2bh(jh)) is held.
				1841	*/
				1842
				1843	static inline void
				1844	__blist_del_buffer(struct journal_head *list, struct journal_head jh)
				1845	{
				1846	if (*list == jh) {
				1847	*list = jh->b_tnext;
				1848	if (*list == jh)
				1849	*list = NULL;
				1850	}
				1851	jh->b_tprev->b_tnext = jh->b_tnext;
				1852	jh->b_tnext->b_tprev = jh->b_tprev;
				1853	}
				1854
				1855	/*
				1856	* Remove a buffer from the appropriate transaction list.
				1857	*
				1858	* Note that this function can change the value of
				1859	* bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
				1860	* t_reserved_list. If the caller is holding onto a copy of one of these
				1861	* pointers, it could go bad. Generally the caller needs to re-read the
				1862	* pointer from the transaction_t.
				1863	*
				1864	* Called under j_list_lock.
				1865	*/
				1866	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
				1867	{
				1868	struct journal_head **list = NULL;
				1869	transaction_t *transaction;
				1870	struct buffer_head *bh = jh2bh(jh);
				1871
				1872	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
				1873	transaction = jh->b_transaction;
				1874	if (transaction)
				1875	assert_spin_locked(&transaction->t_journal->j_list_lock);
				1876
				1877	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
				1878	if (jh->b_jlist != BJ_None)
				1879	J_ASSERT_JH(jh, transaction != NULL);
				1880
				1881	switch (jh->b_jlist) {
				1882	case BJ_None:
				1883	return;
				1884	case BJ_Metadata:
				1885	transaction->t_nr_buffers--;
				1886	J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
				1887	list = &transaction->t_buffers;
				1888	break;
				1889	case BJ_Forget:
				1890	list = &transaction->t_forget;
				1891	break;
				1892	case BJ_Shadow:
				1893	list = &transaction->t_shadow_list;
				1894	break;
				1895	case BJ_Reserved:
				1896	list = &transaction->t_reserved_list;
				1897	break;
				1898	}
				1899
				1900	__blist_del_buffer(list, jh);
				1901	jh->b_jlist = BJ_None;
				1902	if (transaction && is_journal_aborted(transaction->t_journal))
				1903	clear_buffer_jbddirty(bh);
				1904	else if (test_clear_buffer_jbddirty(bh))
				1905	mark_buffer_dirty(bh); /* Expose it to the VM */
				1906	}
				1907
				1908	/*
				1909	* Remove buffer from all transactions.
				1910	*
				1911	* Called with bh_state lock and j_list_lock
				1912	*
				1913	* jh and bh may be already freed when this function returns.
				1914	*/
				1915	static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
				1916	{
				1917	J_ASSERT_JH(jh, jh->b_transaction != NULL);
				1918	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
				1919
				1920	__jbd2_journal_temp_unlink_buffer(jh);
				1921	jh->b_transaction = NULL;
				1922	jbd2_journal_put_journal_head(jh);
				1923	}
				1924
				1925	void jbd2_journal_unfile_buffer(journal_t journal, struct journal_head jh)
				1926	{
				1927	struct buffer_head *bh = jh2bh(jh);
				1928
				1929	/* Get reference so that buffer cannot be freed before we unlock it */
				1930	get_bh(bh);
				1931	jbd_lock_bh_state(bh);
				1932	spin_lock(&journal->j_list_lock);
				1933	__jbd2_journal_unfile_buffer(jh);
				1934	spin_unlock(&journal->j_list_lock);
				1935	jbd_unlock_bh_state(bh);
				1936	__brelse(bh);
				1937	}
				1938
				1939	/*
				1940	* Called from jbd2_journal_try_to_free_buffers().
				1941	*
				1942	* Called under jbd_lock_bh_state(bh)
				1943	*/
				1944	static void
				1945	__journal_try_to_free_buffer(journal_t journal, struct buffer_head bh)
				1946	{
				1947	struct journal_head *jh;
				1948
				1949	jh = bh2jh(bh);
				1950
				1951	if (buffer_locked(bh) \|\| buffer_dirty(bh))
				1952	goto out;
				1953
				1954	if (jh->b_next_transaction != NULL \|\| jh->b_transaction != NULL)
				1955	goto out;
				1956
				1957	spin_lock(&journal->j_list_lock);
				1958	if (jh->b_cp_transaction != NULL) {
				1959	/* written-back checkpointed metadata buffer */
				1960	JBUFFER_TRACE(jh, "remove from checkpoint list");
				1961	__jbd2_journal_remove_checkpoint(jh);
				1962	}
				1963	spin_unlock(&journal->j_list_lock);
				1964	out:
				1965	return;
				1966	}
				1967
				1968	/**
				1969	* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
				1970	* @journal: journal for operation
				1971	* @page: to try and free
				1972	* @gfp_mask: we use the mask to detect how hard should we try to release
				1973	* buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
				1974	* code to release the buffers.
				1975	*
				1976	*
				1977	* For all the buffers on this page,
				1978	* if they are fully written out ordered data, move them onto BUF_CLEAN
				1979	* so try_to_free_buffers() can reap them.
				1980	*
				1981	* This function returns non-zero if we wish try_to_free_buffers()
				1982	* to be called. We do this if the page is releasable by try_to_free_buffers().
				1983	* We also do it if the page has locked or dirty buffers and the caller wants
				1984	* us to perform sync or async writeout.
				1985	*
				1986	* This complicates JBD locking somewhat. We aren't protected by the
				1987	* BKL here. We wish to remove the buffer from its committing or
				1988	* running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
				1989	*
				1990	* This may change the value of transaction_t->t_datalist, so anyone
				1991	* who looks at t_datalist needs to lock against this function.
				1992	*
				1993	* Even worse, someone may be doing a jbd2_journal_dirty_data on this
				1994	* buffer. So we need to lock against that. jbd2_journal_dirty_data()
				1995	* will come out of the lock with the buffer dirty, which makes it
				1996	* ineligible for release here.
				1997	*
				1998	* Who else is affected by this? hmm... Really the only contender
				1999	* is do_get_write_access() - it could be looking at the buffer while
				2000	* journal_try_to_free_buffer() is changing its state. But that
				2001	* cannot happen because we never reallocate freed data as metadata
				2002	* while the data is part of a transaction. Yes?
				2003	*
				2004	* Return 0 on failure, 1 on success
				2005	*/
				2006	int jbd2_journal_try_to_free_buffers(journal_t *journal,
				2007	struct page *page, gfp_t gfp_mask)
				2008	{
				2009	struct buffer_head *head;
				2010	struct buffer_head *bh;
				2011	bool has_write_io_error = false;
				2012	int ret = 0;
				2013
				2014	J_ASSERT(PageLocked(page));
				2015
				2016	head = page_buffers(page);
				2017	bh = head;
				2018	do {
				2019	struct journal_head *jh;
				2020
				2021	/*
				2022	* We take our own ref against the journal_head here to avoid
				2023	* having to add tons of locking around each instance of
				2024	* jbd2_journal_put_journal_head().
				2025	*/
				2026	jh = jbd2_journal_grab_journal_head(bh);
				2027	if (!jh)
				2028	continue;
				2029
				2030	jbd_lock_bh_state(bh);
				2031	__journal_try_to_free_buffer(journal, bh);
				2032	jbd2_journal_put_journal_head(jh);
				2033	jbd_unlock_bh_state(bh);
				2034	if (buffer_jbd(bh))
				2035	goto busy;
				2036
				2037	/*
				2038	* If we free a metadata buffer which has been failed to
				2039	* write out, the jbd2 checkpoint procedure will not detect
				2040	* this failure and may lead to filesystem inconsistency
				2041	* after cleanup journal tail.
				2042	*/
				2043	if (buffer_write_io_error(bh)) {
				2044	pr_err("JBD2: Error while async write back metadata bh %llu.",
				2045	(unsigned long long)bh->b_blocknr);
				2046	has_write_io_error = true;
				2047	}
				2048	} while ((bh = bh->b_this_page) != head);
				2049
				2050	ret = try_to_free_buffers(page);
				2051
				2052	busy:
				2053	if (has_write_io_error)
				2054	jbd2_journal_abort(journal, -EIO);
				2055
				2056	return ret;
				2057	}
				2058
				2059	/*
				2060	* This buffer is no longer needed. If it is on an older transaction's
				2061	* checkpoint list we need to record it on this transaction's forget list
				2062	* to pin this buffer (and hence its checkpointing transaction) down until
				2063	* this transaction commits. If the buffer isn't on a checkpoint list, we
				2064	* release it.
				2065	* Returns non-zero if JBD no longer has an interest in the buffer.
				2066	*
				2067	* Called under j_list_lock.
				2068	*
				2069	* Called under jbd_lock_bh_state(bh).
				2070	*/
				2071	static int __dispose_buffer(struct journal_head jh, transaction_t transaction)
				2072	{
				2073	int may_free = 1;
				2074	struct buffer_head *bh = jh2bh(jh);
				2075
				2076	if (jh->b_cp_transaction) {
				2077	JBUFFER_TRACE(jh, "on running+cp transaction");
				2078	__jbd2_journal_temp_unlink_buffer(jh);
				2079	/*
				2080	* We don't want to write the buffer anymore, clear the
				2081	* bit so that we don't confuse checks in
				2082	* __journal_file_buffer
				2083	*/
				2084	clear_buffer_dirty(bh);
				2085	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
				2086	may_free = 0;
				2087	} else {
				2088	JBUFFER_TRACE(jh, "on running transaction");
				2089	__jbd2_journal_unfile_buffer(jh);
				2090	}
				2091	return may_free;
				2092	}
				2093
				2094	/*
				2095	* jbd2_journal_invalidatepage
				2096	*
				2097	* This code is tricky. It has a number of cases to deal with.
				2098	*
				2099	* There are two invariants which this code relies on:
				2100	*
				2101	* i_size must be updated on disk before we start calling invalidatepage on the
				2102	* data.
				2103	*
				2104	* This is done in ext3 by defining an ext3_setattr method which
				2105	* updates i_size before truncate gets going. By maintaining this
				2106	* invariant, we can be sure that it is safe to throw away any buffers
				2107	* attached to the current transaction: once the transaction commits,
				2108	* we know that the data will not be needed.
				2109	*
				2110	* Note however that we can not throw away data belonging to the
				2111	* previous, committing transaction!
				2112	*
				2113	* Any disk blocks which are part of the previous, committing
				2114	* transaction (and which therefore cannot be discarded immediately) are
				2115	* not going to be reused in the new running transaction
				2116	*
				2117	* The bitmap committed_data images guarantee this: any block which is
				2118	* allocated in one transaction and removed in the next will be marked
				2119	* as in-use in the committed_data bitmap, so cannot be reused until
				2120	* the next transaction to delete the block commits. This means that
				2121	* leaving committing buffers dirty is quite safe: the disk blocks
				2122	* cannot be reallocated to a different file and so buffer aliasing is
				2123	* not possible.
				2124	*
				2125	*
				2126	* The above applies mainly to ordered data mode. In writeback mode we
				2127	* don't make guarantees about the order in which data hits disk --- in
				2128	* particular we don't guarantee that new dirty data is flushed before
				2129	* transaction commit --- so it is always safe just to discard data
				2130	* immediately in that mode. --sct
				2131	*/
				2132
				2133	/*
				2134	* The journal_unmap_buffer helper function returns zero if the buffer
				2135	* concerned remains pinned as an anonymous buffer belonging to an older
				2136	* transaction.
				2137	*
				2138	* We're outside-transaction here. Either or both of j_running_transaction
				2139	* and j_committing_transaction may be NULL.
				2140	*/
				2141	static int journal_unmap_buffer(journal_t journal, struct buffer_head bh,
				2142	int partial_page)
				2143	{
				2144	transaction_t *transaction;
				2145	struct journal_head *jh;
				2146	int may_free = 1;
				2147
				2148	BUFFER_TRACE(bh, "entry");
				2149
				2150	/*
				2151	* It is safe to proceed here without the j_list_lock because the
				2152	* buffers cannot be stolen by try_to_free_buffers as long as we are
				2153	* holding the page lock. --sct
				2154	*/
				2155
				2156	if (!buffer_jbd(bh))
				2157	goto zap_buffer_unlocked;
				2158
				2159	/* OK, we have data buffer in journaled mode */
				2160	write_lock(&journal->j_state_lock);
				2161	jbd_lock_bh_state(bh);
				2162	spin_lock(&journal->j_list_lock);
				2163
				2164	jh = jbd2_journal_grab_journal_head(bh);
				2165	if (!jh)
				2166	goto zap_buffer_no_jh;
				2167
				2168	/*
				2169	* We cannot remove the buffer from checkpoint lists until the
				2170	* transaction adding inode to orphan list (let's call it T)
				2171	* is committed. Otherwise if the transaction changing the
				2172	* buffer would be cleaned from the journal before T is
				2173	* committed, a crash will cause that the correct contents of
				2174	* the buffer will be lost. On the other hand we have to
				2175	* clear the buffer dirty bit at latest at the moment when the
				2176	* transaction marking the buffer as freed in the filesystem
				2177	* structures is committed because from that moment on the
				2178	* block can be reallocated and used by a different page.
				2179	* Since the block hasn't been freed yet but the inode has
				2180	* already been added to orphan list, it is safe for us to add
				2181	* the buffer to BJ_Forget list of the newest transaction.
				2182	*
				2183	* Also we have to clear buffer_mapped flag of a truncated buffer
				2184	* because the buffer_head may be attached to the page straddling
				2185	* i_size (can happen only when blocksize < pagesize) and thus the
				2186	* buffer_head can be reused when the file is extended again. So we end
				2187	* up keeping around invalidated buffers attached to transactions'
				2188	* BJ_Forget list just to stop checkpointing code from cleaning up
				2189	* the transaction this buffer was modified in.
				2190	*/
				2191	transaction = jh->b_transaction;
				2192	if (transaction == NULL) {
				2193	/* First case: not on any transaction. If it
				2194	* has no checkpoint link, then we can zap it:
				2195	* it's a writeback-mode buffer so we don't care
				2196	* if it hits disk safely. */
				2197	if (!jh->b_cp_transaction) {
				2198	JBUFFER_TRACE(jh, "not on any transaction: zap");
				2199	goto zap_buffer;
				2200	}
				2201
				2202	if (!buffer_dirty(bh)) {
				2203	/* bdflush has written it. We can drop it now */
				2204	__jbd2_journal_remove_checkpoint(jh);
				2205	goto zap_buffer;
				2206	}
				2207
				2208	/* OK, it must be in the journal but still not
				2209	* written fully to disk: it's metadata or
				2210	* journaled data... */
				2211
				2212	if (journal->j_running_transaction) {
				2213	/* ... and once the current transaction has
				2214	* committed, the buffer won't be needed any
				2215	* longer. */
				2216	JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
				2217	may_free = __dispose_buffer(jh,
				2218	journal->j_running_transaction);
				2219	goto zap_buffer;
				2220	} else {
				2221	/* There is no currently-running transaction. So the
				2222	* orphan record which we wrote for this file must have
				2223	* passed into commit. We must attach this buffer to
				2224	* the committing transaction, if it exists. */
				2225	if (journal->j_committing_transaction) {
				2226	JBUFFER_TRACE(jh, "give to committing trans");
				2227	may_free = __dispose_buffer(jh,
				2228	journal->j_committing_transaction);
				2229	goto zap_buffer;
				2230	} else {
				2231	/* The orphan record's transaction has
				2232	* committed. We can cleanse this buffer */
				2233	clear_buffer_jbddirty(bh);
				2234	__jbd2_journal_remove_checkpoint(jh);
				2235	goto zap_buffer;
				2236	}
				2237	}
				2238	} else if (transaction == journal->j_committing_transaction) {
				2239	JBUFFER_TRACE(jh, "on committing transaction");
				2240	/*
				2241	* The buffer is committing, we simply cannot touch
				2242	* it. If the page is straddling i_size we have to wait
				2243	* for commit and try again.
				2244	*/
				2245	if (partial_page) {
				2246	jbd2_journal_put_journal_head(jh);
				2247	spin_unlock(&journal->j_list_lock);
				2248	jbd_unlock_bh_state(bh);
				2249	write_unlock(&journal->j_state_lock);
				2250	return -EBUSY;
				2251	}
				2252	/*
				2253	* OK, buffer won't be reachable after truncate. We just clear
				2254	* b_modified to not confuse transaction credit accounting, and
				2255	* set j_next_transaction to the running transaction (if there
				2256	* is one) and mark buffer as freed so that commit code knows
				2257	* it should clear dirty bits when it is done with the buffer.
				2258	*/
				2259	set_buffer_freed(bh);
				2260	if (journal->j_running_transaction && buffer_jbddirty(bh))
				2261	jh->b_next_transaction = journal->j_running_transaction;
				2262	jh->b_modified = 0;
				2263	jbd2_journal_put_journal_head(jh);
				2264	spin_unlock(&journal->j_list_lock);
				2265	jbd_unlock_bh_state(bh);
				2266	write_unlock(&journal->j_state_lock);
				2267	return 0;
				2268	} else {
				2269	/* Good, the buffer belongs to the running transaction.
				2270	* We are writing our own transaction's data, not any
				2271	* previous one's, so it is safe to throw it away
				2272	* (remember that we expect the filesystem to have set
				2273	* i_size already for this truncate so recovery will not
				2274	* expose the disk blocks we are discarding here.) */
				2275	J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
				2276	JBUFFER_TRACE(jh, "on running transaction");
				2277	may_free = __dispose_buffer(jh, transaction);
				2278	}
				2279
				2280	zap_buffer:
				2281	/*
				2282	* This is tricky. Although the buffer is truncated, it may be reused
				2283	* if blocksize < pagesize and it is attached to the page straddling
				2284	* EOF. Since the buffer might have been added to BJ_Forget list of the
				2285	* running transaction, journal_get_write_access() won't clear
				2286	* b_modified and credit accounting gets confused. So clear b_modified
				2287	* here.
				2288	*/
				2289	jh->b_modified = 0;
				2290	jbd2_journal_put_journal_head(jh);
				2291	zap_buffer_no_jh:
				2292	spin_unlock(&journal->j_list_lock);
				2293	jbd_unlock_bh_state(bh);
				2294	write_unlock(&journal->j_state_lock);
				2295	zap_buffer_unlocked:
				2296	clear_buffer_dirty(bh);
				2297	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
				2298	clear_buffer_mapped(bh);
				2299	clear_buffer_req(bh);
				2300	clear_buffer_new(bh);
				2301	clear_buffer_delay(bh);
				2302	clear_buffer_unwritten(bh);
				2303	bh->b_bdev = NULL;
				2304	return may_free;
				2305	}
				2306
				2307	/**
				2308	* void jbd2_journal_invalidatepage()
				2309	* @journal: journal to use for flush...
				2310	* @page: page to flush
				2311	* @offset: start of the range to invalidate
				2312	* @length: length of the range to invalidate
				2313	*
				2314	* Reap page buffers containing data after in the specified range in page.
				2315	* Can return -EBUSY if buffers are part of the committing transaction and
				2316	* the page is straddling i_size. Caller then has to wait for current commit
				2317	* and try again.
				2318	*/
				2319	int jbd2_journal_invalidatepage(journal_t *journal,
				2320	struct page *page,
				2321	unsigned int offset,
				2322	unsigned int length)
				2323	{
				2324	struct buffer_head head, bh, *next;
				2325	unsigned int stop = offset + length;
				2326	unsigned int curr_off = 0;
				2327	int partial_page = (offset \|\| length < PAGE_SIZE);
				2328	int may_free = 1;
				2329	int ret = 0;
				2330
				2331	if (!PageLocked(page))
				2332	BUG();
				2333	if (!page_has_buffers(page))
				2334	return 0;
				2335
				2336	BUG_ON(stop > PAGE_SIZE \|\| stop < length);
				2337
				2338	/* We will potentially be playing with lists other than just the
				2339	* data lists (especially for journaled data mode), so be
				2340	* cautious in our locking. */
				2341
				2342	head = bh = page_buffers(page);
				2343	do {
				2344	unsigned int next_off = curr_off + bh->b_size;
				2345	next = bh->b_this_page;
				2346
				2347	if (next_off > stop)
				2348	return 0;
				2349
				2350	if (offset <= curr_off) {
				2351	/* This block is wholly outside the truncation point */
				2352	lock_buffer(bh);
				2353	ret = journal_unmap_buffer(journal, bh, partial_page);
				2354	unlock_buffer(bh);
				2355	if (ret < 0)
				2356	return ret;
				2357	may_free &= ret;
				2358	}
				2359	curr_off = next_off;
				2360	bh = next;
				2361
				2362	} while (bh != head);
				2363
				2364	if (!partial_page) {
				2365	if (may_free && try_to_free_buffers(page))
				2366	J_ASSERT(!page_has_buffers(page));
				2367	}
				2368	return 0;
				2369	}
				2370
				2371	/*
				2372	* File a buffer on the given transaction list.
				2373	*/
				2374	void __jbd2_journal_file_buffer(struct journal_head *jh,
				2375	transaction_t *transaction, int jlist)
				2376	{
				2377	struct journal_head **list = NULL;
				2378	int was_dirty = 0;
				2379	struct buffer_head *bh = jh2bh(jh);
				2380
				2381	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
				2382	assert_spin_locked(&transaction->t_journal->j_list_lock);
				2383
				2384	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
				2385	J_ASSERT_JH(jh, jh->b_transaction == transaction \|\|
				2386	jh->b_transaction == NULL);
				2387
				2388	if (jh->b_transaction && jh->b_jlist == jlist)
				2389	return;
				2390
				2391	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
				2392	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
				2393	/*
				2394	* For metadata buffers, we track dirty bit in buffer_jbddirty
				2395	* instead of buffer_dirty. We should not see a dirty bit set
				2396	* here because we clear it in do_get_write_access but e.g.
				2397	* tune2fs can modify the sb and set the dirty bit at any time
				2398	* so we try to gracefully handle that.
				2399	*/
				2400	if (buffer_dirty(bh))
				2401	warn_dirty_buffer(bh);
				2402	if (test_clear_buffer_dirty(bh) \|\|
				2403	test_clear_buffer_jbddirty(bh))
				2404	was_dirty = 1;
				2405	}
				2406
				2407	if (jh->b_transaction)
				2408	__jbd2_journal_temp_unlink_buffer(jh);
				2409	else
				2410	jbd2_journal_grab_journal_head(bh);
				2411	jh->b_transaction = transaction;
				2412
				2413	switch (jlist) {
				2414	case BJ_None:
				2415	J_ASSERT_JH(jh, !jh->b_committed_data);
				2416	J_ASSERT_JH(jh, !jh->b_frozen_data);
				2417	return;
				2418	case BJ_Metadata:
				2419	transaction->t_nr_buffers++;
				2420	list = &transaction->t_buffers;
				2421	break;
				2422	case BJ_Forget:
				2423	list = &transaction->t_forget;
				2424	break;
				2425	case BJ_Shadow:
				2426	list = &transaction->t_shadow_list;
				2427	break;
				2428	case BJ_Reserved:
				2429	list = &transaction->t_reserved_list;
				2430	break;
				2431	}
				2432
				2433	__blist_add_buffer(list, jh);
				2434	jh->b_jlist = jlist;
				2435
				2436	if (was_dirty)
				2437	set_buffer_jbddirty(bh);
				2438	}
				2439
				2440	void jbd2_journal_file_buffer(struct journal_head *jh,
				2441	transaction_t *transaction, int jlist)
				2442	{
				2443	jbd_lock_bh_state(jh2bh(jh));
				2444	spin_lock(&transaction->t_journal->j_list_lock);
				2445	__jbd2_journal_file_buffer(jh, transaction, jlist);
				2446	spin_unlock(&transaction->t_journal->j_list_lock);
				2447	jbd_unlock_bh_state(jh2bh(jh));
				2448	}
				2449
				2450	/*
				2451	* Remove a buffer from its current buffer list in preparation for
				2452	* dropping it from its current transaction entirely. If the buffer has
				2453	* already started to be used by a subsequent transaction, refile the
				2454	* buffer on that transaction's metadata list.
				2455	*
				2456	* Called under j_list_lock
				2457	* Called under jbd_lock_bh_state(jh2bh(jh))
				2458	*
				2459	* jh and bh may be already free when this function returns
				2460	*/
				2461	void __jbd2_journal_refile_buffer(struct journal_head *jh)
				2462	{
				2463	int was_dirty, jlist;
				2464	struct buffer_head *bh = jh2bh(jh);
				2465
				2466	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
				2467	if (jh->b_transaction)
				2468	assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
				2469
				2470	/* If the buffer is now unused, just drop it. */
				2471	if (jh->b_next_transaction == NULL) {
				2472	__jbd2_journal_unfile_buffer(jh);
				2473	return;
				2474	}
				2475
				2476	/*
				2477	* It has been modified by a later transaction: add it to the new
				2478	* transaction's metadata list.
				2479	*/
				2480
				2481	was_dirty = test_clear_buffer_jbddirty(bh);
				2482	__jbd2_journal_temp_unlink_buffer(jh);
				2483
				2484	/*
				2485	* b_transaction must be set, otherwise the new b_transaction won't
				2486	* be holding jh reference
				2487	*/
				2488	J_ASSERT_JH(jh, jh->b_transaction != NULL);
				2489
				2490	/*
				2491	* We set b_transaction here because b_next_transaction will inherit
				2492	* our jh reference and thus __jbd2_journal_file_buffer() must not
				2493	* take a new one.
				2494	*/
				2495	WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
				2496	WRITE_ONCE(jh->b_next_transaction, NULL);
				2497	if (buffer_freed(bh))
				2498	jlist = BJ_Forget;
				2499	else if (jh->b_modified)
				2500	jlist = BJ_Metadata;
				2501	else
				2502	jlist = BJ_Reserved;
				2503	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
				2504	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
				2505
				2506	if (was_dirty)
				2507	set_buffer_jbddirty(bh);
				2508	}
				2509
				2510	/*
				2511	* __jbd2_journal_refile_buffer() with necessary locking added. We take our
				2512	* bh reference so that we can safely unlock bh.
				2513	*
				2514	* The jh and bh may be freed by this call.
				2515	*/
				2516	void jbd2_journal_refile_buffer(journal_t journal, struct journal_head jh)
				2517	{
				2518	struct buffer_head *bh = jh2bh(jh);
				2519
				2520	/* Get reference so that buffer cannot be freed before we unlock it */
				2521	get_bh(bh);
				2522	jbd_lock_bh_state(bh);
				2523	spin_lock(&journal->j_list_lock);
				2524	__jbd2_journal_refile_buffer(jh);
				2525	jbd_unlock_bh_state(bh);
				2526	spin_unlock(&journal->j_list_lock);
				2527	__brelse(bh);
				2528	}
				2529
				2530	/*
				2531	* File inode in the inode list of the handle's transaction
				2532	*/
				2533	static int jbd2_journal_file_inode(handle_t handle, struct jbd2_inode jinode,
				2534	unsigned long flags, loff_t start_byte, loff_t end_byte)
				2535	{
				2536	transaction_t *transaction = handle->h_transaction;
				2537	journal_t *journal;
				2538
				2539	if (is_handle_aborted(handle))
				2540	return -EROFS;
				2541	journal = transaction->t_journal;
				2542
				2543	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
				2544	transaction->t_tid);
				2545
				2546	spin_lock(&journal->j_list_lock);
				2547	jinode->i_flags \|= flags;
				2548
				2549	if (jinode->i_dirty_end) {
				2550	jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
				2551	jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
				2552	} else {
				2553	jinode->i_dirty_start = start_byte;
				2554	jinode->i_dirty_end = end_byte;
				2555	}
				2556
				2557	/* Is inode already attached where we need it? */
				2558	if (jinode->i_transaction == transaction \|\|
				2559	jinode->i_next_transaction == transaction)
				2560	goto done;
				2561
				2562	/*
				2563	* We only ever set this variable to 1 so the test is safe. Since
				2564	* t_need_data_flush is likely to be set, we do the test to save some
				2565	* cacheline bouncing
				2566	*/
				2567	if (!transaction->t_need_data_flush)
				2568	transaction->t_need_data_flush = 1;
				2569	/* On some different transaction's list - should be
				2570	* the committing one */
				2571	if (jinode->i_transaction) {
				2572	J_ASSERT(jinode->i_next_transaction == NULL);
				2573	J_ASSERT(jinode->i_transaction ==
				2574	journal->j_committing_transaction);
				2575	jinode->i_next_transaction = transaction;
				2576	goto done;
				2577	}
				2578	/* Not on any transaction list... */
				2579	J_ASSERT(!jinode->i_next_transaction);
				2580	jinode->i_transaction = transaction;
				2581	list_add(&jinode->i_list, &transaction->t_inode_list);
				2582	done:
				2583	spin_unlock(&journal->j_list_lock);
				2584
				2585	return 0;
				2586	}
				2587
				2588	int jbd2_journal_inode_add_write(handle_t handle, struct jbd2_inode jinode)
				2589	{
				2590	return jbd2_journal_file_inode(handle, jinode,
				2591	JI_WRITE_DATA \| JI_WAIT_DATA, 0, LLONG_MAX);
				2592	}
				2593
				2594	int jbd2_journal_inode_add_wait(handle_t handle, struct jbd2_inode jinode)
				2595	{
				2596	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
				2597	LLONG_MAX);
				2598	}
				2599
				2600	int jbd2_journal_inode_ranged_write(handle_t *handle,
				2601	struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
				2602	{
				2603	return jbd2_journal_file_inode(handle, jinode,
				2604	JI_WRITE_DATA \| JI_WAIT_DATA, start_byte,
				2605	start_byte + length - 1);
				2606	}
				2607
				2608	int jbd2_journal_inode_ranged_wait(handle_t handle, struct jbd2_inode jinode,
				2609	loff_t start_byte, loff_t length)
				2610	{
				2611	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
				2612	start_byte, start_byte + length - 1);
				2613	}
				2614
				2615	/*
				2616	* File truncate and transaction commit interact with each other in a
				2617	* non-trivial way. If a transaction writing data block A is
				2618	* committing, we cannot discard the data by truncate until we have
				2619	* written them. Otherwise if we crashed after the transaction with
				2620	* write has committed but before the transaction with truncate has
				2621	* committed, we could see stale data in block A. This function is a
				2622	* helper to solve this problem. It starts writeout of the truncated
				2623	* part in case it is in the committing transaction.
				2624	*
				2625	* Filesystem code must call this function when inode is journaled in
				2626	* ordered mode before truncation happens and after the inode has been
				2627	* placed on orphan list with the new inode size. The second condition
				2628	* avoids the race that someone writes new data and we start
				2629	* committing the transaction after this function has been called but
				2630	* before a transaction for truncate is started (and furthermore it
				2631	* allows us to optimize the case where the addition to orphan list
				2632	* happens in the same transaction as write --- we don't have to write
				2633	* any data in such case).
				2634	*/
				2635	int jbd2_journal_begin_ordered_truncate(journal_t *journal,
				2636	struct jbd2_inode *jinode,
				2637	loff_t new_size)
				2638	{
				2639	transaction_t inode_trans, commit_trans;
				2640	int ret = 0;
				2641
				2642	/* This is a quick check to avoid locking if not necessary */
				2643	if (!jinode->i_transaction)
				2644	goto out;
				2645	/* Locks are here just to force reading of recent values, it is
				2646	* enough that the transaction was not committing before we started
				2647	* a transaction adding the inode to orphan list */
				2648	read_lock(&journal->j_state_lock);
				2649	commit_trans = journal->j_committing_transaction;
				2650	read_unlock(&journal->j_state_lock);
				2651	spin_lock(&journal->j_list_lock);
				2652	inode_trans = jinode->i_transaction;
				2653	spin_unlock(&journal->j_list_lock);
				2654	if (inode_trans == commit_trans) {
				2655	ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
				2656	new_size, LLONG_MAX);
				2657	if (ret)
				2658	jbd2_journal_abort(journal, ret);
				2659	}
				2660	out:
				2661	return ret;
				2662	}