Blame - marvell/linux/fs/jbd2/journal.c - T108

blob: 763f6db659a1cde73765138eb2bd5e443f8cabc6 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0+
				2	/*
				3	* linux/fs/jbd2/journal.c
				4	*
				5	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				6	*
				7	* Copyright 1998 Red Hat corp --- All Rights Reserved
				8	*
				9	* Generic filesystem journal-writing code; part of the ext2fs
				10	* journaling system.
				11	*
				12	* This file manages journals: areas of disk reserved for logging
				13	* transactional updates. This includes the kernel journaling thread
				14	* which is responsible for scheduling updates to the log.
				15	*
				16	* We do not actually manage the physical storage of the journal in this
				17	* file: that is left to a per-journal policy function, which allows us
				18	* to store the journal within a filesystem-specified area for ext2
				19	* journaling (ext2 can use a reserved inode for storing the log).
				20	*/
				21
				22	#include <linux/module.h>
				23	#include <linux/time.h>
				24	#include <linux/fs.h>
				25	#include <linux/jbd2.h>
				26	#include <linux/errno.h>
				27	#include <linux/slab.h>
				28	#include <linux/init.h>
				29	#include <linux/mm.h>
				30	#include <linux/freezer.h>
				31	#include <linux/pagemap.h>
				32	#include <linux/kthread.h>
				33	#include <linux/poison.h>
				34	#include <linux/proc_fs.h>
				35	#include <linux/seq_file.h>
				36	#include <linux/math64.h>
				37	#include <linux/hash.h>
				38	#include <linux/log2.h>
				39	#include <linux/vmalloc.h>
				40	#include <linux/backing-dev.h>
				41	#include <linux/bitops.h>
				42	#include <linux/ratelimit.h>
				43	#include <linux/sched/mm.h>
				44
				45	#define CREATE_TRACE_POINTS
				46	#include <trace/events/jbd2.h>
				47
				48	#include <linux/uaccess.h>
				49	#include <asm/page.h>
				50
				51	#ifdef CONFIG_JBD2_DEBUG
				52	ushort jbd2_journal_enable_debug __read_mostly;
				53	EXPORT_SYMBOL(jbd2_journal_enable_debug);
				54
				55	module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
				56	MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
				57	#endif
				58
				59	EXPORT_SYMBOL(jbd2_journal_extend);
				60	EXPORT_SYMBOL(jbd2_journal_stop);
				61	EXPORT_SYMBOL(jbd2_journal_lock_updates);
				62	EXPORT_SYMBOL(jbd2_journal_unlock_updates);
				63	EXPORT_SYMBOL(jbd2_journal_get_write_access);
				64	EXPORT_SYMBOL(jbd2_journal_get_create_access);
				65	EXPORT_SYMBOL(jbd2_journal_get_undo_access);
				66	EXPORT_SYMBOL(jbd2_journal_set_triggers);
				67	EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
				68	EXPORT_SYMBOL(jbd2_journal_forget);
				69	EXPORT_SYMBOL(jbd2_journal_flush);
				70	EXPORT_SYMBOL(jbd2_journal_revoke);
				71
				72	EXPORT_SYMBOL(jbd2_journal_init_dev);
				73	EXPORT_SYMBOL(jbd2_journal_init_inode);
				74	EXPORT_SYMBOL(jbd2_journal_check_used_features);
				75	EXPORT_SYMBOL(jbd2_journal_check_available_features);
				76	EXPORT_SYMBOL(jbd2_journal_set_features);
				77	EXPORT_SYMBOL(jbd2_journal_load);
				78	EXPORT_SYMBOL(jbd2_journal_destroy);
				79	EXPORT_SYMBOL(jbd2_journal_abort);
				80	EXPORT_SYMBOL(jbd2_journal_errno);
				81	EXPORT_SYMBOL(jbd2_journal_ack_err);
				82	EXPORT_SYMBOL(jbd2_journal_clear_err);
				83	EXPORT_SYMBOL(jbd2_log_wait_commit);
				84	EXPORT_SYMBOL(jbd2_log_start_commit);
				85	EXPORT_SYMBOL(jbd2_journal_start_commit);
				86	EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
				87	EXPORT_SYMBOL(jbd2_journal_wipe);
				88	EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
				89	EXPORT_SYMBOL(jbd2_journal_invalidatepage);
				90	EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
				91	EXPORT_SYMBOL(jbd2_journal_force_commit);
				92	EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
				93	EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
				94	EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers);
				95	EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
				96	EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
				97	EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
				98	EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
				99	EXPORT_SYMBOL(jbd2_inode_cache);
				100
				101	static int jbd2_journal_create_slab(size_t slab_size);
				102
				103	#ifdef CONFIG_JBD2_DEBUG
				104	void __jbd2_debug(int level, const char file, const char func,
				105	unsigned int line, const char *fmt, ...)
				106	{
				107	struct va_format vaf;
				108	va_list args;
				109
				110	if (level > jbd2_journal_enable_debug)
				111	return;
				112	va_start(args, fmt);
				113	vaf.fmt = fmt;
				114	vaf.va = &args;
				115	printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
				116	va_end(args);
				117	}
				118	EXPORT_SYMBOL(__jbd2_debug);
				119	#endif
				120
				121	/* Checksumming functions */
				122	static int jbd2_verify_csum_type(journal_t j, journal_superblock_t sb)
				123	{
				124	if (!jbd2_journal_has_csum_v2or3_feature(j))
				125	return 1;
				126
				127	return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
				128	}
				129
				130	static __be32 jbd2_superblock_csum(journal_t j, journal_superblock_t sb)
				131	{
				132	__u32 csum;
				133	__be32 old_csum;
				134
				135	old_csum = sb->s_checksum;
				136	sb->s_checksum = 0;
				137	csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
				138	sb->s_checksum = old_csum;
				139
				140	return cpu_to_be32(csum);
				141	}
				142
				143	/*
				144	* Helper function used to manage commit timeouts
				145	*/
				146
				147	static void commit_timeout(struct timer_list *t)
				148	{
				149	journal_t *journal = from_timer(journal, t, j_commit_timer);
				150
				151	wake_up_process(journal->j_task);
				152	}
				153
				154	/*
				155	* kjournald2: The main thread function used to manage a logging device
				156	* journal.
				157	*
				158	* This kernel thread is responsible for two things:
				159	*
				160	* 1) COMMIT: Every so often we need to commit the current state of the
				161	* filesystem to disk. The journal thread is responsible for writing
				162	* all of the metadata buffers to disk.
				163	*
				164	* 2) CHECKPOINT: We cannot reuse a used section of the log file until all
				165	* of the data in that part of the log has been rewritten elsewhere on
				166	* the disk. Flushing these old buffers to reclaim space in the log is
				167	* known as checkpointing, and this thread is responsible for that job.
				168	*/
				169
				170	static int kjournald2(void *arg)
				171	{
				172	journal_t *journal = arg;
				173	transaction_t *transaction;
				174
				175	/*
				176	* Set up an interval timer which can be used to trigger a commit wakeup
				177	* after the commit interval expires
				178	*/
				179	timer_setup(&journal->j_commit_timer, commit_timeout, 0);
				180
				181	set_freezable();
				182
				183	/* Record that the journal thread is running */
				184	journal->j_task = current;
				185	wake_up(&journal->j_wait_done_commit);
				186
				187	/*
				188	* Make sure that no allocations from this kernel thread will ever
				189	* recurse to the fs layer because we are responsible for the
				190	* transaction commit and any fs involvement might get stuck waiting for
				191	* the trasn. commit.
				192	*/
				193	memalloc_nofs_save();
				194
				195	/*
				196	* And now, wait forever for commit wakeup events.
				197	*/
				198	write_lock(&journal->j_state_lock);
				199
				200	loop:
				201	if (journal->j_flags & JBD2_UNMOUNT)
				202	goto end_loop;
				203
				204	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
				205	journal->j_commit_sequence, journal->j_commit_request);
				206
				207	if (journal->j_commit_sequence != journal->j_commit_request) {
				208	jbd_debug(1, "OK, requests differ\n");
				209	write_unlock(&journal->j_state_lock);
				210	del_timer_sync(&journal->j_commit_timer);
				211	jbd2_journal_commit_transaction(journal);
				212	write_lock(&journal->j_state_lock);
				213	goto loop;
				214	}
				215
				216	wake_up(&journal->j_wait_done_commit);
				217	if (freezing(current)) {
				218	/*
				219	* The simpler the better. Flushing journal isn't a
				220	* good idea, because that depends on threads that may
				221	* be already stopped.
				222	*/
				223	jbd_debug(1, "Now suspending kjournald2\n");
				224	write_unlock(&journal->j_state_lock);
				225	try_to_freeze();
				226	write_lock(&journal->j_state_lock);
				227	} else {
				228	/*
				229	* We assume on resume that commits are already there,
				230	* so we don't sleep
				231	*/
				232	DEFINE_WAIT(wait);
				233	int should_sleep = 1;
				234
				235	prepare_to_wait(&journal->j_wait_commit, &wait,
				236	TASK_INTERRUPTIBLE);
				237	if (journal->j_commit_sequence != journal->j_commit_request)
				238	should_sleep = 0;
				239	transaction = journal->j_running_transaction;
				240	if (transaction && time_after_eq(jiffies,
				241	transaction->t_expires))
				242	should_sleep = 0;
				243	if (journal->j_flags & JBD2_UNMOUNT)
				244	should_sleep = 0;
				245	if (should_sleep) {
				246	write_unlock(&journal->j_state_lock);
				247	schedule();
				248	write_lock(&journal->j_state_lock);
				249	}
				250	finish_wait(&journal->j_wait_commit, &wait);
				251	}
				252
				253	jbd_debug(1, "kjournald2 wakes\n");
				254
				255	/*
				256	* Were we woken up by a commit wakeup event?
				257	*/
				258	transaction = journal->j_running_transaction;
				259	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
				260	journal->j_commit_request = transaction->t_tid;
				261	jbd_debug(1, "woke because of timeout\n");
				262	}
				263	goto loop;
				264
				265	end_loop:
				266	del_timer_sync(&journal->j_commit_timer);
				267	journal->j_task = NULL;
				268	wake_up(&journal->j_wait_done_commit);
				269	jbd_debug(1, "Journal thread exiting.\n");
				270	write_unlock(&journal->j_state_lock);
				271	return 0;
				272	}
				273
				274	static int jbd2_journal_start_thread(journal_t *journal)
				275	{
				276	struct task_struct *t;
				277
				278	t = kthread_run(kjournald2, journal, "jbd2/%s",
				279	journal->j_devname);
				280	if (IS_ERR(t))
				281	return PTR_ERR(t);
				282
				283	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
				284	return 0;
				285	}
				286
				287	static void journal_kill_thread(journal_t *journal)
				288	{
				289	write_lock(&journal->j_state_lock);
				290	journal->j_flags \|= JBD2_UNMOUNT;
				291
				292	while (journal->j_task) {
				293	write_unlock(&journal->j_state_lock);
				294	wake_up(&journal->j_wait_commit);
				295	wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
				296	write_lock(&journal->j_state_lock);
				297	}
				298	write_unlock(&journal->j_state_lock);
				299	}
				300
				301	/*
				302	* jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
				303	*
				304	* Writes a metadata buffer to a given disk block. The actual IO is not
				305	* performed but a new buffer_head is constructed which labels the data
				306	* to be written with the correct destination disk block.
				307	*
				308	* Any magic-number escaping which needs to be done will cause a
				309	* copy-out here. If the buffer happens to start with the
				310	* JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
				311	* magic number is only written to the log for descripter blocks. In
				312	* this case, we copy the data and replace the first word with 0, and we
				313	* return a result code which indicates that this buffer needs to be
				314	* marked as an escaped buffer in the corresponding log descriptor
				315	* block. The missing word can then be restored when the block is read
				316	* during recovery.
				317	*
				318	* If the source buffer has already been modified by a new transaction
				319	* since we took the last commit snapshot, we use the frozen copy of
				320	* that data for IO. If we end up using the existing buffer_head's data
				321	* for the write, then we have to make sure nobody modifies it while the
				322	* IO is in progress. do_get_write_access() handles this.
				323	*
				324	* The function returns a pointer to the buffer_head to be used for IO.
				325	*
				326	*
				327	* Return value:
				328	* <0: Error
				329	* >=0: Finished OK
				330	*
				331	* On success:
				332	* Bit 0 set == escape performed on the data
				333	* Bit 1 set == buffer copy-out performed (kfree the data after IO)
				334	*/
				335
				336	int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
				337	struct journal_head *jh_in,
				338	struct buffer_head **bh_out,
				339	sector_t blocknr)
				340	{
				341	int need_copy_out = 0;
				342	int done_copy_out = 0;
				343	int do_escape = 0;
				344	char *mapped_data;
				345	struct buffer_head *new_bh;
				346	struct page *new_page;
				347	unsigned int new_offset;
				348	struct buffer_head *bh_in = jh2bh(jh_in);
				349	journal_t *journal = transaction->t_journal;
				350
				351	/*
				352	* The buffer really shouldn't be locked: only the current committing
				353	* transaction is allowed to write it, so nobody else is allowed
				354	* to do any IO.
				355	*
				356	* akpm: except if we're journalling data, and write() output is
				357	* also part of a shared mapping, and another thread has
				358	* decided to launch a writepage() against this buffer.
				359	*/
				360	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
				361
				362	new_bh = alloc_buffer_head(GFP_NOFS\|__GFP_NOFAIL);
				363
				364	/* keep subsequent assertions sane */
				365	atomic_set(&new_bh->b_count, 1);
				366
				367	jbd_lock_bh_state(bh_in);
				368	repeat:
				369	/*
				370	* If a new transaction has already done a buffer copy-out, then
				371	* we use that version of the data for the commit.
				372	*/
				373	if (jh_in->b_frozen_data) {
				374	done_copy_out = 1;
				375	new_page = virt_to_page(jh_in->b_frozen_data);
				376	new_offset = offset_in_page(jh_in->b_frozen_data);
				377	} else {
				378	new_page = jh2bh(jh_in)->b_page;
				379	new_offset = offset_in_page(jh2bh(jh_in)->b_data);
				380	}
				381
				382	mapped_data = kmap_atomic(new_page);
				383	/*
				384	* Fire data frozen trigger if data already wasn't frozen. Do this
				385	* before checking for escaping, as the trigger may modify the magic
				386	* offset. If a copy-out happens afterwards, it will have the correct
				387	* data in the buffer.
				388	*/
				389	if (!done_copy_out)
				390	jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
				391	jh_in->b_triggers);
				392
				393	/*
				394	* Check for escaping
				395	*/
				396	if (((__be32 )(mapped_data + new_offset)) ==
				397	cpu_to_be32(JBD2_MAGIC_NUMBER)) {
				398	need_copy_out = 1;
				399	do_escape = 1;
				400	}
				401	kunmap_atomic(mapped_data);
				402
				403	/*
				404	* Do we need to do a data copy?
				405	*/
				406	if (need_copy_out && !done_copy_out) {
				407	char *tmp;
				408
				409	jbd_unlock_bh_state(bh_in);
				410	tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
				411	if (!tmp) {
				412	brelse(new_bh);
				413	free_buffer_head(new_bh);
				414	return -ENOMEM;
				415	}
				416	jbd_lock_bh_state(bh_in);
				417	if (jh_in->b_frozen_data) {
				418	jbd2_free(tmp, bh_in->b_size);
				419	goto repeat;
				420	}
				421
				422	jh_in->b_frozen_data = tmp;
				423	mapped_data = kmap_atomic(new_page);
				424	memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
				425	kunmap_atomic(mapped_data);
				426
				427	new_page = virt_to_page(tmp);
				428	new_offset = offset_in_page(tmp);
				429	done_copy_out = 1;
				430
				431	/*
				432	* This isn't strictly necessary, as we're using frozen
				433	* data for the escaping, but it keeps consistency with
				434	* b_frozen_data usage.
				435	*/
				436	jh_in->b_frozen_triggers = jh_in->b_triggers;
				437	}
				438
				439	/*
				440	* Did we need to do an escaping? Now we've done all the
				441	* copying, we can finally do so.
				442	*/
				443	if (do_escape) {
				444	mapped_data = kmap_atomic(new_page);
				445	((unsigned int )(mapped_data + new_offset)) = 0;
				446	kunmap_atomic(mapped_data);
				447	}
				448
				449	set_bh_page(new_bh, new_page, new_offset);
				450	new_bh->b_size = bh_in->b_size;
				451	new_bh->b_bdev = journal->j_dev;
				452	new_bh->b_blocknr = blocknr;
				453	new_bh->b_private = bh_in;
				454	set_buffer_mapped(new_bh);
				455	set_buffer_dirty(new_bh);
				456
				457	*bh_out = new_bh;
				458
				459	/*
				460	* The to-be-written buffer needs to get moved to the io queue,
				461	* and the original buffer whose contents we are shadowing or
				462	* copying is moved to the transaction's shadow queue.
				463	*/
				464	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
				465	spin_lock(&journal->j_list_lock);
				466	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
				467	spin_unlock(&journal->j_list_lock);
				468	set_buffer_shadow(bh_in);
				469	jbd_unlock_bh_state(bh_in);
				470
				471	return do_escape \| (done_copy_out << 1);
				472	}
				473
				474	/*
				475	* Allocation code for the journal file. Manage the space left in the
				476	* journal, so that we can begin checkpointing when appropriate.
				477	*/
				478
				479	/*
				480	* Called with j_state_lock locked for writing.
				481	* Returns true if a transaction commit was started.
				482	*/
				483	int __jbd2_log_start_commit(journal_t *journal, tid_t target)
				484	{
				485	/* Return if the txn has already requested to be committed */
				486	if (journal->j_commit_request == target)
				487	return 0;
				488
				489	/*
				490	* The only transaction we can possibly wait upon is the
				491	* currently running transaction (if it exists). Otherwise,
				492	* the target tid must be an old one.
				493	*/
				494	if (journal->j_running_transaction &&
				495	journal->j_running_transaction->t_tid == target) {
				496	/*
				497	* We want a new commit: OK, mark the request and wakeup the
				498	* commit thread. We do _not_ do the commit ourselves.
				499	*/
				500
				501	journal->j_commit_request = target;
				502	jbd_debug(1, "JBD2: requesting commit %u/%u\n",
				503	journal->j_commit_request,
				504	journal->j_commit_sequence);
				505	journal->j_running_transaction->t_requested = jiffies;
				506	wake_up(&journal->j_wait_commit);
				507	return 1;
				508	} else if (!tid_geq(journal->j_commit_request, target))
				509	/* This should never happen, but if it does, preserve
				510	the evidence before kjournald goes into a loop and
				511	increments j_commit_sequence beyond all recognition. */
				512	WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
				513	journal->j_commit_request,
				514	journal->j_commit_sequence,
				515	target, journal->j_running_transaction ?
				516	journal->j_running_transaction->t_tid : 0);
				517	return 0;
				518	}
				519
				520	int jbd2_log_start_commit(journal_t *journal, tid_t tid)
				521	{
				522	int ret;
				523
				524	write_lock(&journal->j_state_lock);
				525	ret = __jbd2_log_start_commit(journal, tid);
				526	write_unlock(&journal->j_state_lock);
				527	return ret;
				528	}
				529
				530	/*
				531	* Force and wait any uncommitted transactions. We can only force the running
				532	* transaction if we don't have an active handle, otherwise, we will deadlock.
				533	* Returns: <0 in case of error,
				534	* 0 if nothing to commit,
				535	* 1 if transaction was successfully committed.
				536	*/
				537	static int __jbd2_journal_force_commit(journal_t *journal)
				538	{
				539	transaction_t *transaction = NULL;
				540	tid_t tid;
				541	int need_to_start = 0, ret = 0;
				542
				543	read_lock(&journal->j_state_lock);
				544	if (journal->j_running_transaction && !current->journal_info) {
				545	transaction = journal->j_running_transaction;
				546	if (!tid_geq(journal->j_commit_request, transaction->t_tid))
				547	need_to_start = 1;
				548	} else if (journal->j_committing_transaction)
				549	transaction = journal->j_committing_transaction;
				550
				551	if (!transaction) {
				552	/* Nothing to commit */
				553	read_unlock(&journal->j_state_lock);
				554	return 0;
				555	}
				556	tid = transaction->t_tid;
				557	read_unlock(&journal->j_state_lock);
				558	if (need_to_start)
				559	jbd2_log_start_commit(journal, tid);
				560	ret = jbd2_log_wait_commit(journal, tid);
				561	if (!ret)
				562	ret = 1;
				563
				564	return ret;
				565	}
				566
				567	/**
				568	* jbd2_journal_force_commit_nested - Force and wait upon a commit if the
				569	* calling process is not within transaction.
				570	*
				571	* @journal: journal to force
				572	* Returns true if progress was made.
				573	*
				574	* This is used for forcing out undo-protected data which contains
				575	* bitmaps, when the fs is running out of space.
				576	*/
				577	int jbd2_journal_force_commit_nested(journal_t *journal)
				578	{
				579	int ret;
				580
				581	ret = __jbd2_journal_force_commit(journal);
				582	return ret > 0;
				583	}
				584
				585	/**
				586	* jbd2_journal_force_commit() - force any uncommitted transactions
				587	* @journal: journal to force
				588	*
				589	* Caller want unconditional commit. We can only force the running transaction
				590	* if we don't have an active handle, otherwise, we will deadlock.
				591	*/
				592	int jbd2_journal_force_commit(journal_t *journal)
				593	{
				594	int ret;
				595
				596	J_ASSERT(!current->journal_info);
				597	ret = __jbd2_journal_force_commit(journal);
				598	if (ret > 0)
				599	ret = 0;
				600	return ret;
				601	}
				602
				603	/*
				604	* Start a commit of the current running transaction (if any). Returns true
				605	* if a transaction is going to be committed (or is currently already
				606	* committing), and fills its tid in at *ptid
				607	*/
				608	int jbd2_journal_start_commit(journal_t journal, tid_t ptid)
				609	{
				610	int ret = 0;
				611
				612	write_lock(&journal->j_state_lock);
				613	if (journal->j_running_transaction) {
				614	tid_t tid = journal->j_running_transaction->t_tid;
				615
				616	__jbd2_log_start_commit(journal, tid);
				617	/* There's a running transaction and we've just made sure
				618	* it's commit has been scheduled. */
				619	if (ptid)
				620	*ptid = tid;
				621	ret = 1;
				622	} else if (journal->j_committing_transaction) {
				623	/*
				624	* If commit has been started, then we have to wait for
				625	* completion of that transaction.
				626	*/
				627	if (ptid)
				628	*ptid = journal->j_committing_transaction->t_tid;
				629	ret = 1;
				630	}
				631	write_unlock(&journal->j_state_lock);
				632	return ret;
				633	}
				634
				635	/*
				636	* Return 1 if a given transaction has not yet sent barrier request
				637	* connected with a transaction commit. If 0 is returned, transaction
				638	* may or may not have sent the barrier. Used to avoid sending barrier
				639	* twice in common cases.
				640	*/
				641	int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
				642	{
				643	int ret = 0;
				644	transaction_t *commit_trans;
				645
				646	if (!(journal->j_flags & JBD2_BARRIER))
				647	return 0;
				648	read_lock(&journal->j_state_lock);
				649	/* Transaction already committed? */
				650	if (tid_geq(journal->j_commit_sequence, tid))
				651	goto out;
				652	commit_trans = journal->j_committing_transaction;
				653	if (!commit_trans \|\| commit_trans->t_tid != tid) {
				654	ret = 1;
				655	goto out;
				656	}
				657	/*
				658	* Transaction is being committed and we already proceeded to
				659	* submitting a flush to fs partition?
				660	*/
				661	if (journal->j_fs_dev != journal->j_dev) {
				662	if (!commit_trans->t_need_data_flush \|\|
				663	commit_trans->t_state >= T_COMMIT_DFLUSH)
				664	goto out;
				665	} else {
				666	if (commit_trans->t_state >= T_COMMIT_JFLUSH)
				667	goto out;
				668	}
				669	ret = 1;
				670	out:
				671	read_unlock(&journal->j_state_lock);
				672	return ret;
				673	}
				674	EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
				675
				676	/*
				677	* Wait for a specified commit to complete.
				678	* The caller may not hold the journal lock.
				679	*/
				680	int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
				681	{
				682	int err = 0;
				683
				684	read_lock(&journal->j_state_lock);
				685	#ifdef CONFIG_PROVE_LOCKING
				686	/*
				687	* Some callers make sure transaction is already committing and in that
				688	* case we cannot block on open handles anymore. So don't warn in that
				689	* case.
				690	*/
				691	if (tid_gt(tid, journal->j_commit_sequence) &&
				692	(!journal->j_committing_transaction \|\|
				693	journal->j_committing_transaction->t_tid != tid)) {
				694	read_unlock(&journal->j_state_lock);
				695	jbd2_might_wait_for_commit(journal);
				696	read_lock(&journal->j_state_lock);
				697	}
				698	#endif
				699	#ifdef CONFIG_JBD2_DEBUG
				700	if (!tid_geq(journal->j_commit_request, tid)) {
				701	printk(KERN_ERR
				702	"%s: error: j_commit_request=%u, tid=%u\n",
				703	__func__, journal->j_commit_request, tid);
				704	}
				705	#endif
				706	while (tid_gt(tid, journal->j_commit_sequence)) {
				707	jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
				708	tid, journal->j_commit_sequence);
				709	read_unlock(&journal->j_state_lock);
				710	wake_up(&journal->j_wait_commit);
				711	wait_event(journal->j_wait_done_commit,
				712	!tid_gt(tid, journal->j_commit_sequence));
				713	read_lock(&journal->j_state_lock);
				714	}
				715	read_unlock(&journal->j_state_lock);
				716
				717	if (unlikely(is_journal_aborted(journal)))
				718	err = -EIO;
				719	return err;
				720	}
				721
				722	/* Return 1 when transaction with given tid has already committed. */
				723	int jbd2_transaction_committed(journal_t *journal, tid_t tid)
				724	{
				725	int ret = 1;
				726
				727	read_lock(&journal->j_state_lock);
				728	if (journal->j_running_transaction &&
				729	journal->j_running_transaction->t_tid == tid)
				730	ret = 0;
				731	if (journal->j_committing_transaction &&
				732	journal->j_committing_transaction->t_tid == tid)
				733	ret = 0;
				734	read_unlock(&journal->j_state_lock);
				735	return ret;
				736	}
				737	EXPORT_SYMBOL(jbd2_transaction_committed);
				738
				739	/*
				740	* When this function returns the transaction corresponding to tid
				741	* will be completed. If the transaction has currently running, start
				742	* committing that transaction before waiting for it to complete. If
				743	* the transaction id is stale, it is by definition already completed,
				744	* so just return SUCCESS.
				745	*/
				746	int jbd2_complete_transaction(journal_t *journal, tid_t tid)
				747	{
				748	int need_to_wait = 1;
				749
				750	read_lock(&journal->j_state_lock);
				751	if (journal->j_running_transaction &&
				752	journal->j_running_transaction->t_tid == tid) {
				753	if (journal->j_commit_request != tid) {
				754	/* transaction not yet started, so request it */
				755	read_unlock(&journal->j_state_lock);
				756	jbd2_log_start_commit(journal, tid);
				757	goto wait_commit;
				758	}
				759	} else if (!(journal->j_committing_transaction &&
				760	journal->j_committing_transaction->t_tid == tid))
				761	need_to_wait = 0;
				762	read_unlock(&journal->j_state_lock);
				763	if (!need_to_wait)
				764	return 0;
				765	wait_commit:
				766	return jbd2_log_wait_commit(journal, tid);
				767	}
				768	EXPORT_SYMBOL(jbd2_complete_transaction);
				769
				770	/*
				771	* Log buffer allocation routines:
				772	*/
				773
				774	int jbd2_journal_next_log_block(journal_t journal, unsigned long long retp)
				775	{
				776	unsigned long blocknr;
				777
				778	write_lock(&journal->j_state_lock);
				779	J_ASSERT(journal->j_free > 1);
				780
				781	blocknr = journal->j_head;
				782	journal->j_head++;
				783	journal->j_free--;
				784	if (journal->j_head == journal->j_last)
				785	journal->j_head = journal->j_first;
				786	write_unlock(&journal->j_state_lock);
				787	return jbd2_journal_bmap(journal, blocknr, retp);
				788	}
				789
				790	/*
				791	* Conversion of logical to physical block numbers for the journal
				792	*
				793	* On external journals the journal blocks are identity-mapped, so
				794	* this is a no-op. If needed, we can use j_blk_offset - everything is
				795	* ready.
				796	*/
				797	int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
				798	unsigned long long *retp)
				799	{
				800	int err = 0;
				801	unsigned long long ret;
				802	sector_t block = 0;
				803
				804	if (journal->j_inode) {
				805	block = blocknr;
				806	ret = bmap(journal->j_inode, &block);
				807
				808	if (ret \|\| !block) {
				809	printk(KERN_ALERT "%s: journal block not found "
				810	"at offset %lu on %s\n",
				811	__func__, blocknr, journal->j_devname);
				812	err = -EIO;
				813	jbd2_journal_abort(journal, err);
				814
				815	} else {
				816	*retp = block;
				817	}
				818
				819	} else {
				820	retp = blocknr; / +journal->j_blk_offset */
				821	}
				822	return err;
				823	}
				824
				825	/*
				826	* We play buffer_head aliasing tricks to write data/metadata blocks to
				827	* the journal without copying their contents, but for journal
				828	* descriptor blocks we do need to generate bona fide buffers.
				829	*
				830	* After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
				831	* the buffer's contents they really should run flush_dcache_page(bh->b_page).
				832	* But we don't bother doing that, so there will be coherency problems with
				833	* mmaps of blockdevs which hold live JBD-controlled filesystems.
				834	*/
				835	struct buffer_head *
				836	jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
				837	{
				838	journal_t *journal = transaction->t_journal;
				839	struct buffer_head *bh;
				840	unsigned long long blocknr;
				841	journal_header_t *header;
				842	int err;
				843
				844	err = jbd2_journal_next_log_block(journal, &blocknr);
				845
				846	if (err)
				847	return NULL;
				848
				849	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
				850	if (!bh)
				851	return NULL;
				852	lock_buffer(bh);
				853	memset(bh->b_data, 0, journal->j_blocksize);
				854	header = (journal_header_t *)bh->b_data;
				855	header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
				856	header->h_blocktype = cpu_to_be32(type);
				857	header->h_sequence = cpu_to_be32(transaction->t_tid);
				858	set_buffer_uptodate(bh);
				859	unlock_buffer(bh);
				860	BUFFER_TRACE(bh, "return this buffer");
				861	return bh;
				862	}
				863
				864	void jbd2_descriptor_block_csum_set(journal_t j, struct buffer_head bh)
				865	{
				866	struct jbd2_journal_block_tail *tail;
				867	__u32 csum;
				868
				869	if (!jbd2_journal_has_csum_v2or3(j))
				870	return;
				871
				872	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
				873	sizeof(struct jbd2_journal_block_tail));
				874	tail->t_checksum = 0;
				875	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
				876	tail->t_checksum = cpu_to_be32(csum);
				877	}
				878
				879	/*
				880	* Return tid of the oldest transaction in the journal and block in the journal
				881	* where the transaction starts.
				882	*
				883	* If the journal is now empty, return which will be the next transaction ID
				884	* we will write and where will that transaction start.
				885	*
				886	* The return value is 0 if journal tail cannot be pushed any further, 1 if
				887	* it can.
				888	*/
				889	int jbd2_journal_get_log_tail(journal_t journal, tid_t tid,
				890	unsigned long *block)
				891	{
				892	transaction_t *transaction;
				893	int ret;
				894
				895	read_lock(&journal->j_state_lock);
				896	spin_lock(&journal->j_list_lock);
				897	transaction = journal->j_checkpoint_transactions;
				898	if (transaction) {
				899	*tid = transaction->t_tid;
				900	*block = transaction->t_log_start;
				901	} else if ((transaction = journal->j_committing_transaction) != NULL) {
				902	*tid = transaction->t_tid;
				903	*block = transaction->t_log_start;
				904	} else if ((transaction = journal->j_running_transaction) != NULL) {
				905	*tid = transaction->t_tid;
				906	*block = journal->j_head;
				907	} else {
				908	*tid = journal->j_transaction_sequence;
				909	*block = journal->j_head;
				910	}
				911	ret = tid_gt(*tid, journal->j_tail_sequence);
				912	spin_unlock(&journal->j_list_lock);
				913	read_unlock(&journal->j_state_lock);
				914
				915	return ret;
				916	}
				917
				918	/*
				919	* Update information in journal structure and in on disk journal superblock
				920	* about log tail. This function does not check whether information passed in
				921	* really pushes log tail further. It's responsibility of the caller to make
				922	* sure provided log tail information is valid (e.g. by holding
				923	* j_checkpoint_mutex all the time between computing log tail and calling this
				924	* function as is the case with jbd2_cleanup_journal_tail()).
				925	*
				926	* Requires j_checkpoint_mutex
				927	*/
				928	int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
				929	{
				930	unsigned long freed;
				931	int ret;
				932
				933	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
				934
				935	/*
				936	* We cannot afford for write to remain in drive's caches since as
				937	* soon as we update j_tail, next transaction can start reusing journal
				938	* space and if we lose sb update during power failure we'd replay
				939	* old transaction with possibly newly overwritten data.
				940	*/
				941	ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
				942	REQ_SYNC \| REQ_FUA);
				943	if (ret)
				944	goto out;
				945
				946	write_lock(&journal->j_state_lock);
				947	freed = block - journal->j_tail;
				948	if (block < journal->j_tail)
				949	freed += journal->j_last - journal->j_first;
				950
				951	trace_jbd2_update_log_tail(journal, tid, block, freed);
				952	jbd_debug(1,
				953	"Cleaning journal tail from %u to %u (offset %lu), "
				954	"freeing %lu\n",
				955	journal->j_tail_sequence, tid, block, freed);
				956
				957	journal->j_free += freed;
				958	journal->j_tail_sequence = tid;
				959	journal->j_tail = block;
				960	write_unlock(&journal->j_state_lock);
				961
				962	out:
				963	return ret;
				964	}
				965
				966	/*
				967	* This is a variation of __jbd2_update_log_tail which checks for validity of
				968	* provided log tail and locks j_checkpoint_mutex. So it is safe against races
				969	* with other threads updating log tail.
				970	*/
				971	void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
				972	{
				973	mutex_lock_io(&journal->j_checkpoint_mutex);
				974	if (tid_gt(tid, journal->j_tail_sequence))
				975	__jbd2_update_log_tail(journal, tid, block);
				976	mutex_unlock(&journal->j_checkpoint_mutex);
				977	}
				978
				979	struct jbd2_stats_proc_session {
				980	journal_t *journal;
				981	struct transaction_stats_s *stats;
				982	int start;
				983	int max;
				984	};
				985
				986	static void jbd2_seq_info_start(struct seq_file seq, loff_t *pos)
				987	{
				988	return *pos ? NULL : SEQ_START_TOKEN;
				989	}
				990
				991	static void jbd2_seq_info_next(struct seq_file seq, void v, loff_t pos)
				992	{
				993	(*pos)++;
				994	return NULL;
				995	}
				996
				997	static int jbd2_seq_info_show(struct seq_file seq, void v)
				998	{
				999	struct jbd2_stats_proc_session *s = seq->private;
				1000
				1001	if (v != SEQ_START_TOKEN)
				1002	return 0;
				1003	seq_printf(seq, "%lu transactions (%lu requested), "
				1004	"each up to %u blocks\n",
				1005	s->stats->ts_tid, s->stats->ts_requested,
				1006	s->journal->j_max_transaction_buffers);
				1007	if (s->stats->ts_tid == 0)
				1008	return 0;
				1009	seq_printf(seq, "average: \n %ums waiting for transaction\n",
				1010	jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
				1011	seq_printf(seq, " %ums request delay\n",
				1012	(s->stats->ts_requested == 0) ? 0 :
				1013	jiffies_to_msecs(s->stats->run.rs_request_delay /
				1014	s->stats->ts_requested));
				1015	seq_printf(seq, " %ums running transaction\n",
				1016	jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
				1017	seq_printf(seq, " %ums transaction was being locked\n",
				1018	jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
				1019	seq_printf(seq, " %ums flushing data (in ordered mode)\n",
				1020	jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
				1021	seq_printf(seq, " %ums logging transaction\n",
				1022	jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
				1023	seq_printf(seq, " %lluus average transaction commit time\n",
				1024	div_u64(s->journal->j_average_commit_time, 1000));
				1025	seq_printf(seq, " %lu handles per transaction\n",
				1026	s->stats->run.rs_handle_count / s->stats->ts_tid);
				1027	seq_printf(seq, " %lu blocks per transaction\n",
				1028	s->stats->run.rs_blocks / s->stats->ts_tid);
				1029	seq_printf(seq, " %lu logged blocks per transaction\n",
				1030	s->stats->run.rs_blocks_logged / s->stats->ts_tid);
				1031	return 0;
				1032	}
				1033
				1034	static void jbd2_seq_info_stop(struct seq_file seq, void v)
				1035	{
				1036	}
				1037
				1038	static const struct seq_operations jbd2_seq_info_ops = {
				1039	.start = jbd2_seq_info_start,
				1040	.next = jbd2_seq_info_next,
				1041	.stop = jbd2_seq_info_stop,
				1042	.show = jbd2_seq_info_show,
				1043	};
				1044
				1045	static int jbd2_seq_info_open(struct inode inode, struct file file)
				1046	{
				1047	journal_t *journal = PDE_DATA(inode);
				1048	struct jbd2_stats_proc_session *s;
				1049	int rc, size;
				1050
				1051	s = kmalloc(sizeof(*s), GFP_KERNEL);
				1052	if (s == NULL)
				1053	return -ENOMEM;
				1054	size = sizeof(struct transaction_stats_s);
				1055	s->stats = kmalloc(size, GFP_KERNEL);
				1056	if (s->stats == NULL) {
				1057	kfree(s);
				1058	return -ENOMEM;
				1059	}
				1060	spin_lock(&journal->j_history_lock);
				1061	memcpy(s->stats, &journal->j_stats, size);
				1062	s->journal = journal;
				1063	spin_unlock(&journal->j_history_lock);
				1064
				1065	rc = seq_open(file, &jbd2_seq_info_ops);
				1066	if (rc == 0) {
				1067	struct seq_file *m = file->private_data;
				1068	m->private = s;
				1069	} else {
				1070	kfree(s->stats);
				1071	kfree(s);
				1072	}
				1073	return rc;
				1074
				1075	}
				1076
				1077	static int jbd2_seq_info_release(struct inode inode, struct file file)
				1078	{
				1079	struct seq_file *seq = file->private_data;
				1080	struct jbd2_stats_proc_session *s = seq->private;
				1081	kfree(s->stats);
				1082	kfree(s);
				1083	return seq_release(inode, file);
				1084	}
				1085
				1086	static const struct file_operations jbd2_seq_info_fops = {
				1087	.owner = THIS_MODULE,
				1088	.open = jbd2_seq_info_open,
				1089	.read = seq_read,
				1090	.llseek = seq_lseek,
				1091	.release = jbd2_seq_info_release,
				1092	};
				1093
				1094	static struct proc_dir_entry *proc_jbd2_stats;
				1095
				1096	static void jbd2_stats_proc_init(journal_t *journal)
				1097	{
				1098	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
				1099	if (journal->j_proc_entry) {
				1100	proc_create_data("info", S_IRUGO, journal->j_proc_entry,
				1101	&jbd2_seq_info_fops, journal);
				1102	}
				1103	}
				1104
				1105	static void jbd2_stats_proc_exit(journal_t *journal)
				1106	{
				1107	remove_proc_entry("info", journal->j_proc_entry);
				1108	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
				1109	}
				1110
				1111	/*
				1112	* Management for journal control blocks: functions to create and
				1113	* destroy journal_t structures, and to initialise and read existing
				1114	* journal blocks from disk. */
				1115
				1116	/* First: create and setup a journal_t object in memory. We initialise
				1117	* very few fields yet: that has to wait until we have created the
				1118	* journal structures from from scratch, or loaded them from disk. */
				1119
				1120	static journal_t journal_init_common(struct block_device bdev,
				1121	struct block_device *fs_dev,
				1122	unsigned long long start, int len, int blocksize)
				1123	{
				1124	static struct lock_class_key jbd2_trans_commit_key;
				1125	journal_t *journal;
				1126	int err;
				1127	struct buffer_head *bh;
				1128	int n;
				1129
				1130	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
				1131	if (!journal)
				1132	return NULL;
				1133
				1134	init_waitqueue_head(&journal->j_wait_transaction_locked);
				1135	init_waitqueue_head(&journal->j_wait_done_commit);
				1136	init_waitqueue_head(&journal->j_wait_commit);
				1137	init_waitqueue_head(&journal->j_wait_updates);
				1138	init_waitqueue_head(&journal->j_wait_reserved);
				1139	mutex_init(&journal->j_barrier);
				1140	mutex_init(&journal->j_checkpoint_mutex);
				1141	spin_lock_init(&journal->j_revoke_lock);
				1142	spin_lock_init(&journal->j_list_lock);
				1143	rwlock_init(&journal->j_state_lock);
				1144
				1145	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
				1146	journal->j_min_batch_time = 0;
				1147	journal->j_max_batch_time = 15000; /* 15ms */
				1148	atomic_set(&journal->j_reserved_credits, 0);
				1149
				1150	/* The journal is marked for error until we succeed with recovery! */
				1151	journal->j_flags = JBD2_ABORT;
				1152
				1153	/* Set up a default-sized revoke table for the new mount. */
				1154	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
				1155	if (err)
				1156	goto err_cleanup;
				1157
				1158	spin_lock_init(&journal->j_history_lock);
				1159
				1160	lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
				1161	&jbd2_trans_commit_key, 0);
				1162
				1163	/* journal descriptor can store up to n blocks -bzzz */
				1164	journal->j_blocksize = blocksize;
				1165	journal->j_dev = bdev;
				1166	journal->j_fs_dev = fs_dev;
				1167	journal->j_blk_offset = start;
				1168	journal->j_maxlen = len;
				1169	n = journal->j_blocksize / sizeof(journal_block_tag_t);
				1170	journal->j_wbufsize = n;
				1171	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
				1172	GFP_KERNEL);
				1173	if (!journal->j_wbuf)
				1174	goto err_cleanup;
				1175
				1176	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
				1177	if (!bh) {
				1178	pr_err("%s: Cannot get buffer for journal superblock\n",
				1179	__func__);
				1180	goto err_cleanup;
				1181	}
				1182	journal->j_sb_buffer = bh;
				1183	journal->j_superblock = (journal_superblock_t *)bh->b_data;
				1184
				1185	return journal;
				1186
				1187	err_cleanup:
				1188	kfree(journal->j_wbuf);
				1189	jbd2_journal_destroy_revoke(journal);
				1190	kfree(journal);
				1191	return NULL;
				1192	}
				1193
				1194	/* jbd2_journal_init_dev and jbd2_journal_init_inode:
				1195	*
				1196	* Create a journal structure assigned some fixed set of disk blocks to
				1197	* the journal. We don't actually touch those disk blocks yet, but we
				1198	* need to set up all of the mapping information to tell the journaling
				1199	* system where the journal blocks are.
				1200	*
				1201	*/
				1202
				1203	/**
				1204	* journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
				1205	* @bdev: Block device on which to create the journal
				1206	* @fs_dev: Device which hold journalled filesystem for this journal.
				1207	* @start: Block nr Start of journal.
				1208	* @len: Length of the journal in blocks.
				1209	* @blocksize: blocksize of journalling device
				1210	*
				1211	* Returns: a newly created journal_t *
				1212	*
				1213	* jbd2_journal_init_dev creates a journal which maps a fixed contiguous
				1214	* range of blocks on an arbitrary block device.
				1215	*
				1216	*/
				1217	journal_t jbd2_journal_init_dev(struct block_device bdev,
				1218	struct block_device *fs_dev,
				1219	unsigned long long start, int len, int blocksize)
				1220	{
				1221	journal_t *journal;
				1222
				1223	journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
				1224	if (!journal)
				1225	return NULL;
				1226
				1227	bdevname(journal->j_dev, journal->j_devname);
				1228	strreplace(journal->j_devname, '/', '!');
				1229	jbd2_stats_proc_init(journal);
				1230
				1231	return journal;
				1232	}
				1233
				1234	/**
				1235	* journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
				1236	* @inode: An inode to create the journal in
				1237	*
				1238	* jbd2_journal_init_inode creates a journal which maps an on-disk inode as
				1239	* the journal. The inode must exist already, must support bmap() and
				1240	* must have all data blocks preallocated.
				1241	*/
				1242	journal_t jbd2_journal_init_inode(struct inode inode)
				1243	{
				1244	journal_t *journal;
				1245	sector_t blocknr;
				1246	char *p;
				1247	int err = 0;
				1248
				1249	blocknr = 0;
				1250	err = bmap(inode, &blocknr);
				1251
				1252	if (err \|\| !blocknr) {
				1253	pr_err("%s: Cannot locate journal superblock\n",
				1254	__func__);
				1255	return NULL;
				1256	}
				1257
				1258	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
				1259	inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
				1260	inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
				1261
				1262	journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
				1263	blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
				1264	inode->i_sb->s_blocksize);
				1265	if (!journal)
				1266	return NULL;
				1267
				1268	journal->j_inode = inode;
				1269	bdevname(journal->j_dev, journal->j_devname);
				1270	p = strreplace(journal->j_devname, '/', '!');
				1271	sprintf(p, "-%lu", journal->j_inode->i_ino);
				1272	jbd2_stats_proc_init(journal);
				1273
				1274	return journal;
				1275	}
				1276
				1277	/*
				1278	* If the journal init or create aborts, we need to mark the journal
				1279	* superblock as being NULL to prevent the journal destroy from writing
				1280	* back a bogus superblock.
				1281	*/
				1282	static void journal_fail_superblock(journal_t *journal)
				1283	{
				1284	struct buffer_head *bh = journal->j_sb_buffer;
				1285	brelse(bh);
				1286	journal->j_sb_buffer = NULL;
				1287	}
				1288
				1289	/*
				1290	* Given a journal_t structure, initialise the various fields for
				1291	* startup of a new journaling session. We use this both when creating
				1292	* a journal, and after recovering an old journal to reset it for
				1293	* subsequent use.
				1294	*/
				1295
				1296	static int journal_reset(journal_t *journal)
				1297	{
				1298	journal_superblock_t *sb = journal->j_superblock;
				1299	unsigned long long first, last;
				1300
				1301	first = be32_to_cpu(sb->s_first);
				1302	last = be32_to_cpu(sb->s_maxlen);
				1303	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
				1304	printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
				1305	first, last);
				1306	journal_fail_superblock(journal);
				1307	return -EINVAL;
				1308	}
				1309
				1310	journal->j_first = first;
				1311	journal->j_last = last;
				1312
				1313	journal->j_head = first;
				1314	journal->j_tail = first;
				1315	journal->j_free = last - first;
				1316
				1317	journal->j_tail_sequence = journal->j_transaction_sequence;
				1318	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
				1319	journal->j_commit_request = journal->j_commit_sequence;
				1320
				1321	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
				1322
				1323	/*
				1324	* As a special case, if the on-disk copy is already marked as needing
				1325	* no recovery (s_start == 0), then we can safely defer the superblock
				1326	* update until the next commit by setting JBD2_FLUSHED. This avoids
				1327	* attempting a write to a potential-readonly device.
				1328	*/
				1329	if (sb->s_start == 0) {
				1330	jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
				1331	"(start %ld, seq %u, errno %d)\n",
				1332	journal->j_tail, journal->j_tail_sequence,
				1333	journal->j_errno);
				1334	journal->j_flags \|= JBD2_FLUSHED;
				1335	} else {
				1336	/* Lock here to make assertions happy... */
				1337	mutex_lock_io(&journal->j_checkpoint_mutex);
				1338	/*
				1339	* Update log tail information. We use REQ_FUA since new
				1340	* transaction will start reusing journal space and so we
				1341	* must make sure information about current log tail is on
				1342	* disk before that.
				1343	*/
				1344	jbd2_journal_update_sb_log_tail(journal,
				1345	journal->j_tail_sequence,
				1346	journal->j_tail,
				1347	REQ_SYNC \| REQ_FUA);
				1348	mutex_unlock(&journal->j_checkpoint_mutex);
				1349	}
				1350	return jbd2_journal_start_thread(journal);
				1351	}
				1352
				1353	/*
				1354	* This function expects that the caller will have locked the journal
				1355	* buffer head, and will return with it unlocked
				1356	*/
				1357	static int jbd2_write_superblock(journal_t *journal, int write_flags)
				1358	{
				1359	struct buffer_head *bh = journal->j_sb_buffer;
				1360	journal_superblock_t *sb = journal->j_superblock;
				1361	int ret;
				1362
				1363	/* Buffer got discarded which means block device got invalidated */
				1364	if (!buffer_mapped(bh)) {
				1365	unlock_buffer(bh);
				1366	return -EIO;
				1367	}
				1368
				1369	if (!(journal->j_flags & JBD2_BARRIER))
				1370	write_flags &= ~(REQ_FUA \| REQ_PREFLUSH);
				1371
				1372	trace_jbd2_write_superblock(journal, write_flags);
				1373
				1374	if (buffer_write_io_error(bh)) {
				1375	/*
				1376	* Oh, dear. A previous attempt to write the journal
				1377	* superblock failed. This could happen because the
				1378	* USB device was yanked out. Or it could happen to
				1379	* be a transient write error and maybe the block will
				1380	* be remapped. Nothing we can do but to retry the
				1381	* write and hope for the best.
				1382	*/
				1383	printk(KERN_ERR "JBD2: previous I/O error detected "
				1384	"for journal superblock update for %s.\n",
				1385	journal->j_devname);
				1386	clear_buffer_write_io_error(bh);
				1387	set_buffer_uptodate(bh);
				1388	}
				1389	if (jbd2_journal_has_csum_v2or3(journal))
				1390	sb->s_checksum = jbd2_superblock_csum(journal, sb);
				1391	get_bh(bh);
				1392	bh->b_end_io = end_buffer_write_sync;
				1393	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
				1394	wait_on_buffer(bh);
				1395	if (buffer_write_io_error(bh)) {
				1396	clear_buffer_write_io_error(bh);
				1397	set_buffer_uptodate(bh);
				1398	ret = -EIO;
				1399	}
				1400	if (ret) {
				1401	printk(KERN_ERR "JBD2: Error %d detected when updating "
				1402	"journal superblock for %s.\n", ret,
				1403	journal->j_devname);
				1404	jbd2_journal_abort(journal, ret);
				1405	}
				1406
				1407	return ret;
				1408	}
				1409
				1410	/**
				1411	* jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
				1412	* @journal: The journal to update.
				1413	* @tail_tid: TID of the new transaction at the tail of the log
				1414	* @tail_block: The first block of the transaction at the tail of the log
				1415	* @write_op: With which operation should we write the journal sb
				1416	*
				1417	* Update a journal's superblock information about log tail and write it to
				1418	* disk, waiting for the IO to complete.
				1419	*/
				1420	int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
				1421	unsigned long tail_block, int write_op)
				1422	{
				1423	journal_superblock_t *sb = journal->j_superblock;
				1424	int ret;
				1425
				1426	if (is_journal_aborted(journal))
				1427	return -EIO;
				1428
				1429	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
				1430	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
				1431	tail_block, tail_tid);
				1432
				1433	lock_buffer(journal->j_sb_buffer);
				1434	sb->s_sequence = cpu_to_be32(tail_tid);
				1435	sb->s_start = cpu_to_be32(tail_block);
				1436
				1437	ret = jbd2_write_superblock(journal, write_op);
				1438	if (ret)
				1439	goto out;
				1440
				1441	/* Log is no longer empty */
				1442	write_lock(&journal->j_state_lock);
				1443	WARN_ON(!sb->s_sequence);
				1444	journal->j_flags &= ~JBD2_FLUSHED;
				1445	write_unlock(&journal->j_state_lock);
				1446
				1447	out:
				1448	return ret;
				1449	}
				1450
				1451	/**
				1452	* jbd2_mark_journal_empty() - Mark on disk journal as empty.
				1453	* @journal: The journal to update.
				1454	* @write_op: With which operation should we write the journal sb
				1455	*
				1456	* Update a journal's dynamic superblock fields to show that journal is empty.
				1457	* Write updated superblock to disk waiting for IO to complete.
				1458	*/
				1459	static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
				1460	{
				1461	journal_superblock_t *sb = journal->j_superblock;
				1462
				1463	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
				1464	lock_buffer(journal->j_sb_buffer);
				1465	if (sb->s_start == 0) { /* Is it already empty? */
				1466	unlock_buffer(journal->j_sb_buffer);
				1467	return;
				1468	}
				1469
				1470	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
				1471	journal->j_tail_sequence);
				1472
				1473	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
				1474	sb->s_start = cpu_to_be32(0);
				1475
				1476	jbd2_write_superblock(journal, write_op);
				1477
				1478	/* Log is no longer empty */
				1479	write_lock(&journal->j_state_lock);
				1480	journal->j_flags \|= JBD2_FLUSHED;
				1481	write_unlock(&journal->j_state_lock);
				1482	}
				1483
				1484
				1485	/**
				1486	* jbd2_journal_update_sb_errno() - Update error in the journal.
				1487	* @journal: The journal to update.
				1488	*
				1489	* Update a journal's errno. Write updated superblock to disk waiting for IO
				1490	* to complete.
				1491	*/
				1492	void jbd2_journal_update_sb_errno(journal_t *journal)
				1493	{
				1494	journal_superblock_t *sb = journal->j_superblock;
				1495	int errcode;
				1496
				1497	lock_buffer(journal->j_sb_buffer);
				1498	errcode = journal->j_errno;
				1499	if (errcode == -ESHUTDOWN)
				1500	errcode = 0;
				1501	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
				1502	sb->s_errno = cpu_to_be32(errcode);
				1503
				1504	jbd2_write_superblock(journal, REQ_SYNC \| REQ_FUA);
				1505	}
				1506	EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
				1507
				1508	/*
				1509	* Read the superblock for a given journal, performing initial
				1510	* validation of the format.
				1511	*/
				1512	static int journal_get_superblock(journal_t *journal)
				1513	{
				1514	struct buffer_head *bh;
				1515	journal_superblock_t *sb;
				1516	int err = -EIO;
				1517
				1518	bh = journal->j_sb_buffer;
				1519
				1520	J_ASSERT(bh != NULL);
				1521	if (!buffer_uptodate(bh)) {
				1522	ll_rw_block(REQ_OP_READ, 0, 1, &bh);
				1523	wait_on_buffer(bh);
				1524	if (!buffer_uptodate(bh)) {
				1525	printk(KERN_ERR
				1526	"JBD2: IO error reading journal superblock\n");
				1527	goto out;
				1528	}
				1529	}
				1530
				1531	if (buffer_verified(bh))
				1532	return 0;
				1533
				1534	sb = journal->j_superblock;
				1535
				1536	err = -EINVAL;
				1537
				1538	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) \|\|
				1539	sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
				1540	printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
				1541	goto out;
				1542	}
				1543
				1544	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
				1545	case JBD2_SUPERBLOCK_V1:
				1546	journal->j_format_version = 1;
				1547	break;
				1548	case JBD2_SUPERBLOCK_V2:
				1549	journal->j_format_version = 2;
				1550	break;
				1551	default:
				1552	printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
				1553	goto out;
				1554	}
				1555
				1556	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
				1557	journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
				1558	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
				1559	printk(KERN_WARNING "JBD2: journal file too short\n");
				1560	goto out;
				1561	}
				1562
				1563	if (be32_to_cpu(sb->s_first) == 0 \|\|
				1564	be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
				1565	printk(KERN_WARNING
				1566	"JBD2: Invalid start block of journal: %u\n",
				1567	be32_to_cpu(sb->s_first));
				1568	goto out;
				1569	}
				1570
				1571	if (jbd2_has_feature_csum2(journal) &&
				1572	jbd2_has_feature_csum3(journal)) {
				1573	/* Can't have checksum v2 and v3 at the same time! */
				1574	printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
				1575	"at the same time!\n");
				1576	goto out;
				1577	}
				1578
				1579	if (jbd2_journal_has_csum_v2or3_feature(journal) &&
				1580	jbd2_has_feature_checksum(journal)) {
				1581	/* Can't have checksum v1 and v2 on at the same time! */
				1582	printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
				1583	"at the same time!\n");
				1584	goto out;
				1585	}
				1586
				1587	if (!jbd2_verify_csum_type(journal, sb)) {
				1588	printk(KERN_ERR "JBD2: Unknown checksum type\n");
				1589	goto out;
				1590	}
				1591
				1592	/* Load the checksum driver */
				1593	if (jbd2_journal_has_csum_v2or3_feature(journal)) {
				1594	journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
				1595	if (IS_ERR(journal->j_chksum_driver)) {
				1596	printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
				1597	err = PTR_ERR(journal->j_chksum_driver);
				1598	journal->j_chksum_driver = NULL;
				1599	goto out;
				1600	}
				1601	}
				1602
				1603	if (jbd2_journal_has_csum_v2or3(journal)) {
				1604	/* Check superblock checksum */
				1605	if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
				1606	printk(KERN_ERR "JBD2: journal checksum error\n");
				1607	err = -EFSBADCRC;
				1608	goto out;
				1609	}
				1610
				1611	/* Precompute checksum seed for all metadata */
				1612	journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
				1613	sizeof(sb->s_uuid));
				1614	}
				1615
				1616	set_buffer_verified(bh);
				1617
				1618	return 0;
				1619
				1620	out:
				1621	journal_fail_superblock(journal);
				1622	return err;
				1623	}
				1624
				1625	/*
				1626	* Load the on-disk journal superblock and read the key fields into the
				1627	* journal_t.
				1628	*/
				1629
				1630	static int load_superblock(journal_t *journal)
				1631	{
				1632	int err;
				1633	journal_superblock_t *sb;
				1634
				1635	err = journal_get_superblock(journal);
				1636	if (err)
				1637	return err;
				1638
				1639	sb = journal->j_superblock;
				1640
				1641	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
				1642	journal->j_tail = be32_to_cpu(sb->s_start);
				1643	journal->j_first = be32_to_cpu(sb->s_first);
				1644	journal->j_last = be32_to_cpu(sb->s_maxlen);
				1645	journal->j_errno = be32_to_cpu(sb->s_errno);
				1646
				1647	return 0;
				1648	}
				1649
				1650
				1651	/**
				1652	* jbd2_journal_load() - Read journal from disk.
				1653	* @journal: Journal to act on.
				1654	*
				1655	* Given a journal_t structure which tells us which disk blocks contain
				1656	* a journal, read the journal from disk to initialise the in-memory
				1657	* structures.
				1658	*/
				1659	int jbd2_journal_load(journal_t *journal)
				1660	{
				1661	int err;
				1662	journal_superblock_t *sb;
				1663
				1664	err = load_superblock(journal);
				1665	if (err)
				1666	return err;
				1667
				1668	sb = journal->j_superblock;
				1669	/* If this is a V2 superblock, then we have to check the
				1670	* features flags on it. */
				1671
				1672	if (journal->j_format_version >= 2) {
				1673	if ((sb->s_feature_ro_compat &
				1674	~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) \|\|
				1675	(sb->s_feature_incompat &
				1676	~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
				1677	printk(KERN_WARNING
				1678	"JBD2: Unrecognised features on journal\n");
				1679	return -EINVAL;
				1680	}
				1681	}
				1682
				1683	/*
				1684	* Create a slab for this blocksize
				1685	*/
				1686	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
				1687	if (err)
				1688	return err;
				1689
				1690	/* Let the recovery code check whether it needs to recover any
				1691	* data from the journal. */
				1692	if (jbd2_journal_recover(journal))
				1693	goto recovery_error;
				1694
				1695	if (journal->j_failed_commit) {
				1696	printk(KERN_ERR "JBD2: journal transaction %u on %s "
				1697	"is corrupt.\n", journal->j_failed_commit,
				1698	journal->j_devname);
				1699	return -EFSCORRUPTED;
				1700	}
				1701	/*
				1702	* clear JBD2_ABORT flag initialized in journal_init_common
				1703	* here to update log tail information with the newest seq.
				1704	*/
				1705	journal->j_flags &= ~JBD2_ABORT;
				1706
				1707	/* OK, we've finished with the dynamic journal bits:
				1708	* reinitialise the dynamic contents of the superblock in memory
				1709	* and reset them on disk. */
				1710	if (journal_reset(journal))
				1711	goto recovery_error;
				1712
				1713	journal->j_flags \|= JBD2_LOADED;
				1714	return 0;
				1715
				1716	recovery_error:
				1717	printk(KERN_WARNING "JBD2: recovery failed\n");
				1718	return -EIO;
				1719	}
				1720
				1721	/**
				1722	* jbd2_journal_destroy() - Release a journal_t structure.
				1723	* @journal: Journal to act on.
				1724	*
				1725	* Release a journal_t structure once it is no longer in use by the
				1726	* journaled object.
				1727	* Return <0 if we couldn't clean up the journal.
				1728	*/
				1729	int jbd2_journal_destroy(journal_t *journal)
				1730	{
				1731	int err = 0;
				1732
				1733	/* Wait for the commit thread to wake up and die. */
				1734	journal_kill_thread(journal);
				1735
				1736	/* Force a final log commit */
				1737	if (journal->j_running_transaction)
				1738	jbd2_journal_commit_transaction(journal);
				1739
				1740	/* Force any old transactions to disk */
				1741
				1742	/* Totally anal locking here... */
				1743	spin_lock(&journal->j_list_lock);
				1744	while (journal->j_checkpoint_transactions != NULL) {
				1745	spin_unlock(&journal->j_list_lock);
				1746	mutex_lock_io(&journal->j_checkpoint_mutex);
				1747	err = jbd2_log_do_checkpoint(journal);
				1748	mutex_unlock(&journal->j_checkpoint_mutex);
				1749	/*
				1750	* If checkpointing failed, just free the buffers to avoid
				1751	* looping forever
				1752	*/
				1753	if (err) {
				1754	jbd2_journal_destroy_checkpoint(journal);
				1755	spin_lock(&journal->j_list_lock);
				1756	break;
				1757	}
				1758	spin_lock(&journal->j_list_lock);
				1759	}
				1760
				1761	J_ASSERT(journal->j_running_transaction == NULL);
				1762	J_ASSERT(journal->j_committing_transaction == NULL);
				1763	J_ASSERT(journal->j_checkpoint_transactions == NULL);
				1764	spin_unlock(&journal->j_list_lock);
				1765
				1766	if (journal->j_sb_buffer) {
				1767	if (!is_journal_aborted(journal)) {
				1768	mutex_lock_io(&journal->j_checkpoint_mutex);
				1769
				1770	write_lock(&journal->j_state_lock);
				1771	journal->j_tail_sequence =
				1772	++journal->j_transaction_sequence;
				1773	write_unlock(&journal->j_state_lock);
				1774
				1775	jbd2_mark_journal_empty(journal,
				1776	REQ_SYNC \| REQ_PREFLUSH \| REQ_FUA);
				1777	mutex_unlock(&journal->j_checkpoint_mutex);
				1778	} else
				1779	err = -EIO;
				1780	brelse(journal->j_sb_buffer);
				1781	}
				1782
				1783	if (journal->j_proc_entry)
				1784	jbd2_stats_proc_exit(journal);
				1785	iput(journal->j_inode);
				1786	if (journal->j_revoke)
				1787	jbd2_journal_destroy_revoke(journal);
				1788	if (journal->j_chksum_driver)
				1789	crypto_free_shash(journal->j_chksum_driver);
				1790	kfree(journal->j_wbuf);
				1791	kfree(journal);
				1792
				1793	return err;
				1794	}
				1795
				1796
				1797	/**
				1798	* jbd2_journal_check_used_features() - Check if features specified are used.
				1799	* @journal: Journal to check.
				1800	* @compat: bitmask of compatible features
				1801	* @ro: bitmask of features that force read-only mount
				1802	* @incompat: bitmask of incompatible features
				1803	*
				1804	* Check whether the journal uses all of a given set of
				1805	* features. Return true (non-zero) if it does.
				1806	**/
				1807
				1808	int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
				1809	unsigned long ro, unsigned long incompat)
				1810	{
				1811	journal_superblock_t *sb;
				1812
				1813	if (!compat && !ro && !incompat)
				1814	return 1;
				1815	/* Load journal superblock if it is not loaded yet. */
				1816	if (journal->j_format_version == 0 &&
				1817	journal_get_superblock(journal) != 0)
				1818	return 0;
				1819	if (journal->j_format_version == 1)
				1820	return 0;
				1821
				1822	sb = journal->j_superblock;
				1823
				1824	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
				1825	((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
				1826	((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
				1827	return 1;
				1828
				1829	return 0;
				1830	}
				1831
				1832	/**
				1833	* jbd2_journal_check_available_features() - Check feature set in journalling layer
				1834	* @journal: Journal to check.
				1835	* @compat: bitmask of compatible features
				1836	* @ro: bitmask of features that force read-only mount
				1837	* @incompat: bitmask of incompatible features
				1838	*
				1839	* Check whether the journaling code supports the use of
				1840	* all of a given set of features on this journal. Return true
				1841	* (non-zero) if it can. */
				1842
				1843	int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
				1844	unsigned long ro, unsigned long incompat)
				1845	{
				1846	if (!compat && !ro && !incompat)
				1847	return 1;
				1848
				1849	/* We can support any known requested features iff the
				1850	* superblock is in version 2. Otherwise we fail to support any
				1851	* extended sb features. */
				1852
				1853	if (journal->j_format_version != 2)
				1854	return 0;
				1855
				1856	if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
				1857	(ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
				1858	(incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
				1859	return 1;
				1860
				1861	return 0;
				1862	}
				1863
				1864	/**
				1865	* jbd2_journal_set_features() - Mark a given journal feature in the superblock
				1866	* @journal: Journal to act on.
				1867	* @compat: bitmask of compatible features
				1868	* @ro: bitmask of features that force read-only mount
				1869	* @incompat: bitmask of incompatible features
				1870	*
				1871	* Mark a given journal feature as present on the
				1872	* superblock. Returns true if the requested features could be set.
				1873	*
				1874	*/
				1875
				1876	int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
				1877	unsigned long ro, unsigned long incompat)
				1878	{
				1879	#define INCOMPAT_FEATURE_ON(f) \
				1880	((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
				1881	#define COMPAT_FEATURE_ON(f) \
				1882	((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
				1883	journal_superblock_t *sb;
				1884
				1885	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
				1886	return 1;
				1887
				1888	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
				1889	return 0;
				1890
				1891	/* If enabling v2 checksums, turn on v3 instead */
				1892	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
				1893	incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
				1894	incompat \|= JBD2_FEATURE_INCOMPAT_CSUM_V3;
				1895	}
				1896
				1897	/* Asking for checksumming v3 and v1? Only give them v3. */
				1898	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
				1899	compat & JBD2_FEATURE_COMPAT_CHECKSUM)
				1900	compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
				1901
				1902	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
				1903	compat, ro, incompat);
				1904
				1905	sb = journal->j_superblock;
				1906
				1907	/* Load the checksum driver if necessary */
				1908	if ((journal->j_chksum_driver == NULL) &&
				1909	INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
				1910	journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
				1911	if (IS_ERR(journal->j_chksum_driver)) {
				1912	printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
				1913	journal->j_chksum_driver = NULL;
				1914	return 0;
				1915	}
				1916	/* Precompute checksum seed for all metadata */
				1917	journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
				1918	sizeof(sb->s_uuid));
				1919	}
				1920
				1921	lock_buffer(journal->j_sb_buffer);
				1922
				1923	/* If enabling v3 checksums, update superblock */
				1924	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
				1925	sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
				1926	sb->s_feature_compat &=
				1927	~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
				1928	}
				1929
				1930	/* If enabling v1 checksums, downgrade superblock */
				1931	if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
				1932	sb->s_feature_incompat &=
				1933	~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 \|
				1934	JBD2_FEATURE_INCOMPAT_CSUM_V3);
				1935
				1936	sb->s_feature_compat \|= cpu_to_be32(compat);
				1937	sb->s_feature_ro_compat \|= cpu_to_be32(ro);
				1938	sb->s_feature_incompat \|= cpu_to_be32(incompat);
				1939	unlock_buffer(journal->j_sb_buffer);
				1940
				1941	return 1;
				1942	#undef COMPAT_FEATURE_ON
				1943	#undef INCOMPAT_FEATURE_ON
				1944	}
				1945
				1946	/*
				1947	* jbd2_journal_clear_features() - Clear a given journal feature in the
				1948	* superblock
				1949	* @journal: Journal to act on.
				1950	* @compat: bitmask of compatible features
				1951	* @ro: bitmask of features that force read-only mount
				1952	* @incompat: bitmask of incompatible features
				1953	*
				1954	* Clear a given journal feature as present on the
				1955	* superblock.
				1956	*/
				1957	void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
				1958	unsigned long ro, unsigned long incompat)
				1959	{
				1960	journal_superblock_t *sb;
				1961
				1962	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
				1963	compat, ro, incompat);
				1964
				1965	sb = journal->j_superblock;
				1966
				1967	sb->s_feature_compat &= ~cpu_to_be32(compat);
				1968	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
				1969	sb->s_feature_incompat &= ~cpu_to_be32(incompat);
				1970	}
				1971	EXPORT_SYMBOL(jbd2_journal_clear_features);
				1972
				1973	/**
				1974	* jbd2_journal_flush() - Flush journal
				1975	* @journal: Journal to act on.
				1976	*
				1977	* Flush all data for a given journal to disk and empty the journal.
				1978	* Filesystems can use this when remounting readonly to ensure that
				1979	* recovery does not need to happen on remount.
				1980	*/
				1981
				1982	int jbd2_journal_flush(journal_t *journal)
				1983	{
				1984	int err = 0;
				1985	transaction_t *transaction = NULL;
				1986
				1987	write_lock(&journal->j_state_lock);
				1988
				1989	/* Force everything buffered to the log... */
				1990	if (journal->j_running_transaction) {
				1991	transaction = journal->j_running_transaction;
				1992	__jbd2_log_start_commit(journal, transaction->t_tid);
				1993	} else if (journal->j_committing_transaction)
				1994	transaction = journal->j_committing_transaction;
				1995
				1996	/* Wait for the log commit to complete... */
				1997	if (transaction) {
				1998	tid_t tid = transaction->t_tid;
				1999
				2000	write_unlock(&journal->j_state_lock);
				2001	jbd2_log_wait_commit(journal, tid);
				2002	} else {
				2003	write_unlock(&journal->j_state_lock);
				2004	}
				2005
				2006	/* ...and flush everything in the log out to disk. */
				2007	spin_lock(&journal->j_list_lock);
				2008	while (!err && journal->j_checkpoint_transactions != NULL) {
				2009	spin_unlock(&journal->j_list_lock);
				2010	mutex_lock_io(&journal->j_checkpoint_mutex);
				2011	err = jbd2_log_do_checkpoint(journal);
				2012	mutex_unlock(&journal->j_checkpoint_mutex);
				2013	spin_lock(&journal->j_list_lock);
				2014	}
				2015	spin_unlock(&journal->j_list_lock);
				2016
				2017	if (is_journal_aborted(journal))
				2018	return -EIO;
				2019
				2020	mutex_lock_io(&journal->j_checkpoint_mutex);
				2021	if (!err) {
				2022	err = jbd2_cleanup_journal_tail(journal);
				2023	if (err < 0) {
				2024	mutex_unlock(&journal->j_checkpoint_mutex);
				2025	goto out;
				2026	}
				2027	err = 0;
				2028	}
				2029
				2030	/* Finally, mark the journal as really needing no recovery.
				2031	* This sets s_start==0 in the underlying superblock, which is
				2032	* the magic code for a fully-recovered superblock. Any future
				2033	* commits of data to the journal will restore the current
				2034	* s_start value. */
				2035	jbd2_mark_journal_empty(journal, REQ_SYNC \| REQ_FUA);
				2036	mutex_unlock(&journal->j_checkpoint_mutex);
				2037	write_lock(&journal->j_state_lock);
				2038	J_ASSERT(!journal->j_running_transaction);
				2039	J_ASSERT(!journal->j_committing_transaction);
				2040	J_ASSERT(!journal->j_checkpoint_transactions);
				2041	J_ASSERT(journal->j_head == journal->j_tail);
				2042	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
				2043	write_unlock(&journal->j_state_lock);
				2044	out:
				2045	return err;
				2046	}
				2047
				2048	/**
				2049	* jbd2_journal_wipe() - Wipe journal contents
				2050	* @journal: Journal to act on.
				2051	* @write: flag (see below)
				2052	*
				2053	* Wipe out all of the contents of a journal, safely. This will produce
				2054	* a warning if the journal contains any valid recovery information.
				2055	* Must be called between journal_init_*() and jbd2_journal_load().
				2056	*
				2057	* If 'write' is non-zero, then we wipe out the journal on disk; otherwise
				2058	* we merely suppress recovery.
				2059	*/
				2060
				2061	int jbd2_journal_wipe(journal_t *journal, int write)
				2062	{
				2063	int err = 0;
				2064
				2065	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
				2066
				2067	err = load_superblock(journal);
				2068	if (err)
				2069	return err;
				2070
				2071	if (!journal->j_tail)
				2072	goto no_recovery;
				2073
				2074	printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
				2075	write ? "Clearing" : "Ignoring");
				2076
				2077	err = jbd2_journal_skip_recovery(journal);
				2078	if (write) {
				2079	/* Lock to make assertions happy... */
				2080	mutex_lock_io(&journal->j_checkpoint_mutex);
				2081	jbd2_mark_journal_empty(journal, REQ_SYNC \| REQ_FUA);
				2082	mutex_unlock(&journal->j_checkpoint_mutex);
				2083	}
				2084
				2085	no_recovery:
				2086	return err;
				2087	}
				2088
				2089	/**
				2090	* jbd2_journal_abort () - Shutdown the journal immediately.
				2091	* @journal: the journal to shutdown.
				2092	* @errno: an error number to record in the journal indicating
				2093	* the reason for the shutdown.
				2094	*
				2095	* Perform a complete, immediate shutdown of the ENTIRE
				2096	* journal (not of a single transaction). This operation cannot be
				2097	* undone without closing and reopening the journal.
				2098	*
				2099	* The jbd2_journal_abort function is intended to support higher level error
				2100	* recovery mechanisms such as the ext2/ext3 remount-readonly error
				2101	* mode.
				2102	*
				2103	* Journal abort has very specific semantics. Any existing dirty,
				2104	* unjournaled buffers in the main filesystem will still be written to
				2105	* disk by bdflush, but the journaling mechanism will be suspended
				2106	* immediately and no further transaction commits will be honoured.
				2107	*
				2108	* Any dirty, journaled buffers will be written back to disk without
				2109	* hitting the journal. Atomicity cannot be guaranteed on an aborted
				2110	* filesystem, but we _do_ attempt to leave as much data as possible
				2111	* behind for fsck to use for cleanup.
				2112	*
				2113	* Any attempt to get a new transaction handle on a journal which is in
				2114	* ABORT state will just result in an -EROFS error return. A
				2115	* jbd2_journal_stop on an existing handle will return -EIO if we have
				2116	* entered abort state during the update.
				2117	*
				2118	* Recursive transactions are not disturbed by journal abort until the
				2119	* final jbd2_journal_stop, which will receive the -EIO error.
				2120	*
				2121	* Finally, the jbd2_journal_abort call allows the caller to supply an errno
				2122	* which will be recorded (if possible) in the journal superblock. This
				2123	* allows a client to record failure conditions in the middle of a
				2124	* transaction without having to complete the transaction to record the
				2125	* failure to disk. ext3_error, for example, now uses this
				2126	* functionality.
				2127	*
				2128	*/
				2129
				2130	void jbd2_journal_abort(journal_t *journal, int errno)
				2131	{
				2132	transaction_t *transaction;
				2133
				2134	/*
				2135	* ESHUTDOWN always takes precedence because a file system check
				2136	* caused by any other journal abort error is not required after
				2137	* a shutdown triggered.
				2138	*/
				2139	write_lock(&journal->j_state_lock);
				2140	if (journal->j_flags & JBD2_ABORT) {
				2141	int old_errno = journal->j_errno;
				2142
				2143	write_unlock(&journal->j_state_lock);
				2144	if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
				2145	journal->j_errno = errno;
				2146	jbd2_journal_update_sb_errno(journal);
				2147	}
				2148	return;
				2149	}
				2150
				2151	/*
				2152	* Mark the abort as occurred and start current running transaction
				2153	* to release all journaled buffer.
				2154	*/
				2155	pr_err("Aborting journal on device %s.\n", journal->j_devname);
				2156
				2157	journal->j_flags \|= JBD2_ABORT;
				2158	journal->j_errno = errno;
				2159	transaction = journal->j_running_transaction;
				2160	if (transaction)
				2161	__jbd2_log_start_commit(journal, transaction->t_tid);
				2162	write_unlock(&journal->j_state_lock);
				2163
				2164	/*
				2165	* Record errno to the journal super block, so that fsck and jbd2
				2166	* layer could realise that a filesystem check is needed.
				2167	*/
				2168	jbd2_journal_update_sb_errno(journal);
				2169
				2170	write_lock(&journal->j_state_lock);
				2171	journal->j_flags \|= JBD2_REC_ERR;
				2172	write_unlock(&journal->j_state_lock);
				2173	}
				2174
				2175	/**
				2176	* jbd2_journal_errno() - returns the journal's error state.
				2177	* @journal: journal to examine.
				2178	*
				2179	* This is the errno number set with jbd2_journal_abort(), the last
				2180	* time the journal was mounted - if the journal was stopped
				2181	* without calling abort this will be 0.
				2182	*
				2183	* If the journal has been aborted on this mount time -EROFS will
				2184	* be returned.
				2185	*/
				2186	int jbd2_journal_errno(journal_t *journal)
				2187	{
				2188	int err;
				2189
				2190	read_lock(&journal->j_state_lock);
				2191	if (journal->j_flags & JBD2_ABORT)
				2192	err = -EROFS;
				2193	else
				2194	err = journal->j_errno;
				2195	read_unlock(&journal->j_state_lock);
				2196	return err;
				2197	}
				2198
				2199	/**
				2200	* jbd2_journal_clear_err() - clears the journal's error state
				2201	* @journal: journal to act on.
				2202	*
				2203	* An error must be cleared or acked to take a FS out of readonly
				2204	* mode.
				2205	*/
				2206	int jbd2_journal_clear_err(journal_t *journal)
				2207	{
				2208	int err = 0;
				2209
				2210	write_lock(&journal->j_state_lock);
				2211	if (journal->j_flags & JBD2_ABORT)
				2212	err = -EROFS;
				2213	else
				2214	journal->j_errno = 0;
				2215	write_unlock(&journal->j_state_lock);
				2216	return err;
				2217	}
				2218
				2219	/**
				2220	* jbd2_journal_ack_err() - Ack journal err.
				2221	* @journal: journal to act on.
				2222	*
				2223	* An error must be cleared or acked to take a FS out of readonly
				2224	* mode.
				2225	*/
				2226	void jbd2_journal_ack_err(journal_t *journal)
				2227	{
				2228	write_lock(&journal->j_state_lock);
				2229	if (journal->j_errno)
				2230	journal->j_flags \|= JBD2_ACK_ERR;
				2231	write_unlock(&journal->j_state_lock);
				2232	}
				2233
				2234	int jbd2_journal_blocks_per_page(struct inode *inode)
				2235	{
				2236	return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
				2237	}
				2238
				2239	/*
				2240	* helper functions to deal with 32 or 64bit block numbers.
				2241	*/
				2242	size_t journal_tag_bytes(journal_t *journal)
				2243	{
				2244	size_t sz;
				2245
				2246	if (jbd2_has_feature_csum3(journal))
				2247	return sizeof(journal_block_tag3_t);
				2248
				2249	sz = sizeof(journal_block_tag_t);
				2250
				2251	if (jbd2_has_feature_csum2(journal))
				2252	sz += sizeof(__u16);
				2253
				2254	if (jbd2_has_feature_64bit(journal))
				2255	return sz;
				2256	else
				2257	return sz - sizeof(__u32);
				2258	}
				2259
				2260	/*
				2261	* JBD memory management
				2262	*
				2263	* These functions are used to allocate block-sized chunks of memory
				2264	* used for making copies of buffer_head data. Very often it will be
				2265	* page-sized chunks of data, but sometimes it will be in
				2266	* sub-page-size chunks. (For example, 16k pages on Power systems
				2267	* with a 4k block file system.) For blocks smaller than a page, we
				2268	* use a SLAB allocator. There are slab caches for each block size,
				2269	* which are allocated at mount time, if necessary, and we only free
				2270	* (all of) the slab caches when/if the jbd2 module is unloaded. For
				2271	* this reason we don't need to a mutex to protect access to
				2272	* jbd2_slab[] allocating or releasing memory; only in
				2273	* jbd2_journal_create_slab().
				2274	*/
				2275	#define JBD2_MAX_SLABS 8
				2276	static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
				2277
				2278	static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
				2279	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
				2280	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
				2281	};
				2282
				2283
				2284	static void jbd2_journal_destroy_slabs(void)
				2285	{
				2286	int i;
				2287
				2288	for (i = 0; i < JBD2_MAX_SLABS; i++) {
				2289	kmem_cache_destroy(jbd2_slab[i]);
				2290	jbd2_slab[i] = NULL;
				2291	}
				2292	}
				2293
				2294	static int jbd2_journal_create_slab(size_t size)
				2295	{
				2296	static DEFINE_MUTEX(jbd2_slab_create_mutex);
				2297	int i = order_base_2(size) - 10;
				2298	size_t slab_size;
				2299
				2300	if (size == PAGE_SIZE)
				2301	return 0;
				2302
				2303	if (i >= JBD2_MAX_SLABS)
				2304	return -EINVAL;
				2305
				2306	if (unlikely(i < 0))
				2307	i = 0;
				2308	mutex_lock(&jbd2_slab_create_mutex);
				2309	if (jbd2_slab[i]) {
				2310	mutex_unlock(&jbd2_slab_create_mutex);
				2311	return 0; /* Already created */
				2312	}
				2313
				2314	slab_size = 1 << (i+10);
				2315	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
				2316	slab_size, 0, NULL);
				2317	mutex_unlock(&jbd2_slab_create_mutex);
				2318	if (!jbd2_slab[i]) {
				2319	printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
				2320	return -ENOMEM;
				2321	}
				2322	return 0;
				2323	}
				2324
				2325	static struct kmem_cache *get_slab(size_t size)
				2326	{
				2327	int i = order_base_2(size) - 10;
				2328
				2329	BUG_ON(i >= JBD2_MAX_SLABS);
				2330	if (unlikely(i < 0))
				2331	i = 0;
				2332	BUG_ON(jbd2_slab[i] == NULL);
				2333	return jbd2_slab[i];
				2334	}
				2335
				2336	void *jbd2_alloc(size_t size, gfp_t flags)
				2337	{
				2338	void *ptr;
				2339
				2340	BUG_ON(size & (size-1)); /* Must be a power of 2 */
				2341
				2342	if (size < PAGE_SIZE)
				2343	ptr = kmem_cache_alloc(get_slab(size), flags);
				2344	else
				2345	ptr = (void *)__get_free_pages(flags, get_order(size));
				2346
				2347	/* Check alignment; SLUB has gotten this wrong in the past,
				2348	* and this can lead to user data corruption! */
				2349	BUG_ON(((unsigned long) ptr) & (size-1));
				2350
				2351	return ptr;
				2352	}
				2353
				2354	void jbd2_free(void *ptr, size_t size)
				2355	{
				2356	if (size < PAGE_SIZE)
				2357	kmem_cache_free(get_slab(size), ptr);
				2358	else
				2359	free_pages((unsigned long)ptr, get_order(size));
				2360	};
				2361
				2362	/*
				2363	* Journal_head storage management
				2364	*/
				2365	static struct kmem_cache *jbd2_journal_head_cache;
				2366	#ifdef CONFIG_JBD2_DEBUG
				2367	static atomic_t nr_journal_heads = ATOMIC_INIT(0);
				2368	#endif
				2369
				2370	static int __init jbd2_journal_init_journal_head_cache(void)
				2371	{
				2372	J_ASSERT(!jbd2_journal_head_cache);
				2373	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
				2374	sizeof(struct journal_head),
				2375	0, /* offset */
				2376	SLAB_TEMPORARY \| SLAB_TYPESAFE_BY_RCU,
				2377	NULL); /* ctor */
				2378	if (!jbd2_journal_head_cache) {
				2379	printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
				2380	return -ENOMEM;
				2381	}
				2382	return 0;
				2383	}
				2384
				2385	static void jbd2_journal_destroy_journal_head_cache(void)
				2386	{
				2387	kmem_cache_destroy(jbd2_journal_head_cache);
				2388	jbd2_journal_head_cache = NULL;
				2389	}
				2390
				2391	/*
				2392	* journal_head splicing and dicing
				2393	*/
				2394	static struct journal_head *journal_alloc_journal_head(void)
				2395	{
				2396	struct journal_head *ret;
				2397
				2398	#ifdef CONFIG_JBD2_DEBUG
				2399	atomic_inc(&nr_journal_heads);
				2400	#endif
				2401	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
				2402	if (!ret) {
				2403	jbd_debug(1, "out of memory for journal_head\n");
				2404	pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
				2405	ret = kmem_cache_zalloc(jbd2_journal_head_cache,
				2406	GFP_NOFS \| __GFP_NOFAIL);
				2407	}
				2408	return ret;
				2409	}
				2410
				2411	static void journal_free_journal_head(struct journal_head *jh)
				2412	{
				2413	#ifdef CONFIG_JBD2_DEBUG
				2414	atomic_dec(&nr_journal_heads);
				2415	memset(jh, JBD2_POISON_FREE, sizeof(*jh));
				2416	#endif
				2417	kmem_cache_free(jbd2_journal_head_cache, jh);
				2418	}
				2419
				2420	/*
				2421	* A journal_head is attached to a buffer_head whenever JBD has an
				2422	* interest in the buffer.
				2423	*
				2424	* Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
				2425	* is set. This bit is tested in core kernel code where we need to take
				2426	* JBD-specific actions. Testing the zeroness of ->b_private is not reliable
				2427	* there.
				2428	*
				2429	* When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
				2430	*
				2431	* When a buffer has its BH_JBD bit set it is immune from being released by
				2432	* core kernel code, mainly via ->b_count.
				2433	*
				2434	* A journal_head is detached from its buffer_head when the journal_head's
				2435	* b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
				2436	* transaction (b_cp_transaction) hold their references to b_jcount.
				2437	*
				2438	* Various places in the kernel want to attach a journal_head to a buffer_head
				2439	* _before_ attaching the journal_head to a transaction. To protect the
				2440	* journal_head in this situation, jbd2_journal_add_journal_head elevates the
				2441	* journal_head's b_jcount refcount by one. The caller must call
				2442	* jbd2_journal_put_journal_head() to undo this.
				2443	*
				2444	* So the typical usage would be:
				2445	*
				2446	* (Attach a journal_head if needed. Increments b_jcount)
				2447	* struct journal_head *jh = jbd2_journal_add_journal_head(bh);
				2448	* ...
				2449	* (Get another reference for transaction)
				2450	* jbd2_journal_grab_journal_head(bh);
				2451	* jh->b_transaction = xxx;
				2452	* (Put original reference)
				2453	* jbd2_journal_put_journal_head(jh);
				2454	*/
				2455
				2456	/*
				2457	* Give a buffer_head a journal_head.
				2458	*
				2459	* May sleep.
				2460	*/
				2461	struct journal_head jbd2_journal_add_journal_head(struct buffer_head bh)
				2462	{
				2463	struct journal_head *jh;
				2464	struct journal_head *new_jh = NULL;
				2465
				2466	repeat:
				2467	if (!buffer_jbd(bh))
				2468	new_jh = journal_alloc_journal_head();
				2469
				2470	jbd_lock_bh_journal_head(bh);
				2471	if (buffer_jbd(bh)) {
				2472	jh = bh2jh(bh);
				2473	} else {
				2474	J_ASSERT_BH(bh,
				2475	(atomic_read(&bh->b_count) > 0) \|\|
				2476	(bh->b_page && bh->b_page->mapping));
				2477
				2478	if (!new_jh) {
				2479	jbd_unlock_bh_journal_head(bh);
				2480	goto repeat;
				2481	}
				2482
				2483	jh = new_jh;
				2484	new_jh = NULL; /* We consumed it */
				2485	set_buffer_jbd(bh);
				2486	bh->b_private = jh;
				2487	jh->b_bh = bh;
				2488	get_bh(bh);
				2489	BUFFER_TRACE(bh, "added journal_head");
				2490	}
				2491	jh->b_jcount++;
				2492	jbd_unlock_bh_journal_head(bh);
				2493	if (new_jh)
				2494	journal_free_journal_head(new_jh);
				2495	return bh->b_private;
				2496	}
				2497
				2498	/*
				2499	* Grab a ref against this buffer_head's journal_head. If it ended up not
				2500	* having a journal_head, return NULL
				2501	*/
				2502	struct journal_head jbd2_journal_grab_journal_head(struct buffer_head bh)
				2503	{
				2504	struct journal_head *jh = NULL;
				2505
				2506	jbd_lock_bh_journal_head(bh);
				2507	if (buffer_jbd(bh)) {
				2508	jh = bh2jh(bh);
				2509	jh->b_jcount++;
				2510	}
				2511	jbd_unlock_bh_journal_head(bh);
				2512	return jh;
				2513	}
				2514
				2515	static void __journal_remove_journal_head(struct buffer_head *bh)
				2516	{
				2517	struct journal_head *jh = bh2jh(bh);
				2518
				2519	J_ASSERT_JH(jh, jh->b_jcount >= 0);
				2520	J_ASSERT_JH(jh, jh->b_transaction == NULL);
				2521	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
				2522	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
				2523	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
				2524	J_ASSERT_BH(bh, buffer_jbd(bh));
				2525	J_ASSERT_BH(bh, jh2bh(jh) == bh);
				2526	BUFFER_TRACE(bh, "remove journal_head");
				2527	if (jh->b_frozen_data) {
				2528	printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
				2529	jbd2_free(jh->b_frozen_data, bh->b_size);
				2530	}
				2531	if (jh->b_committed_data) {
				2532	printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
				2533	jbd2_free(jh->b_committed_data, bh->b_size);
				2534	}
				2535	bh->b_private = NULL;
				2536	jh->b_bh = NULL; /* debug, really */
				2537	clear_buffer_jbd(bh);
				2538	journal_free_journal_head(jh);
				2539	}
				2540
				2541	/*
				2542	* Drop a reference on the passed journal_head. If it fell to zero then
				2543	* release the journal_head from the buffer_head.
				2544	*/
				2545	void jbd2_journal_put_journal_head(struct journal_head *jh)
				2546	{
				2547	struct buffer_head *bh = jh2bh(jh);
				2548
				2549	jbd_lock_bh_journal_head(bh);
				2550	J_ASSERT_JH(jh, jh->b_jcount > 0);
				2551	--jh->b_jcount;
				2552	if (!jh->b_jcount) {
				2553	__journal_remove_journal_head(bh);
				2554	jbd_unlock_bh_journal_head(bh);
				2555	__brelse(bh);
				2556	} else
				2557	jbd_unlock_bh_journal_head(bh);
				2558	}
				2559
				2560	/*
				2561	* Initialize jbd inode head
				2562	*/
				2563	void jbd2_journal_init_jbd_inode(struct jbd2_inode jinode, struct inode inode)
				2564	{
				2565	jinode->i_transaction = NULL;
				2566	jinode->i_next_transaction = NULL;
				2567	jinode->i_vfs_inode = inode;
				2568	jinode->i_flags = 0;
				2569	jinode->i_dirty_start = 0;
				2570	jinode->i_dirty_end = 0;
				2571	INIT_LIST_HEAD(&jinode->i_list);
				2572	}
				2573
				2574	/*
				2575	* Function to be called before we start removing inode from memory (i.e.,
				2576	* clear_inode() is a fine place to be called from). It removes inode from
				2577	* transaction's lists.
				2578	*/
				2579	void jbd2_journal_release_jbd_inode(journal_t *journal,
				2580	struct jbd2_inode *jinode)
				2581	{
				2582	if (!journal)
				2583	return;
				2584	restart:
				2585	spin_lock(&journal->j_list_lock);
				2586	/* Is commit writing out inode - we have to wait */
				2587	if (jinode->i_flags & JI_COMMIT_RUNNING) {
				2588	wait_queue_head_t *wq;
				2589	DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
				2590	wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
				2591	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				2592	spin_unlock(&journal->j_list_lock);
				2593	schedule();
				2594	finish_wait(wq, &wait.wq_entry);
				2595	goto restart;
				2596	}
				2597
				2598	if (jinode->i_transaction) {
				2599	list_del(&jinode->i_list);
				2600	jinode->i_transaction = NULL;
				2601	}
				2602	spin_unlock(&journal->j_list_lock);
				2603	}
				2604
				2605
				2606	#ifdef CONFIG_PROC_FS
				2607
				2608	#define JBD2_STATS_PROC_NAME "fs/jbd2"
				2609
				2610	static void __init jbd2_create_jbd_stats_proc_entry(void)
				2611	{
				2612	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
				2613	}
				2614
				2615	static void __exit jbd2_remove_jbd_stats_proc_entry(void)
				2616	{
				2617	if (proc_jbd2_stats)
				2618	remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
				2619	}
				2620
				2621	#else
				2622
				2623	#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
				2624	#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
				2625
				2626	#endif
				2627
				2628	struct kmem_cache jbd2_handle_cache, jbd2_inode_cache;
				2629
				2630	static int __init jbd2_journal_init_inode_cache(void)
				2631	{
				2632	J_ASSERT(!jbd2_inode_cache);
				2633	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
				2634	if (!jbd2_inode_cache) {
				2635	pr_emerg("JBD2: failed to create inode cache\n");
				2636	return -ENOMEM;
				2637	}
				2638	return 0;
				2639	}
				2640
				2641	static int __init jbd2_journal_init_handle_cache(void)
				2642	{
				2643	J_ASSERT(!jbd2_handle_cache);
				2644	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
				2645	if (!jbd2_handle_cache) {
				2646	printk(KERN_EMERG "JBD2: failed to create handle cache\n");
				2647	return -ENOMEM;
				2648	}
				2649	return 0;
				2650	}
				2651
				2652	static void jbd2_journal_destroy_inode_cache(void)
				2653	{
				2654	kmem_cache_destroy(jbd2_inode_cache);
				2655	jbd2_inode_cache = NULL;
				2656	}
				2657
				2658	static void jbd2_journal_destroy_handle_cache(void)
				2659	{
				2660	kmem_cache_destroy(jbd2_handle_cache);
				2661	jbd2_handle_cache = NULL;
				2662	}
				2663
				2664	/*
				2665	* Module startup and shutdown
				2666	*/
				2667
				2668	static int __init journal_init_caches(void)
				2669	{
				2670	int ret;
				2671
				2672	ret = jbd2_journal_init_revoke_record_cache();
				2673	if (ret == 0)
				2674	ret = jbd2_journal_init_revoke_table_cache();
				2675	if (ret == 0)
				2676	ret = jbd2_journal_init_journal_head_cache();
				2677	if (ret == 0)
				2678	ret = jbd2_journal_init_handle_cache();
				2679	if (ret == 0)
				2680	ret = jbd2_journal_init_inode_cache();
				2681	if (ret == 0)
				2682	ret = jbd2_journal_init_transaction_cache();
				2683	return ret;
				2684	}
				2685
				2686	static void jbd2_journal_destroy_caches(void)
				2687	{
				2688	jbd2_journal_destroy_revoke_record_cache();
				2689	jbd2_journal_destroy_revoke_table_cache();
				2690	jbd2_journal_destroy_journal_head_cache();
				2691	jbd2_journal_destroy_handle_cache();
				2692	jbd2_journal_destroy_inode_cache();
				2693	jbd2_journal_destroy_transaction_cache();
				2694	jbd2_journal_destroy_slabs();
				2695	}
				2696
				2697	static int __init journal_init(void)
				2698	{
				2699	int ret;
				2700
				2701	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
				2702
				2703	ret = journal_init_caches();
				2704	if (ret == 0) {
				2705	jbd2_create_jbd_stats_proc_entry();
				2706	} else {
				2707	jbd2_journal_destroy_caches();
				2708	}
				2709	return ret;
				2710	}
				2711
				2712	static void __exit journal_exit(void)
				2713	{
				2714	#ifdef CONFIG_JBD2_DEBUG
				2715	int n = atomic_read(&nr_journal_heads);
				2716	if (n)
				2717	printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
				2718	#endif
				2719	jbd2_remove_jbd_stats_proc_entry();
				2720	jbd2_journal_destroy_caches();
				2721	}
				2722
				2723	MODULE_LICENSE("GPL");
				2724	module_init(journal_init);
				2725	module_exit(journal_exit);
				2726