Blame - src/kernel/linux/v4.14/fs/jbd2/commit.c - T103

blob: 6870103a0f59cfdcc6f649ba910fe1767e89f42c [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* linux/fs/jbd2/commit.c
				3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Journal commit routines for the generic filesystem journaling code;
				13	* part of the ext2fs journaling system.
				14	*/
				15
				16	#include <linux/time.h>
				17	#include <linux/fs.h>
				18	#include <linux/jbd2.h>
				19	#include <linux/errno.h>
				20	#include <linux/slab.h>
				21	#include <linux/mm.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/jiffies.h>
				24	#include <linux/crc32.h>
				25	#include <linux/writeback.h>
				26	#include <linux/backing-dev.h>
				27	#include <linux/bio.h>
				28	#include <linux/blkdev.h>
				29	#include <linux/bitops.h>
				30	#include <trace/events/jbd2.h>
				31
				32	/*
				33	* IO end handler for temporary buffer_heads handling writes to the journal.
				34	*/
				35	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				36	{
				37	struct buffer_head *orig_bh = bh->b_private;
				38
				39	BUFFER_TRACE(bh, "");
				40	if (uptodate)
				41	set_buffer_uptodate(bh);
				42	else
				43	clear_buffer_uptodate(bh);
				44	if (orig_bh) {
				45	clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
				46	smp_mb__after_atomic();
				47	wake_up_bit(&orig_bh->b_state, BH_Shadow);
				48	}
				49	unlock_buffer(bh);
				50	}
				51
				52	/*
				53	* When an ext4 file is truncated, it is possible that some pages are not
				54	* successfully freed, because they are attached to a committing transaction.
				55	* After the transaction commits, these pages are left on the LRU, with no
				56	* ->mapping, and with attached buffers. These pages are trivially reclaimable
				57	* by the VM, but their apparent absence upsets the VM accounting, and it makes
				58	* the numbers in /proc/meminfo look odd.
				59	*
				60	* So here, we have a buffer which has just come off the forget list. Look to
				61	* see if we can strip all buffers from the backing page.
				62	*
				63	* Called under lock_journal(), and possibly under journal_datalist_lock. The
				64	* caller provided us with a ref against the buffer, and we drop that here.
				65	*/
				66	static void release_buffer_page(struct buffer_head *bh)
				67	{
				68	struct page *page;
				69
				70	if (buffer_dirty(bh))
				71	goto nope;
				72	if (atomic_read(&bh->b_count) != 1)
				73	goto nope;
				74	page = bh->b_page;
				75	if (!page)
				76	goto nope;
				77	if (page->mapping)
				78	goto nope;
				79
				80	/* OK, it's a truncated page */
				81	if (!trylock_page(page))
				82	goto nope;
				83
				84	get_page(page);
				85	__brelse(bh);
				86	try_to_free_buffers(page);
				87	unlock_page(page);
				88	put_page(page);
				89	return;
				90
				91	nope:
				92	__brelse(bh);
				93	}
				94
				95	static void jbd2_commit_block_csum_set(journal_t j, struct buffer_head bh)
				96	{
				97	struct commit_header *h;
				98	__u32 csum;
				99
				100	if (!jbd2_journal_has_csum_v2or3(j))
				101	return;
				102
				103	h = (struct commit_header *)(bh->b_data);
				104	h->h_chksum_type = 0;
				105	h->h_chksum_size = 0;
				106	h->h_chksum[0] = 0;
				107	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
				108	h->h_chksum[0] = cpu_to_be32(csum);
				109	}
				110
				111	/*
				112	* Done it all: now submit the commit record. We should have
				113	* cleaned up our previous buffers by now, so if we are in abort
				114	* mode we can now just skip the rest of the journal write
				115	* entirely.
				116	*
				117	* Returns 1 if the journal needs to be aborted or 0 on success
				118	*/
				119	static int journal_submit_commit_record(journal_t *journal,
				120	transaction_t *commit_transaction,
				121	struct buffer_head **cbh,
				122	__u32 crc32_sum)
				123	{
				124	struct commit_header *tmp;
				125	struct buffer_head *bh;
				126	int ret;
				127	struct timespec64 now = current_kernel_time64();
				128
				129	*cbh = NULL;
				130
				131	if (is_journal_aborted(journal))
				132	return 0;
				133
				134	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
				135	JBD2_COMMIT_BLOCK);
				136	if (!bh)
				137	return 1;
				138
				139	tmp = (struct commit_header *)bh->b_data;
				140	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
				141	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
				142
				143	if (jbd2_has_feature_checksum(journal)) {
				144	tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
				145	tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
				146	tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
				147	}
				148	jbd2_commit_block_csum_set(journal, bh);
				149
				150	BUFFER_TRACE(bh, "submit commit block");
				151	lock_buffer(bh);
				152	clear_buffer_dirty(bh);
				153	set_buffer_uptodate(bh);
				154	bh->b_end_io = journal_end_buffer_io_sync;
				155
				156	if (journal->j_flags & JBD2_BARRIER &&
				157	!jbd2_has_feature_async_commit(journal))
				158	ret = submit_bh(REQ_OP_WRITE,
				159	REQ_SYNC \| REQ_PREFLUSH \| REQ_FUA, bh);
				160	else
				161	ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
				162
				163	*cbh = bh;
				164	return ret;
				165	}
				166
				167	/*
				168	* This function along with journal_submit_commit_record
				169	* allows to write the commit record asynchronously.
				170	*/
				171	static int journal_wait_on_commit_record(journal_t *journal,
				172	struct buffer_head *bh)
				173	{
				174	int ret = 0;
				175
				176	clear_buffer_dirty(bh);
				177	wait_on_buffer(bh);
				178
				179	if (unlikely(!buffer_uptodate(bh)))
				180	ret = -EIO;
				181	put_bh(bh); /* One for getblk() */
				182
				183	return ret;
				184	}
				185
				186	/*
				187	* write the filemap data using writepage() address_space_operations.
				188	* We don't do block allocation here even for delalloc. We don't
				189	* use writepages() because with dealyed allocation we may be doing
				190	* block allocation in writepages().
				191	*/
				192	static int journal_submit_inode_data_buffers(struct address_space *mapping,
				193	loff_t dirty_start, loff_t dirty_end)
				194	{
				195	int ret;
				196	struct writeback_control wbc = {
				197	.sync_mode = WB_SYNC_ALL,
				198	.nr_to_write = mapping->nrpages * 2,
				199	.range_start = dirty_start,
				200	.range_end = dirty_end,
				201	};
				202
				203	ret = generic_writepages(mapping, &wbc);
				204	return ret;
				205	}
				206
				207	/*
				208	* Submit all the data buffers of inode associated with the transaction to
				209	* disk.
				210	*
				211	* We are in a committing transaction. Therefore no new inode can be added to
				212	* our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
				213	* operate on from being released while we write out pages.
				214	*/
				215	static int journal_submit_data_buffers(journal_t *journal,
				216	transaction_t *commit_transaction)
				217	{
				218	struct jbd2_inode *jinode;
				219	int err, ret = 0;
				220	struct address_space *mapping;
				221
				222	spin_lock(&journal->j_list_lock);
				223	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
				224	loff_t dirty_start = jinode->i_dirty_start;
				225	loff_t dirty_end = jinode->i_dirty_end;
				226
				227	if (!(jinode->i_flags & JI_WRITE_DATA))
				228	continue;
				229	mapping = jinode->i_vfs_inode->i_mapping;
				230	jinode->i_flags \|= JI_COMMIT_RUNNING;
				231	spin_unlock(&journal->j_list_lock);
				232	/*
				233	* submit the inode data buffers. We use writepage
				234	* instead of writepages. Because writepages can do
				235	* block allocation with delalloc. We need to write
				236	* only allocated blocks here.
				237	*/
				238	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
				239	err = journal_submit_inode_data_buffers(mapping, dirty_start,
				240	dirty_end);
				241	if (!ret)
				242	ret = err;
				243	spin_lock(&journal->j_list_lock);
				244	J_ASSERT(jinode->i_transaction == commit_transaction);
				245	jinode->i_flags &= ~JI_COMMIT_RUNNING;
				246	smp_mb();
				247	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
				248	}
				249	spin_unlock(&journal->j_list_lock);
				250	return ret;
				251	}
				252
				253	/*
				254	* Wait for data submitted for writeout, refile inodes to proper
				255	* transaction if needed.
				256	*
				257	*/
				258	static int journal_finish_inode_data_buffers(journal_t *journal,
				259	transaction_t *commit_transaction)
				260	{
				261	struct jbd2_inode jinode, next_i;
				262	int err, ret = 0;
				263
				264	/* For locking, see the comment in journal_submit_data_buffers() */
				265	spin_lock(&journal->j_list_lock);
				266	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
				267	loff_t dirty_start = jinode->i_dirty_start;
				268	loff_t dirty_end = jinode->i_dirty_end;
				269
				270	if (!(jinode->i_flags & JI_WAIT_DATA))
				271	continue;
				272	jinode->i_flags \|= JI_COMMIT_RUNNING;
				273	spin_unlock(&journal->j_list_lock);
				274	err = filemap_fdatawait_range_keep_errors(
				275	jinode->i_vfs_inode->i_mapping, dirty_start,
				276	dirty_end);
				277	if (!ret)
				278	ret = err;
				279	spin_lock(&journal->j_list_lock);
				280	jinode->i_flags &= ~JI_COMMIT_RUNNING;
				281	smp_mb();
				282	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
				283	}
				284
				285	/* Now refile inode to proper lists */
				286	list_for_each_entry_safe(jinode, next_i,
				287	&commit_transaction->t_inode_list, i_list) {
				288	list_del(&jinode->i_list);
				289	if (jinode->i_next_transaction) {
				290	jinode->i_transaction = jinode->i_next_transaction;
				291	jinode->i_next_transaction = NULL;
				292	list_add(&jinode->i_list,
				293	&jinode->i_transaction->t_inode_list);
				294	} else {
				295	jinode->i_transaction = NULL;
				296	jinode->i_dirty_start = 0;
				297	jinode->i_dirty_end = 0;
				298	}
				299	}
				300	spin_unlock(&journal->j_list_lock);
				301
				302	return ret;
				303	}
				304
				305	static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
				306	{
				307	struct page *page = bh->b_page;
				308	char *addr;
				309	__u32 checksum;
				310
				311	addr = kmap_atomic(page);
				312	checksum = crc32_be(crc32_sum,
				313	(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
				314	kunmap_atomic(addr);
				315
				316	return checksum;
				317	}
				318
				319	static void write_tag_block(journal_t j, journal_block_tag_t tag,
				320	unsigned long long block)
				321	{
				322	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
				323	if (jbd2_has_feature_64bit(j))
				324	tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
				325	}
				326
				327	static void jbd2_block_tag_csum_set(journal_t j, journal_block_tag_t tag,
				328	struct buffer_head *bh, __u32 sequence)
				329	{
				330	journal_block_tag3_t tag3 = (journal_block_tag3_t )tag;
				331	struct page *page = bh->b_page;
				332	__u8 *addr;
				333	__u32 csum32;
				334	__be32 seq;
				335
				336	if (!jbd2_journal_has_csum_v2or3(j))
				337	return;
				338
				339	seq = cpu_to_be32(sequence);
				340	addr = kmap_atomic(page);
				341	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
				342	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
				343	bh->b_size);
				344	kunmap_atomic(addr);
				345
				346	if (jbd2_has_feature_csum3(j))
				347	tag3->t_checksum = cpu_to_be32(csum32);
				348	else
				349	tag->t_checksum = cpu_to_be16(csum32);
				350	}
				351	/*
				352	* jbd2_journal_commit_transaction
				353	*
				354	* The primary function for committing a transaction to the log. This
				355	* function is called by the journal thread to begin a complete commit.
				356	*/
				357	void jbd2_journal_commit_transaction(journal_t *journal)
				358	{
				359	struct transaction_stats_s stats;
				360	transaction_t *commit_transaction;
				361	struct journal_head *jh;
				362	struct buffer_head *descriptor;
				363	struct buffer_head **wbuf = journal->j_wbuf;
				364	int bufs;
				365	int flags;
				366	int err;
				367	unsigned long long blocknr;
				368	ktime_t start_time;
				369	u64 commit_time;
				370	char *tagp = NULL;
				371	journal_block_tag_t *tag = NULL;
				372	int space_left = 0;
				373	int first_tag = 0;
				374	int tag_flag;
				375	int i;
				376	int tag_bytes = journal_tag_bytes(journal);
				377	struct buffer_head cbh = NULL; / For transactional checksums */
				378	__u32 crc32_sum = ~0;
				379	struct blk_plug plug;
				380	/* Tail of the journal */
				381	unsigned long first_block;
				382	tid_t first_tid;
				383	int update_tail;
				384	int csum_size = 0;
				385	LIST_HEAD(io_bufs);
				386	LIST_HEAD(log_bufs);
				387
				388	if (jbd2_journal_has_csum_v2or3(journal))
				389	csum_size = sizeof(struct jbd2_journal_block_tail);
				390
				391	/*
				392	* First job: lock down the current transaction and wait for
				393	* all outstanding updates to complete.
				394	*/
				395
				396	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
				397	if (journal->j_flags & JBD2_FLUSHED) {
				398	jbd_debug(3, "super block updated\n");
				399	mutex_lock_io(&journal->j_checkpoint_mutex);
				400	/*
				401	* We hold j_checkpoint_mutex so tail cannot change under us.
				402	* We don't need any special data guarantees for writing sb
				403	* since journal is empty and it is ok for write to be
				404	* flushed only with transaction commit.
				405	*/
				406	jbd2_journal_update_sb_log_tail(journal,
				407	journal->j_tail_sequence,
				408	journal->j_tail,
				409	REQ_SYNC);
				410	mutex_unlock(&journal->j_checkpoint_mutex);
				411	} else {
				412	jbd_debug(3, "superblock not updated\n");
				413	}
				414
				415	J_ASSERT(journal->j_running_transaction != NULL);
				416	J_ASSERT(journal->j_committing_transaction == NULL);
				417
				418	commit_transaction = journal->j_running_transaction;
				419
				420	trace_jbd2_start_commit(journal, commit_transaction);
				421	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
				422	commit_transaction->t_tid);
				423
				424	write_lock(&journal->j_state_lock);
				425	J_ASSERT(commit_transaction->t_state == T_RUNNING);
				426	commit_transaction->t_state = T_LOCKED;
				427
				428	trace_jbd2_commit_locking(journal, commit_transaction);
				429	stats.run.rs_wait = commit_transaction->t_max_wait;
				430	stats.run.rs_request_delay = 0;
				431	stats.run.rs_locked = jiffies;
				432	if (commit_transaction->t_requested)
				433	stats.run.rs_request_delay =
				434	jbd2_time_diff(commit_transaction->t_requested,
				435	stats.run.rs_locked);
				436	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
				437	stats.run.rs_locked);
				438
				439	spin_lock(&commit_transaction->t_handle_lock);
				440	while (atomic_read(&commit_transaction->t_updates)) {
				441	DEFINE_WAIT(wait);
				442
				443	prepare_to_wait(&journal->j_wait_updates, &wait,
				444	TASK_UNINTERRUPTIBLE);
				445	if (atomic_read(&commit_transaction->t_updates)) {
				446	spin_unlock(&commit_transaction->t_handle_lock);
				447	write_unlock(&journal->j_state_lock);
				448	schedule();
				449	write_lock(&journal->j_state_lock);
				450	spin_lock(&commit_transaction->t_handle_lock);
				451	}
				452	finish_wait(&journal->j_wait_updates, &wait);
				453	}
				454	spin_unlock(&commit_transaction->t_handle_lock);
				455
				456	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
				457	journal->j_max_transaction_buffers);
				458
				459	/*
				460	* First thing we are allowed to do is to discard any remaining
				461	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
				462	* that there are no such buffers: if a large filesystem
				463	* operation like a truncate needs to split itself over multiple
				464	* transactions, then it may try to do a jbd2_journal_restart() while
				465	* there are still BJ_Reserved buffers outstanding. These must
				466	* be released cleanly from the current transaction.
				467	*
				468	* In this case, the filesystem must still reserve write access
				469	* again before modifying the buffer in the new transaction, but
				470	* we do not require it to remember exactly which old buffers it
				471	* has reserved. This is consistent with the existing behaviour
				472	* that multiple jbd2_journal_get_write_access() calls to the same
				473	* buffer are perfectly permissible.
				474	*/
				475	while (commit_transaction->t_reserved_list) {
				476	jh = commit_transaction->t_reserved_list;
				477	JBUFFER_TRACE(jh, "reserved, unused: refile");
				478	/*
				479	* A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
				480	* leave undo-committed data.
				481	*/
				482	if (jh->b_committed_data) {
				483	struct buffer_head *bh = jh2bh(jh);
				484
				485	jbd_lock_bh_state(bh);
				486	jbd2_free(jh->b_committed_data, bh->b_size);
				487	jh->b_committed_data = NULL;
				488	jbd_unlock_bh_state(bh);
				489	}
				490	jbd2_journal_refile_buffer(journal, jh);
				491	}
				492
				493	/*
				494	* Now try to drop any written-back buffers from the journal's
				495	* checkpoint lists. We do this before commit because it potentially
				496	* frees some memory
				497	*/
				498	spin_lock(&journal->j_list_lock);
				499	__jbd2_journal_clean_checkpoint_list(journal, false);
				500	spin_unlock(&journal->j_list_lock);
				501
				502	jbd_debug(3, "JBD2: commit phase 1\n");
				503
				504	/*
				505	* Clear revoked flag to reflect there is no revoked buffers
				506	* in the next transaction which is going to be started.
				507	*/
				508	jbd2_clear_buffer_revoked_flags(journal);
				509
				510	/*
				511	* Switch to a new revoke table.
				512	*/
				513	jbd2_journal_switch_revoke_table(journal);
				514
				515	/*
				516	* Reserved credits cannot be claimed anymore, free them
				517	*/
				518	atomic_sub(atomic_read(&journal->j_reserved_credits),
				519	&commit_transaction->t_outstanding_credits);
				520
				521	trace_jbd2_commit_flushing(journal, commit_transaction);
				522	stats.run.rs_flushing = jiffies;
				523	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
				524	stats.run.rs_flushing);
				525
				526	commit_transaction->t_state = T_FLUSH;
				527	journal->j_committing_transaction = commit_transaction;
				528	journal->j_running_transaction = NULL;
				529	start_time = ktime_get();
				530	commit_transaction->t_log_start = journal->j_head;
				531	wake_up(&journal->j_wait_transaction_locked);
				532	write_unlock(&journal->j_state_lock);
				533
				534	jbd_debug(3, "JBD2: commit phase 2a\n");
				535
				536	/*
				537	* Now start flushing things to disk, in the order they appear
				538	* on the transaction lists. Data blocks go first.
				539	*/
				540	err = journal_submit_data_buffers(journal, commit_transaction);
				541	if (err)
				542	jbd2_journal_abort(journal, err);
				543
				544	blk_start_plug(&plug);
				545	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
				546
				547	jbd_debug(3, "JBD2: commit phase 2b\n");
				548
				549	/*
				550	* Way to go: we have now written out all of the data for a
				551	* transaction! Now comes the tricky part: we need to write out
				552	* metadata. Loop over the transaction's entire buffer list:
				553	*/
				554	write_lock(&journal->j_state_lock);
				555	commit_transaction->t_state = T_COMMIT;
				556	write_unlock(&journal->j_state_lock);
				557
				558	trace_jbd2_commit_logging(journal, commit_transaction);
				559	stats.run.rs_logging = jiffies;
				560	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
				561	stats.run.rs_logging);
				562	stats.run.rs_blocks =
				563	atomic_read(&commit_transaction->t_outstanding_credits);
				564	stats.run.rs_blocks_logged = 0;
				565
				566	J_ASSERT(commit_transaction->t_nr_buffers <=
				567	atomic_read(&commit_transaction->t_outstanding_credits));
				568
				569	err = 0;
				570	bufs = 0;
				571	descriptor = NULL;
				572	while (commit_transaction->t_buffers) {
				573
				574	/* Find the next buffer to be journaled... */
				575
				576	jh = commit_transaction->t_buffers;
				577
				578	/* If we're in abort mode, we just un-journal the buffer and
				579	release it. */
				580
				581	if (is_journal_aborted(journal)) {
				582	clear_buffer_jbddirty(jh2bh(jh));
				583	JBUFFER_TRACE(jh, "journal is aborting: refile");
				584	jbd2_buffer_abort_trigger(jh,
				585	jh->b_frozen_data ?
				586	jh->b_frozen_triggers :
				587	jh->b_triggers);
				588	jbd2_journal_refile_buffer(journal, jh);
				589	/* If that was the last one, we need to clean up
				590	* any descriptor buffers which may have been
				591	* already allocated, even if we are now
				592	* aborting. */
				593	if (!commit_transaction->t_buffers)
				594	goto start_journal_io;
				595	continue;
				596	}
				597
				598	/* Make sure we have a descriptor block in which to
				599	record the metadata buffer. */
				600
				601	if (!descriptor) {
				602	J_ASSERT (bufs == 0);
				603
				604	jbd_debug(4, "JBD2: get descriptor\n");
				605
				606	descriptor = jbd2_journal_get_descriptor_buffer(
				607	commit_transaction,
				608	JBD2_DESCRIPTOR_BLOCK);
				609	if (!descriptor) {
				610	jbd2_journal_abort(journal, -EIO);
				611	continue;
				612	}
				613
				614	jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
				615	(unsigned long long)descriptor->b_blocknr,
				616	descriptor->b_data);
				617	tagp = &descriptor->b_data[sizeof(journal_header_t)];
				618	space_left = descriptor->b_size -
				619	sizeof(journal_header_t);
				620	first_tag = 1;
				621	set_buffer_jwrite(descriptor);
				622	set_buffer_dirty(descriptor);
				623	wbuf[bufs++] = descriptor;
				624
				625	/* Record it so that we can wait for IO
				626	completion later */
				627	BUFFER_TRACE(descriptor, "ph3: file as descriptor");
				628	jbd2_file_log_bh(&log_bufs, descriptor);
				629	}
				630
				631	/* Where is the buffer to be written? */
				632
				633	err = jbd2_journal_next_log_block(journal, &blocknr);
				634	/* If the block mapping failed, just abandon the buffer
				635	and repeat this loop: we'll fall into the
				636	refile-on-abort condition above. */
				637	if (err) {
				638	jbd2_journal_abort(journal, err);
				639	continue;
				640	}
				641
				642	/*
				643	* start_this_handle() uses t_outstanding_credits to determine
				644	* the free space in the log, but this counter is changed
				645	* by jbd2_journal_next_log_block() also.
				646	*/
				647	atomic_dec(&commit_transaction->t_outstanding_credits);
				648
				649	/* Bump b_count to prevent truncate from stumbling over
				650	the shadowed buffer! @@@ This can go if we ever get
				651	rid of the shadow pairing of buffers. */
				652	atomic_inc(&jh2bh(jh)->b_count);
				653
				654	/*
				655	* Make a temporary IO buffer with which to write it out
				656	* (this will requeue the metadata buffer to BJ_Shadow).
				657	*/
				658	set_bit(BH_JWrite, &jh2bh(jh)->b_state);
				659	JBUFFER_TRACE(jh, "ph3: write metadata");
				660	flags = jbd2_journal_write_metadata_buffer(commit_transaction,
				661	jh, &wbuf[bufs], blocknr);
				662	if (flags < 0) {
				663	jbd2_journal_abort(journal, flags);
				664	continue;
				665	}
				666	jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
				667
				668	/* Record the new block's tag in the current descriptor
				669	buffer */
				670
				671	tag_flag = 0;
				672	if (flags & 1)
				673	tag_flag \|= JBD2_FLAG_ESCAPE;
				674	if (!first_tag)
				675	tag_flag \|= JBD2_FLAG_SAME_UUID;
				676
				677	tag = (journal_block_tag_t *) tagp;
				678	write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
				679	tag->t_flags = cpu_to_be16(tag_flag);
				680	jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
				681	commit_transaction->t_tid);
				682	tagp += tag_bytes;
				683	space_left -= tag_bytes;
				684	bufs++;
				685
				686	if (first_tag) {
				687	memcpy (tagp, journal->j_uuid, 16);
				688	tagp += 16;
				689	space_left -= 16;
				690	first_tag = 0;
				691	}
				692
				693	/* If there's no more to do, or if the descriptor is full,
				694	let the IO rip! */
				695
				696	if (bufs == journal->j_wbufsize \|\|
				697	commit_transaction->t_buffers == NULL \|\|
				698	space_left < tag_bytes + 16 + csum_size) {
				699
				700	jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
				701
				702	/* Write an end-of-descriptor marker before
				703	submitting the IOs. "tag" still points to
				704	the last tag we set up. */
				705
				706	tag->t_flags \|= cpu_to_be16(JBD2_FLAG_LAST_TAG);
				707	start_journal_io:
				708	if (descriptor)
				709	jbd2_descriptor_block_csum_set(journal,
				710	descriptor);
				711
				712	for (i = 0; i < bufs; i++) {
				713	struct buffer_head *bh = wbuf[i];
				714	/*
				715	* Compute checksum.
				716	*/
				717	if (jbd2_has_feature_checksum(journal)) {
				718	crc32_sum =
				719	jbd2_checksum_data(crc32_sum, bh);
				720	}
				721
				722	lock_buffer(bh);
				723	clear_buffer_dirty(bh);
				724	set_buffer_uptodate(bh);
				725	bh->b_end_io = journal_end_buffer_io_sync;
				726	submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
				727	}
				728	cond_resched();
				729
				730	/* Force a new descriptor to be generated next
				731	time round the loop. */
				732	descriptor = NULL;
				733	bufs = 0;
				734	}
				735	}
				736
				737	err = journal_finish_inode_data_buffers(journal, commit_transaction);
				738	if (err) {
				739	printk(KERN_WARNING
				740	"JBD2: Detected IO errors while flushing file data "
				741	"on %s\n", journal->j_devname);
				742	if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
				743	jbd2_journal_abort(journal, err);
				744	err = 0;
				745	}
				746
				747	/*
				748	* Get current oldest transaction in the log before we issue flush
				749	* to the filesystem device. After the flush we can be sure that
				750	* blocks of all older transactions are checkpointed to persistent
				751	* storage and we will be safe to update journal start in the
				752	* superblock with the numbers we get here.
				753	*/
				754	update_tail =
				755	jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
				756
				757	write_lock(&journal->j_state_lock);
				758	if (update_tail) {
				759	long freed = first_block - journal->j_tail;
				760
				761	if (first_block < journal->j_tail)
				762	freed += journal->j_last - journal->j_first;
				763	/* Update tail only if we free significant amount of space */
				764	if (freed < journal->j_maxlen / 4)
				765	update_tail = 0;
				766	}
				767	J_ASSERT(commit_transaction->t_state == T_COMMIT);
				768	commit_transaction->t_state = T_COMMIT_DFLUSH;
				769	write_unlock(&journal->j_state_lock);
				770
				771	/*
				772	* If the journal is not located on the file system device,
				773	* then we must flush the file system device before we issue
				774	* the commit record
				775	*/
				776	if (commit_transaction->t_need_data_flush &&
				777	(journal->j_fs_dev != journal->j_dev) &&
				778	(journal->j_flags & JBD2_BARRIER))
				779	blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
				780
				781	/* Done it all: now write the commit record asynchronously. */
				782	if (jbd2_has_feature_async_commit(journal)) {
				783	err = journal_submit_commit_record(journal, commit_transaction,
				784	&cbh, crc32_sum);
				785	if (err)
				786	jbd2_journal_abort(journal, err);
				787	}
				788
				789	blk_finish_plug(&plug);
				790
				791	/* Lo and behold: we have just managed to send a transaction to
				792	the log. Before we can commit it, wait for the IO so far to
				793	complete. Control buffers being written are on the
				794	transaction's t_log_list queue, and metadata buffers are on
				795	the io_bufs list.
				796
				797	Wait for the buffers in reverse order. That way we are
				798	less likely to be woken up until all IOs have completed, and
				799	so we incur less scheduling load.
				800	*/
				801
				802	jbd_debug(3, "JBD2: commit phase 3\n");
				803
				804	while (!list_empty(&io_bufs)) {
				805	struct buffer_head *bh = list_entry(io_bufs.prev,
				806	struct buffer_head,
				807	b_assoc_buffers);
				808
				809	wait_on_buffer(bh);
				810	cond_resched();
				811
				812	if (unlikely(!buffer_uptodate(bh)))
				813	err = -EIO;
				814	jbd2_unfile_log_bh(bh);
				815	stats.run.rs_blocks_logged++;
				816
				817	/*
				818	* The list contains temporary buffer heads created by
				819	* jbd2_journal_write_metadata_buffer().
				820	*/
				821	BUFFER_TRACE(bh, "dumping temporary bh");
				822	__brelse(bh);
				823	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
				824	free_buffer_head(bh);
				825
				826	/* We also have to refile the corresponding shadowed buffer */
				827	jh = commit_transaction->t_shadow_list->b_tprev;
				828	bh = jh2bh(jh);
				829	clear_buffer_jwrite(bh);
				830	J_ASSERT_BH(bh, buffer_jbddirty(bh));
				831	J_ASSERT_BH(bh, !buffer_shadow(bh));
				832
				833	/* The metadata is now released for reuse, but we need
				834	to remember it against this transaction so that when
				835	we finally commit, we can do any checkpointing
				836	required. */
				837	JBUFFER_TRACE(jh, "file as BJ_Forget");
				838	jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
				839	JBUFFER_TRACE(jh, "brelse shadowed buffer");
				840	__brelse(bh);
				841	}
				842
				843	J_ASSERT (commit_transaction->t_shadow_list == NULL);
				844
				845	jbd_debug(3, "JBD2: commit phase 4\n");
				846
				847	/* Here we wait for the revoke record and descriptor record buffers */
				848	while (!list_empty(&log_bufs)) {
				849	struct buffer_head *bh;
				850
				851	bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
				852	wait_on_buffer(bh);
				853	cond_resched();
				854
				855	if (unlikely(!buffer_uptodate(bh)))
				856	err = -EIO;
				857
				858	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
				859	clear_buffer_jwrite(bh);
				860	jbd2_unfile_log_bh(bh);
				861	stats.run.rs_blocks_logged++;
				862	__brelse(bh); /* One for getblk */
				863	/* AKPM: bforget here */
				864	}
				865
				866	if (err)
				867	jbd2_journal_abort(journal, err);
				868
				869	jbd_debug(3, "JBD2: commit phase 5\n");
				870	write_lock(&journal->j_state_lock);
				871	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
				872	commit_transaction->t_state = T_COMMIT_JFLUSH;
				873	write_unlock(&journal->j_state_lock);
				874
				875	if (!jbd2_has_feature_async_commit(journal)) {
				876	err = journal_submit_commit_record(journal, commit_transaction,
				877	&cbh, crc32_sum);
				878	if (err)
				879	jbd2_journal_abort(journal, err);
				880	}
				881	if (cbh)
				882	err = journal_wait_on_commit_record(journal, cbh);
				883	stats.run.rs_blocks_logged++;
				884	if (jbd2_has_feature_async_commit(journal) &&
				885	journal->j_flags & JBD2_BARRIER) {
				886	blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
				887	}
				888
				889	if (err)
				890	jbd2_journal_abort(journal, err);
				891
				892	/*
				893	* Now disk caches for filesystem device are flushed so we are safe to
				894	* erase checkpointed transactions from the log by updating journal
				895	* superblock.
				896	*/
				897	if (update_tail)
				898	jbd2_update_log_tail(journal, first_tid, first_block);
				899
				900	/* End of a transaction! Finally, we can do checkpoint
				901	processing: any buffers committed as a result of this
				902	transaction can be removed from any checkpoint list it was on
				903	before. */
				904
				905	jbd_debug(3, "JBD2: commit phase 6\n");
				906
				907	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
				908	J_ASSERT(commit_transaction->t_buffers == NULL);
				909	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
				910	J_ASSERT(commit_transaction->t_shadow_list == NULL);
				911
				912	restart_loop:
				913	/*
				914	* As there are other places (journal_unmap_buffer()) adding buffers
				915	* to this list we have to be careful and hold the j_list_lock.
				916	*/
				917	spin_lock(&journal->j_list_lock);
				918	while (commit_transaction->t_forget) {
				919	transaction_t *cp_transaction;
				920	struct buffer_head *bh;
				921	int try_to_free = 0;
				922
				923	jh = commit_transaction->t_forget;
				924	spin_unlock(&journal->j_list_lock);
				925	bh = jh2bh(jh);
				926	/*
				927	* Get a reference so that bh cannot be freed before we are
				928	* done with it.
				929	*/
				930	get_bh(bh);
				931	jbd_lock_bh_state(bh);
				932	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
				933
				934	/*
				935	* If there is undo-protected committed data against
				936	* this buffer, then we can remove it now. If it is a
				937	* buffer needing such protection, the old frozen_data
				938	* field now points to a committed version of the
				939	* buffer, so rotate that field to the new committed
				940	* data.
				941	*
				942	* Otherwise, we can just throw away the frozen data now.
				943	*
				944	* We also know that the frozen data has already fired
				945	* its triggers if they exist, so we can clear that too.
				946	*/
				947	if (jh->b_committed_data) {
				948	jbd2_free(jh->b_committed_data, bh->b_size);
				949	jh->b_committed_data = NULL;
				950	if (jh->b_frozen_data) {
				951	jh->b_committed_data = jh->b_frozen_data;
				952	jh->b_frozen_data = NULL;
				953	jh->b_frozen_triggers = NULL;
				954	}
				955	} else if (jh->b_frozen_data) {
				956	jbd2_free(jh->b_frozen_data, bh->b_size);
				957	jh->b_frozen_data = NULL;
				958	jh->b_frozen_triggers = NULL;
				959	}
				960
				961	spin_lock(&journal->j_list_lock);
				962	cp_transaction = jh->b_cp_transaction;
				963	if (cp_transaction) {
				964	JBUFFER_TRACE(jh, "remove from old cp transaction");
				965	cp_transaction->t_chp_stats.cs_dropped++;
				966	__jbd2_journal_remove_checkpoint(jh);
				967	}
				968
				969	/* Only re-checkpoint the buffer_head if it is marked
				970	* dirty. If the buffer was added to the BJ_Forget list
				971	* by jbd2_journal_forget, it may no longer be dirty and
				972	* there's no point in keeping a checkpoint record for
				973	* it. */
				974
				975	/*
				976	* A buffer which has been freed while still being journaled
				977	* by a previous transaction, refile the buffer to BJ_Forget of
				978	* the running transaction. If the just committed transaction
				979	* contains "add to orphan" operation, we can completely
				980	* invalidate the buffer now. We are rather through in that
				981	* since the buffer may be still accessible when blocksize <
				982	* pagesize and it is attached to the last partial page.
				983	*/
				984	if (buffer_freed(bh) && !jh->b_next_transaction) {
				985	struct address_space *mapping;
				986
				987	clear_buffer_freed(bh);
				988	clear_buffer_jbddirty(bh);
				989
				990	/*
				991	* Block device buffers need to stay mapped all the
				992	* time, so it is enough to clear buffer_jbddirty and
				993	* buffer_freed bits. For the file mapping buffers (i.e.
				994	* journalled data) we need to unmap buffer and clear
				995	* more bits. We also need to be careful about the check
				996	* because the data page mapping can get cleared under
				997	* our hands. Note that if mapping == NULL, we don't
				998	* need to make buffer unmapped because the page is
				999	* already detached from the mapping and buffers cannot
				1000	* get reused.
				1001	*/
				1002	mapping = READ_ONCE(bh->b_page->mapping);
				1003	if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
				1004	clear_buffer_mapped(bh);
				1005	clear_buffer_new(bh);
				1006	clear_buffer_req(bh);
				1007	bh->b_bdev = NULL;
				1008	}
				1009	}
				1010
				1011	if (buffer_jbddirty(bh)) {
				1012	JBUFFER_TRACE(jh, "add to new checkpointing trans");
				1013	__jbd2_journal_insert_checkpoint(jh, commit_transaction);
				1014	if (is_journal_aborted(journal))
				1015	clear_buffer_jbddirty(bh);
				1016	} else {
				1017	J_ASSERT_BH(bh, !buffer_dirty(bh));
				1018	/*
				1019	* The buffer on BJ_Forget list and not jbddirty means
				1020	* it has been freed by this transaction and hence it
				1021	* could not have been reallocated until this
				1022	* transaction has committed. BUT it could be
				1023	* reallocated once we have written all the data to
				1024	* disk and before we process the buffer on BJ_Forget
				1025	* list.
				1026	*/
				1027	if (!jh->b_next_transaction)
				1028	try_to_free = 1;
				1029	}
				1030	JBUFFER_TRACE(jh, "refile or unfile buffer");
				1031	__jbd2_journal_refile_buffer(jh);
				1032	jbd_unlock_bh_state(bh);
				1033	if (try_to_free)
				1034	release_buffer_page(bh); /* Drops bh reference */
				1035	else
				1036	__brelse(bh);
				1037	cond_resched_lock(&journal->j_list_lock);
				1038	}
				1039	spin_unlock(&journal->j_list_lock);
				1040	/*
				1041	* This is a bit sleazy. We use j_list_lock to protect transition
				1042	* of a transaction into T_FINISHED state and calling
				1043	* __jbd2_journal_drop_transaction(). Otherwise we could race with
				1044	* other checkpointing code processing the transaction...
				1045	*/
				1046	write_lock(&journal->j_state_lock);
				1047	spin_lock(&journal->j_list_lock);
				1048	/*
				1049	* Now recheck if some buffers did not get attached to the transaction
				1050	* while the lock was dropped...
				1051	*/
				1052	if (commit_transaction->t_forget) {
				1053	spin_unlock(&journal->j_list_lock);
				1054	write_unlock(&journal->j_state_lock);
				1055	goto restart_loop;
				1056	}
				1057
				1058	/* Add the transaction to the checkpoint list
				1059	* __journal_remove_checkpoint() can not destroy transaction
				1060	* under us because it is not marked as T_FINISHED yet */
				1061	if (journal->j_checkpoint_transactions == NULL) {
				1062	journal->j_checkpoint_transactions = commit_transaction;
				1063	commit_transaction->t_cpnext = commit_transaction;
				1064	commit_transaction->t_cpprev = commit_transaction;
				1065	} else {
				1066	commit_transaction->t_cpnext =
				1067	journal->j_checkpoint_transactions;
				1068	commit_transaction->t_cpprev =
				1069	commit_transaction->t_cpnext->t_cpprev;
				1070	commit_transaction->t_cpnext->t_cpprev =
				1071	commit_transaction;
				1072	commit_transaction->t_cpprev->t_cpnext =
				1073	commit_transaction;
				1074	}
				1075	spin_unlock(&journal->j_list_lock);
				1076
				1077	/* Done with this transaction! */
				1078
				1079	jbd_debug(3, "JBD2: commit phase 7\n");
				1080
				1081	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
				1082
				1083	commit_transaction->t_start = jiffies;
				1084	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
				1085	commit_transaction->t_start);
				1086
				1087	/*
				1088	* File the transaction statistics
				1089	*/
				1090	stats.ts_tid = commit_transaction->t_tid;
				1091	stats.run.rs_handle_count =
				1092	atomic_read(&commit_transaction->t_handle_count);
				1093	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
				1094	commit_transaction->t_tid, &stats.run);
				1095	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
				1096
				1097	commit_transaction->t_state = T_COMMIT_CALLBACK;
				1098	J_ASSERT(commit_transaction == journal->j_committing_transaction);
				1099	journal->j_commit_sequence = commit_transaction->t_tid;
				1100	journal->j_committing_transaction = NULL;
				1101	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
				1102
				1103	/*
				1104	* weight the commit time higher than the average time so we don't
				1105	* react too strongly to vast changes in the commit time
				1106	*/
				1107	if (likely(journal->j_average_commit_time))
				1108	journal->j_average_commit_time = (commit_time +
				1109	journal->j_average_commit_time*3) / 4;
				1110	else
				1111	journal->j_average_commit_time = commit_time;
				1112
				1113	write_unlock(&journal->j_state_lock);
				1114
				1115	if (journal->j_commit_callback)
				1116	journal->j_commit_callback(journal, commit_transaction);
				1117
				1118	trace_jbd2_end_commit(journal, commit_transaction);
				1119	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
				1120	journal->j_commit_sequence, journal->j_tail_sequence);
				1121
				1122	write_lock(&journal->j_state_lock);
				1123	spin_lock(&journal->j_list_lock);
				1124	commit_transaction->t_state = T_FINISHED;
				1125	/* Check if the transaction can be dropped now that we are finished */
				1126	if (commit_transaction->t_checkpoint_list == NULL &&
				1127	commit_transaction->t_checkpoint_io_list == NULL) {
				1128	__jbd2_journal_drop_transaction(journal, commit_transaction);
				1129	jbd2_journal_free_transaction(commit_transaction);
				1130	}
				1131	spin_unlock(&journal->j_list_lock);
				1132	write_unlock(&journal->j_state_lock);
				1133	wake_up(&journal->j_wait_done_commit);
				1134
				1135	/*
				1136	* Calculate overall stats
				1137	*/
				1138	spin_lock(&journal->j_history_lock);
				1139	journal->j_stats.ts_tid++;
				1140	journal->j_stats.ts_requested += stats.ts_requested;
				1141	journal->j_stats.run.rs_wait += stats.run.rs_wait;
				1142	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
				1143	journal->j_stats.run.rs_running += stats.run.rs_running;
				1144	journal->j_stats.run.rs_locked += stats.run.rs_locked;
				1145	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
				1146	journal->j_stats.run.rs_logging += stats.run.rs_logging;
				1147	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
				1148	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
				1149	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
				1150	spin_unlock(&journal->j_history_lock);
				1151	}