Blame - src/kernel/linux/v4.19/drivers/md/raid5-ppl.c - T800

blob: 3a7c363265893ade48ae5b3f929b039b0673bcdb [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Partial Parity Log for closing the RAID5 write hole
				3	* Copyright (c) 2017, Intel Corporation.
				4	*
				5	* This program is free software; you can redistribute it and/or modify it
				6	* under the terms and conditions of the GNU General Public License,
				7	* version 2, as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope it will be useful, but WITHOUT
				10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				12	* more details.
				13	*/
				14
				15	#include <linux/kernel.h>
				16	#include <linux/blkdev.h>
				17	#include <linux/slab.h>
				18	#include <linux/crc32c.h>
				19	#include <linux/flex_array.h>
				20	#include <linux/async_tx.h>
				21	#include <linux/raid/md_p.h>
				22	#include "md.h"
				23	#include "raid5.h"
				24
				25	/*
				26	* PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
				27	* partial parity data. The header contains an array of entries
				28	* (struct ppl_header_entry) which describe the logged write requests.
				29	* Partial parity for the entries comes after the header, written in the same
				30	* sequence as the entries:
				31	*
				32	* Header
				33	* entry0
				34	* ...
				35	* entryN
				36	* PP data
				37	* PP for entry0
				38	* ...
				39	* PP for entryN
				40	*
				41	* An entry describes one or more consecutive stripe_heads, up to a full
				42	* stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
				43	* number of stripe_heads in the entry and n is the number of modified data
				44	* disks. Every stripe_head in the entry must write to the same data disks.
				45	* An example of a valid case described by a single entry (writes to the first
				46	* stripe of a 4 disk array, 16k chunk size):
				47	*
				48	* sh->sector dd0 dd1 dd2 ppl
				49	* +-----+-----+-----+
				50	* 0 \| --- \| --- \| --- \| +----+
				51	* 8 \| -W- \| -W- \| --- \| \| pp \| data_sector = 8
				52	* 16 \| -W- \| -W- \| --- \| \| pp \| data_size = 3 * 2 * 4k
				53	* 24 \| -W- \| -W- \| --- \| \| pp \| pp_size = 3 * 4k
				54	* +-----+-----+-----+ +----+
				55	*
				56	* data_sector is the first raid sector of the modified data, data_size is the
				57	* total size of modified data and pp_size is the size of partial parity for
				58	* this entry. Entries for full stripe writes contain no partial parity
				59	* (pp_size = 0), they only mark the stripes for which parity should be
				60	* recalculated after an unclean shutdown. Every entry holds a checksum of its
				61	* partial parity, the header also has a checksum of the header itself.
				62	*
				63	* A write request is always logged to the PPL instance stored on the parity
				64	* disk of the corresponding stripe. For each member disk there is one ppl_log
				65	* used to handle logging for this disk, independently from others. They are
				66	* grouped in child_logs array in struct ppl_conf, which is assigned to
				67	* r5conf->log_private.
				68	*
				69	* ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
				70	* PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
				71	* can be appended to the last entry if it meets the conditions for a valid
				72	* entry described above, otherwise a new entry is added. Checksums of entries
				73	* are calculated incrementally as stripes containing partial parity are being
				74	* added. ppl_submit_iounit() calculates the checksum of the header and submits
				75	* a bio containing the header page and partial parity pages (sh->ppl_page) for
				76	* all stripes of the io_unit. When the PPL write completes, the stripes
				77	* associated with the io_unit are released and raid5d starts writing their data
				78	* and parity. When all stripes are written, the io_unit is freed and the next
				79	* can be submitted.
				80	*
				81	* An io_unit is used to gather stripes until it is submitted or becomes full
				82	* (if the maximum number of entries or size of PPL is reached). Another io_unit
				83	* can't be submitted until the previous has completed (PPL and stripe
				84	* data+parity is written). The log->io_list tracks all io_units of a log
				85	* (for a single member disk). New io_units are added to the end of the list
				86	* and the first io_unit is submitted, if it is not submitted already.
				87	* The current io_unit accepting new stripes is always at the end of the list.
				88	*
				89	* If write-back cache is enabled for any of the disks in the array, its data
				90	* must be flushed before next io_unit is submitted.
				91	*/
				92
				93	#define PPL_SPACE_SIZE (128 * 1024)
				94
				95	struct ppl_conf {
				96	struct mddev *mddev;
				97
				98	/* array of child logs, one for each raid disk */
				99	struct ppl_log *child_logs;
				100	int count;
				101
				102	int block_size; /* the logical block size used for data_sector
				103	* in ppl_header_entry */
				104	u32 signature; /* raid array identifier */
				105	atomic64_t seq; /* current log write sequence number */
				106
				107	struct kmem_cache *io_kc;
				108	mempool_t io_pool;
				109	struct bio_set bs;
				110	struct bio_set flush_bs;
				111
				112	/* used only for recovery */
				113	int recovered_entries;
				114	int mismatch_count;
				115
				116	/* stripes to retry if failed to allocate io_unit */
				117	struct list_head no_mem_stripes;
				118	spinlock_t no_mem_stripes_lock;
				119	};
				120
				121	struct ppl_log {
				122	struct ppl_conf ppl_conf; / shared between all log instances */
				123
				124	struct md_rdev rdev; / array member disk associated with
				125	* this log instance */
				126	struct mutex io_mutex;
				127	struct ppl_io_unit current_io; / current io_unit accepting new data
				128	* always at the end of io_list */
				129	spinlock_t io_list_lock;
				130	struct list_head io_list; /* all io_units of this log */
				131
				132	sector_t next_io_sector;
				133	unsigned int entry_space;
				134	bool use_multippl;
				135	bool wb_cache_on;
				136	unsigned long disk_flush_bitmap;
				137	};
				138
				139	#define PPL_IO_INLINE_BVECS 32
				140
				141	struct ppl_io_unit {
				142	struct ppl_log *log;
				143
				144	struct page header_page; / for ppl_header */
				145
				146	unsigned int entries_count; /* number of entries in ppl_header */
				147	unsigned int pp_size; /* total size current of partial parity */
				148
				149	u64 seq; /* sequence number of this log write */
				150	struct list_head log_sibling; /* log->io_list */
				151
				152	struct list_head stripe_list; /* stripes added to the io_unit */
				153	atomic_t pending_stripes; /* how many stripes not written to raid */
				154	atomic_t pending_flushes; /* how many disk flushes are in progress */
				155
				156	bool submitted; /* true if write to log started */
				157
				158	/* inline bio and its biovec for submitting the iounit */
				159	struct bio bio;
				160	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
				161	};
				162
				163	struct dma_async_tx_descriptor *
				164	ops_run_partial_parity(struct stripe_head sh, struct raid5_percpu percpu,
				165	struct dma_async_tx_descriptor *tx)
				166	{
				167	int disks = sh->disks;
				168	struct page **srcs = flex_array_get(percpu->scribble, 0);
				169	int count = 0, pd_idx = sh->pd_idx, i;
				170	struct async_submit_ctl submit;
				171
				172	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
				173
				174	/*
				175	* Partial parity is the XOR of stripe data chunks that are not changed
				176	* during the write request. Depending on available data
				177	* (read-modify-write vs. reconstruct-write case) we calculate it
				178	* differently.
				179	*/
				180	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
				181	/*
				182	* rmw: xor old data and parity from updated disks
				183	* This is calculated earlier by ops_run_prexor5() so just copy
				184	* the parity dev page.
				185	*/
				186	srcs[count++] = sh->dev[pd_idx].page;
				187	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
				188	/* rcw: xor data from all not updated disks */
				189	for (i = disks; i--;) {
				190	struct r5dev *dev = &sh->dev[i];
				191	if (test_bit(R5_UPTODATE, &dev->flags))
				192	srcs[count++] = dev->page;
				193	}
				194	} else {
				195	return tx;
				196	}
				197
				198	init_async_submit(&submit, ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, tx,
				199	NULL, sh, flex_array_get(percpu->scribble, 0)
				200	+ sizeof(struct page ) (sh->disks + 2));
				201
				202	if (count == 1)
				203	tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
				204	&submit);
				205	else
				206	tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
				207	&submit);
				208
				209	return tx;
				210	}
				211
				212	static void ppl_io_pool_alloc(gfp_t gfp_mask, void pool_data)
				213	{
				214	struct kmem_cache *kc = pool_data;
				215	struct ppl_io_unit *io;
				216
				217	io = kmem_cache_alloc(kc, gfp_mask);
				218	if (!io)
				219	return NULL;
				220
				221	io->header_page = alloc_page(gfp_mask);
				222	if (!io->header_page) {
				223	kmem_cache_free(kc, io);
				224	return NULL;
				225	}
				226
				227	return io;
				228	}
				229
				230	static void ppl_io_pool_free(void element, void pool_data)
				231	{
				232	struct kmem_cache *kc = pool_data;
				233	struct ppl_io_unit *io = element;
				234
				235	__free_page(io->header_page);
				236	kmem_cache_free(kc, io);
				237	}
				238
				239	static struct ppl_io_unit ppl_new_iounit(struct ppl_log log,
				240	struct stripe_head *sh)
				241	{
				242	struct ppl_conf *ppl_conf = log->ppl_conf;
				243	struct ppl_io_unit *io;
				244	struct ppl_header *pplhdr;
				245	struct page *header_page;
				246
				247	io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT);
				248	if (!io)
				249	return NULL;
				250
				251	header_page = io->header_page;
				252	memset(io, 0, sizeof(*io));
				253	io->header_page = header_page;
				254
				255	io->log = log;
				256	INIT_LIST_HEAD(&io->log_sibling);
				257	INIT_LIST_HEAD(&io->stripe_list);
				258	atomic_set(&io->pending_stripes, 0);
				259	atomic_set(&io->pending_flushes, 0);
				260	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
				261
				262	pplhdr = page_address(io->header_page);
				263	clear_page(pplhdr);
				264	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
				265	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
				266
				267	io->seq = atomic64_add_return(1, &ppl_conf->seq);
				268	pplhdr->generation = cpu_to_le64(io->seq);
				269
				270	return io;
				271	}
				272
				273	static int ppl_log_stripe(struct ppl_log log, struct stripe_head sh)
				274	{
				275	struct ppl_io_unit *io = log->current_io;
				276	struct ppl_header_entry *e = NULL;
				277	struct ppl_header *pplhdr;
				278	int i;
				279	sector_t data_sector = 0;
				280	int data_disks = 0;
				281	struct r5conf *conf = sh->raid_conf;
				282
				283	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
				284
				285	/* check if current io_unit is full */
				286	if (io && (io->pp_size == log->entry_space \|\|
				287	io->entries_count == PPL_HDR_MAX_ENTRIES)) {
				288	pr_debug("%s: add io_unit blocked by seq: %llu\n",
				289	__func__, io->seq);
				290	io = NULL;
				291	}
				292
				293	/* add a new unit if there is none or the current is full */
				294	if (!io) {
				295	io = ppl_new_iounit(log, sh);
				296	if (!io)
				297	return -ENOMEM;
				298	spin_lock_irq(&log->io_list_lock);
				299	list_add_tail(&io->log_sibling, &log->io_list);
				300	spin_unlock_irq(&log->io_list_lock);
				301
				302	log->current_io = io;
				303	}
				304
				305	for (i = 0; i < sh->disks; i++) {
				306	struct r5dev *dev = &sh->dev[i];
				307
				308	if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
				309	if (!data_disks \|\| dev->sector < data_sector)
				310	data_sector = dev->sector;
				311	data_disks++;
				312	}
				313	}
				314	BUG_ON(!data_disks);
				315
				316	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
				317	io->seq, (unsigned long long)data_sector, data_disks);
				318
				319	pplhdr = page_address(io->header_page);
				320
				321	if (io->entries_count > 0) {
				322	struct ppl_header_entry *last =
				323	&pplhdr->entries[io->entries_count - 1];
				324	struct stripe_head *sh_last = list_last_entry(
				325	&io->stripe_list, struct stripe_head, log_list);
				326	u64 data_sector_last = le64_to_cpu(last->data_sector);
				327	u32 data_size_last = le32_to_cpu(last->data_size);
				328
				329	/*
				330	* Check if we can append the stripe to the last entry. It must
				331	* be just after the last logged stripe and write to the same
				332	* disks. Use bit shift and logarithm to avoid 64-bit division.
				333	*/
				334	if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
				335	(data_sector >> ilog2(conf->chunk_sectors) ==
				336	data_sector_last >> ilog2(conf->chunk_sectors)) &&
				337	((data_sector - data_sector_last) * data_disks ==
				338	data_size_last >> 9))
				339	e = last;
				340	}
				341
				342	if (!e) {
				343	e = &pplhdr->entries[io->entries_count++];
				344	e->data_sector = cpu_to_le64(data_sector);
				345	e->parity_disk = cpu_to_le32(sh->pd_idx);
				346	e->checksum = cpu_to_le32(~0);
				347	}
				348
				349	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
				350
				351	/* don't write any PP if full stripe write */
				352	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
				353	le32_add_cpu(&e->pp_size, PAGE_SIZE);
				354	io->pp_size += PAGE_SIZE;
				355	e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
				356	page_address(sh->ppl_page),
				357	PAGE_SIZE));
				358	}
				359
				360	list_add_tail(&sh->log_list, &io->stripe_list);
				361	atomic_inc(&io->pending_stripes);
				362	sh->ppl_io = io;
				363
				364	return 0;
				365	}
				366
				367	int ppl_write_stripe(struct r5conf conf, struct stripe_head sh)
				368	{
				369	struct ppl_conf *ppl_conf = conf->log_private;
				370	struct ppl_io_unit *io = sh->ppl_io;
				371	struct ppl_log *log;
				372
				373	if (io \|\| test_bit(STRIPE_SYNCING, &sh->state) \|\| !sh->ppl_page \|\|
				374	!test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) \|\|
				375	!test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
				376	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
				377	return -EAGAIN;
				378	}
				379
				380	log = &ppl_conf->child_logs[sh->pd_idx];
				381
				382	mutex_lock(&log->io_mutex);
				383
				384	if (!log->rdev \|\| test_bit(Faulty, &log->rdev->flags)) {
				385	mutex_unlock(&log->io_mutex);
				386	return -EAGAIN;
				387	}
				388
				389	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
				390	clear_bit(STRIPE_DELAYED, &sh->state);
				391	atomic_inc(&sh->count);
				392
				393	if (ppl_log_stripe(log, sh)) {
				394	spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
				395	list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
				396	spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
				397	}
				398
				399	mutex_unlock(&log->io_mutex);
				400
				401	return 0;
				402	}
				403
				404	static void ppl_log_endio(struct bio *bio)
				405	{
				406	struct ppl_io_unit *io = bio->bi_private;
				407	struct ppl_log *log = io->log;
				408	struct ppl_conf *ppl_conf = log->ppl_conf;
				409	struct stripe_head sh, next;
				410
				411	pr_debug("%s: seq: %llu\n", __func__, io->seq);
				412
				413	if (bio->bi_status)
				414	md_error(ppl_conf->mddev, log->rdev);
				415
				416	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
				417	list_del_init(&sh->log_list);
				418
				419	set_bit(STRIPE_HANDLE, &sh->state);
				420	raid5_release_stripe(sh);
				421	}
				422	}
				423
				424	static void ppl_submit_iounit_bio(struct ppl_io_unit io, struct bio bio)
				425	{
				426	char b[BDEVNAME_SIZE];
				427
				428	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
				429	__func__, io->seq, bio->bi_iter.bi_size,
				430	(unsigned long long)bio->bi_iter.bi_sector,
				431	bio_devname(bio, b));
				432
				433	submit_bio(bio);
				434	}
				435
				436	static void ppl_submit_iounit(struct ppl_io_unit *io)
				437	{
				438	struct ppl_log *log = io->log;
				439	struct ppl_conf *ppl_conf = log->ppl_conf;
				440	struct ppl_header *pplhdr = page_address(io->header_page);
				441	struct bio *bio = &io->bio;
				442	struct stripe_head *sh;
				443	int i;
				444
				445	bio->bi_private = io;
				446
				447	if (!log->rdev \|\| test_bit(Faulty, &log->rdev->flags)) {
				448	ppl_log_endio(bio);
				449	return;
				450	}
				451
				452	for (i = 0; i < io->entries_count; i++) {
				453	struct ppl_header_entry *e = &pplhdr->entries[i];
				454
				455	pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
				456	__func__, io->seq, i, le64_to_cpu(e->data_sector),
				457	le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
				458
				459	e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
				460	ilog2(ppl_conf->block_size >> 9));
				461	e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
				462	}
				463
				464	pplhdr->entries_count = cpu_to_le32(io->entries_count);
				465	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
				466
				467	/* Rewind the buffer if current PPL is larger then remaining space */
				468	if (log->use_multippl &&
				469	log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
				470	(PPL_HEADER_SIZE + io->pp_size) >> 9)
				471	log->next_io_sector = log->rdev->ppl.sector;
				472
				473
				474	bio->bi_end_io = ppl_log_endio;
				475	bio->bi_opf = REQ_OP_WRITE \| REQ_FUA;
				476	bio_set_dev(bio, log->rdev->bdev);
				477	bio->bi_iter.bi_sector = log->next_io_sector;
				478	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
				479
				480	pr_debug("%s: log->current_io_sector: %llu\n", __func__,
				481	(unsigned long long)log->next_io_sector);
				482
				483	if (log->use_multippl)
				484	log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
				485
				486	WARN_ON(log->disk_flush_bitmap != 0);
				487
				488	list_for_each_entry(sh, &io->stripe_list, log_list) {
				489	for (i = 0; i < sh->disks; i++) {
				490	struct r5dev *dev = &sh->dev[i];
				491
				492	if ((ppl_conf->child_logs[i].wb_cache_on) &&
				493	(test_bit(R5_Wantwrite, &dev->flags))) {
				494	set_bit(i, &log->disk_flush_bitmap);
				495	}
				496	}
				497
				498	/* entries for full stripe writes have no partial parity */
				499	if (test_bit(STRIPE_FULL_WRITE, &sh->state))
				500	continue;
				501
				502	if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
				503	struct bio *prev = bio;
				504
				505	bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
				506	&ppl_conf->bs);
				507	bio->bi_opf = prev->bi_opf;
				508	bio_copy_dev(bio, prev);
				509	bio->bi_iter.bi_sector = bio_end_sector(prev);
				510	bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
				511
				512	bio_chain(bio, prev);
				513	ppl_submit_iounit_bio(io, prev);
				514	}
				515	}
				516
				517	ppl_submit_iounit_bio(io, bio);
				518	}
				519
				520	static void ppl_submit_current_io(struct ppl_log *log)
				521	{
				522	struct ppl_io_unit *io;
				523
				524	spin_lock_irq(&log->io_list_lock);
				525
				526	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
				527	log_sibling);
				528	if (io && io->submitted)
				529	io = NULL;
				530
				531	spin_unlock_irq(&log->io_list_lock);
				532
				533	if (io) {
				534	io->submitted = true;
				535
				536	if (io == log->current_io)
				537	log->current_io = NULL;
				538
				539	ppl_submit_iounit(io);
				540	}
				541	}
				542
				543	void ppl_write_stripe_run(struct r5conf *conf)
				544	{
				545	struct ppl_conf *ppl_conf = conf->log_private;
				546	struct ppl_log *log;
				547	int i;
				548
				549	for (i = 0; i < ppl_conf->count; i++) {
				550	log = &ppl_conf->child_logs[i];
				551
				552	mutex_lock(&log->io_mutex);
				553	ppl_submit_current_io(log);
				554	mutex_unlock(&log->io_mutex);
				555	}
				556	}
				557
				558	static void ppl_io_unit_finished(struct ppl_io_unit *io)
				559	{
				560	struct ppl_log *log = io->log;
				561	struct ppl_conf *ppl_conf = log->ppl_conf;
				562	struct r5conf *conf = ppl_conf->mddev->private;
				563	unsigned long flags;
				564
				565	pr_debug("%s: seq: %llu\n", __func__, io->seq);
				566
				567	local_irq_save(flags);
				568
				569	spin_lock(&log->io_list_lock);
				570	list_del(&io->log_sibling);
				571	spin_unlock(&log->io_list_lock);
				572
				573	mempool_free(io, &ppl_conf->io_pool);
				574
				575	spin_lock(&ppl_conf->no_mem_stripes_lock);
				576	if (!list_empty(&ppl_conf->no_mem_stripes)) {
				577	struct stripe_head *sh;
				578
				579	sh = list_first_entry(&ppl_conf->no_mem_stripes,
				580	struct stripe_head, log_list);
				581	list_del_init(&sh->log_list);
				582	set_bit(STRIPE_HANDLE, &sh->state);
				583	raid5_release_stripe(sh);
				584	}
				585	spin_unlock(&ppl_conf->no_mem_stripes_lock);
				586
				587	local_irq_restore(flags);
				588
				589	wake_up(&conf->wait_for_quiescent);
				590	}
				591
				592	static void ppl_flush_endio(struct bio *bio)
				593	{
				594	struct ppl_io_unit *io = bio->bi_private;
				595	struct ppl_log *log = io->log;
				596	struct ppl_conf *ppl_conf = log->ppl_conf;
				597	struct r5conf *conf = ppl_conf->mddev->private;
				598	char b[BDEVNAME_SIZE];
				599
				600	pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
				601
				602	if (bio->bi_status) {
				603	struct md_rdev *rdev;
				604
				605	rcu_read_lock();
				606	rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio));
				607	if (rdev)
				608	md_error(rdev->mddev, rdev);
				609	rcu_read_unlock();
				610	}
				611
				612	bio_put(bio);
				613
				614	if (atomic_dec_and_test(&io->pending_flushes)) {
				615	ppl_io_unit_finished(io);
				616	md_wakeup_thread(conf->mddev->thread);
				617	}
				618	}
				619
				620	static void ppl_do_flush(struct ppl_io_unit *io)
				621	{
				622	struct ppl_log *log = io->log;
				623	struct ppl_conf *ppl_conf = log->ppl_conf;
				624	struct r5conf *conf = ppl_conf->mddev->private;
				625	int raid_disks = conf->raid_disks;
				626	int flushed_disks = 0;
				627	int i;
				628
				629	atomic_set(&io->pending_flushes, raid_disks);
				630
				631	for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) {
				632	struct md_rdev *rdev;
				633	struct block_device *bdev = NULL;
				634
				635	rcu_read_lock();
				636	rdev = rcu_dereference(conf->disks[i].rdev);
				637	if (rdev && !test_bit(Faulty, &rdev->flags))
				638	bdev = rdev->bdev;
				639	rcu_read_unlock();
				640
				641	if (bdev) {
				642	struct bio *bio;
				643	char b[BDEVNAME_SIZE];
				644
				645	bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs);
				646	bio_set_dev(bio, bdev);
				647	bio->bi_private = io;
				648	bio->bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH;
				649	bio->bi_end_io = ppl_flush_endio;
				650
				651	pr_debug("%s: dev: %s\n", __func__,
				652	bio_devname(bio, b));
				653
				654	submit_bio(bio);
				655	flushed_disks++;
				656	}
				657	}
				658
				659	log->disk_flush_bitmap = 0;
				660
				661	for (i = flushed_disks ; i < raid_disks; i++) {
				662	if (atomic_dec_and_test(&io->pending_flushes))
				663	ppl_io_unit_finished(io);
				664	}
				665	}
				666
				667	static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
				668	struct ppl_log *log)
				669	{
				670	struct ppl_io_unit *io;
				671
				672	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
				673	log_sibling);
				674
				675	return !io \|\| !io->submitted;
				676	}
				677
				678	void ppl_quiesce(struct r5conf *conf, int quiesce)
				679	{
				680	struct ppl_conf *ppl_conf = conf->log_private;
				681	int i;
				682
				683	if (quiesce) {
				684	for (i = 0; i < ppl_conf->count; i++) {
				685	struct ppl_log *log = &ppl_conf->child_logs[i];
				686
				687	spin_lock_irq(&log->io_list_lock);
				688	wait_event_lock_irq(conf->wait_for_quiescent,
				689	ppl_no_io_unit_submitted(conf, log),
				690	log->io_list_lock);
				691	spin_unlock_irq(&log->io_list_lock);
				692	}
				693	}
				694	}
				695
				696	int ppl_handle_flush_request(struct r5l_log log, struct bio bio)
				697	{
				698	if (bio->bi_iter.bi_size == 0) {
				699	bio_endio(bio);
				700	return 0;
				701	}
				702	bio->bi_opf &= ~REQ_PREFLUSH;
				703	return -EAGAIN;
				704	}
				705
				706	void ppl_stripe_write_finished(struct stripe_head *sh)
				707	{
				708	struct ppl_io_unit *io;
				709
				710	io = sh->ppl_io;
				711	sh->ppl_io = NULL;
				712
				713	if (io && atomic_dec_and_test(&io->pending_stripes)) {
				714	if (io->log->disk_flush_bitmap)
				715	ppl_do_flush(io);
				716	else
				717	ppl_io_unit_finished(io);
				718	}
				719	}
				720
				721	static void ppl_xor(int size, struct page page1, struct page page2)
				722	{
				723	struct async_submit_ctl submit;
				724	struct dma_async_tx_descriptor *tx;
				725	struct page *xor_srcs[] = { page1, page2 };
				726
				727	init_async_submit(&submit, ASYNC_TX_ACK\|ASYNC_TX_XOR_DROP_DST,
				728	NULL, NULL, NULL, NULL);
				729	tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
				730
				731	async_tx_quiesce(&tx);
				732	}
				733
				734	/*
				735	* PPL recovery strategy: xor partial parity and data from all modified data
				736	* disks within a stripe and write the result as the new stripe parity. If all
				737	* stripe data disks are modified (full stripe write), no partial parity is
				738	* available, so just xor the data disks.
				739	*
				740	* Recovery of a PPL entry shall occur only if all modified data disks are
				741	* available and read from all of them succeeds.
				742	*
				743	* A PPL entry applies to a stripe, partial parity size for an entry is at most
				744	* the size of the chunk. Examples of possible cases for a single entry:
				745	*
				746	* case 0: single data disk write:
				747	* data0 data1 data2 ppl parity
				748	* +--------+--------+--------+ +--------------------+
				749	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				750	* \| ------ \| -data- \| ------ \| \| pp \| -> \| data1 ^ pp \|
				751	* \| ------ \| -data- \| ------ \| \| pp \| -> \| data1 ^ pp \|
				752	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				753	* +--------+--------+--------+ +--------------------+
				754	* pp_size = data_size
				755	*
				756	* case 1: more than one data disk write:
				757	* data0 data1 data2 ppl parity
				758	* +--------+--------+--------+ +--------------------+
				759	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				760	* \| -data- \| -data- \| ------ \| \| pp \| -> \| data0 ^ data1 ^ pp \|
				761	* \| -data- \| -data- \| ------ \| \| pp \| -> \| data0 ^ data1 ^ pp \|
				762	* \| ------ \| ------ \| ------ \| +----+ \| (no change) \|
				763	* +--------+--------+--------+ +--------------------+
				764	* pp_size = data_size / modified_data_disks
				765	*
				766	* case 2: write to all data disks (also full stripe write):
				767	* data0 data1 data2 parity
				768	* +--------+--------+--------+ +--------------------+
				769	* \| ------ \| ------ \| ------ \| \| (no change) \|
				770	* \| -data- \| -data- \| -data- \| --------> \| xor all data \|
				771	* \| ------ \| ------ \| ------ \| --------> \| (no change) \|
				772	* \| ------ \| ------ \| ------ \| \| (no change) \|
				773	* +--------+--------+--------+ +--------------------+
				774	* pp_size = 0
				775	*
				776	* The following cases are possible only in other implementations. The recovery
				777	* code can handle them, but they are not generated at runtime because they can
				778	* be reduced to cases 0, 1 and 2:
				779	*
				780	* case 3:
				781	* data0 data1 data2 ppl parity
				782	* +--------+--------+--------+ +----+ +--------------------+
				783	* \| ------ \| -data- \| -data- \| \| pp \| \| data1 ^ data2 ^ pp \|
				784	* \| ------ \| -data- \| -data- \| \| pp \| -> \| data1 ^ data2 ^ pp \|
				785	* \| -data- \| -data- \| -data- \| \| -- \| -> \| xor all data \|
				786	* \| -data- \| -data- \| ------ \| \| pp \| \| data0 ^ data1 ^ pp \|
				787	* +--------+--------+--------+ +----+ +--------------------+
				788	* pp_size = chunk_size
				789	*
				790	* case 4:
				791	* data0 data1 data2 ppl parity
				792	* +--------+--------+--------+ +----+ +--------------------+
				793	* \| ------ \| -data- \| ------ \| \| pp \| \| data1 ^ pp \|
				794	* \| ------ \| ------ \| ------ \| \| -- \| -> \| (no change) \|
				795	* \| ------ \| ------ \| ------ \| \| -- \| -> \| (no change) \|
				796	* \| -data- \| ------ \| ------ \| \| pp \| \| data0 ^ pp \|
				797	* +--------+--------+--------+ +----+ +--------------------+
				798	* pp_size = chunk_size
				799	*/
				800	static int ppl_recover_entry(struct ppl_log log, struct ppl_header_entry e,
				801	sector_t ppl_sector)
				802	{
				803	struct ppl_conf *ppl_conf = log->ppl_conf;
				804	struct mddev *mddev = ppl_conf->mddev;
				805	struct r5conf *conf = mddev->private;
				806	int block_size = ppl_conf->block_size;
				807	struct page *page1;
				808	struct page *page2;
				809	sector_t r_sector_first;
				810	sector_t r_sector_last;
				811	int strip_sectors;
				812	int data_disks;
				813	int i;
				814	int ret = 0;
				815	char b[BDEVNAME_SIZE];
				816	unsigned int pp_size = le32_to_cpu(e->pp_size);
				817	unsigned int data_size = le32_to_cpu(e->data_size);
				818
				819	page1 = alloc_page(GFP_KERNEL);
				820	page2 = alloc_page(GFP_KERNEL);
				821
				822	if (!page1 \|\| !page2) {
				823	ret = -ENOMEM;
				824	goto out;
				825	}
				826
				827	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
				828
				829	if ((pp_size >> 9) < conf->chunk_sectors) {
				830	if (pp_size > 0) {
				831	data_disks = data_size / pp_size;
				832	strip_sectors = pp_size >> 9;
				833	} else {
				834	data_disks = conf->raid_disks - conf->max_degraded;
				835	strip_sectors = (data_size >> 9) / data_disks;
				836	}
				837	r_sector_last = r_sector_first +
				838	(data_disks - 1) * conf->chunk_sectors +
				839	strip_sectors;
				840	} else {
				841	data_disks = conf->raid_disks - conf->max_degraded;
				842	strip_sectors = conf->chunk_sectors;
				843	r_sector_last = r_sector_first + (data_size >> 9);
				844	}
				845
				846	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
				847	(unsigned long long)r_sector_first,
				848	(unsigned long long)r_sector_last);
				849
				850	/* if start and end is 4k aligned, use a 4k block */
				851	if (block_size == 512 &&
				852	(r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
				853	(r_sector_last & (STRIPE_SECTORS - 1)) == 0)
				854	block_size = STRIPE_SIZE;
				855
				856	/* iterate through blocks in strip */
				857	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
				858	bool update_parity = false;
				859	sector_t parity_sector;
				860	struct md_rdev *parity_rdev;
				861	struct stripe_head sh;
				862	int disk;
				863	int indent = 0;
				864
				865	pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
				866	indent += 2;
				867
				868	memset(page_address(page1), 0, PAGE_SIZE);
				869
				870	/* iterate through data member disks */
				871	for (disk = 0; disk < data_disks; disk++) {
				872	int dd_idx;
				873	struct md_rdev *rdev;
				874	sector_t sector;
				875	sector_t r_sector = r_sector_first + i +
				876	(disk * conf->chunk_sectors);
				877
				878	pr_debug("%s:%*s data member disk %d start\n",
				879	__func__, indent, "", disk);
				880	indent += 2;
				881
				882	if (r_sector >= r_sector_last) {
				883	pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
				884	__func__, indent, "",
				885	(unsigned long long)r_sector);
				886	indent -= 2;
				887	continue;
				888	}
				889
				890	update_parity = true;
				891
				892	/* map raid sector to member disk */
				893	sector = raid5_compute_sector(conf, r_sector, 0,
				894	&dd_idx, NULL);
				895	pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
				896	__func__, indent, "",
				897	(unsigned long long)r_sector, dd_idx,
				898	(unsigned long long)sector);
				899
				900	rdev = conf->disks[dd_idx].rdev;
				901	if (!rdev \|\| (!test_bit(In_sync, &rdev->flags) &&
				902	sector >= rdev->recovery_offset)) {
				903	pr_debug("%s:%*s data member disk %d missing\n",
				904	__func__, indent, "", dd_idx);
				905	update_parity = false;
				906	break;
				907	}
				908
				909	pr_debug("%s:%*s reading data member disk %s sector %llu\n",
				910	__func__, indent, "", bdevname(rdev->bdev, b),
				911	(unsigned long long)sector);
				912	if (!sync_page_io(rdev, sector, block_size, page2,
				913	REQ_OP_READ, 0, false)) {
				914	md_error(mddev, rdev);
				915	pr_debug("%s:%*s read failed!\n", __func__,
				916	indent, "");
				917	ret = -EIO;
				918	goto out;
				919	}
				920
				921	ppl_xor(block_size, page1, page2);
				922
				923	indent -= 2;
				924	}
				925
				926	if (!update_parity)
				927	continue;
				928
				929	if (pp_size > 0) {
				930	pr_debug("%s:%*s reading pp disk sector %llu\n",
				931	__func__, indent, "",
				932	(unsigned long long)(ppl_sector + i));
				933	if (!sync_page_io(log->rdev,
				934	ppl_sector - log->rdev->data_offset + i,
				935	block_size, page2, REQ_OP_READ, 0,
				936	false)) {
				937	pr_debug("%s:%*s read failed!\n", __func__,
				938	indent, "");
				939	md_error(mddev, log->rdev);
				940	ret = -EIO;
				941	goto out;
				942	}
				943
				944	ppl_xor(block_size, page1, page2);
				945	}
				946
				947	/* map raid sector to parity disk */
				948	parity_sector = raid5_compute_sector(conf, r_sector_first + i,
				949	0, &disk, &sh);
				950	BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
				951	parity_rdev = conf->disks[sh.pd_idx].rdev;
				952
				953	BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
				954	pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
				955	__func__, indent, "",
				956	(unsigned long long)parity_sector,
				957	bdevname(parity_rdev->bdev, b));
				958	if (!sync_page_io(parity_rdev, parity_sector, block_size,
				959	page1, REQ_OP_WRITE, 0, false)) {
				960	pr_debug("%s:%*s parity write error!\n", __func__,
				961	indent, "");
				962	md_error(mddev, parity_rdev);
				963	ret = -EIO;
				964	goto out;
				965	}
				966	}
				967	out:
				968	if (page1)
				969	__free_page(page1);
				970	if (page2)
				971	__free_page(page2);
				972	return ret;
				973	}
				974
				975	static int ppl_recover(struct ppl_log log, struct ppl_header pplhdr,
				976	sector_t offset)
				977	{
				978	struct ppl_conf *ppl_conf = log->ppl_conf;
				979	struct md_rdev *rdev = log->rdev;
				980	struct mddev *mddev = rdev->mddev;
				981	sector_t ppl_sector = rdev->ppl.sector + offset +
				982	(PPL_HEADER_SIZE >> 9);
				983	struct page *page;
				984	int i;
				985	int ret = 0;
				986
				987	page = alloc_page(GFP_KERNEL);
				988	if (!page)
				989	return -ENOMEM;
				990
				991	/* iterate through all PPL entries saved */
				992	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
				993	struct ppl_header_entry *e = &pplhdr->entries[i];
				994	u32 pp_size = le32_to_cpu(e->pp_size);
				995	sector_t sector = ppl_sector;
				996	int ppl_entry_sectors = pp_size >> 9;
				997	u32 crc, crc_stored;
				998
				999	pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
				1000	__func__, rdev->raid_disk, i,
				1001	(unsigned long long)ppl_sector, pp_size);
				1002
				1003	crc = ~0;
				1004	crc_stored = le32_to_cpu(e->checksum);
				1005
				1006	/* read parial parity for this entry and calculate its checksum */
				1007	while (pp_size) {
				1008	int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
				1009
				1010	if (!sync_page_io(rdev, sector - rdev->data_offset,
				1011	s, page, REQ_OP_READ, 0, false)) {
				1012	md_error(mddev, rdev);
				1013	ret = -EIO;
				1014	goto out;
				1015	}
				1016
				1017	crc = crc32c_le(crc, page_address(page), s);
				1018
				1019	pp_size -= s;
				1020	sector += s >> 9;
				1021	}
				1022
				1023	crc = ~crc;
				1024
				1025	if (crc != crc_stored) {
				1026	/*
				1027	* Don't recover this entry if the checksum does not
				1028	* match, but keep going and try to recover other
				1029	* entries.
				1030	*/
				1031	pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
				1032	__func__, crc_stored, crc);
				1033	ppl_conf->mismatch_count++;
				1034	} else {
				1035	ret = ppl_recover_entry(log, e, ppl_sector);
				1036	if (ret)
				1037	goto out;
				1038	ppl_conf->recovered_entries++;
				1039	}
				1040
				1041	ppl_sector += ppl_entry_sectors;
				1042	}
				1043
				1044	/* flush the disk cache after recovery if necessary */
				1045	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
				1046	out:
				1047	__free_page(page);
				1048	return ret;
				1049	}
				1050
				1051	static int ppl_write_empty_header(struct ppl_log *log)
				1052	{
				1053	struct page *page;
				1054	struct ppl_header *pplhdr;
				1055	struct md_rdev *rdev = log->rdev;
				1056	int ret = 0;
				1057
				1058	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
				1059	rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
				1060
				1061	page = alloc_page(GFP_NOIO \| __GFP_ZERO);
				1062	if (!page)
				1063	return -ENOMEM;
				1064
				1065	pplhdr = page_address(page);
				1066	/* zero out PPL space to avoid collision with old PPLs */
				1067	blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector,
				1068	log->rdev->ppl.size, GFP_NOIO, 0);
				1069	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
				1070	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
				1071	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
				1072
				1073	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
				1074	PPL_HEADER_SIZE, page, REQ_OP_WRITE \| REQ_SYNC \|
				1075	REQ_FUA, 0, false)) {
				1076	md_error(rdev->mddev, rdev);
				1077	ret = -EIO;
				1078	}
				1079
				1080	__free_page(page);
				1081	return ret;
				1082	}
				1083
				1084	static int ppl_load_distributed(struct ppl_log *log)
				1085	{
				1086	struct ppl_conf *ppl_conf = log->ppl_conf;
				1087	struct md_rdev *rdev = log->rdev;
				1088	struct mddev *mddev = rdev->mddev;
				1089	struct page page, page2, *tmp;
				1090	struct ppl_header pplhdr = NULL, prev_pplhdr = NULL;
				1091	u32 crc, crc_stored;
				1092	u32 signature;
				1093	int ret = 0, i;
				1094	sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0;
				1095
				1096	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
				1097	/* read PPL headers, find the recent one */
				1098	page = alloc_page(GFP_KERNEL);
				1099	if (!page)
				1100	return -ENOMEM;
				1101
				1102	page2 = alloc_page(GFP_KERNEL);
				1103	if (!page2) {
				1104	__free_page(page);
				1105	return -ENOMEM;
				1106	}
				1107
				1108	/* searching ppl area for latest ppl */
				1109	while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) {
				1110	if (!sync_page_io(rdev,
				1111	rdev->ppl.sector - rdev->data_offset +
				1112	pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ,
				1113	0, false)) {
				1114	md_error(mddev, rdev);
				1115	ret = -EIO;
				1116	/* if not able to read - don't recover any PPL */
				1117	pplhdr = NULL;
				1118	break;
				1119	}
				1120	pplhdr = page_address(page);
				1121
				1122	/* check header validity */
				1123	crc_stored = le32_to_cpu(pplhdr->checksum);
				1124	pplhdr->checksum = 0;
				1125	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
				1126
				1127	if (crc_stored != crc) {
				1128	pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n",
				1129	__func__, crc_stored, crc,
				1130	(unsigned long long)pplhdr_offset);
				1131	pplhdr = prev_pplhdr;
				1132	pplhdr_offset = prev_pplhdr_offset;
				1133	break;
				1134	}
				1135
				1136	signature = le32_to_cpu(pplhdr->signature);
				1137
				1138	if (mddev->external) {
				1139	/*
				1140	* For external metadata the header signature is set and
				1141	* validated in userspace.
				1142	*/
				1143	ppl_conf->signature = signature;
				1144	} else if (ppl_conf->signature != signature) {
				1145	pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n",
				1146	__func__, signature, ppl_conf->signature,
				1147	(unsigned long long)pplhdr_offset);
				1148	pplhdr = prev_pplhdr;
				1149	pplhdr_offset = prev_pplhdr_offset;
				1150	break;
				1151	}
				1152
				1153	if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) >
				1154	le64_to_cpu(pplhdr->generation)) {
				1155	/* previous was newest */
				1156	pplhdr = prev_pplhdr;
				1157	pplhdr_offset = prev_pplhdr_offset;
				1158	break;
				1159	}
				1160
				1161	prev_pplhdr_offset = pplhdr_offset;
				1162	prev_pplhdr = pplhdr;
				1163
				1164	tmp = page;
				1165	page = page2;
				1166	page2 = tmp;
				1167
				1168	/* calculate next potential ppl offset */
				1169	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++)
				1170	pplhdr_offset +=
				1171	le32_to_cpu(pplhdr->entries[i].pp_size) >> 9;
				1172	pplhdr_offset += PPL_HEADER_SIZE >> 9;
				1173	}
				1174
				1175	/* no valid ppl found */
				1176	if (!pplhdr)
				1177	ppl_conf->mismatch_count++;
				1178	else
				1179	pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n",
				1180	__func__, (unsigned long long)pplhdr_offset,
				1181	le64_to_cpu(pplhdr->generation));
				1182
				1183	/* attempt to recover from log if we are starting a dirty array */
				1184	if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
				1185	ret = ppl_recover(log, pplhdr, pplhdr_offset);
				1186
				1187	/* write empty header if we are starting the array */
				1188	if (!ret && !mddev->pers)
				1189	ret = ppl_write_empty_header(log);
				1190
				1191	__free_page(page);
				1192	__free_page(page2);
				1193
				1194	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
				1195	__func__, ret, ppl_conf->mismatch_count,
				1196	ppl_conf->recovered_entries);
				1197	return ret;
				1198	}
				1199
				1200	static int ppl_load(struct ppl_conf *ppl_conf)
				1201	{
				1202	int ret = 0;
				1203	u32 signature = 0;
				1204	bool signature_set = false;
				1205	int i;
				1206
				1207	for (i = 0; i < ppl_conf->count; i++) {
				1208	struct ppl_log *log = &ppl_conf->child_logs[i];
				1209
				1210	/* skip missing drive */
				1211	if (!log->rdev)
				1212	continue;
				1213
				1214	ret = ppl_load_distributed(log);
				1215	if (ret)
				1216	break;
				1217
				1218	/*
				1219	* For external metadata we can't check if the signature is
				1220	* correct on a single drive, but we can check if it is the same
				1221	* on all drives.
				1222	*/
				1223	if (ppl_conf->mddev->external) {
				1224	if (!signature_set) {
				1225	signature = ppl_conf->signature;
				1226	signature_set = true;
				1227	} else if (signature != ppl_conf->signature) {
				1228	pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
				1229	mdname(ppl_conf->mddev));
				1230	ret = -EINVAL;
				1231	break;
				1232	}
				1233	}
				1234	}
				1235
				1236	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
				1237	__func__, ret, ppl_conf->mismatch_count,
				1238	ppl_conf->recovered_entries);
				1239	return ret;
				1240	}
				1241
				1242	static void __ppl_exit_log(struct ppl_conf *ppl_conf)
				1243	{
				1244	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
				1245	clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
				1246
				1247	kfree(ppl_conf->child_logs);
				1248
				1249	bioset_exit(&ppl_conf->bs);
				1250	bioset_exit(&ppl_conf->flush_bs);
				1251	mempool_exit(&ppl_conf->io_pool);
				1252	kmem_cache_destroy(ppl_conf->io_kc);
				1253
				1254	kfree(ppl_conf);
				1255	}
				1256
				1257	void ppl_exit_log(struct r5conf *conf)
				1258	{
				1259	struct ppl_conf *ppl_conf = conf->log_private;
				1260
				1261	if (ppl_conf) {
				1262	__ppl_exit_log(ppl_conf);
				1263	conf->log_private = NULL;
				1264	}
				1265	}
				1266
				1267	static int ppl_validate_rdev(struct md_rdev *rdev)
				1268	{
				1269	char b[BDEVNAME_SIZE];
				1270	int ppl_data_sectors;
				1271	int ppl_size_new;
				1272
				1273	/*
				1274	* The configured PPL size must be enough to store
				1275	* the header and (at the very least) partial parity
				1276	* for one stripe. Round it down to ensure the data
				1277	* space is cleanly divisible by stripe size.
				1278	*/
				1279	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
				1280
				1281	if (ppl_data_sectors > 0)
				1282	ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
				1283
				1284	if (ppl_data_sectors <= 0) {
				1285	pr_warn("md/raid:%s: PPL space too small on %s\n",
				1286	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				1287	return -ENOSPC;
				1288	}
				1289
				1290	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
				1291
				1292	if ((rdev->ppl.sector < rdev->data_offset &&
				1293	rdev->ppl.sector + ppl_size_new > rdev->data_offset) \|\|
				1294	(rdev->ppl.sector >= rdev->data_offset &&
				1295	rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
				1296	pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
				1297	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				1298	return -EINVAL;
				1299	}
				1300
				1301	if (!rdev->mddev->external &&
				1302	((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) \|\|
				1303	(rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
				1304	pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
				1305	mdname(rdev->mddev), bdevname(rdev->bdev, b));
				1306	return -EINVAL;
				1307	}
				1308
				1309	rdev->ppl.size = ppl_size_new;
				1310
				1311	return 0;
				1312	}
				1313
				1314	static void ppl_init_child_log(struct ppl_log log, struct md_rdev rdev)
				1315	{
				1316	struct request_queue *q;
				1317
				1318	if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
				1319	PPL_HEADER_SIZE) * 2) {
				1320	log->use_multippl = true;
				1321	set_bit(MD_HAS_MULTIPLE_PPLS,
				1322	&log->ppl_conf->mddev->flags);
				1323	log->entry_space = PPL_SPACE_SIZE;
				1324	} else {
				1325	log->use_multippl = false;
				1326	log->entry_space = (log->rdev->ppl.size << 9) -
				1327	PPL_HEADER_SIZE;
				1328	}
				1329	log->next_io_sector = rdev->ppl.sector;
				1330
				1331	q = bdev_get_queue(rdev->bdev);
				1332	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
				1333	log->wb_cache_on = true;
				1334	}
				1335
				1336	int ppl_init_log(struct r5conf *conf)
				1337	{
				1338	struct ppl_conf *ppl_conf;
				1339	struct mddev *mddev = conf->mddev;
				1340	int ret = 0;
				1341	int max_disks;
				1342	int i;
				1343
				1344	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
				1345	mdname(conf->mddev));
				1346
				1347	if (PAGE_SIZE != 4096)
				1348	return -EINVAL;
				1349
				1350	if (mddev->level != 5) {
				1351	pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
				1352	mdname(mddev), mddev->level);
				1353	return -EINVAL;
				1354	}
				1355
				1356	if (mddev->bitmap_info.file \|\| mddev->bitmap_info.offset) {
				1357	pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
				1358	mdname(mddev));
				1359	return -EINVAL;
				1360	}
				1361
				1362	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
				1363	pr_warn("md/raid:%s PPL is not compatible with journal\n",
				1364	mdname(mddev));
				1365	return -EINVAL;
				1366	}
				1367
				1368	max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
				1369	BITS_PER_BYTE;
				1370	if (conf->raid_disks > max_disks) {
				1371	pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
				1372	mdname(mddev), max_disks);
				1373	return -EINVAL;
				1374	}
				1375
				1376	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
				1377	if (!ppl_conf)
				1378	return -ENOMEM;
				1379
				1380	ppl_conf->mddev = mddev;
				1381
				1382	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
				1383	if (!ppl_conf->io_kc) {
				1384	ret = -ENOMEM;
				1385	goto err;
				1386	}
				1387
				1388	ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc,
				1389	ppl_io_pool_free, ppl_conf->io_kc);
				1390	if (ret)
				1391	goto err;
				1392
				1393	ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS);
				1394	if (ret)
				1395	goto err;
				1396
				1397	ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0);
				1398	if (ret)
				1399	goto err;
				1400
				1401	ppl_conf->count = conf->raid_disks;
				1402	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
				1403	GFP_KERNEL);
				1404	if (!ppl_conf->child_logs) {
				1405	ret = -ENOMEM;
				1406	goto err;
				1407	}
				1408
				1409	atomic64_set(&ppl_conf->seq, 0);
				1410	INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
				1411	spin_lock_init(&ppl_conf->no_mem_stripes_lock);
				1412
				1413	if (!mddev->external) {
				1414	ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
				1415	ppl_conf->block_size = 512;
				1416	} else {
				1417	ppl_conf->block_size = queue_logical_block_size(mddev->queue);
				1418	}
				1419
				1420	for (i = 0; i < ppl_conf->count; i++) {
				1421	struct ppl_log *log = &ppl_conf->child_logs[i];
				1422	struct md_rdev *rdev = conf->disks[i].rdev;
				1423
				1424	mutex_init(&log->io_mutex);
				1425	spin_lock_init(&log->io_list_lock);
				1426	INIT_LIST_HEAD(&log->io_list);
				1427
				1428	log->ppl_conf = ppl_conf;
				1429	log->rdev = rdev;
				1430
				1431	if (rdev) {
				1432	ret = ppl_validate_rdev(rdev);
				1433	if (ret)
				1434	goto err;
				1435
				1436	ppl_init_child_log(log, rdev);
				1437	}
				1438	}
				1439
				1440	/* load and possibly recover the logs from the member disks */
				1441	ret = ppl_load(ppl_conf);
				1442
				1443	if (ret) {
				1444	goto err;
				1445	} else if (!mddev->pers && mddev->recovery_cp == 0 &&
				1446	ppl_conf->recovered_entries > 0 &&
				1447	ppl_conf->mismatch_count == 0) {
				1448	/*
				1449	* If we are starting a dirty array and the recovery succeeds
				1450	* without any issues, set the array as clean.
				1451	*/
				1452	mddev->recovery_cp = MaxSector;
				1453	set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
				1454	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
				1455	/* no mismatch allowed when enabling PPL for a running array */
				1456	ret = -EINVAL;
				1457	goto err;
				1458	}
				1459
				1460	conf->log_private = ppl_conf;
				1461	set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
				1462
				1463	return 0;
				1464	err:
				1465	__ppl_exit_log(ppl_conf);
				1466	return ret;
				1467	}
				1468
				1469	int ppl_modify_log(struct r5conf conf, struct md_rdev rdev, bool add)
				1470	{
				1471	struct ppl_conf *ppl_conf = conf->log_private;
				1472	struct ppl_log *log;
				1473	int ret = 0;
				1474	char b[BDEVNAME_SIZE];
				1475
				1476	if (!rdev)
				1477	return -EINVAL;
				1478
				1479	pr_debug("%s: disk: %d operation: %s dev: %s\n",
				1480	__func__, rdev->raid_disk, add ? "add" : "remove",
				1481	bdevname(rdev->bdev, b));
				1482
				1483	if (rdev->raid_disk < 0)
				1484	return 0;
				1485
				1486	if (rdev->raid_disk >= ppl_conf->count)
				1487	return -ENODEV;
				1488
				1489	log = &ppl_conf->child_logs[rdev->raid_disk];
				1490
				1491	mutex_lock(&log->io_mutex);
				1492	if (add) {
				1493	ret = ppl_validate_rdev(rdev);
				1494	if (!ret) {
				1495	log->rdev = rdev;
				1496	ret = ppl_write_empty_header(log);
				1497	ppl_init_child_log(log, rdev);
				1498	}
				1499	} else {
				1500	log->rdev = NULL;
				1501	}
				1502	mutex_unlock(&log->io_mutex);
				1503
				1504	return ret;
				1505	}