Blame - marvell/linux/drivers/md/raid1.c - T108

blob: c40237cfdcb0fe979d8d8d5abd80d65ceb5ff5b0 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/*
				3	* raid1.c : Multiple Devices driver for Linux
				4	*
				5	* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
				6	*
				7	* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
				8	*
				9	* RAID-1 management functions.
				10	*
				11	* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
				12	*
				13	* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
				14	* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
				15	*
				16	* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
				17	* bitmapped intelligence in resync:
				18	*
				19	* - bitmap marked during normal i/o
				20	* - bitmap used to skip nondirty blocks during sync
				21	*
				22	* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
				23	* - persistent bitmap code
				24	*/
				25
				26	#include <linux/slab.h>
				27	#include <linux/delay.h>
				28	#include <linux/blkdev.h>
				29	#include <linux/module.h>
				30	#include <linux/seq_file.h>
				31	#include <linux/ratelimit.h>
				32
				33	#include <trace/events/block.h>
				34
				35	#include "md.h"
				36	#include "raid1.h"
				37	#include "md-bitmap.h"
				38
				39	#define UNSUPPORTED_MDDEV_FLAGS \
				40	((1L << MD_HAS_JOURNAL) \| \
				41	(1L << MD_JOURNAL_CLEAN) \| \
				42	(1L << MD_HAS_PPL) \| \
				43	(1L << MD_HAS_MULTIPLE_PPLS))
				44
				45	static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
				46	static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
				47
				48	#define raid1_log(md, fmt, args...) \
				49	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
				50
				51	#include "raid1-10.c"
				52
				53	static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
				54	{
				55	struct wb_info wi, temp_wi;
				56	unsigned long flags;
				57	int ret = 0;
				58	struct mddev *mddev = rdev->mddev;
				59
				60	wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
				61
				62	spin_lock_irqsave(&rdev->wb_list_lock, flags);
				63	list_for_each_entry(temp_wi, &rdev->wb_list, list) {
				64	/* collision happened */
				65	if (hi > temp_wi->lo && lo < temp_wi->hi) {
				66	ret = -EBUSY;
				67	break;
				68	}
				69	}
				70
				71	if (!ret) {
				72	wi->lo = lo;
				73	wi->hi = hi;
				74	list_add(&wi->list, &rdev->wb_list);
				75	} else
				76	mempool_free(wi, mddev->wb_info_pool);
				77	spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
				78
				79	return ret;
				80	}
				81
				82	static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
				83	{
				84	struct wb_info *wi;
				85	unsigned long flags;
				86	int found = 0;
				87	struct mddev *mddev = rdev->mddev;
				88
				89	spin_lock_irqsave(&rdev->wb_list_lock, flags);
				90	list_for_each_entry(wi, &rdev->wb_list, list)
				91	if (hi == wi->hi && lo == wi->lo) {
				92	list_del(&wi->list);
				93	mempool_free(wi, mddev->wb_info_pool);
				94	found = 1;
				95	break;
				96	}
				97
				98	if (!found)
				99	WARN(1, "The write behind IO is not recorded\n");
				100	spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
				101	wake_up(&rdev->wb_io_wait);
				102	}
				103
				104	/*
				105	* for resync bio, r1bio pointer can be retrieved from the per-bio
				106	* 'struct resync_pages'.
				107	*/
				108	static inline struct r1bio get_resync_r1bio(struct bio bio)
				109	{
				110	return get_resync_pages(bio)->raid_bio;
				111	}
				112
				113	static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
				114	{
				115	struct pool_info *pi = data;
				116	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
				117
				118	/* allocate a r1bio with room for raid_disks entries in the bios array */
				119	return kzalloc(size, gfp_flags);
				120	}
				121
				122	#define RESYNC_DEPTH 32
				123	#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
				124	#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
				125	#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
				126	#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
				127	#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
				128
				129	static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
				130	{
				131	struct pool_info *pi = data;
				132	struct r1bio *r1_bio;
				133	struct bio *bio;
				134	int need_pages;
				135	int j;
				136	struct resync_pages *rps;
				137
				138	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
				139	if (!r1_bio)
				140	return NULL;
				141
				142	rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages),
				143	gfp_flags);
				144	if (!rps)
				145	goto out_free_r1bio;
				146
				147	/*
				148	* Allocate bios : 1 for reading, n-1 for writing
				149	*/
				150	for (j = pi->raid_disks ; j-- ; ) {
				151	bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
				152	if (!bio)
				153	goto out_free_bio;
				154	r1_bio->bios[j] = bio;
				155	}
				156	/*
				157	* Allocate RESYNC_PAGES data pages and attach them to
				158	* the first bio.
				159	* If this is a user-requested check/repair, allocate
				160	* RESYNC_PAGES for each bio.
				161	*/
				162	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
				163	need_pages = pi->raid_disks;
				164	else
				165	need_pages = 1;
				166	for (j = 0; j < pi->raid_disks; j++) {
				167	struct resync_pages *rp = &rps[j];
				168
				169	bio = r1_bio->bios[j];
				170
				171	if (j < need_pages) {
				172	if (resync_alloc_pages(rp, gfp_flags))
				173	goto out_free_pages;
				174	} else {
				175	memcpy(rp, &rps[0], sizeof(*rp));
				176	resync_get_all_pages(rp);
				177	}
				178
				179	rp->raid_bio = r1_bio;
				180	bio->bi_private = rp;
				181	}
				182
				183	r1_bio->master_bio = NULL;
				184
				185	return r1_bio;
				186
				187	out_free_pages:
				188	while (--j >= 0)
				189	resync_free_pages(&rps[j]);
				190
				191	out_free_bio:
				192	while (++j < pi->raid_disks)
				193	bio_put(r1_bio->bios[j]);
				194	kfree(rps);
				195
				196	out_free_r1bio:
				197	rbio_pool_free(r1_bio, data);
				198	return NULL;
				199	}
				200
				201	static void r1buf_pool_free(void __r1_bio, void data)
				202	{
				203	struct pool_info *pi = data;
				204	int i;
				205	struct r1bio *r1bio = __r1_bio;
				206	struct resync_pages *rp = NULL;
				207
				208	for (i = pi->raid_disks; i--; ) {
				209	rp = get_resync_pages(r1bio->bios[i]);
				210	resync_free_pages(rp);
				211	bio_put(r1bio->bios[i]);
				212	}
				213
				214	/* resync pages array stored in the 1st bio's .bi_private */
				215	kfree(rp);
				216
				217	rbio_pool_free(r1bio, data);
				218	}
				219
				220	static void put_all_bios(struct r1conf conf, struct r1bio r1_bio)
				221	{
				222	int i;
				223
				224	for (i = 0; i < conf->raid_disks * 2; i++) {
				225	struct bio **bio = r1_bio->bios + i;
				226	if (!BIO_SPECIAL(*bio))
				227	bio_put(*bio);
				228	*bio = NULL;
				229	}
				230	}
				231
				232	static void free_r1bio(struct r1bio *r1_bio)
				233	{
				234	struct r1conf *conf = r1_bio->mddev->private;
				235
				236	put_all_bios(conf, r1_bio);
				237	mempool_free(r1_bio, &conf->r1bio_pool);
				238	}
				239
				240	static void put_buf(struct r1bio *r1_bio)
				241	{
				242	struct r1conf *conf = r1_bio->mddev->private;
				243	sector_t sect = r1_bio->sector;
				244	int i;
				245
				246	for (i = 0; i < conf->raid_disks * 2; i++) {
				247	struct bio *bio = r1_bio->bios[i];
				248	if (bio->bi_end_io)
				249	rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
				250	}
				251
				252	mempool_free(r1_bio, &conf->r1buf_pool);
				253
				254	lower_barrier(conf, sect);
				255	}
				256
				257	static void reschedule_retry(struct r1bio *r1_bio)
				258	{
				259	unsigned long flags;
				260	struct mddev *mddev = r1_bio->mddev;
				261	struct r1conf *conf = mddev->private;
				262	int idx;
				263
				264	idx = sector_to_idx(r1_bio->sector);
				265	spin_lock_irqsave(&conf->device_lock, flags);
				266	list_add(&r1_bio->retry_list, &conf->retry_list);
				267	atomic_inc(&conf->nr_queued[idx]);
				268	spin_unlock_irqrestore(&conf->device_lock, flags);
				269
				270	wake_up(&conf->wait_barrier);
				271	md_wakeup_thread(mddev->thread);
				272	}
				273
				274	/*
				275	* raid_end_bio_io() is called when we have finished servicing a mirrored
				276	* operation and are ready to return a success/failure code to the buffer
				277	* cache layer.
				278	*/
				279	static void call_bio_endio(struct r1bio *r1_bio)
				280	{
				281	struct bio *bio = r1_bio->master_bio;
				282	struct r1conf *conf = r1_bio->mddev->private;
				283
				284	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
				285	bio->bi_status = BLK_STS_IOERR;
				286
				287	bio_endio(bio);
				288	/*
				289	* Wake up any possible resync thread that waits for the device
				290	* to go idle.
				291	*/
				292	allow_barrier(conf, r1_bio->sector);
				293	}
				294
				295	static void raid_end_bio_io(struct r1bio *r1_bio)
				296	{
				297	struct bio *bio = r1_bio->master_bio;
				298
				299	/* if nobody has done the final endio yet, do it now */
				300	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
				301	pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
				302	(bio_data_dir(bio) == WRITE) ? "write" : "read",
				303	(unsigned long long) bio->bi_iter.bi_sector,
				304	(unsigned long long) bio_end_sector(bio) - 1);
				305
				306	call_bio_endio(r1_bio);
				307	}
				308	free_r1bio(r1_bio);
				309	}
				310
				311	/*
				312	* Update disk head position estimator based on IRQ completion info.
				313	*/
				314	static inline void update_head_pos(int disk, struct r1bio *r1_bio)
				315	{
				316	struct r1conf *conf = r1_bio->mddev->private;
				317
				318	conf->mirrors[disk].head_position =
				319	r1_bio->sector + (r1_bio->sectors);
				320	}
				321
				322	/*
				323	* Find the disk number which triggered given bio
				324	*/
				325	static int find_bio_disk(struct r1bio r1_bio, struct bio bio)
				326	{
				327	int mirror;
				328	struct r1conf *conf = r1_bio->mddev->private;
				329	int raid_disks = conf->raid_disks;
				330
				331	for (mirror = 0; mirror < raid_disks * 2; mirror++)
				332	if (r1_bio->bios[mirror] == bio)
				333	break;
				334
				335	BUG_ON(mirror == raid_disks * 2);
				336	update_head_pos(mirror, r1_bio);
				337
				338	return mirror;
				339	}
				340
				341	static void raid1_end_read_request(struct bio *bio)
				342	{
				343	int uptodate = !bio->bi_status;
				344	struct r1bio *r1_bio = bio->bi_private;
				345	struct r1conf *conf = r1_bio->mddev->private;
				346	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
				347
				348	/*
				349	* this branch is our 'one mirror IO has finished' event handler:
				350	*/
				351	update_head_pos(r1_bio->read_disk, r1_bio);
				352
				353	if (uptodate)
				354	set_bit(R1BIO_Uptodate, &r1_bio->state);
				355	else if (test_bit(FailFast, &rdev->flags) &&
				356	test_bit(R1BIO_FailFast, &r1_bio->state))
				357	/* This was a fail-fast read so we definitely
				358	* want to retry */
				359	;
				360	else {
				361	/* If all other devices have failed, we want to return
				362	* the error upwards rather than fail the last device.
				363	* Here we redefine "uptodate" to mean "Don't want to retry"
				364	*/
				365	unsigned long flags;
				366	spin_lock_irqsave(&conf->device_lock, flags);
				367	if (r1_bio->mddev->degraded == conf->raid_disks \|\|
				368	(r1_bio->mddev->degraded == conf->raid_disks-1 &&
				369	test_bit(In_sync, &rdev->flags)))
				370	uptodate = 1;
				371	spin_unlock_irqrestore(&conf->device_lock, flags);
				372	}
				373
				374	if (uptodate) {
				375	raid_end_bio_io(r1_bio);
				376	rdev_dec_pending(rdev, conf->mddev);
				377	} else {
				378	/*
				379	* oops, read error:
				380	*/
				381	char b[BDEVNAME_SIZE];
				382	pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n",
				383	mdname(conf->mddev),
				384	bdevname(rdev->bdev, b),
				385	(unsigned long long)r1_bio->sector);
				386	set_bit(R1BIO_ReadError, &r1_bio->state);
				387	reschedule_retry(r1_bio);
				388	/* don't drop the reference on read_disk yet */
				389	}
				390	}
				391
				392	static void close_write(struct r1bio *r1_bio)
				393	{
				394	/* it really is the end of this request */
				395	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
				396	bio_free_pages(r1_bio->behind_master_bio);
				397	bio_put(r1_bio->behind_master_bio);
				398	r1_bio->behind_master_bio = NULL;
				399	}
				400	/* clear the bitmap if all writes complete successfully */
				401	md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
				402	r1_bio->sectors,
				403	!test_bit(R1BIO_Degraded, &r1_bio->state),
				404	test_bit(R1BIO_BehindIO, &r1_bio->state));
				405	md_write_end(r1_bio->mddev);
				406	}
				407
				408	static void r1_bio_write_done(struct r1bio *r1_bio)
				409	{
				410	if (!atomic_dec_and_test(&r1_bio->remaining))
				411	return;
				412
				413	if (test_bit(R1BIO_WriteError, &r1_bio->state))
				414	reschedule_retry(r1_bio);
				415	else {
				416	close_write(r1_bio);
				417	if (test_bit(R1BIO_MadeGood, &r1_bio->state))
				418	reschedule_retry(r1_bio);
				419	else
				420	raid_end_bio_io(r1_bio);
				421	}
				422	}
				423
				424	static void raid1_end_write_request(struct bio *bio)
				425	{
				426	struct r1bio *r1_bio = bio->bi_private;
				427	int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
				428	struct r1conf *conf = r1_bio->mddev->private;
				429	struct bio *to_put = NULL;
				430	int mirror = find_bio_disk(r1_bio, bio);
				431	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
				432	bool discard_error;
				433
				434	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
				435
				436	/*
				437	* 'one mirror IO has finished' event handler:
				438	*/
				439	if (bio->bi_status && !discard_error) {
				440	set_bit(WriteErrorSeen, &rdev->flags);
				441	if (!test_and_set_bit(WantReplacement, &rdev->flags))
				442	set_bit(MD_RECOVERY_NEEDED, &
				443	conf->mddev->recovery);
				444
				445	if (test_bit(FailFast, &rdev->flags) &&
				446	(bio->bi_opf & MD_FAILFAST) &&
				447	/* We never try FailFast to WriteMostly devices */
				448	!test_bit(WriteMostly, &rdev->flags)) {
				449	md_error(r1_bio->mddev, rdev);
				450	}
				451
				452	/*
				453	* When the device is faulty, it is not necessary to
				454	* handle write error.
				455	*/
				456	if (!test_bit(Faulty, &rdev->flags))
				457	set_bit(R1BIO_WriteError, &r1_bio->state);
				458	else {
				459	/* Fail the request */
				460	set_bit(R1BIO_Degraded, &r1_bio->state);
				461	/* Finished with this branch */
				462	r1_bio->bios[mirror] = NULL;
				463	to_put = bio;
				464	}
				465	} else {
				466	/*
				467	* Set R1BIO_Uptodate in our master bio, so that we
				468	* will return a good error code for to the higher
				469	* levels even if IO on some other mirrored buffer
				470	* fails.
				471	*
				472	* The 'master' represents the composite IO operation
				473	* to user-side. So if something waits for IO, then it
				474	* will wait for the 'master' bio.
				475	*/
				476	sector_t first_bad;
				477	int bad_sectors;
				478
				479	r1_bio->bios[mirror] = NULL;
				480	to_put = bio;
				481	/*
				482	* Do not set R1BIO_Uptodate if the current device is
				483	* rebuilding or Faulty. This is because we cannot use
				484	* such device for properly reading the data back (we could
				485	* potentially use it, if the current write would have felt
				486	* before rdev->recovery_offset, but for simplicity we don't
				487	* check this here.
				488	*/
				489	if (test_bit(In_sync, &rdev->flags) &&
				490	!test_bit(Faulty, &rdev->flags))
				491	set_bit(R1BIO_Uptodate, &r1_bio->state);
				492
				493	/* Maybe we can clear some bad blocks. */
				494	if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
				495	&first_bad, &bad_sectors) && !discard_error) {
				496	r1_bio->bios[mirror] = IO_MADE_GOOD;
				497	set_bit(R1BIO_MadeGood, &r1_bio->state);
				498	}
				499	}
				500
				501	if (behind) {
				502	if (test_bit(WBCollisionCheck, &rdev->flags)) {
				503	sector_t lo = r1_bio->sector;
				504	sector_t hi = r1_bio->sector + r1_bio->sectors;
				505
				506	remove_wb(rdev, lo, hi);
				507	}
				508	if (test_bit(WriteMostly, &rdev->flags))
				509	atomic_dec(&r1_bio->behind_remaining);
				510
				511	/*
				512	* In behind mode, we ACK the master bio once the I/O
				513	* has safely reached all non-writemostly
				514	* disks. Setting the Returned bit ensures that this
				515	* gets done only once -- we don't ever want to return
				516	* -EIO here, instead we'll wait
				517	*/
				518	if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
				519	test_bit(R1BIO_Uptodate, &r1_bio->state)) {
				520	/* Maybe we can return now */
				521	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
				522	struct bio *mbio = r1_bio->master_bio;
				523	pr_debug("raid1: behind end write sectors"
				524	" %llu-%llu\n",
				525	(unsigned long long) mbio->bi_iter.bi_sector,
				526	(unsigned long long) bio_end_sector(mbio) - 1);
				527	call_bio_endio(r1_bio);
				528	}
				529	}
				530	}
				531	if (r1_bio->bios[mirror] == NULL)
				532	rdev_dec_pending(rdev, conf->mddev);
				533
				534	/*
				535	* Let's see if all mirrored write operations have finished
				536	* already.
				537	*/
				538	r1_bio_write_done(r1_bio);
				539
				540	if (to_put)
				541	bio_put(to_put);
				542	}
				543
				544	static sector_t align_to_barrier_unit_end(sector_t start_sector,
				545	sector_t sectors)
				546	{
				547	sector_t len;
				548
				549	WARN_ON(sectors == 0);
				550	/*
				551	* len is the number of sectors from start_sector to end of the
				552	* barrier unit which start_sector belongs to.
				553	*/
				554	len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
				555	start_sector;
				556
				557	if (len > sectors)
				558	len = sectors;
				559
				560	return len;
				561	}
				562
				563	/*
				564	* This routine returns the disk from which the requested read should
				565	* be done. There is a per-array 'next expected sequential IO' sector
				566	* number - if this matches on the next IO then we use the last disk.
				567	* There is also a per-disk 'last know head position' sector that is
				568	* maintained from IRQ contexts, both the normal and the resync IO
				569	* completion handlers update this position correctly. If there is no
				570	* perfect sequential match then we pick the disk whose head is closest.
				571	*
				572	* If there are 2 mirrors in the same 2 devices, performance degrades
				573	* because position is mirror, not device based.
				574	*
				575	* The rdev for the device selected will have nr_pending incremented.
				576	*/
				577	static int read_balance(struct r1conf conf, struct r1bio r1_bio, int *max_sectors)
				578	{
				579	const sector_t this_sector = r1_bio->sector;
				580	int sectors;
				581	int best_good_sectors;
				582	int best_disk, best_dist_disk, best_pending_disk;
				583	int has_nonrot_disk;
				584	int disk;
				585	sector_t best_dist;
				586	unsigned int min_pending;
				587	struct md_rdev *rdev;
				588	int choose_first;
				589	int choose_next_idle;
				590
				591	rcu_read_lock();
				592	/*
				593	* Check if we can balance. We can balance on the whole
				594	* device if no resync is going on, or below the resync window.
				595	* We take the first readable disk when above the resync window.
				596	*/
				597	retry:
				598	sectors = r1_bio->sectors;
				599	best_disk = -1;
				600	best_dist_disk = -1;
				601	best_dist = MaxSector;
				602	best_pending_disk = -1;
				603	min_pending = UINT_MAX;
				604	best_good_sectors = 0;
				605	has_nonrot_disk = 0;
				606	choose_next_idle = 0;
				607	clear_bit(R1BIO_FailFast, &r1_bio->state);
				608
				609	if ((conf->mddev->recovery_cp < this_sector + sectors) \|\|
				610	(mddev_is_clustered(conf->mddev) &&
				611	md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
				612	this_sector + sectors)))
				613	choose_first = 1;
				614	else
				615	choose_first = 0;
				616
				617	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
				618	sector_t dist;
				619	sector_t first_bad;
				620	int bad_sectors;
				621	unsigned int pending;
				622	bool nonrot;
				623
				624	rdev = rcu_dereference(conf->mirrors[disk].rdev);
				625	if (r1_bio->bios[disk] == IO_BLOCKED
				626	\|\| rdev == NULL
				627	\|\| test_bit(Faulty, &rdev->flags))
				628	continue;
				629	if (!test_bit(In_sync, &rdev->flags) &&
				630	rdev->recovery_offset < this_sector + sectors)
				631	continue;
				632	if (test_bit(WriteMostly, &rdev->flags)) {
				633	/* Don't balance among write-mostly, just
				634	* use the first as a last resort */
				635	if (best_dist_disk < 0) {
				636	if (is_badblock(rdev, this_sector, sectors,
				637	&first_bad, &bad_sectors)) {
				638	if (first_bad <= this_sector)
				639	/* Cannot use this */
				640	continue;
				641	best_good_sectors = first_bad - this_sector;
				642	} else
				643	best_good_sectors = sectors;
				644	best_dist_disk = disk;
				645	best_pending_disk = disk;
				646	}
				647	continue;
				648	}
				649	/* This is a reasonable device to use. It might
				650	* even be best.
				651	*/
				652	if (is_badblock(rdev, this_sector, sectors,
				653	&first_bad, &bad_sectors)) {
				654	if (best_dist < MaxSector)
				655	/* already have a better device */
				656	continue;
				657	if (first_bad <= this_sector) {
				658	/* cannot read here. If this is the 'primary'
				659	* device, then we must not read beyond
				660	* bad_sectors from another device..
				661	*/
				662	bad_sectors -= (this_sector - first_bad);
				663	if (choose_first && sectors > bad_sectors)
				664	sectors = bad_sectors;
				665	if (best_good_sectors > sectors)
				666	best_good_sectors = sectors;
				667
				668	} else {
				669	sector_t good_sectors = first_bad - this_sector;
				670	if (good_sectors > best_good_sectors) {
				671	best_good_sectors = good_sectors;
				672	best_disk = disk;
				673	}
				674	if (choose_first)
				675	break;
				676	}
				677	continue;
				678	} else {
				679	if ((sectors > best_good_sectors) && (best_disk >= 0))
				680	best_disk = -1;
				681	best_good_sectors = sectors;
				682	}
				683
				684	if (best_disk >= 0)
				685	/* At least two disks to choose from so failfast is OK */
				686	set_bit(R1BIO_FailFast, &r1_bio->state);
				687
				688	nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
				689	has_nonrot_disk \|= nonrot;
				690	pending = atomic_read(&rdev->nr_pending);
				691	dist = abs(this_sector - conf->mirrors[disk].head_position);
				692	if (choose_first) {
				693	best_disk = disk;
				694	break;
				695	}
				696	/* Don't change to another disk for sequential reads */
				697	if (conf->mirrors[disk].next_seq_sect == this_sector
				698	\|\| dist == 0) {
				699	int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
				700	struct raid1_info *mirror = &conf->mirrors[disk];
				701
				702	best_disk = disk;
				703	/*
				704	* If buffered sequential IO size exceeds optimal
				705	* iosize, check if there is idle disk. If yes, choose
				706	* the idle disk. read_balance could already choose an
				707	* idle disk before noticing it's a sequential IO in
				708	* this disk. This doesn't matter because this disk
				709	* will idle, next time it will be utilized after the
				710	* first disk has IO size exceeds optimal iosize. In
				711	* this way, iosize of the first disk will be optimal
				712	* iosize at least. iosize of the second disk might be
				713	* small, but not a big deal since when the second disk
				714	* starts IO, the first disk is likely still busy.
				715	*/
				716	if (nonrot && opt_iosize > 0 &&
				717	mirror->seq_start != MaxSector &&
				718	mirror->next_seq_sect > opt_iosize &&
				719	mirror->next_seq_sect - opt_iosize >=
				720	mirror->seq_start) {
				721	choose_next_idle = 1;
				722	continue;
				723	}
				724	break;
				725	}
				726
				727	if (choose_next_idle)
				728	continue;
				729
				730	if (min_pending > pending) {
				731	min_pending = pending;
				732	best_pending_disk = disk;
				733	}
				734
				735	if (dist < best_dist) {
				736	best_dist = dist;
				737	best_dist_disk = disk;
				738	}
				739	}
				740
				741	/*
				742	* If all disks are rotational, choose the closest disk. If any disk is
				743	* non-rotational, choose the disk with less pending request even the
				744	* disk is rotational, which might/might not be optimal for raids with
				745	* mixed ratation/non-rotational disks depending on workload.
				746	*/
				747	if (best_disk == -1) {
				748	if (has_nonrot_disk \|\| min_pending == 0)
				749	best_disk = best_pending_disk;
				750	else
				751	best_disk = best_dist_disk;
				752	}
				753
				754	if (best_disk >= 0) {
				755	rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
				756	if (!rdev)
				757	goto retry;
				758	atomic_inc(&rdev->nr_pending);
				759	sectors = best_good_sectors;
				760
				761	if (conf->mirrors[best_disk].next_seq_sect != this_sector)
				762	conf->mirrors[best_disk].seq_start = this_sector;
				763
				764	conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
				765	}
				766	rcu_read_unlock();
				767	*max_sectors = sectors;
				768
				769	return best_disk;
				770	}
				771
				772	static int raid1_congested(struct mddev *mddev, int bits)
				773	{
				774	struct r1conf *conf = mddev->private;
				775	int i, ret = 0;
				776
				777	if ((bits & (1 << WB_async_congested)) &&
				778	conf->pending_count >= max_queued_requests)
				779	return 1;
				780
				781	rcu_read_lock();
				782	for (i = 0; i < conf->raid_disks * 2; i++) {
				783	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
				784	if (rdev && !test_bit(Faulty, &rdev->flags)) {
				785	struct request_queue *q = bdev_get_queue(rdev->bdev);
				786
				787	BUG_ON(!q);
				788
				789	/* Note the '\|\| 1' - when read_balance prefers
				790	* non-congested targets, it can be removed
				791	*/
				792	if ((bits & (1 << WB_async_congested)) \|\| 1)
				793	ret \|= bdi_congested(q->backing_dev_info, bits);
				794	else
				795	ret &= bdi_congested(q->backing_dev_info, bits);
				796	}
				797	}
				798	rcu_read_unlock();
				799	return ret;
				800	}
				801
				802	static void flush_bio_list(struct r1conf conf, struct bio bio)
				803	{
				804	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
				805	md_bitmap_unplug(conf->mddev->bitmap);
				806	wake_up(&conf->wait_barrier);
				807
				808	while (bio) { /* submit pending writes */
				809	struct bio *next = bio->bi_next;
				810	struct md_rdev rdev = (void )bio->bi_disk;
				811	bio->bi_next = NULL;
				812	bio_set_dev(bio, rdev->bdev);
				813	if (test_bit(Faulty, &rdev->flags)) {
				814	bio_io_error(bio);
				815	} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
				816	!blk_queue_discard(bio->bi_disk->queue)))
				817	/* Just ignore it */
				818	bio_endio(bio);
				819	else
				820	generic_make_request(bio);
				821	bio = next;
				822	}
				823	}
				824
				825	static void flush_pending_writes(struct r1conf *conf)
				826	{
				827	/* Any writes that have been queued but are awaiting
				828	* bitmap updates get flushed here.
				829	*/
				830	spin_lock_irq(&conf->device_lock);
				831
				832	if (conf->pending_bio_list.head) {
				833	struct blk_plug plug;
				834	struct bio *bio;
				835
				836	bio = bio_list_get(&conf->pending_bio_list);
				837	conf->pending_count = 0;
				838	spin_unlock_irq(&conf->device_lock);
				839
				840	/*
				841	* As this is called in a wait_event() loop (see freeze_array),
				842	* current->state might be TASK_UNINTERRUPTIBLE which will
				843	* cause a warning when we prepare to wait again. As it is
				844	* rare that this path is taken, it is perfectly safe to force
				845	* us to go around the wait_event() loop again, so the warning
				846	* is a false-positive. Silence the warning by resetting
				847	* thread state
				848	*/
				849	__set_current_state(TASK_RUNNING);
				850	blk_start_plug(&plug);
				851	flush_bio_list(conf, bio);
				852	blk_finish_plug(&plug);
				853	} else
				854	spin_unlock_irq(&conf->device_lock);
				855	}
				856
				857	/* Barriers....
				858	* Sometimes we need to suspend IO while we do something else,
				859	* either some resync/recovery, or reconfigure the array.
				860	* To do this we raise a 'barrier'.
				861	* The 'barrier' is a counter that can be raised multiple times
				862	* to count how many activities are happening which preclude
				863	* normal IO.
				864	* We can only raise the barrier if there is no pending IO.
				865	* i.e. if nr_pending == 0.
				866	* We choose only to raise the barrier if no-one is waiting for the
				867	* barrier to go down. This means that as soon as an IO request
				868	* is ready, no other operations which require a barrier will start
				869	* until the IO request has had a chance.
				870	*
				871	* So: regular IO calls 'wait_barrier'. When that returns there
				872	* is no backgroup IO happening, It must arrange to call
				873	* allow_barrier when it has finished its IO.
				874	* backgroup IO calls must call raise_barrier. Once that returns
				875	* there is no normal IO happeing. It must arrange to call
				876	* lower_barrier when the particular background IO completes.
				877	*
				878	* If resync/recovery is interrupted, returns -EINTR;
				879	* Otherwise, returns 0.
				880	*/
				881	static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
				882	{
				883	int idx = sector_to_idx(sector_nr);
				884
				885	spin_lock_irq(&conf->resync_lock);
				886
				887	/* Wait until no block IO is waiting */
				888	wait_event_lock_irq(conf->wait_barrier,
				889	!atomic_read(&conf->nr_waiting[idx]),
				890	conf->resync_lock);
				891
				892	/* block any new IO from starting */
				893	atomic_inc(&conf->barrier[idx]);
				894	/*
				895	* In raise_barrier() we firstly increase conf->barrier[idx] then
				896	* check conf->nr_pending[idx]. In _wait_barrier() we firstly
				897	* increase conf->nr_pending[idx] then check conf->barrier[idx].
				898	* A memory barrier here to make sure conf->nr_pending[idx] won't
				899	* be fetched before conf->barrier[idx] is increased. Otherwise
				900	* there will be a race between raise_barrier() and _wait_barrier().
				901	*/
				902	smp_mb__after_atomic();
				903
				904	/* For these conditions we must wait:
				905	* A: while the array is in frozen state
				906	* B: while conf->nr_pending[idx] is not 0, meaning regular I/O
				907	* existing in corresponding I/O barrier bucket.
				908	* C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
				909	* max resync count which allowed on current I/O barrier bucket.
				910	*/
				911	wait_event_lock_irq(conf->wait_barrier,
				912	(!conf->array_frozen &&
				913	!atomic_read(&conf->nr_pending[idx]) &&
				914	atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) \|\|
				915	test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
				916	conf->resync_lock);
				917
				918	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
				919	atomic_dec(&conf->barrier[idx]);
				920	spin_unlock_irq(&conf->resync_lock);
				921	wake_up(&conf->wait_barrier);
				922	return -EINTR;
				923	}
				924
				925	atomic_inc(&conf->nr_sync_pending);
				926	spin_unlock_irq(&conf->resync_lock);
				927
				928	return 0;
				929	}
				930
				931	static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
				932	{
				933	int idx = sector_to_idx(sector_nr);
				934
				935	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
				936
				937	atomic_dec(&conf->barrier[idx]);
				938	atomic_dec(&conf->nr_sync_pending);
				939	wake_up(&conf->wait_barrier);
				940	}
				941
				942	static void _wait_barrier(struct r1conf *conf, int idx)
				943	{
				944	/*
				945	* We need to increase conf->nr_pending[idx] very early here,
				946	* then raise_barrier() can be blocked when it waits for
				947	* conf->nr_pending[idx] to be 0. Then we can avoid holding
				948	* conf->resync_lock when there is no barrier raised in same
				949	* barrier unit bucket. Also if the array is frozen, I/O
				950	* should be blocked until array is unfrozen.
				951	*/
				952	atomic_inc(&conf->nr_pending[idx]);
				953	/*
				954	* In _wait_barrier() we firstly increase conf->nr_pending[idx], then
				955	* check conf->barrier[idx]. In raise_barrier() we firstly increase
				956	* conf->barrier[idx], then check conf->nr_pending[idx]. A memory
				957	* barrier is necessary here to make sure conf->barrier[idx] won't be
				958	* fetched before conf->nr_pending[idx] is increased. Otherwise there
				959	* will be a race between _wait_barrier() and raise_barrier().
				960	*/
				961	smp_mb__after_atomic();
				962
				963	/*
				964	* Don't worry about checking two atomic_t variables at same time
				965	* here. If during we check conf->barrier[idx], the array is
				966	* frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
				967	* 0, it is safe to return and make the I/O continue. Because the
				968	* array is frozen, all I/O returned here will eventually complete
				969	* or be queued, no race will happen. See code comment in
				970	* frozen_array().
				971	*/
				972	if (!READ_ONCE(conf->array_frozen) &&
				973	!atomic_read(&conf->barrier[idx]))
				974	return;
				975
				976	/*
				977	* After holding conf->resync_lock, conf->nr_pending[idx]
				978	* should be decreased before waiting for barrier to drop.
				979	* Otherwise, we may encounter a race condition because
				980	* raise_barrer() might be waiting for conf->nr_pending[idx]
				981	* to be 0 at same time.
				982	*/
				983	spin_lock_irq(&conf->resync_lock);
				984	atomic_inc(&conf->nr_waiting[idx]);
				985	atomic_dec(&conf->nr_pending[idx]);
				986	/*
				987	* In case freeze_array() is waiting for
				988	* get_unqueued_pending() == extra
				989	*/
				990	wake_up(&conf->wait_barrier);
				991	/* Wait for the barrier in same barrier unit bucket to drop. */
				992	wait_event_lock_irq(conf->wait_barrier,
				993	!conf->array_frozen &&
				994	!atomic_read(&conf->barrier[idx]),
				995	conf->resync_lock);
				996	atomic_inc(&conf->nr_pending[idx]);
				997	atomic_dec(&conf->nr_waiting[idx]);
				998	spin_unlock_irq(&conf->resync_lock);
				999	}
				1000
				1001	static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
				1002	{
				1003	int idx = sector_to_idx(sector_nr);
				1004
				1005	/*
				1006	* Very similar to _wait_barrier(). The difference is, for read
				1007	* I/O we don't need wait for sync I/O, but if the whole array
				1008	* is frozen, the read I/O still has to wait until the array is
				1009	* unfrozen. Since there is no ordering requirement with
				1010	* conf->barrier[idx] here, memory barrier is unnecessary as well.
				1011	*/
				1012	atomic_inc(&conf->nr_pending[idx]);
				1013
				1014	if (!READ_ONCE(conf->array_frozen))
				1015	return;
				1016
				1017	spin_lock_irq(&conf->resync_lock);
				1018	atomic_inc(&conf->nr_waiting[idx]);
				1019	atomic_dec(&conf->nr_pending[idx]);
				1020	/*
				1021	* In case freeze_array() is waiting for
				1022	* get_unqueued_pending() == extra
				1023	*/
				1024	wake_up(&conf->wait_barrier);
				1025	/* Wait for array to be unfrozen */
				1026	wait_event_lock_irq(conf->wait_barrier,
				1027	!conf->array_frozen,
				1028	conf->resync_lock);
				1029	atomic_inc(&conf->nr_pending[idx]);
				1030	atomic_dec(&conf->nr_waiting[idx]);
				1031	spin_unlock_irq(&conf->resync_lock);
				1032	}
				1033
				1034	static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
				1035	{
				1036	int idx = sector_to_idx(sector_nr);
				1037
				1038	_wait_barrier(conf, idx);
				1039	}
				1040
				1041	static void _allow_barrier(struct r1conf *conf, int idx)
				1042	{
				1043	atomic_dec(&conf->nr_pending[idx]);
				1044	wake_up(&conf->wait_barrier);
				1045	}
				1046
				1047	static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
				1048	{
				1049	int idx = sector_to_idx(sector_nr);
				1050
				1051	_allow_barrier(conf, idx);
				1052	}
				1053
				1054	/* conf->resync_lock should be held */
				1055	static int get_unqueued_pending(struct r1conf *conf)
				1056	{
				1057	int idx, ret;
				1058
				1059	ret = atomic_read(&conf->nr_sync_pending);
				1060	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
				1061	ret += atomic_read(&conf->nr_pending[idx]) -
				1062	atomic_read(&conf->nr_queued[idx]);
				1063
				1064	return ret;
				1065	}
				1066
				1067	static void freeze_array(struct r1conf *conf, int extra)
				1068	{
				1069	/* Stop sync I/O and normal I/O and wait for everything to
				1070	* go quiet.
				1071	* This is called in two situations:
				1072	* 1) management command handlers (reshape, remove disk, quiesce).
				1073	* 2) one normal I/O request failed.
				1074
				1075	* After array_frozen is set to 1, new sync IO will be blocked at
				1076	* raise_barrier(), and new normal I/O will blocked at _wait_barrier()
				1077	* or wait_read_barrier(). The flying I/Os will either complete or be
				1078	* queued. When everything goes quite, there are only queued I/Os left.
				1079
				1080	* Every flying I/O contributes to a conf->nr_pending[idx], idx is the
				1081	* barrier bucket index which this I/O request hits. When all sync and
				1082	* normal I/O are queued, sum of all conf->nr_pending[] will match sum
				1083	* of all conf->nr_queued[]. But normal I/O failure is an exception,
				1084	* in handle_read_error(), we may call freeze_array() before trying to
				1085	* fix the read error. In this case, the error read I/O is not queued,
				1086	* so get_unqueued_pending() == 1.
				1087	*
				1088	* Therefore before this function returns, we need to wait until
				1089	* get_unqueued_pendings(conf) gets equal to extra. For
				1090	* normal I/O context, extra is 1, in rested situations extra is 0.
				1091	*/
				1092	spin_lock_irq(&conf->resync_lock);
				1093	conf->array_frozen = 1;
				1094	raid1_log(conf->mddev, "wait freeze");
				1095	wait_event_lock_irq_cmd(
				1096	conf->wait_barrier,
				1097	get_unqueued_pending(conf) == extra,
				1098	conf->resync_lock,
				1099	flush_pending_writes(conf));
				1100	spin_unlock_irq(&conf->resync_lock);
				1101	}
				1102	static void unfreeze_array(struct r1conf *conf)
				1103	{
				1104	/* reverse the effect of the freeze */
				1105	spin_lock_irq(&conf->resync_lock);
				1106	conf->array_frozen = 0;
				1107	spin_unlock_irq(&conf->resync_lock);
				1108	wake_up(&conf->wait_barrier);
				1109	}
				1110
				1111	static void alloc_behind_master_bio(struct r1bio *r1_bio,
				1112	struct bio *bio)
				1113	{
				1114	int size = bio->bi_iter.bi_size;
				1115	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1116	int i = 0;
				1117	struct bio *behind_bio = NULL;
				1118
				1119	behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
				1120	if (!behind_bio)
				1121	return;
				1122
				1123	/* discard op, we don't support writezero/writesame yet */
				1124	if (!bio_has_data(bio)) {
				1125	behind_bio->bi_iter.bi_size = size;
				1126	goto skip_copy;
				1127	}
				1128
				1129	behind_bio->bi_write_hint = bio->bi_write_hint;
				1130
				1131	while (i < vcnt && size) {
				1132	struct page *page;
				1133	int len = min_t(int, PAGE_SIZE, size);
				1134
				1135	page = alloc_page(GFP_NOIO);
				1136	if (unlikely(!page))
				1137	goto free_pages;
				1138
				1139	bio_add_page(behind_bio, page, len, 0);
				1140
				1141	size -= len;
				1142	i++;
				1143	}
				1144
				1145	bio_copy_data(behind_bio, bio);
				1146	skip_copy:
				1147	r1_bio->behind_master_bio = behind_bio;
				1148	set_bit(R1BIO_BehindIO, &r1_bio->state);
				1149
				1150	return;
				1151
				1152	free_pages:
				1153	pr_debug("%dB behind alloc failed, doing sync I/O\n",
				1154	bio->bi_iter.bi_size);
				1155	bio_free_pages(behind_bio);
				1156	bio_put(behind_bio);
				1157	}
				1158
				1159	struct raid1_plug_cb {
				1160	struct blk_plug_cb cb;
				1161	struct bio_list pending;
				1162	int pending_cnt;
				1163	};
				1164
				1165	static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1166	{
				1167	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
				1168	cb);
				1169	struct mddev *mddev = plug->cb.data;
				1170	struct r1conf *conf = mddev->private;
				1171	struct bio *bio;
				1172
				1173	if (from_schedule \|\| current->bio_list) {
				1174	spin_lock_irq(&conf->device_lock);
				1175	bio_list_merge(&conf->pending_bio_list, &plug->pending);
				1176	conf->pending_count += plug->pending_cnt;
				1177	spin_unlock_irq(&conf->device_lock);
				1178	wake_up(&conf->wait_barrier);
				1179	md_wakeup_thread(mddev->thread);
				1180	kfree(plug);
				1181	return;
				1182	}
				1183
				1184	/* we aren't scheduling, so we can do the write-out directly. */
				1185	bio = bio_list_get(&plug->pending);
				1186	flush_bio_list(conf, bio);
				1187	kfree(plug);
				1188	}
				1189
				1190	static void init_r1bio(struct r1bio r1_bio, struct mddev mddev, struct bio *bio)
				1191	{
				1192	r1_bio->master_bio = bio;
				1193	r1_bio->sectors = bio_sectors(bio);
				1194	r1_bio->state = 0;
				1195	r1_bio->mddev = mddev;
				1196	r1_bio->sector = bio->bi_iter.bi_sector;
				1197	}
				1198
				1199	static inline struct r1bio *
				1200	alloc_r1bio(struct mddev mddev, struct bio bio)
				1201	{
				1202	struct r1conf *conf = mddev->private;
				1203	struct r1bio *r1_bio;
				1204
				1205	r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
				1206	/* Ensure no bio records IO_BLOCKED */
				1207	memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
				1208	init_r1bio(r1_bio, mddev, bio);
				1209	return r1_bio;
				1210	}
				1211
				1212	static void raid1_read_request(struct mddev mddev, struct bio bio,
				1213	int max_read_sectors, struct r1bio *r1_bio)
				1214	{
				1215	struct r1conf *conf = mddev->private;
				1216	struct raid1_info *mirror;
				1217	struct bio *read_bio;
				1218	struct bitmap *bitmap = mddev->bitmap;
				1219	const int op = bio_op(bio);
				1220	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
				1221	int max_sectors;
				1222	int rdisk;
				1223	bool print_msg = !!r1_bio;
				1224	char b[BDEVNAME_SIZE];
				1225
				1226	/*
				1227	* If r1_bio is set, we are blocking the raid1d thread
				1228	* so there is a tiny risk of deadlock. So ask for
				1229	* emergency memory if needed.
				1230	*/
				1231	gfp_t gfp = r1_bio ? (GFP_NOIO \| __GFP_HIGH) : GFP_NOIO;
				1232
				1233	if (print_msg) {
				1234	/* Need to get the block device name carefully */
				1235	struct md_rdev *rdev;
				1236	rcu_read_lock();
				1237	rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
				1238	if (rdev)
				1239	bdevname(rdev->bdev, b);
				1240	else
				1241	strcpy(b, "???");
				1242	rcu_read_unlock();
				1243	}
				1244
				1245	/*
				1246	* Still need barrier for READ in case that whole
				1247	* array is frozen.
				1248	*/
				1249	wait_read_barrier(conf, bio->bi_iter.bi_sector);
				1250
				1251	if (!r1_bio)
				1252	r1_bio = alloc_r1bio(mddev, bio);
				1253	else
				1254	init_r1bio(r1_bio, mddev, bio);
				1255	r1_bio->sectors = max_read_sectors;
				1256
				1257	/*
				1258	* make_request() can abort the operation when read-ahead is being
				1259	* used and no empty request is available.
				1260	*/
				1261	rdisk = read_balance(conf, r1_bio, &max_sectors);
				1262
				1263	if (rdisk < 0) {
				1264	/* couldn't find anywhere to read from */
				1265	if (print_msg) {
				1266	pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
				1267	mdname(mddev),
				1268	b,
				1269	(unsigned long long)r1_bio->sector);
				1270	}
				1271	raid_end_bio_io(r1_bio);
				1272	return;
				1273	}
				1274	mirror = conf->mirrors + rdisk;
				1275
				1276	if (print_msg)
				1277	pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
				1278	mdname(mddev),
				1279	(unsigned long long)r1_bio->sector,
				1280	bdevname(mirror->rdev->bdev, b));
				1281
				1282	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
				1283	bitmap) {
				1284	/*
				1285	* Reading from a write-mostly device must take care not to
				1286	* over-take any writes that are 'behind'
				1287	*/
				1288	raid1_log(mddev, "wait behind writes");
				1289	wait_event(bitmap->behind_wait,
				1290	atomic_read(&bitmap->behind_writes) == 0);
				1291	}
				1292
				1293	if (max_sectors < bio_sectors(bio)) {
				1294	struct bio *split = bio_split(bio, max_sectors,
				1295	gfp, &conf->bio_split);
				1296	bio_chain(split, bio);
				1297	generic_make_request(bio);
				1298	bio = split;
				1299	r1_bio->master_bio = bio;
				1300	r1_bio->sectors = max_sectors;
				1301	}
				1302
				1303	r1_bio->read_disk = rdisk;
				1304
				1305	read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
				1306
				1307	r1_bio->bios[rdisk] = read_bio;
				1308
				1309	read_bio->bi_iter.bi_sector = r1_bio->sector +
				1310	mirror->rdev->data_offset;
				1311	bio_set_dev(read_bio, mirror->rdev->bdev);
				1312	read_bio->bi_end_io = raid1_end_read_request;
				1313	bio_set_op_attrs(read_bio, op, do_sync);
				1314	if (test_bit(FailFast, &mirror->rdev->flags) &&
				1315	test_bit(R1BIO_FailFast, &r1_bio->state))
				1316	read_bio->bi_opf \|= MD_FAILFAST;
				1317	read_bio->bi_private = r1_bio;
				1318
				1319	if (mddev->gendisk)
				1320	trace_block_bio_remap(read_bio->bi_disk->queue, read_bio,
				1321	disk_devt(mddev->gendisk), r1_bio->sector);
				1322
				1323	generic_make_request(read_bio);
				1324	}
				1325
				1326	static void raid1_write_request(struct mddev mddev, struct bio bio,
				1327	int max_write_sectors)
				1328	{
				1329	struct r1conf *conf = mddev->private;
				1330	struct r1bio *r1_bio;
				1331	int i, disks;
				1332	struct bitmap *bitmap = mddev->bitmap;
				1333	unsigned long flags;
				1334	struct md_rdev *blocked_rdev;
				1335	struct blk_plug_cb *cb;
				1336	struct raid1_plug_cb *plug = NULL;
				1337	int first_clone;
				1338	int max_sectors;
				1339
				1340	if (mddev_is_clustered(mddev) &&
				1341	md_cluster_ops->area_resyncing(mddev, WRITE,
				1342	bio->bi_iter.bi_sector, bio_end_sector(bio))) {
				1343
				1344	DEFINE_WAIT(w);
				1345	for (;;) {
				1346	prepare_to_wait(&conf->wait_barrier,
				1347	&w, TASK_IDLE);
				1348	if (!md_cluster_ops->area_resyncing(mddev, WRITE,
				1349	bio->bi_iter.bi_sector,
				1350	bio_end_sector(bio)))
				1351	break;
				1352	schedule();
				1353	}
				1354	finish_wait(&conf->wait_barrier, &w);
				1355	}
				1356
				1357	/*
				1358	* Register the new request and wait if the reconstruction
				1359	* thread has put up a bar for new requests.
				1360	* Continue immediately if no resync is active currently.
				1361	*/
				1362	wait_barrier(conf, bio->bi_iter.bi_sector);
				1363
				1364	r1_bio = alloc_r1bio(mddev, bio);
				1365	r1_bio->sectors = max_write_sectors;
				1366
				1367	if (conf->pending_count >= max_queued_requests) {
				1368	md_wakeup_thread(mddev->thread);
				1369	raid1_log(mddev, "wait queued");
				1370	wait_event(conf->wait_barrier,
				1371	conf->pending_count < max_queued_requests);
				1372	}
				1373	/* first select target devices under rcu_lock and
				1374	* inc refcount on their rdev. Record them by setting
				1375	* bios[x] to bio
				1376	* If there are known/acknowledged bad blocks on any device on
				1377	* which we have seen a write error, we want to avoid writing those
				1378	* blocks.
				1379	* This potentially requires several writes to write around
				1380	* the bad blocks. Each set of writes gets it's own r1bio
				1381	* with a set of bios attached.
				1382	*/
				1383
				1384	disks = conf->raid_disks * 2;
				1385	retry_write:
				1386	blocked_rdev = NULL;
				1387	rcu_read_lock();
				1388	max_sectors = r1_bio->sectors;
				1389	for (i = 0; i < disks; i++) {
				1390	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
				1391	if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
				1392	atomic_inc(&rdev->nr_pending);
				1393	blocked_rdev = rdev;
				1394	break;
				1395	}
				1396	r1_bio->bios[i] = NULL;
				1397	if (!rdev \|\| test_bit(Faulty, &rdev->flags)) {
				1398	if (i < conf->raid_disks)
				1399	set_bit(R1BIO_Degraded, &r1_bio->state);
				1400	continue;
				1401	}
				1402
				1403	atomic_inc(&rdev->nr_pending);
				1404	if (test_bit(WriteErrorSeen, &rdev->flags)) {
				1405	sector_t first_bad;
				1406	int bad_sectors;
				1407	int is_bad;
				1408
				1409	is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
				1410	&first_bad, &bad_sectors);
				1411	if (is_bad < 0) {
				1412	/* mustn't write here until the bad block is
				1413	* acknowledged*/
				1414	set_bit(BlockedBadBlocks, &rdev->flags);
				1415	blocked_rdev = rdev;
				1416	break;
				1417	}
				1418	if (is_bad && first_bad <= r1_bio->sector) {
				1419	/* Cannot write here at all */
				1420	bad_sectors -= (r1_bio->sector - first_bad);
				1421	if (bad_sectors < max_sectors)
				1422	/* mustn't write more than bad_sectors
				1423	* to other devices yet
				1424	*/
				1425	max_sectors = bad_sectors;
				1426	rdev_dec_pending(rdev, mddev);
				1427	/* We don't set R1BIO_Degraded as that
				1428	* only applies if the disk is
				1429	* missing, so it might be re-added,
				1430	* and we want to know to recover this
				1431	* chunk.
				1432	* In this case the device is here,
				1433	* and the fact that this chunk is not
				1434	* in-sync is recorded in the bad
				1435	* block log
				1436	*/
				1437	continue;
				1438	}
				1439	if (is_bad) {
				1440	int good_sectors = first_bad - r1_bio->sector;
				1441	if (good_sectors < max_sectors)
				1442	max_sectors = good_sectors;
				1443	}
				1444	}
				1445	r1_bio->bios[i] = bio;
				1446	}
				1447	rcu_read_unlock();
				1448
				1449	if (unlikely(blocked_rdev)) {
				1450	/* Wait for this device to become unblocked */
				1451	int j;
				1452
				1453	for (j = 0; j < i; j++)
				1454	if (r1_bio->bios[j])
				1455	rdev_dec_pending(conf->mirrors[j].rdev, mddev);
				1456	r1_bio->state = 0;
				1457	allow_barrier(conf, bio->bi_iter.bi_sector);
				1458	raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
				1459	md_wait_for_blocked_rdev(blocked_rdev, mddev);
				1460	wait_barrier(conf, bio->bi_iter.bi_sector);
				1461	goto retry_write;
				1462	}
				1463
				1464	if (max_sectors < bio_sectors(bio)) {
				1465	struct bio *split = bio_split(bio, max_sectors,
				1466	GFP_NOIO, &conf->bio_split);
				1467	bio_chain(split, bio);
				1468	generic_make_request(bio);
				1469	bio = split;
				1470	r1_bio->master_bio = bio;
				1471	r1_bio->sectors = max_sectors;
				1472	}
				1473
				1474	atomic_set(&r1_bio->remaining, 1);
				1475	atomic_set(&r1_bio->behind_remaining, 0);
				1476
				1477	first_clone = 1;
				1478
				1479	for (i = 0; i < disks; i++) {
				1480	struct bio *mbio = NULL;
				1481	if (!r1_bio->bios[i])
				1482	continue;
				1483
				1484	if (first_clone) {
				1485	/* do behind I/O ?
				1486	* Not if there are too many, or cannot
				1487	* allocate memory, or a reader on WriteMostly
				1488	* is waiting for behind writes to flush */
				1489	if (bitmap &&
				1490	(atomic_read(&bitmap->behind_writes)
				1491	< mddev->bitmap_info.max_write_behind) &&
				1492	!waitqueue_active(&bitmap->behind_wait)) {
				1493	alloc_behind_master_bio(r1_bio, bio);
				1494	}
				1495
				1496	md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
				1497	test_bit(R1BIO_BehindIO, &r1_bio->state));
				1498	first_clone = 0;
				1499	}
				1500
				1501	if (r1_bio->behind_master_bio)
				1502	mbio = bio_clone_fast(r1_bio->behind_master_bio,
				1503	GFP_NOIO, &mddev->bio_set);
				1504	else
				1505	mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
				1506
				1507	if (r1_bio->behind_master_bio) {
				1508	struct md_rdev *rdev = conf->mirrors[i].rdev;
				1509
				1510	if (test_bit(WBCollisionCheck, &rdev->flags)) {
				1511	sector_t lo = r1_bio->sector;
				1512	sector_t hi = r1_bio->sector + r1_bio->sectors;
				1513
				1514	wait_event(rdev->wb_io_wait,
				1515	check_and_add_wb(rdev, lo, hi) == 0);
				1516	}
				1517	if (test_bit(WriteMostly, &rdev->flags))
				1518	atomic_inc(&r1_bio->behind_remaining);
				1519	}
				1520
				1521	r1_bio->bios[i] = mbio;
				1522
				1523	mbio->bi_iter.bi_sector = (r1_bio->sector +
				1524	conf->mirrors[i].rdev->data_offset);
				1525	bio_set_dev(mbio, conf->mirrors[i].rdev->bdev);
				1526	mbio->bi_end_io = raid1_end_write_request;
				1527	mbio->bi_opf = bio_op(bio) \| (bio->bi_opf & (REQ_SYNC \| REQ_FUA));
				1528	if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
				1529	!test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
				1530	conf->raid_disks - mddev->degraded > 1)
				1531	mbio->bi_opf \|= MD_FAILFAST;
				1532	mbio->bi_private = r1_bio;
				1533
				1534	atomic_inc(&r1_bio->remaining);
				1535
				1536	if (mddev->gendisk)
				1537	trace_block_bio_remap(mbio->bi_disk->queue,
				1538	mbio, disk_devt(mddev->gendisk),
				1539	r1_bio->sector);
				1540	/* flush_pending_writes() needs access to the rdev so...*/
				1541	mbio->bi_disk = (void *)conf->mirrors[i].rdev;
				1542
				1543	cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
				1544	if (cb)
				1545	plug = container_of(cb, struct raid1_plug_cb, cb);
				1546	else
				1547	plug = NULL;
				1548	if (plug) {
				1549	bio_list_add(&plug->pending, mbio);
				1550	plug->pending_cnt++;
				1551	} else {
				1552	spin_lock_irqsave(&conf->device_lock, flags);
				1553	bio_list_add(&conf->pending_bio_list, mbio);
				1554	conf->pending_count++;
				1555	spin_unlock_irqrestore(&conf->device_lock, flags);
				1556	md_wakeup_thread(mddev->thread);
				1557	}
				1558	}
				1559
				1560	r1_bio_write_done(r1_bio);
				1561
				1562	/* In case raid1d snuck in to freeze_array */
				1563	wake_up(&conf->wait_barrier);
				1564	}
				1565
				1566	static bool raid1_make_request(struct mddev mddev, struct bio bio)
				1567	{
				1568	sector_t sectors;
				1569
				1570	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
				1571	&& md_flush_request(mddev, bio))
				1572	return true;
				1573
				1574	/*
				1575	* There is a limit to the maximum size, but
				1576	* the read/write handler might find a lower limit
				1577	* due to bad blocks. To avoid multiple splits,
				1578	* we pass the maximum number of sectors down
				1579	* and let the lower level perform the split.
				1580	*/
				1581	sectors = align_to_barrier_unit_end(
				1582	bio->bi_iter.bi_sector, bio_sectors(bio));
				1583
				1584	if (bio_data_dir(bio) == READ)
				1585	raid1_read_request(mddev, bio, sectors, NULL);
				1586	else {
				1587	if (!md_write_start(mddev,bio))
				1588	return false;
				1589	raid1_write_request(mddev, bio, sectors);
				1590	}
				1591	return true;
				1592	}
				1593
				1594	static void raid1_status(struct seq_file seq, struct mddev mddev)
				1595	{
				1596	struct r1conf *conf = mddev->private;
				1597	int i;
				1598
				1599	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
				1600	conf->raid_disks - mddev->degraded);
				1601	rcu_read_lock();
				1602	for (i = 0; i < conf->raid_disks; i++) {
				1603	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
				1604	seq_printf(seq, "%s",
				1605	rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
				1606	}
				1607	rcu_read_unlock();
				1608	seq_printf(seq, "]");
				1609	}
				1610
				1611	static void raid1_error(struct mddev mddev, struct md_rdev rdev)
				1612	{
				1613	char b[BDEVNAME_SIZE];
				1614	struct r1conf *conf = mddev->private;
				1615	unsigned long flags;
				1616
				1617	/*
				1618	* If it is not operational, then we have already marked it as dead
				1619	* else if it is the last working disks with "fail_last_dev == false",
				1620	* ignore the error, let the next level up know.
				1621	* else mark the drive as failed
				1622	*/
				1623	spin_lock_irqsave(&conf->device_lock, flags);
				1624	if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
				1625	&& (conf->raid_disks - mddev->degraded) == 1) {
				1626	/*
				1627	* Don't fail the drive, act as though we were just a
				1628	* normal single drive.
				1629	* However don't try a recovery from this drive as
				1630	* it is very likely to fail.
				1631	*/
				1632	conf->recovery_disabled = mddev->recovery_disabled;
				1633	spin_unlock_irqrestore(&conf->device_lock, flags);
				1634	return;
				1635	}
				1636	set_bit(Blocked, &rdev->flags);
				1637	if (test_and_clear_bit(In_sync, &rdev->flags))
				1638	mddev->degraded++;
				1639	set_bit(Faulty, &rdev->flags);
				1640	spin_unlock_irqrestore(&conf->device_lock, flags);
				1641	/*
				1642	* if recovery is running, make sure it aborts.
				1643	*/
				1644	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
				1645	set_mask_bits(&mddev->sb_flags, 0,
				1646	BIT(MD_SB_CHANGE_DEVS) \| BIT(MD_SB_CHANGE_PENDING));
				1647	pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n"
				1648	"md/raid1:%s: Operation continuing on %d devices.\n",
				1649	mdname(mddev), bdevname(rdev->bdev, b),
				1650	mdname(mddev), conf->raid_disks - mddev->degraded);
				1651	}
				1652
				1653	static void print_conf(struct r1conf *conf)
				1654	{
				1655	int i;
				1656
				1657	pr_debug("RAID1 conf printout:\n");
				1658	if (!conf) {
				1659	pr_debug("(!conf)\n");
				1660	return;
				1661	}
				1662	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
				1663	conf->raid_disks);
				1664
				1665	rcu_read_lock();
				1666	for (i = 0; i < conf->raid_disks; i++) {
				1667	char b[BDEVNAME_SIZE];
				1668	struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
				1669	if (rdev)
				1670	pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
				1671	i, !test_bit(In_sync, &rdev->flags),
				1672	!test_bit(Faulty, &rdev->flags),
				1673	bdevname(rdev->bdev,b));
				1674	}
				1675	rcu_read_unlock();
				1676	}
				1677
				1678	static void close_sync(struct r1conf *conf)
				1679	{
				1680	int idx;
				1681
				1682	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
				1683	_wait_barrier(conf, idx);
				1684	_allow_barrier(conf, idx);
				1685	}
				1686
				1687	mempool_exit(&conf->r1buf_pool);
				1688	}
				1689
				1690	static int raid1_spare_active(struct mddev *mddev)
				1691	{
				1692	int i;
				1693	struct r1conf *conf = mddev->private;
				1694	int count = 0;
				1695	unsigned long flags;
				1696
				1697	/*
				1698	* Find all failed disks within the RAID1 configuration
				1699	* and mark them readable.
				1700	* Called under mddev lock, so rcu protection not needed.
				1701	* device_lock used to avoid races with raid1_end_read_request
				1702	* which expects 'In_sync' flags and ->degraded to be consistent.
				1703	*/
				1704	spin_lock_irqsave(&conf->device_lock, flags);
				1705	for (i = 0; i < conf->raid_disks; i++) {
				1706	struct md_rdev *rdev = conf->mirrors[i].rdev;
				1707	struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
				1708	if (repl
				1709	&& !test_bit(Candidate, &repl->flags)
				1710	&& repl->recovery_offset == MaxSector
				1711	&& !test_bit(Faulty, &repl->flags)
				1712	&& !test_and_set_bit(In_sync, &repl->flags)) {
				1713	/* replacement has just become active */
				1714	if (!rdev \|\|
				1715	!test_and_clear_bit(In_sync, &rdev->flags))
				1716	count++;
				1717	if (rdev) {
				1718	/* Replaced device not technically
				1719	* faulty, but we need to be sure
				1720	* it gets removed and never re-added
				1721	*/
				1722	set_bit(Faulty, &rdev->flags);
				1723	sysfs_notify_dirent_safe(
				1724	rdev->sysfs_state);
				1725	}
				1726	}
				1727	if (rdev
				1728	&& rdev->recovery_offset == MaxSector
				1729	&& !test_bit(Faulty, &rdev->flags)
				1730	&& !test_and_set_bit(In_sync, &rdev->flags)) {
				1731	count++;
				1732	sysfs_notify_dirent_safe(rdev->sysfs_state);
				1733	}
				1734	}
				1735	mddev->degraded -= count;
				1736	spin_unlock_irqrestore(&conf->device_lock, flags);
				1737
				1738	print_conf(conf);
				1739	return count;
				1740	}
				1741
				1742	static int raid1_add_disk(struct mddev mddev, struct md_rdev rdev)
				1743	{
				1744	struct r1conf *conf = mddev->private;
				1745	int err = -EEXIST;
				1746	int mirror = 0;
				1747	struct raid1_info *p;
				1748	int first = 0;
				1749	int last = conf->raid_disks - 1;
				1750
				1751	if (mddev->recovery_disabled == conf->recovery_disabled)
				1752	return -EBUSY;
				1753
				1754	if (md_integrity_add_rdev(rdev, mddev))
				1755	return -ENXIO;
				1756
				1757	if (rdev->raid_disk >= 0)
				1758	first = last = rdev->raid_disk;
				1759
				1760	/*
				1761	* find the disk ... but prefer rdev->saved_raid_disk
				1762	* if possible.
				1763	*/
				1764	if (rdev->saved_raid_disk >= 0 &&
				1765	rdev->saved_raid_disk >= first &&
				1766	rdev->saved_raid_disk < conf->raid_disks &&
				1767	conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
				1768	first = last = rdev->saved_raid_disk;
				1769
				1770	for (mirror = first; mirror <= last; mirror++) {
				1771	p = conf->mirrors + mirror;
				1772	if (!p->rdev) {
				1773	if (mddev->gendisk)
				1774	disk_stack_limits(mddev->gendisk, rdev->bdev,
				1775	rdev->data_offset << 9);
				1776
				1777	p->head_position = 0;
				1778	rdev->raid_disk = mirror;
				1779	err = 0;
				1780	/* As all devices are equivalent, we don't need a full recovery
				1781	* if this was recently any drive of the array
				1782	*/
				1783	if (rdev->saved_raid_disk < 0)
				1784	conf->fullsync = 1;
				1785	rcu_assign_pointer(p->rdev, rdev);
				1786	break;
				1787	}
				1788	if (test_bit(WantReplacement, &p->rdev->flags) &&
				1789	p[conf->raid_disks].rdev == NULL) {
				1790	/* Add this device as a replacement */
				1791	clear_bit(In_sync, &rdev->flags);
				1792	set_bit(Replacement, &rdev->flags);
				1793	rdev->raid_disk = mirror;
				1794	err = 0;
				1795	conf->fullsync = 1;
				1796	rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
				1797	break;
				1798	}
				1799	}
				1800	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
				1801	blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
				1802	print_conf(conf);
				1803	return err;
				1804	}
				1805
				1806	static int raid1_remove_disk(struct mddev mddev, struct md_rdev rdev)
				1807	{
				1808	struct r1conf *conf = mddev->private;
				1809	int err = 0;
				1810	int number = rdev->raid_disk;
				1811	struct raid1_info *p = conf->mirrors + number;
				1812
				1813	if (unlikely(number >= conf->raid_disks))
				1814	goto abort;
				1815
				1816	if (rdev != p->rdev)
				1817	p = conf->mirrors + conf->raid_disks + number;
				1818
				1819	print_conf(conf);
				1820	if (rdev == p->rdev) {
				1821	if (test_bit(In_sync, &rdev->flags) \|\|
				1822	atomic_read(&rdev->nr_pending)) {
				1823	err = -EBUSY;
				1824	goto abort;
				1825	}
				1826	/* Only remove non-faulty devices if recovery
				1827	* is not possible.
				1828	*/
				1829	if (!test_bit(Faulty, &rdev->flags) &&
				1830	mddev->recovery_disabled != conf->recovery_disabled &&
				1831	mddev->degraded < conf->raid_disks) {
				1832	err = -EBUSY;
				1833	goto abort;
				1834	}
				1835	p->rdev = NULL;
				1836	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
				1837	synchronize_rcu();
				1838	if (atomic_read(&rdev->nr_pending)) {
				1839	/* lost the race, try later */
				1840	err = -EBUSY;
				1841	p->rdev = rdev;
				1842	goto abort;
				1843	}
				1844	}
				1845	if (conf->mirrors[conf->raid_disks + number].rdev) {
				1846	/* We just removed a device that is being replaced.
				1847	* Move down the replacement. We drain all IO before
				1848	* doing this to avoid confusion.
				1849	*/
				1850	struct md_rdev *repl =
				1851	conf->mirrors[conf->raid_disks + number].rdev;
				1852	freeze_array(conf, 0);
				1853	if (atomic_read(&repl->nr_pending)) {
				1854	/* It means that some queued IO of retry_list
				1855	* hold repl. Thus, we cannot set replacement
				1856	* as NULL, avoiding rdev NULL pointer
				1857	* dereference in sync_request_write and
				1858	* handle_write_finished.
				1859	*/
				1860	err = -EBUSY;
				1861	unfreeze_array(conf);
				1862	goto abort;
				1863	}
				1864	clear_bit(Replacement, &repl->flags);
				1865	p->rdev = repl;
				1866	conf->mirrors[conf->raid_disks + number].rdev = NULL;
				1867	unfreeze_array(conf);
				1868	}
				1869
				1870	clear_bit(WantReplacement, &rdev->flags);
				1871	err = md_integrity_register(mddev);
				1872	}
				1873	abort:
				1874
				1875	print_conf(conf);
				1876	return err;
				1877	}
				1878
				1879	static void end_sync_read(struct bio *bio)
				1880	{
				1881	struct r1bio *r1_bio = get_resync_r1bio(bio);
				1882
				1883	update_head_pos(r1_bio->read_disk, r1_bio);
				1884
				1885	/*
				1886	* we have read a block, now it needs to be re-written,
				1887	* or re-read if the read failed.
				1888	* We don't do much here, just schedule handling by raid1d
				1889	*/
				1890	if (!bio->bi_status)
				1891	set_bit(R1BIO_Uptodate, &r1_bio->state);
				1892
				1893	if (atomic_dec_and_test(&r1_bio->remaining))
				1894	reschedule_retry(r1_bio);
				1895	}
				1896
				1897	static void abort_sync_write(struct mddev mddev, struct r1bio r1_bio)
				1898	{
				1899	sector_t sync_blocks = 0;
				1900	sector_t s = r1_bio->sector;
				1901	long sectors_to_go = r1_bio->sectors;
				1902
				1903	/* make sure these bits don't get cleared. */
				1904	do {
				1905	md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
				1906	s += sync_blocks;
				1907	sectors_to_go -= sync_blocks;
				1908	} while (sectors_to_go > 0);
				1909	}
				1910
				1911	static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
				1912	{
				1913	if (atomic_dec_and_test(&r1_bio->remaining)) {
				1914	struct mddev *mddev = r1_bio->mddev;
				1915	int s = r1_bio->sectors;
				1916
				1917	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
				1918	test_bit(R1BIO_WriteError, &r1_bio->state))
				1919	reschedule_retry(r1_bio);
				1920	else {
				1921	put_buf(r1_bio);
				1922	md_done_sync(mddev, s, uptodate);
				1923	}
				1924	}
				1925	}
				1926
				1927	static void end_sync_write(struct bio *bio)
				1928	{
				1929	int uptodate = !bio->bi_status;
				1930	struct r1bio *r1_bio = get_resync_r1bio(bio);
				1931	struct mddev *mddev = r1_bio->mddev;
				1932	struct r1conf *conf = mddev->private;
				1933	sector_t first_bad;
				1934	int bad_sectors;
				1935	struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
				1936
				1937	if (!uptodate) {
				1938	abort_sync_write(mddev, r1_bio);
				1939	set_bit(WriteErrorSeen, &rdev->flags);
				1940	if (!test_and_set_bit(WantReplacement, &rdev->flags))
				1941	set_bit(MD_RECOVERY_NEEDED, &
				1942	mddev->recovery);
				1943	set_bit(R1BIO_WriteError, &r1_bio->state);
				1944	} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
				1945	&first_bad, &bad_sectors) &&
				1946	!is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
				1947	r1_bio->sector,
				1948	r1_bio->sectors,
				1949	&first_bad, &bad_sectors)
				1950	)
				1951	set_bit(R1BIO_MadeGood, &r1_bio->state);
				1952
				1953	put_sync_write_buf(r1_bio, uptodate);
				1954	}
				1955
				1956	static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
				1957	int sectors, struct page *page, int rw)
				1958	{
				1959	if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
				1960	/* success */
				1961	return 1;
				1962	if (rw == WRITE) {
				1963	set_bit(WriteErrorSeen, &rdev->flags);
				1964	if (!test_and_set_bit(WantReplacement,
				1965	&rdev->flags))
				1966	set_bit(MD_RECOVERY_NEEDED, &
				1967	rdev->mddev->recovery);
				1968	}
				1969	/* need to record an error - either for the block or the device */
				1970	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
				1971	md_error(rdev->mddev, rdev);
				1972	return 0;
				1973	}
				1974
				1975	static int fix_sync_read_error(struct r1bio *r1_bio)
				1976	{
				1977	/* Try some synchronous reads of other devices to get
				1978	* good data, much like with normal read errors. Only
				1979	* read into the pages we already have so we don't
				1980	* need to re-issue the read request.
				1981	* We don't need to freeze the array, because being in an
				1982	* active sync request, there is no normal IO, and
				1983	* no overlapping syncs.
				1984	* We don't need to check is_badblock() again as we
				1985	* made sure that anything with a bad block in range
				1986	* will have bi_end_io clear.
				1987	*/
				1988	struct mddev *mddev = r1_bio->mddev;
				1989	struct r1conf *conf = mddev->private;
				1990	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
				1991	struct page **pages = get_resync_pages(bio)->pages;
				1992	sector_t sect = r1_bio->sector;
				1993	int sectors = r1_bio->sectors;
				1994	int idx = 0;
				1995	struct md_rdev *rdev;
				1996
				1997	rdev = conf->mirrors[r1_bio->read_disk].rdev;
				1998	if (test_bit(FailFast, &rdev->flags)) {
				1999	/* Don't try recovering from here - just fail it
				2000	* ... unless it is the last working device of course */
				2001	md_error(mddev, rdev);
				2002	if (test_bit(Faulty, &rdev->flags))
				2003	/* Don't try to read from here, but make sure
				2004	* put_buf does it's thing
				2005	*/
				2006	bio->bi_end_io = end_sync_write;
				2007	}
				2008
				2009	while(sectors) {
				2010	int s = sectors;
				2011	int d = r1_bio->read_disk;
				2012	int success = 0;
				2013	int start;
				2014
				2015	if (s > (PAGE_SIZE>>9))
				2016	s = PAGE_SIZE >> 9;
				2017	do {
				2018	if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
				2019	/* No rcu protection needed here devices
				2020	* can only be removed when no resync is
				2021	* active, and resync is currently active
				2022	*/
				2023	rdev = conf->mirrors[d].rdev;
				2024	if (sync_page_io(rdev, sect, s<<9,
				2025	pages[idx],
				2026	REQ_OP_READ, 0, false)) {
				2027	success = 1;
				2028	break;
				2029	}
				2030	}
				2031	d++;
				2032	if (d == conf->raid_disks * 2)
				2033	d = 0;
				2034	} while (!success && d != r1_bio->read_disk);
				2035
				2036	if (!success) {
				2037	char b[BDEVNAME_SIZE];
				2038	int abort = 0;
				2039	/* Cannot read from anywhere, this block is lost.
				2040	* Record a bad block on each device. If that doesn't
				2041	* work just disable and interrupt the recovery.
				2042	* Don't fail devices as that won't really help.
				2043	*/
				2044	pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
				2045	mdname(mddev), bio_devname(bio, b),
				2046	(unsigned long long)r1_bio->sector);
				2047	for (d = 0; d < conf->raid_disks * 2; d++) {
				2048	rdev = conf->mirrors[d].rdev;
				2049	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
				2050	continue;
				2051	if (!rdev_set_badblocks(rdev, sect, s, 0))
				2052	abort = 1;
				2053	}
				2054	if (abort) {
				2055	conf->recovery_disabled =
				2056	mddev->recovery_disabled;
				2057	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
				2058	md_done_sync(mddev, r1_bio->sectors, 0);
				2059	put_buf(r1_bio);
				2060	return 0;
				2061	}
				2062	/* Try next page */
				2063	sectors -= s;
				2064	sect += s;
				2065	idx++;
				2066	continue;
				2067	}
				2068
				2069	start = d;
				2070	/* write it back and re-read */
				2071	while (d != r1_bio->read_disk) {
				2072	if (d == 0)
				2073	d = conf->raid_disks * 2;
				2074	d--;
				2075	if (r1_bio->bios[d]->bi_end_io != end_sync_read)
				2076	continue;
				2077	rdev = conf->mirrors[d].rdev;
				2078	if (r1_sync_page_io(rdev, sect, s,
				2079	pages[idx],
				2080	WRITE) == 0) {
				2081	r1_bio->bios[d]->bi_end_io = NULL;
				2082	rdev_dec_pending(rdev, mddev);
				2083	}
				2084	}
				2085	d = start;
				2086	while (d != r1_bio->read_disk) {
				2087	if (d == 0)
				2088	d = conf->raid_disks * 2;
				2089	d--;
				2090	if (r1_bio->bios[d]->bi_end_io != end_sync_read)
				2091	continue;
				2092	rdev = conf->mirrors[d].rdev;
				2093	if (r1_sync_page_io(rdev, sect, s,
				2094	pages[idx],
				2095	READ) != 0)
				2096	atomic_add(s, &rdev->corrected_errors);
				2097	}
				2098	sectors -= s;
				2099	sect += s;
				2100	idx ++;
				2101	}
				2102	set_bit(R1BIO_Uptodate, &r1_bio->state);
				2103	bio->bi_status = 0;
				2104	return 1;
				2105	}
				2106
				2107	static void process_checks(struct r1bio *r1_bio)
				2108	{
				2109	/* We have read all readable devices. If we haven't
				2110	* got the block, then there is no hope left.
				2111	* If we have, then we want to do a comparison
				2112	* and skip the write if everything is the same.
				2113	* If any blocks failed to read, then we need to
				2114	* attempt an over-write
				2115	*/
				2116	struct mddev *mddev = r1_bio->mddev;
				2117	struct r1conf *conf = mddev->private;
				2118	int primary;
				2119	int i;
				2120	int vcnt;
				2121
				2122	/* Fix variable parts of all bios */
				2123	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
				2124	for (i = 0; i < conf->raid_disks * 2; i++) {
				2125	blk_status_t status;
				2126	struct bio *b = r1_bio->bios[i];
				2127	struct resync_pages *rp = get_resync_pages(b);
				2128	if (b->bi_end_io != end_sync_read)
				2129	continue;
				2130	/* fixup the bio for reuse, but preserve errno */
				2131	status = b->bi_status;
				2132	bio_reset(b);
				2133	b->bi_status = status;
				2134	b->bi_iter.bi_sector = r1_bio->sector +
				2135	conf->mirrors[i].rdev->data_offset;
				2136	bio_set_dev(b, conf->mirrors[i].rdev->bdev);
				2137	b->bi_end_io = end_sync_read;
				2138	rp->raid_bio = r1_bio;
				2139	b->bi_private = rp;
				2140
				2141	/* initialize bvec table again */
				2142	md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9);
				2143	}
				2144	for (primary = 0; primary < conf->raid_disks * 2; primary++)
				2145	if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
				2146	!r1_bio->bios[primary]->bi_status) {
				2147	r1_bio->bios[primary]->bi_end_io = NULL;
				2148	rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
				2149	break;
				2150	}
				2151	r1_bio->read_disk = primary;
				2152	for (i = 0; i < conf->raid_disks * 2; i++) {
				2153	int j = 0;
				2154	struct bio *pbio = r1_bio->bios[primary];
				2155	struct bio *sbio = r1_bio->bios[i];
				2156	blk_status_t status = sbio->bi_status;
				2157	struct page **ppages = get_resync_pages(pbio)->pages;
				2158	struct page **spages = get_resync_pages(sbio)->pages;
				2159	struct bio_vec *bi;
				2160	int page_len[RESYNC_PAGES] = { 0 };
				2161	struct bvec_iter_all iter_all;
				2162
				2163	if (sbio->bi_end_io != end_sync_read)
				2164	continue;
				2165	/* Now we can 'fixup' the error value */
				2166	sbio->bi_status = 0;
				2167
				2168	bio_for_each_segment_all(bi, sbio, iter_all)
				2169	page_len[j++] = bi->bv_len;
				2170
				2171	if (!status) {
				2172	for (j = vcnt; j-- ; ) {
				2173	if (memcmp(page_address(ppages[j]),
				2174	page_address(spages[j]),
				2175	page_len[j]))
				2176	break;
				2177	}
				2178	} else
				2179	j = 0;
				2180	if (j >= 0)
				2181	atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
				2182	if (j < 0 \|\| (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
				2183	&& !status)) {
				2184	/* No need to write to this device. */
				2185	sbio->bi_end_io = NULL;
				2186	rdev_dec_pending(conf->mirrors[i].rdev, mddev);
				2187	continue;
				2188	}
				2189
				2190	bio_copy_data(sbio, pbio);
				2191	}
				2192	}
				2193
				2194	static void sync_request_write(struct mddev mddev, struct r1bio r1_bio)
				2195	{
				2196	struct r1conf *conf = mddev->private;
				2197	int i;
				2198	int disks = conf->raid_disks * 2;
				2199	struct bio *wbio;
				2200
				2201	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
				2202	/* ouch - failed to read all of that. */
				2203	if (!fix_sync_read_error(r1_bio))
				2204	return;
				2205
				2206	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
				2207	process_checks(r1_bio);
				2208
				2209	/*
				2210	* schedule writes
				2211	*/
				2212	atomic_set(&r1_bio->remaining, 1);
				2213	for (i = 0; i < disks ; i++) {
				2214	wbio = r1_bio->bios[i];
				2215	if (wbio->bi_end_io == NULL \|\|
				2216	(wbio->bi_end_io == end_sync_read &&
				2217	(i == r1_bio->read_disk \|\|
				2218	!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
				2219	continue;
				2220	if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
				2221	abort_sync_write(mddev, r1_bio);
				2222	continue;
				2223	}
				2224
				2225	bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
				2226	if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
				2227	wbio->bi_opf \|= MD_FAILFAST;
				2228
				2229	wbio->bi_end_io = end_sync_write;
				2230	atomic_inc(&r1_bio->remaining);
				2231	md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
				2232
				2233	generic_make_request(wbio);
				2234	}
				2235
				2236	put_sync_write_buf(r1_bio, 1);
				2237	}
				2238
				2239	/*
				2240	* This is a kernel thread which:
				2241	*
				2242	* 1. Retries failed read operations on working mirrors.
				2243	* 2. Updates the raid superblock when problems encounter.
				2244	* 3. Performs writes following reads for array synchronising.
				2245	*/
				2246
				2247	static void fix_read_error(struct r1conf *conf, int read_disk,
				2248	sector_t sect, int sectors)
				2249	{
				2250	struct mddev *mddev = conf->mddev;
				2251	while(sectors) {
				2252	int s = sectors;
				2253	int d = read_disk;
				2254	int success = 0;
				2255	int start;
				2256	struct md_rdev *rdev;
				2257
				2258	if (s > (PAGE_SIZE>>9))
				2259	s = PAGE_SIZE >> 9;
				2260
				2261	do {
				2262	sector_t first_bad;
				2263	int bad_sectors;
				2264
				2265	rcu_read_lock();
				2266	rdev = rcu_dereference(conf->mirrors[d].rdev);
				2267	if (rdev &&
				2268	(test_bit(In_sync, &rdev->flags) \|\|
				2269	(!test_bit(Faulty, &rdev->flags) &&
				2270	rdev->recovery_offset >= sect + s)) &&
				2271	is_badblock(rdev, sect, s,
				2272	&first_bad, &bad_sectors) == 0) {
				2273	atomic_inc(&rdev->nr_pending);
				2274	rcu_read_unlock();
				2275	if (sync_page_io(rdev, sect, s<<9,
				2276	conf->tmppage, REQ_OP_READ, 0, false))
				2277	success = 1;
				2278	rdev_dec_pending(rdev, mddev);
				2279	if (success)
				2280	break;
				2281	} else
				2282	rcu_read_unlock();
				2283	d++;
				2284	if (d == conf->raid_disks * 2)
				2285	d = 0;
				2286	} while (!success && d != read_disk);
				2287
				2288	if (!success) {
				2289	/* Cannot read from anywhere - mark it bad */
				2290	struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
				2291	if (!rdev_set_badblocks(rdev, sect, s, 0))
				2292	md_error(mddev, rdev);
				2293	break;
				2294	}
				2295	/* write it back and re-read */
				2296	start = d;
				2297	while (d != read_disk) {
				2298	if (d==0)
				2299	d = conf->raid_disks * 2;
				2300	d--;
				2301	rcu_read_lock();
				2302	rdev = rcu_dereference(conf->mirrors[d].rdev);
				2303	if (rdev &&
				2304	!test_bit(Faulty, &rdev->flags)) {
				2305	atomic_inc(&rdev->nr_pending);
				2306	rcu_read_unlock();
				2307	r1_sync_page_io(rdev, sect, s,
				2308	conf->tmppage, WRITE);
				2309	rdev_dec_pending(rdev, mddev);
				2310	} else
				2311	rcu_read_unlock();
				2312	}
				2313	d = start;
				2314	while (d != read_disk) {
				2315	char b[BDEVNAME_SIZE];
				2316	if (d==0)
				2317	d = conf->raid_disks * 2;
				2318	d--;
				2319	rcu_read_lock();
				2320	rdev = rcu_dereference(conf->mirrors[d].rdev);
				2321	if (rdev &&
				2322	!test_bit(Faulty, &rdev->flags)) {
				2323	atomic_inc(&rdev->nr_pending);
				2324	rcu_read_unlock();
				2325	if (r1_sync_page_io(rdev, sect, s,
				2326	conf->tmppage, READ)) {
				2327	atomic_add(s, &rdev->corrected_errors);
				2328	pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
				2329	mdname(mddev), s,
				2330	(unsigned long long)(sect +
				2331	rdev->data_offset),
				2332	bdevname(rdev->bdev, b));
				2333	}
				2334	rdev_dec_pending(rdev, mddev);
				2335	} else
				2336	rcu_read_unlock();
				2337	}
				2338	sectors -= s;
				2339	sect += s;
				2340	}
				2341	}
				2342
				2343	static int narrow_write_error(struct r1bio *r1_bio, int i)
				2344	{
				2345	struct mddev *mddev = r1_bio->mddev;
				2346	struct r1conf *conf = mddev->private;
				2347	struct md_rdev *rdev = conf->mirrors[i].rdev;
				2348
				2349	/* bio has the data to be written to device 'i' where
				2350	* we just recently had a write error.
				2351	* We repeatedly clone the bio and trim down to one block,
				2352	* then try the write. Where the write fails we record
				2353	* a bad block.
				2354	* It is conceivable that the bio doesn't exactly align with
				2355	* blocks. We must handle this somehow.
				2356	*
				2357	* We currently own a reference on the rdev.
				2358	*/
				2359
				2360	int block_sectors;
				2361	sector_t sector;
				2362	int sectors;
				2363	int sect_to_write = r1_bio->sectors;
				2364	int ok = 1;
				2365
				2366	if (rdev->badblocks.shift < 0)
				2367	return 0;
				2368
				2369	block_sectors = roundup(1 << rdev->badblocks.shift,
				2370	bdev_logical_block_size(rdev->bdev) >> 9);
				2371	sector = r1_bio->sector;
				2372	sectors = ((sector + block_sectors)
				2373	& ~(sector_t)(block_sectors - 1))
				2374	- sector;
				2375
				2376	while (sect_to_write) {
				2377	struct bio *wbio;
				2378	if (sectors > sect_to_write)
				2379	sectors = sect_to_write;
				2380	/* Write at 'sector' for 'sectors'*/
				2381
				2382	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
				2383	wbio = bio_clone_fast(r1_bio->behind_master_bio,
				2384	GFP_NOIO,
				2385	&mddev->bio_set);
				2386	} else {
				2387	wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
				2388	&mddev->bio_set);
				2389	}
				2390
				2391	bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
				2392	wbio->bi_iter.bi_sector = r1_bio->sector;
				2393	wbio->bi_iter.bi_size = r1_bio->sectors << 9;
				2394
				2395	bio_trim(wbio, sector - r1_bio->sector, sectors);
				2396	wbio->bi_iter.bi_sector += rdev->data_offset;
				2397	bio_set_dev(wbio, rdev->bdev);
				2398
				2399	if (submit_bio_wait(wbio) < 0)
				2400	/* failure! */
				2401	ok = rdev_set_badblocks(rdev, sector,
				2402	sectors, 0)
				2403	&& ok;
				2404
				2405	bio_put(wbio);
				2406	sect_to_write -= sectors;
				2407	sector += sectors;
				2408	sectors = block_sectors;
				2409	}
				2410	return ok;
				2411	}
				2412
				2413	static void handle_sync_write_finished(struct r1conf conf, struct r1bio r1_bio)
				2414	{
				2415	int m;
				2416	int s = r1_bio->sectors;
				2417	for (m = 0; m < conf->raid_disks * 2 ; m++) {
				2418	struct md_rdev *rdev = conf->mirrors[m].rdev;
				2419	struct bio *bio = r1_bio->bios[m];
				2420	if (bio->bi_end_io == NULL)
				2421	continue;
				2422	if (!bio->bi_status &&
				2423	test_bit(R1BIO_MadeGood, &r1_bio->state)) {
				2424	rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
				2425	}
				2426	if (bio->bi_status &&
				2427	test_bit(R1BIO_WriteError, &r1_bio->state)) {
				2428	if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
				2429	md_error(conf->mddev, rdev);
				2430	}
				2431	}
				2432	put_buf(r1_bio);
				2433	md_done_sync(conf->mddev, s, 1);
				2434	}
				2435
				2436	static void handle_write_finished(struct r1conf conf, struct r1bio r1_bio)
				2437	{
				2438	int m, idx;
				2439	bool fail = false;
				2440
				2441	for (m = 0; m < conf->raid_disks * 2 ; m++)
				2442	if (r1_bio->bios[m] == IO_MADE_GOOD) {
				2443	struct md_rdev *rdev = conf->mirrors[m].rdev;
				2444	rdev_clear_badblocks(rdev,
				2445	r1_bio->sector,
				2446	r1_bio->sectors, 0);
				2447	rdev_dec_pending(rdev, conf->mddev);
				2448	} else if (r1_bio->bios[m] != NULL) {
				2449	/* This drive got a write error. We need to
				2450	* narrow down and record precise write
				2451	* errors.
				2452	*/
				2453	fail = true;
				2454	if (!narrow_write_error(r1_bio, m)) {
				2455	md_error(conf->mddev,
				2456	conf->mirrors[m].rdev);
				2457	/* an I/O failed, we can't clear the bitmap */
				2458	set_bit(R1BIO_Degraded, &r1_bio->state);
				2459	}
				2460	rdev_dec_pending(conf->mirrors[m].rdev,
				2461	conf->mddev);
				2462	}
				2463	if (fail) {
				2464	spin_lock_irq(&conf->device_lock);
				2465	list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
				2466	idx = sector_to_idx(r1_bio->sector);
				2467	atomic_inc(&conf->nr_queued[idx]);
				2468	spin_unlock_irq(&conf->device_lock);
				2469	/*
				2470	* In case freeze_array() is waiting for condition
				2471	* get_unqueued_pending() == extra to be true.
				2472	*/
				2473	wake_up(&conf->wait_barrier);
				2474	md_wakeup_thread(conf->mddev->thread);
				2475	} else {
				2476	if (test_bit(R1BIO_WriteError, &r1_bio->state))
				2477	close_write(r1_bio);
				2478	raid_end_bio_io(r1_bio);
				2479	}
				2480	}
				2481
				2482	static void handle_read_error(struct r1conf conf, struct r1bio r1_bio)
				2483	{
				2484	struct mddev *mddev = conf->mddev;
				2485	struct bio *bio;
				2486	struct md_rdev *rdev;
				2487
				2488	clear_bit(R1BIO_ReadError, &r1_bio->state);
				2489	/* we got a read error. Maybe the drive is bad. Maybe just
				2490	* the block and we can fix it.
				2491	* We freeze all other IO, and try reading the block from
				2492	* other devices. When we find one, we re-write
				2493	* and check it that fixes the read error.
				2494	* This is all done synchronously while the array is
				2495	* frozen
				2496	*/
				2497
				2498	bio = r1_bio->bios[r1_bio->read_disk];
				2499	bio_put(bio);
				2500	r1_bio->bios[r1_bio->read_disk] = NULL;
				2501
				2502	rdev = conf->mirrors[r1_bio->read_disk].rdev;
				2503	if (mddev->ro == 0
				2504	&& !test_bit(FailFast, &rdev->flags)) {
				2505	freeze_array(conf, 1);
				2506	fix_read_error(conf, r1_bio->read_disk,
				2507	r1_bio->sector, r1_bio->sectors);
				2508	unfreeze_array(conf);
				2509	} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
				2510	md_error(mddev, rdev);
				2511	} else {
				2512	r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
				2513	}
				2514
				2515	rdev_dec_pending(rdev, conf->mddev);
				2516	allow_barrier(conf, r1_bio->sector);
				2517	bio = r1_bio->master_bio;
				2518
				2519	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
				2520	r1_bio->state = 0;
				2521	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
				2522	}
				2523
				2524	static void raid1d(struct md_thread *thread)
				2525	{
				2526	struct mddev *mddev = thread->mddev;
				2527	struct r1bio *r1_bio;
				2528	unsigned long flags;
				2529	struct r1conf *conf = mddev->private;
				2530	struct list_head *head = &conf->retry_list;
				2531	struct blk_plug plug;
				2532	int idx;
				2533
				2534	md_check_recovery(mddev);
				2535
				2536	if (!list_empty_careful(&conf->bio_end_io_list) &&
				2537	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
				2538	LIST_HEAD(tmp);
				2539	spin_lock_irqsave(&conf->device_lock, flags);
				2540	if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
				2541	list_splice_init(&conf->bio_end_io_list, &tmp);
				2542	spin_unlock_irqrestore(&conf->device_lock, flags);
				2543	while (!list_empty(&tmp)) {
				2544	r1_bio = list_first_entry(&tmp, struct r1bio,
				2545	retry_list);
				2546	list_del(&r1_bio->retry_list);
				2547	idx = sector_to_idx(r1_bio->sector);
				2548	atomic_dec(&conf->nr_queued[idx]);
				2549	if (mddev->degraded)
				2550	set_bit(R1BIO_Degraded, &r1_bio->state);
				2551	if (test_bit(R1BIO_WriteError, &r1_bio->state))
				2552	close_write(r1_bio);
				2553	raid_end_bio_io(r1_bio);
				2554	}
				2555	}
				2556
				2557	blk_start_plug(&plug);
				2558	for (;;) {
				2559
				2560	flush_pending_writes(conf);
				2561
				2562	spin_lock_irqsave(&conf->device_lock, flags);
				2563	if (list_empty(head)) {
				2564	spin_unlock_irqrestore(&conf->device_lock, flags);
				2565	break;
				2566	}
				2567	r1_bio = list_entry(head->prev, struct r1bio, retry_list);
				2568	list_del(head->prev);
				2569	idx = sector_to_idx(r1_bio->sector);
				2570	atomic_dec(&conf->nr_queued[idx]);
				2571	spin_unlock_irqrestore(&conf->device_lock, flags);
				2572
				2573	mddev = r1_bio->mddev;
				2574	conf = mddev->private;
				2575	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
				2576	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
				2577	test_bit(R1BIO_WriteError, &r1_bio->state))
				2578	handle_sync_write_finished(conf, r1_bio);
				2579	else
				2580	sync_request_write(mddev, r1_bio);
				2581	} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
				2582	test_bit(R1BIO_WriteError, &r1_bio->state))
				2583	handle_write_finished(conf, r1_bio);
				2584	else if (test_bit(R1BIO_ReadError, &r1_bio->state))
				2585	handle_read_error(conf, r1_bio);
				2586	else
				2587	WARN_ON_ONCE(1);
				2588
				2589	cond_resched();
				2590	if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
				2591	md_check_recovery(mddev);
				2592	}
				2593	blk_finish_plug(&plug);
				2594	}
				2595
				2596	static int init_resync(struct r1conf *conf)
				2597	{
				2598	int buffs;
				2599
				2600	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
				2601	BUG_ON(mempool_initialized(&conf->r1buf_pool));
				2602
				2603	return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
				2604	r1buf_pool_free, conf->poolinfo);
				2605	}
				2606
				2607	static struct r1bio raid1_alloc_init_r1buf(struct r1conf conf)
				2608	{
				2609	struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
				2610	struct resync_pages *rps;
				2611	struct bio *bio;
				2612	int i;
				2613
				2614	for (i = conf->poolinfo->raid_disks; i--; ) {
				2615	bio = r1bio->bios[i];
				2616	rps = bio->bi_private;
				2617	bio_reset(bio);
				2618	bio->bi_private = rps;
				2619	}
				2620	r1bio->master_bio = NULL;
				2621	return r1bio;
				2622	}
				2623
				2624	/*
				2625	* perform a "sync" on one "block"
				2626	*
				2627	* We need to make sure that no normal I/O request - particularly write
				2628	* requests - conflict with active sync requests.
				2629	*
				2630	* This is achieved by tracking pending requests and a 'barrier' concept
				2631	* that can be installed to exclude normal IO requests.
				2632	*/
				2633
				2634	static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
				2635	int *skipped)
				2636	{
				2637	struct r1conf *conf = mddev->private;
				2638	struct r1bio *r1_bio;
				2639	struct bio *bio;
				2640	sector_t max_sector, nr_sectors;
				2641	int disk = -1;
				2642	int i;
				2643	int wonly = -1;
				2644	int write_targets = 0, read_targets = 0;
				2645	sector_t sync_blocks;
				2646	int still_degraded = 0;
				2647	int good_sectors = RESYNC_SECTORS;
				2648	int min_bad = 0; /* number of sectors that are bad in all devices */
				2649	int idx = sector_to_idx(sector_nr);
				2650	int page_idx = 0;
				2651
				2652	if (!mempool_initialized(&conf->r1buf_pool))
				2653	if (init_resync(conf))
				2654	return 0;
				2655
				2656	max_sector = mddev->dev_sectors;
				2657	if (sector_nr >= max_sector) {
				2658	/* If we aborted, we need to abort the
				2659	* sync on the 'current' bitmap chunk (there will
				2660	* only be one in raid1 resync.
				2661	* We can find the current addess in mddev->curr_resync
				2662	*/
				2663	if (mddev->curr_resync < max_sector) /* aborted */
				2664	md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
				2665	&sync_blocks, 1);
				2666	else /* completed sync */
				2667	conf->fullsync = 0;
				2668
				2669	md_bitmap_close_sync(mddev->bitmap);
				2670	close_sync(conf);
				2671
				2672	if (mddev_is_clustered(mddev)) {
				2673	conf->cluster_sync_low = 0;
				2674	conf->cluster_sync_high = 0;
				2675	}
				2676	return 0;
				2677	}
				2678
				2679	if (mddev->bitmap == NULL &&
				2680	mddev->recovery_cp == MaxSector &&
				2681	!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
				2682	conf->fullsync == 0) {
				2683	*skipped = 1;
				2684	return max_sector - sector_nr;
				2685	}
				2686	/* before building a request, check if we can skip these blocks..
				2687	* This call the bitmap_start_sync doesn't actually record anything
				2688	*/
				2689	if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
				2690	!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
				2691	/* We can skip this block, and probably several more */
				2692	*skipped = 1;
				2693	return sync_blocks;
				2694	}
				2695
				2696	/*
				2697	* If there is non-resync activity waiting for a turn, then let it
				2698	* though before starting on this new sync request.
				2699	*/
				2700	if (atomic_read(&conf->nr_waiting[idx]))
				2701	schedule_timeout_uninterruptible(1);
				2702
				2703	/* we are incrementing sector_nr below. To be safe, we check against
				2704	* sector_nr + two times RESYNC_SECTORS
				2705	*/
				2706
				2707	md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
				2708	mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
				2709
				2710
				2711	if (raise_barrier(conf, sector_nr))
				2712	return 0;
				2713
				2714	r1_bio = raid1_alloc_init_r1buf(conf);
				2715
				2716	rcu_read_lock();
				2717	/*
				2718	* If we get a correctably read error during resync or recovery,
				2719	* we might want to read from a different device. So we
				2720	* flag all drives that could conceivably be read from for READ,
				2721	* and any others (which will be non-In_sync devices) for WRITE.
				2722	* If a read fails, we try reading from something else for which READ
				2723	* is OK.
				2724	*/
				2725
				2726	r1_bio->mddev = mddev;
				2727	r1_bio->sector = sector_nr;
				2728	r1_bio->state = 0;
				2729	set_bit(R1BIO_IsSync, &r1_bio->state);
				2730	/* make sure good_sectors won't go across barrier unit boundary */
				2731	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
				2732
				2733	for (i = 0; i < conf->raid_disks * 2; i++) {
				2734	struct md_rdev *rdev;
				2735	bio = r1_bio->bios[i];
				2736
				2737	rdev = rcu_dereference(conf->mirrors[i].rdev);
				2738	if (rdev == NULL \|\|
				2739	test_bit(Faulty, &rdev->flags)) {
				2740	if (i < conf->raid_disks)
				2741	still_degraded = 1;
				2742	} else if (!test_bit(In_sync, &rdev->flags)) {
				2743	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
				2744	bio->bi_end_io = end_sync_write;
				2745	write_targets ++;
				2746	} else {
				2747	/* may need to read from here */
				2748	sector_t first_bad = MaxSector;
				2749	int bad_sectors;
				2750
				2751	if (is_badblock(rdev, sector_nr, good_sectors,
				2752	&first_bad, &bad_sectors)) {
				2753	if (first_bad > sector_nr)
				2754	good_sectors = first_bad - sector_nr;
				2755	else {
				2756	bad_sectors -= (sector_nr - first_bad);
				2757	if (min_bad == 0 \|\|
				2758	min_bad > bad_sectors)
				2759	min_bad = bad_sectors;
				2760	}
				2761	}
				2762	if (sector_nr < first_bad) {
				2763	if (test_bit(WriteMostly, &rdev->flags)) {
				2764	if (wonly < 0)
				2765	wonly = i;
				2766	} else {
				2767	if (disk < 0)
				2768	disk = i;
				2769	}
				2770	bio_set_op_attrs(bio, REQ_OP_READ, 0);
				2771	bio->bi_end_io = end_sync_read;
				2772	read_targets++;
				2773	} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
				2774	test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
				2775	!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
				2776	/*
				2777	* The device is suitable for reading (InSync),
				2778	* but has bad block(s) here. Let's try to correct them,
				2779	* if we are doing resync or repair. Otherwise, leave
				2780	* this device alone for this sync request.
				2781	*/
				2782	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
				2783	bio->bi_end_io = end_sync_write;
				2784	write_targets++;
				2785	}
				2786	}
				2787	if (rdev && bio->bi_end_io) {
				2788	atomic_inc(&rdev->nr_pending);
				2789	bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
				2790	bio_set_dev(bio, rdev->bdev);
				2791	if (test_bit(FailFast, &rdev->flags))
				2792	bio->bi_opf \|= MD_FAILFAST;
				2793	}
				2794	}
				2795	rcu_read_unlock();
				2796	if (disk < 0)
				2797	disk = wonly;
				2798	r1_bio->read_disk = disk;
				2799
				2800	if (read_targets == 0 && min_bad > 0) {
				2801	/* These sectors are bad on all InSync devices, so we
				2802	* need to mark them bad on all write targets
				2803	*/
				2804	int ok = 1;
				2805	for (i = 0 ; i < conf->raid_disks * 2 ; i++)
				2806	if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
				2807	struct md_rdev *rdev = conf->mirrors[i].rdev;
				2808	ok = rdev_set_badblocks(rdev, sector_nr,
				2809	min_bad, 0
				2810	) && ok;
				2811	}
				2812	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
				2813	*skipped = 1;
				2814	put_buf(r1_bio);
				2815
				2816	if (!ok) {
				2817	/* Cannot record the badblocks, so need to
				2818	* abort the resync.
				2819	* If there are multiple read targets, could just
				2820	* fail the really bad ones ???
				2821	*/
				2822	conf->recovery_disabled = mddev->recovery_disabled;
				2823	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
				2824	return 0;
				2825	} else
				2826	return min_bad;
				2827
				2828	}
				2829	if (min_bad > 0 && min_bad < good_sectors) {
				2830	/* only resync enough to reach the next bad->good
				2831	* transition */
				2832	good_sectors = min_bad;
				2833	}
				2834
				2835	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
				2836	/* extra read targets are also write targets */
				2837	write_targets += read_targets-1;
				2838
				2839	if (write_targets == 0 \|\| read_targets == 0) {
				2840	/* There is nowhere to write, so all non-sync
				2841	* drives must be failed - so we are finished
				2842	*/
				2843	sector_t rv;
				2844	if (min_bad > 0)
				2845	max_sector = sector_nr + min_bad;
				2846	rv = max_sector - sector_nr;
				2847	*skipped = 1;
				2848	put_buf(r1_bio);
				2849	return rv;
				2850	}
				2851
				2852	if (max_sector > mddev->resync_max)
				2853	max_sector = mddev->resync_max; /* Don't do IO beyond here */
				2854	if (max_sector > sector_nr + good_sectors)
				2855	max_sector = sector_nr + good_sectors;
				2856	nr_sectors = 0;
				2857	sync_blocks = 0;
				2858	do {
				2859	struct page *page;
				2860	int len = PAGE_SIZE;
				2861	if (sector_nr + (len>>9) > max_sector)
				2862	len = (max_sector - sector_nr) << 9;
				2863	if (len == 0)
				2864	break;
				2865	if (sync_blocks == 0) {
				2866	if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
				2867	&sync_blocks, still_degraded) &&
				2868	!conf->fullsync &&
				2869	!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
				2870	break;
				2871	if ((len >> 9) > sync_blocks)
				2872	len = sync_blocks<<9;
				2873	}
				2874
				2875	for (i = 0 ; i < conf->raid_disks * 2; i++) {
				2876	struct resync_pages *rp;
				2877
				2878	bio = r1_bio->bios[i];
				2879	rp = get_resync_pages(bio);
				2880	if (bio->bi_end_io) {
				2881	page = resync_fetch_page(rp, page_idx);
				2882
				2883	/*
				2884	* won't fail because the vec table is big
				2885	* enough to hold all these pages
				2886	*/
				2887	bio_add_page(bio, page, len, 0);
				2888	}
				2889	}
				2890	nr_sectors += len>>9;
				2891	sector_nr += len>>9;
				2892	sync_blocks -= (len>>9);
				2893	} while (++page_idx < RESYNC_PAGES);
				2894
				2895	r1_bio->sectors = nr_sectors;
				2896
				2897	if (mddev_is_clustered(mddev) &&
				2898	conf->cluster_sync_high < sector_nr + nr_sectors) {
				2899	conf->cluster_sync_low = mddev->curr_resync_completed;
				2900	conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
				2901	/* Send resync message */
				2902	md_cluster_ops->resync_info_update(mddev,
				2903	conf->cluster_sync_low,
				2904	conf->cluster_sync_high);
				2905	}
				2906
				2907	/* For a user-requested sync, we read all readable devices and do a
				2908	* compare
				2909	*/
				2910	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
				2911	atomic_set(&r1_bio->remaining, read_targets);
				2912	for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
				2913	bio = r1_bio->bios[i];
				2914	if (bio->bi_end_io == end_sync_read) {
				2915	read_targets--;
				2916	md_sync_acct_bio(bio, nr_sectors);
				2917	if (read_targets == 1)
				2918	bio->bi_opf &= ~MD_FAILFAST;
				2919	generic_make_request(bio);
				2920	}
				2921	}
				2922	} else {
				2923	atomic_set(&r1_bio->remaining, 1);
				2924	bio = r1_bio->bios[r1_bio->read_disk];
				2925	md_sync_acct_bio(bio, nr_sectors);
				2926	if (read_targets == 1)
				2927	bio->bi_opf &= ~MD_FAILFAST;
				2928	generic_make_request(bio);
				2929	}
				2930	return nr_sectors;
				2931	}
				2932
				2933	static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
				2934	{
				2935	if (sectors)
				2936	return sectors;
				2937
				2938	return mddev->dev_sectors;
				2939	}
				2940
				2941	static struct r1conf setup_conf(struct mddev mddev)
				2942	{
				2943	struct r1conf *conf;
				2944	int i;
				2945	struct raid1_info *disk;
				2946	struct md_rdev *rdev;
				2947	int err = -ENOMEM;
				2948
				2949	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
				2950	if (!conf)
				2951	goto abort;
				2952
				2953	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
				2954	sizeof(atomic_t), GFP_KERNEL);
				2955	if (!conf->nr_pending)
				2956	goto abort;
				2957
				2958	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
				2959	sizeof(atomic_t), GFP_KERNEL);
				2960	if (!conf->nr_waiting)
				2961	goto abort;
				2962
				2963	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
				2964	sizeof(atomic_t), GFP_KERNEL);
				2965	if (!conf->nr_queued)
				2966	goto abort;
				2967
				2968	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
				2969	sizeof(atomic_t), GFP_KERNEL);
				2970	if (!conf->barrier)
				2971	goto abort;
				2972
				2973	conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
				2974	mddev->raid_disks, 2),
				2975	GFP_KERNEL);
				2976	if (!conf->mirrors)
				2977	goto abort;
				2978
				2979	conf->tmppage = alloc_page(GFP_KERNEL);
				2980	if (!conf->tmppage)
				2981	goto abort;
				2982
				2983	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
				2984	if (!conf->poolinfo)
				2985	goto abort;
				2986	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
				2987	err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
				2988	rbio_pool_free, conf->poolinfo);
				2989	if (err)
				2990	goto abort;
				2991
				2992	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
				2993	if (err)
				2994	goto abort;
				2995
				2996	conf->poolinfo->mddev = mddev;
				2997
				2998	err = -EINVAL;
				2999	spin_lock_init(&conf->device_lock);
				3000	rdev_for_each(rdev, mddev) {
				3001	int disk_idx = rdev->raid_disk;
				3002	if (disk_idx >= mddev->raid_disks
				3003	\|\| disk_idx < 0)
				3004	continue;
				3005	if (test_bit(Replacement, &rdev->flags))
				3006	disk = conf->mirrors + mddev->raid_disks + disk_idx;
				3007	else
				3008	disk = conf->mirrors + disk_idx;
				3009
				3010	if (disk->rdev)
				3011	goto abort;
				3012	disk->rdev = rdev;
				3013	disk->head_position = 0;
				3014	disk->seq_start = MaxSector;
				3015	}
				3016	conf->raid_disks = mddev->raid_disks;
				3017	conf->mddev = mddev;
				3018	INIT_LIST_HEAD(&conf->retry_list);
				3019	INIT_LIST_HEAD(&conf->bio_end_io_list);
				3020
				3021	spin_lock_init(&conf->resync_lock);
				3022	init_waitqueue_head(&conf->wait_barrier);
				3023
				3024	bio_list_init(&conf->pending_bio_list);
				3025	conf->pending_count = 0;
				3026	conf->recovery_disabled = mddev->recovery_disabled - 1;
				3027
				3028	err = -EIO;
				3029	for (i = 0; i < conf->raid_disks * 2; i++) {
				3030
				3031	disk = conf->mirrors + i;
				3032
				3033	if (i < conf->raid_disks &&
				3034	disk[conf->raid_disks].rdev) {
				3035	/* This slot has a replacement. */
				3036	if (!disk->rdev) {
				3037	/* No original, just make the replacement
				3038	* a recovering spare
				3039	*/
				3040	disk->rdev =
				3041	disk[conf->raid_disks].rdev;
				3042	disk[conf->raid_disks].rdev = NULL;
				3043	} else if (!test_bit(In_sync, &disk->rdev->flags))
				3044	/* Original is not in_sync - bad */
				3045	goto abort;
				3046	}
				3047
				3048	if (!disk->rdev \|\|
				3049	!test_bit(In_sync, &disk->rdev->flags)) {
				3050	disk->head_position = 0;
				3051	if (disk->rdev &&
				3052	(disk->rdev->saved_raid_disk < 0))
				3053	conf->fullsync = 1;
				3054	}
				3055	}
				3056
				3057	err = -ENOMEM;
				3058	conf->thread = md_register_thread(raid1d, mddev, "raid1");
				3059	if (!conf->thread)
				3060	goto abort;
				3061
				3062	return conf;
				3063
				3064	abort:
				3065	if (conf) {
				3066	mempool_exit(&conf->r1bio_pool);
				3067	kfree(conf->mirrors);
				3068	safe_put_page(conf->tmppage);
				3069	kfree(conf->poolinfo);
				3070	kfree(conf->nr_pending);
				3071	kfree(conf->nr_waiting);
				3072	kfree(conf->nr_queued);
				3073	kfree(conf->barrier);
				3074	bioset_exit(&conf->bio_split);
				3075	kfree(conf);
				3076	}
				3077	return ERR_PTR(err);
				3078	}
				3079
				3080	static void raid1_free(struct mddev mddev, void priv);
				3081	static int raid1_run(struct mddev *mddev)
				3082	{
				3083	struct r1conf *conf;
				3084	int i;
				3085	struct md_rdev *rdev;
				3086	int ret;
				3087	bool discard_supported = false;
				3088
				3089	if (mddev->level != 1) {
				3090	pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
				3091	mdname(mddev), mddev->level);
				3092	return -EIO;
				3093	}
				3094	if (mddev->reshape_position != MaxSector) {
				3095	pr_warn("md/raid1:%s: reshape_position set but not supported\n",
				3096	mdname(mddev));
				3097	return -EIO;
				3098	}
				3099	if (mddev_init_writes_pending(mddev) < 0)
				3100	return -ENOMEM;
				3101	/*
				3102	* copy the already verified devices into our private RAID1
				3103	* bookkeeping area. [whatever we allocate in run(),
				3104	* should be freed in raid1_free()]
				3105	*/
				3106	if (mddev->private == NULL)
				3107	conf = setup_conf(mddev);
				3108	else
				3109	conf = mddev->private;
				3110
				3111	if (IS_ERR(conf))
				3112	return PTR_ERR(conf);
				3113
				3114	if (mddev->queue) {
				3115	blk_queue_max_write_same_sectors(mddev->queue, 0);
				3116	blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
				3117	}
				3118
				3119	rdev_for_each(rdev, mddev) {
				3120	if (!mddev->gendisk)
				3121	continue;
				3122	disk_stack_limits(mddev->gendisk, rdev->bdev,
				3123	rdev->data_offset << 9);
				3124	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
				3125	discard_supported = true;
				3126	}
				3127
				3128	mddev->degraded = 0;
				3129	for (i = 0; i < conf->raid_disks; i++)
				3130	if (conf->mirrors[i].rdev == NULL \|\|
				3131	!test_bit(In_sync, &conf->mirrors[i].rdev->flags) \|\|
				3132	test_bit(Faulty, &conf->mirrors[i].rdev->flags))
				3133	mddev->degraded++;
				3134	/*
				3135	* RAID1 needs at least one disk in active
				3136	*/
				3137	if (conf->raid_disks - mddev->degraded < 1) {
				3138	md_unregister_thread(&conf->thread);
				3139	ret = -EINVAL;
				3140	goto abort;
				3141	}
				3142
				3143	if (conf->raid_disks - mddev->degraded == 1)
				3144	mddev->recovery_cp = MaxSector;
				3145
				3146	if (mddev->recovery_cp != MaxSector)
				3147	pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
				3148	mdname(mddev));
				3149	pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
				3150	mdname(mddev), mddev->raid_disks - mddev->degraded,
				3151	mddev->raid_disks);
				3152
				3153	/*
				3154	* Ok, everything is just fine now
				3155	*/
				3156	mddev->thread = conf->thread;
				3157	conf->thread = NULL;
				3158	mddev->private = conf;
				3159	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
				3160
				3161	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
				3162
				3163	if (mddev->queue) {
				3164	if (discard_supported)
				3165	blk_queue_flag_set(QUEUE_FLAG_DISCARD,
				3166	mddev->queue);
				3167	else
				3168	blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
				3169	mddev->queue);
				3170	}
				3171
				3172	ret = md_integrity_register(mddev);
				3173	if (ret) {
				3174	md_unregister_thread(&mddev->thread);
				3175	goto abort;
				3176	}
				3177	return 0;
				3178
				3179	abort:
				3180	raid1_free(mddev, conf);
				3181	return ret;
				3182	}
				3183
				3184	static void raid1_free(struct mddev mddev, void priv)
				3185	{
				3186	struct r1conf *conf = priv;
				3187
				3188	mempool_exit(&conf->r1bio_pool);
				3189	kfree(conf->mirrors);
				3190	safe_put_page(conf->tmppage);
				3191	kfree(conf->poolinfo);
				3192	kfree(conf->nr_pending);
				3193	kfree(conf->nr_waiting);
				3194	kfree(conf->nr_queued);
				3195	kfree(conf->barrier);
				3196	bioset_exit(&conf->bio_split);
				3197	kfree(conf);
				3198	}
				3199
				3200	static int raid1_resize(struct mddev *mddev, sector_t sectors)
				3201	{
				3202	/* no resync is happening, and there is enough space
				3203	* on all devices, so we can resize.
				3204	* We need to make sure resync covers any new space.
				3205	* If the array is shrinking we should possibly wait until
				3206	* any io in the removed space completes, but it hardly seems
				3207	* worth it.
				3208	*/
				3209	sector_t newsize = raid1_size(mddev, sectors, 0);
				3210	if (mddev->external_size &&
				3211	mddev->array_sectors > newsize)
				3212	return -EINVAL;
				3213	if (mddev->bitmap) {
				3214	int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
				3215	if (ret)
				3216	return ret;
				3217	}
				3218	md_set_array_sectors(mddev, newsize);
				3219	if (sectors > mddev->dev_sectors &&
				3220	mddev->recovery_cp > mddev->dev_sectors) {
				3221	mddev->recovery_cp = mddev->dev_sectors;
				3222	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
				3223	}
				3224	mddev->dev_sectors = sectors;
				3225	mddev->resync_max_sectors = sectors;
				3226	return 0;
				3227	}
				3228
				3229	static int raid1_reshape(struct mddev *mddev)
				3230	{
				3231	/* We need to:
				3232	* 1/ resize the r1bio_pool
				3233	* 2/ resize conf->mirrors
				3234	*
				3235	* We allocate a new r1bio_pool if we can.
				3236	* Then raise a device barrier and wait until all IO stops.
				3237	* Then resize conf->mirrors and swap in the new r1bio pool.
				3238	*
				3239	* At the same time, we "pack" the devices so that all the missing
				3240	* devices have the higher raid_disk numbers.
				3241	*/
				3242	mempool_t newpool, oldpool;
				3243	struct pool_info *newpoolinfo;
				3244	struct raid1_info *newmirrors;
				3245	struct r1conf *conf = mddev->private;
				3246	int cnt, raid_disks;
				3247	unsigned long flags;
				3248	int d, d2;
				3249	int ret;
				3250
				3251	memset(&newpool, 0, sizeof(newpool));
				3252	memset(&oldpool, 0, sizeof(oldpool));
				3253
				3254	/* Cannot change chunk_size, layout, or level */
				3255	if (mddev->chunk_sectors != mddev->new_chunk_sectors \|\|
				3256	mddev->layout != mddev->new_layout \|\|
				3257	mddev->level != mddev->new_level) {
				3258	mddev->new_chunk_sectors = mddev->chunk_sectors;
				3259	mddev->new_layout = mddev->layout;
				3260	mddev->new_level = mddev->level;
				3261	return -EINVAL;
				3262	}
				3263
				3264	if (!mddev_is_clustered(mddev))
				3265	md_allow_write(mddev);
				3266
				3267	raid_disks = mddev->raid_disks + mddev->delta_disks;
				3268
				3269	if (raid_disks < conf->raid_disks) {
				3270	cnt=0;
				3271	for (d= 0; d < conf->raid_disks; d++)
				3272	if (conf->mirrors[d].rdev)
				3273	cnt++;
				3274	if (cnt > raid_disks)
				3275	return -EBUSY;
				3276	}
				3277
				3278	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
				3279	if (!newpoolinfo)
				3280	return -ENOMEM;
				3281	newpoolinfo->mddev = mddev;
				3282	newpoolinfo->raid_disks = raid_disks * 2;
				3283
				3284	ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
				3285	rbio_pool_free, newpoolinfo);
				3286	if (ret) {
				3287	kfree(newpoolinfo);
				3288	return ret;
				3289	}
				3290	newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
				3291	raid_disks, 2),
				3292	GFP_KERNEL);
				3293	if (!newmirrors) {
				3294	kfree(newpoolinfo);
				3295	mempool_exit(&newpool);
				3296	return -ENOMEM;
				3297	}
				3298
				3299	freeze_array(conf, 0);
				3300
				3301	/* ok, everything is stopped */
				3302	oldpool = conf->r1bio_pool;
				3303	conf->r1bio_pool = newpool;
				3304
				3305	for (d = d2 = 0; d < conf->raid_disks; d++) {
				3306	struct md_rdev *rdev = conf->mirrors[d].rdev;
				3307	if (rdev && rdev->raid_disk != d2) {
				3308	sysfs_unlink_rdev(mddev, rdev);
				3309	rdev->raid_disk = d2;
				3310	sysfs_unlink_rdev(mddev, rdev);
				3311	if (sysfs_link_rdev(mddev, rdev))
				3312	pr_warn("md/raid1:%s: cannot register rd%d\n",
				3313	mdname(mddev), rdev->raid_disk);
				3314	}
				3315	if (rdev)
				3316	newmirrors[d2++].rdev = rdev;
				3317	}
				3318	kfree(conf->mirrors);
				3319	conf->mirrors = newmirrors;
				3320	kfree(conf->poolinfo);
				3321	conf->poolinfo = newpoolinfo;
				3322
				3323	spin_lock_irqsave(&conf->device_lock, flags);
				3324	mddev->degraded += (raid_disks - conf->raid_disks);
				3325	spin_unlock_irqrestore(&conf->device_lock, flags);
				3326	conf->raid_disks = mddev->raid_disks = raid_disks;
				3327	mddev->delta_disks = 0;
				3328
				3329	unfreeze_array(conf);
				3330
				3331	set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
				3332	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
				3333	md_wakeup_thread(mddev->thread);
				3334
				3335	mempool_exit(&oldpool);
				3336	return 0;
				3337	}
				3338
				3339	static void raid1_quiesce(struct mddev *mddev, int quiesce)
				3340	{
				3341	struct r1conf *conf = mddev->private;
				3342
				3343	if (quiesce)
				3344	freeze_array(conf, 0);
				3345	else
				3346	unfreeze_array(conf);
				3347	}
				3348
				3349	static void raid1_takeover(struct mddev mddev)
				3350	{
				3351	/* raid1 can take over:
				3352	* raid5 with 2 devices, any layout or chunk size
				3353	*/
				3354	if (mddev->level == 5 && mddev->raid_disks == 2) {
				3355	struct r1conf *conf;
				3356	mddev->new_level = 1;
				3357	mddev->new_layout = 0;
				3358	mddev->new_chunk_sectors = 0;
				3359	conf = setup_conf(mddev);
				3360	if (!IS_ERR(conf)) {
				3361	/* Array must appear to be quiesced */
				3362	conf->array_frozen = 1;
				3363	mddev_clear_unsupported_flags(mddev,
				3364	UNSUPPORTED_MDDEV_FLAGS);
				3365	}
				3366	return conf;
				3367	}
				3368	return ERR_PTR(-EINVAL);
				3369	}
				3370
				3371	static struct md_personality raid1_personality =
				3372	{
				3373	.name = "raid1",
				3374	.level = 1,
				3375	.owner = THIS_MODULE,
				3376	.make_request = raid1_make_request,
				3377	.run = raid1_run,
				3378	.free = raid1_free,
				3379	.status = raid1_status,
				3380	.error_handler = raid1_error,
				3381	.hot_add_disk = raid1_add_disk,
				3382	.hot_remove_disk= raid1_remove_disk,
				3383	.spare_active = raid1_spare_active,
				3384	.sync_request = raid1_sync_request,
				3385	.resize = raid1_resize,
				3386	.size = raid1_size,
				3387	.check_reshape = raid1_reshape,
				3388	.quiesce = raid1_quiesce,
				3389	.takeover = raid1_takeover,
				3390	.congested = raid1_congested,
				3391	};
				3392
				3393	static int __init raid_init(void)
				3394	{
				3395	return register_md_personality(&raid1_personality);
				3396	}
				3397
				3398	static void raid_exit(void)
				3399	{
				3400	unregister_md_personality(&raid1_personality);
				3401	}
				3402
				3403	module_init(raid_init);
				3404	module_exit(raid_exit);
				3405	MODULE_LICENSE("GPL");
				3406	MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
				3407	MODULE_ALIAS("md-personality-3"); /* RAID1 */
				3408	MODULE_ALIAS("md-raid1");
				3409	MODULE_ALIAS("md-level-1");
				3410
				3411	module_param(max_queued_requests, int, S_IRUGO\|S_IWUSR);