raid5.c 158 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4
/*
 * raid5.c : Multiple Devices driver for Linux
 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
 *	   Copyright (C) 1999, 2000 Ingo Molnar
5
 *	   Copyright (C) 2002, 2003 H. Peter Anvin
Linus Torvalds's avatar
Linus Torvalds committed
6
 *
7 8 9
 * RAID-4/5/6 management functions.
 * Thanks to Penguin Computing for making the RAID-6 development possible
 * by donating a test server!
Linus Torvalds's avatar
Linus Torvalds committed
10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * You should have received a copy of the GNU General Public License
 * (for example /usr/src/linux/COPYING); if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

21 22 23 24 25 26 27 28 29
/*
 * BITMAP UNPLUGGING:
 *
 * The sequencing for updating the bitmap reliably is a little
 * subtle (and I got it wrong the first time) so it deserves some
 * explanation.
 *
 * We group bitmap updates into batches.  Each batch has a number.
 * We may write out several batches at once, but that isn't very important.
30 31
 * conf->seq_write is the number of the last batch successfully written.
 * conf->seq_flush is the number of the last batch that was closed to
32 33 34
 *    new additions.
 * When we discover that we will need to write to any block in a stripe
 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35
 * the number of the batch it will be in. This is seq_flush+1.
36 37 38 39 40 41 42 43 44
 * When we are ready to do a write, if that batch hasn't been written yet,
 *   we plug the array and queue the stripe for later.
 * When an unplug happens, we increment bm_flush, thus closing the current
 *   batch.
 * When we notice that bm_flush > bm_write, we write out all pending updates
 * to the bitmap, and advance bm_write to where bm_flush was.
 * This may occasionally write a bit out twice, but is sure never to
 * miss any bits.
 */
Linus Torvalds's avatar
Linus Torvalds committed
45

46
#include <linux/blkdev.h>
47
#include <linux/kthread.h>
48
#include <linux/raid/pq.h>
49
#include <linux/async_tx.h>
50
#include <linux/async.h>
51
#include <linux/seq_file.h>
52
#include <linux/cpu.h>
53
#include <linux/slab.h>
54
#include <linux/ratelimit.h>
55
#include "md.h"
56
#include "raid5.h"
57
#include "raid0.h"
58
#include "bitmap.h"
59

Linus Torvalds's avatar
Linus Torvalds committed
60 61 62 63 64 65 66 67 68
/*
 * Stripe cache
 */

#define NR_STRIPES		256
#define STRIPE_SIZE		PAGE_SIZE
#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
#define	IO_THRESHOLD		1
69
#define BYPASS_THRESHOLD	1
70
#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
Linus Torvalds's avatar
Linus Torvalds committed
71 72
#define HASH_MASK		(NR_HASH - 1)

73 74 75 76 77
static inline struct hlist_head *stripe_hash(raid5_conf_t *conf, sector_t sect)
{
	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
	return &conf->stripe_hashtbl[hash];
}
Linus Torvalds's avatar
Linus Torvalds committed
78 79 80 81 82 83 84

/* bio's attached to a stripe+device for I/O are linked together in bi_sector
 * order without overlap.  There may be several bio's per stripe+device, and
 * a bio could span several devices.
 * When walking this list for a particular stripe+device, we must never proceed
 * beyond a bio that extends past this device, as the next bio might no longer
 * be valid.
85
 * This function is used to determine the 'next' bio in the list, given the sector
Linus Torvalds's avatar
Linus Torvalds committed
86 87
 * of the current stripe+device
 */
88 89 90 91 92 93 94 95
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
	int sectors = bio->bi_size >> 9;
	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
		return bio->bi_next;
	else
		return NULL;
}
Linus Torvalds's avatar
Linus Torvalds committed
96

97
/*
98 99
 * We maintain a biased count of active stripes in the bottom 16 bits of
 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
100 101 102
 */
static inline int raid5_bi_phys_segments(struct bio *bio)
{
103
	return bio->bi_phys_segments & 0xffff;
104 105 106 107
}

static inline int raid5_bi_hw_segments(struct bio *bio)
{
108
	return (bio->bi_phys_segments >> 16) & 0xffff;
109 110 111 112 113 114 115 116 117 118 119 120 121
}

static inline int raid5_dec_bi_phys_segments(struct bio *bio)
{
	--bio->bi_phys_segments;
	return raid5_bi_phys_segments(bio);
}

static inline int raid5_dec_bi_hw_segments(struct bio *bio)
{
	unsigned short val = raid5_bi_hw_segments(bio);

	--val;
122
	bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
123 124 125 126 127
	return val;
}

static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
{
128
	bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
129 130
}

131 132 133
/* Find first data disk in a raid6 stripe */
static inline int raid6_d0(struct stripe_head *sh)
{
134 135 136 137
	if (sh->ddf_layout)
		/* ddf always start from first device */
		return 0;
	/* md starts just after Q block */
138 139 140 141 142
	if (sh->qd_idx == sh->disks - 1)
		return 0;
	else
		return sh->qd_idx + 1;
}
143 144 145 146 147
static inline int raid6_next_disk(int disk, int raid_disks)
{
	disk++;
	return (disk < raid_disks) ? disk : 0;
}
148

149 150 151 152 153
/* When walking through the disks in a raid5, starting at raid6_d0,
 * We need to map each disk to a 'slot', where the data disks are slot
 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 * is raid_disks-1.  This help does that mapping.
 */
154 155
static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
			     int *count, int syndrome_disks)
156
{
157
	int slot = *count;
158

159
	if (sh->ddf_layout)
160
		(*count)++;
161
	if (idx == sh->pd_idx)
162
		return syndrome_disks;
163
	if (idx == sh->qd_idx)
164
		return syndrome_disks + 1;
165
	if (!sh->ddf_layout)
166
		(*count)++;
167 168 169
	return slot;
}

170 171 172 173 174 175 176 177
static void return_io(struct bio *return_bi)
{
	struct bio *bi = return_bi;
	while (bi) {

		return_bi = bi->bi_next;
		bi->bi_next = NULL;
		bi->bi_size = 0;
178
		bio_endio(bi, 0);
179 180 181 182
		bi = return_bi;
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
183 184
static void print_raid5_conf (raid5_conf_t *conf);

185 186 187 188 189 190 191
static int stripe_operations_active(struct stripe_head *sh)
{
	return sh->check_state || sh->reconstruct_state ||
	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}

192
static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
193 194
{
	if (atomic_dec_and_test(&sh->count)) {
195 196
		BUG_ON(!list_empty(&sh->lru));
		BUG_ON(atomic_read(&conf->active_stripes)==0);
Linus Torvalds's avatar
Linus Torvalds committed
197
		if (test_bit(STRIPE_HANDLE, &sh->state)) {
NeilBrown's avatar
NeilBrown committed
198
			if (test_bit(STRIPE_DELAYED, &sh->state))
Linus Torvalds's avatar
Linus Torvalds committed
199
				list_add_tail(&sh->lru, &conf->delayed_list);
NeilBrown's avatar
NeilBrown committed
200 201
			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
				   sh->bm_seq - conf->seq_write > 0)
202
				list_add_tail(&sh->lru, &conf->bitmap_list);
NeilBrown's avatar
NeilBrown committed
203
			else {
204
				clear_bit(STRIPE_BIT_DELAY, &sh->state);
Linus Torvalds's avatar
Linus Torvalds committed
205
				list_add_tail(&sh->lru, &conf->handle_list);
206
			}
Linus Torvalds's avatar
Linus Torvalds committed
207 208
			md_wakeup_thread(conf->mddev->thread);
		} else {
209
			BUG_ON(stripe_operations_active(sh));
Linus Torvalds's avatar
Linus Torvalds committed
210 211 212 213 214 215
			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
				atomic_dec(&conf->preread_active_stripes);
				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
					md_wakeup_thread(conf->mddev->thread);
			}
			atomic_dec(&conf->active_stripes);
216 217
			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
				list_add_tail(&sh->lru, &conf->inactive_list);
Linus Torvalds's avatar
Linus Torvalds committed
218
				wake_up(&conf->wait_for_stripe);
219 220
				if (conf->retry_read_aligned)
					md_wakeup_thread(conf->mddev->thread);
221
			}
Linus Torvalds's avatar
Linus Torvalds committed
222 223 224
		}
	}
}
225

Linus Torvalds's avatar
Linus Torvalds committed
226 227 228 229
static void release_stripe(struct stripe_head *sh)
{
	raid5_conf_t *conf = sh->raid_conf;
	unsigned long flags;
230

Linus Torvalds's avatar
Linus Torvalds committed
231 232 233 234 235
	spin_lock_irqsave(&conf->device_lock, flags);
	__release_stripe(conf, sh);
	spin_unlock_irqrestore(&conf->device_lock, flags);
}

236
static inline void remove_hash(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
237
{
238 239
	pr_debug("remove_hash(), stripe %llu\n",
		(unsigned long long)sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
240

241
	hlist_del_init(&sh->hash);
Linus Torvalds's avatar
Linus Torvalds committed
242 243
}

244
static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
245
{
246
	struct hlist_head *hp = stripe_hash(conf, sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
247

248 249
	pr_debug("insert_hash(), stripe %llu\n",
		(unsigned long long)sh->sector);
Linus Torvalds's avatar
Linus Torvalds committed
250

251
	hlist_add_head(&sh->hash, hp);
Linus Torvalds's avatar
Linus Torvalds committed
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
}


/* find an idle stripe, make sure it is unhashed, and return it. */
static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
{
	struct stripe_head *sh = NULL;
	struct list_head *first;

	if (list_empty(&conf->inactive_list))
		goto out;
	first = conf->inactive_list.next;
	sh = list_entry(first, struct stripe_head, lru);
	list_del_init(first);
	remove_hash(sh);
	atomic_inc(&conf->active_stripes);
out:
	return sh;
}

272
static void shrink_buffers(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
273 274 275
{
	struct page *p;
	int i;
276
	int num = sh->raid_conf->pool_size;
Linus Torvalds's avatar
Linus Torvalds committed
277

278
	for (i = 0; i < num ; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
279 280 281 282
		p = sh->dev[i].page;
		if (!p)
			continue;
		sh->dev[i].page = NULL;
283
		put_page(p);
Linus Torvalds's avatar
Linus Torvalds committed
284 285 286
	}
}

287
static int grow_buffers(struct stripe_head *sh)
Linus Torvalds's avatar
Linus Torvalds committed
288 289
{
	int i;
290
	int num = sh->raid_conf->pool_size;
Linus Torvalds's avatar
Linus Torvalds committed
291

292
	for (i = 0; i < num; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
293 294 295 296 297 298 299 300 301 302
		struct page *page;

		if (!(page = alloc_page(GFP_KERNEL))) {
			return 1;
		}
		sh->dev[i].page = page;
	}
	return 0;
}

303
static void raid5_build_block(struct stripe_head *sh, int i, int previous);
304 305
static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
			    struct stripe_head *sh);
Linus Torvalds's avatar
Linus Torvalds committed
306

307
static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
Linus Torvalds's avatar
Linus Torvalds committed
308 309
{
	raid5_conf_t *conf = sh->raid_conf;
310
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
311

312 313
	BUG_ON(atomic_read(&sh->count) != 0);
	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
314
	BUG_ON(stripe_operations_active(sh));
315

316
	pr_debug("init_stripe called, stripe %llu\n",
Linus Torvalds's avatar
Linus Torvalds committed
317 318 319
		(unsigned long long)sh->sector);

	remove_hash(sh);
320

321
	sh->generation = conf->generation - previous;
322
	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
Linus Torvalds's avatar
Linus Torvalds committed
323
	sh->sector = sector;
324
	stripe_set_idx(sector, conf, previous, sh);
Linus Torvalds's avatar
Linus Torvalds committed
325 326
	sh->state = 0;

327 328

	for (i = sh->disks; i--; ) {
Linus Torvalds's avatar
Linus Torvalds committed
329 330
		struct r5dev *dev = &sh->dev[i];

331
		if (dev->toread || dev->read || dev->towrite || dev->written ||
Linus Torvalds's avatar
Linus Torvalds committed
332
		    test_bit(R5_LOCKED, &dev->flags)) {
333
			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
Linus Torvalds's avatar
Linus Torvalds committed
334
			       (unsigned long long)sh->sector, i, dev->toread,
335
			       dev->read, dev->towrite, dev->written,
Linus Torvalds's avatar
Linus Torvalds committed
336
			       test_bit(R5_LOCKED, &dev->flags));
337
			WARN_ON(1);
Linus Torvalds's avatar
Linus Torvalds committed
338 339
		}
		dev->flags = 0;
340
		raid5_build_block(sh, i, previous);
Linus Torvalds's avatar
Linus Torvalds committed
341 342 343 344
	}
	insert_hash(conf, sh);
}

345 346
static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
					 short generation)
Linus Torvalds's avatar
Linus Torvalds committed
347 348
{
	struct stripe_head *sh;
349
	struct hlist_node *hn;
Linus Torvalds's avatar
Linus Torvalds committed
350

351
	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
352
	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
353
		if (sh->sector == sector && sh->generation == generation)
Linus Torvalds's avatar
Linus Torvalds committed
354
			return sh;
355
	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
Linus Torvalds's avatar
Linus Torvalds committed
356 357 358
	return NULL;
}

359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
/*
 * Need to check if array has failed when deciding whether to:
 *  - start an array
 *  - remove non-faulty devices
 *  - add a spare
 *  - allow a reshape
 * This determination is simple when no reshape is happening.
 * However if there is a reshape, we need to carefully check
 * both the before and after sections.
 * This is because some failed devices may only affect one
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 */
static int has_failed(raid5_conf_t *conf)
{
	int degraded;
	int i;
	if (conf->mddev->reshape_position == MaxSector)
		return conf->mddev->degraded > conf->max_degraded;

	rcu_read_lock();
	degraded = 0;
	for (i = 0; i < conf->previous_raid_disks; i++) {
		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
		if (!rdev || test_bit(Faulty, &rdev->flags))
			degraded++;
		else if (test_bit(In_sync, &rdev->flags))
			;
		else
			/* not in-sync or faulty.
			 * If the reshape increases the number of devices,
			 * this is being recovered by the reshape, so
			 * this 'previous' section is not in_sync.
			 * If the number of devices is being reduced however,
			 * the device can only be part of the array if
			 * we are reverting a reshape, so this section will
			 * be in-sync.
			 */
			if (conf->raid_disks >= conf->previous_raid_disks)
				degraded++;
	}
	rcu_read_unlock();
	if (degraded > conf->max_degraded)
		return 1;
	rcu_read_lock();
	degraded = 0;
	for (i = 0; i < conf->raid_disks; i++) {
		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
		if (!rdev || test_bit(Faulty, &rdev->flags))
			degraded++;
		else if (test_bit(In_sync, &rdev->flags))
			;
		else
			/* not in-sync or faulty.
			 * If reshape increases the number of devices, this
			 * section has already been recovered, else it
			 * almost certainly hasn't.
			 */
			if (conf->raid_disks <= conf->previous_raid_disks)
				degraded++;
	}
	rcu_read_unlock();
	if (degraded > conf->max_degraded)
		return 1;
	return 0;
}

426 427
static struct stripe_head *
get_active_stripe(raid5_conf_t *conf, sector_t sector,
428
		  int previous, int noblock, int noquiesce)
Linus Torvalds's avatar
Linus Torvalds committed
429 430 431
{
	struct stripe_head *sh;

432
	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
Linus Torvalds's avatar
Linus Torvalds committed
433 434 435 436

	spin_lock_irq(&conf->device_lock);

	do {
437
		wait_event_lock_irq(conf->wait_for_stripe,
438
				    conf->quiesce == 0 || noquiesce,
439
				    conf->device_lock, /* nothing */);
440
		sh = __find_stripe(conf, sector, conf->generation - previous);
Linus Torvalds's avatar
Linus Torvalds committed
441 442 443 444 445 446 447 448 449
		if (!sh) {
			if (!conf->inactive_blocked)
				sh = get_free_stripe(conf);
			if (noblock && sh == NULL)
				break;
			if (!sh) {
				conf->inactive_blocked = 1;
				wait_event_lock_irq(conf->wait_for_stripe,
						    !list_empty(&conf->inactive_list) &&
450 451
						    (atomic_read(&conf->active_stripes)
						     < (conf->max_nr_stripes *3/4)
Linus Torvalds's avatar
Linus Torvalds committed
452 453
						     || !conf->inactive_blocked),
						    conf->device_lock,
454
						    );
Linus Torvalds's avatar
Linus Torvalds committed
455 456
				conf->inactive_blocked = 0;
			} else
457
				init_stripe(sh, sector, previous);
Linus Torvalds's avatar
Linus Torvalds committed
458 459
		} else {
			if (atomic_read(&sh->count)) {
460 461
				BUG_ON(!list_empty(&sh->lru)
				    && !test_bit(STRIPE_EXPANDING, &sh->state));
Linus Torvalds's avatar
Linus Torvalds committed
462 463 464
			} else {
				if (!test_bit(STRIPE_HANDLE, &sh->state))
					atomic_inc(&conf->active_stripes);
465 466
				if (list_empty(&sh->lru) &&
				    !test_bit(STRIPE_EXPANDING, &sh->state))
467 468
					BUG();
				list_del_init(&sh->lru);
Linus Torvalds's avatar
Linus Torvalds committed
469 470 471 472 473 474 475 476 477 478 479
			}
		}
	} while (sh == NULL);

	if (sh)
		atomic_inc(&sh->count);

	spin_unlock_irq(&conf->device_lock);
	return sh;
}

480 481 482 483
static void
raid5_end_read_request(struct bio *bi, int error);
static void
raid5_end_write_request(struct bio *bi, int error);
484

485
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
486 487 488 489 490 491 492 493 494 495
{
	raid5_conf_t *conf = sh->raid_conf;
	int i, disks = sh->disks;

	might_sleep();

	for (i = disks; i--; ) {
		int rw;
		struct bio *bi;
		mdk_rdev_t *rdev;
Tejun Heo's avatar
Tejun Heo committed
496 497 498 499 500 501
		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
				rw = WRITE_FUA;
			else
				rw = WRITE;
		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
502 503 504 505 506 507 508
			rw = READ;
		else
			continue;

		bi = &sh->dev[i].req;

		bi->bi_rw = rw;
509
		if (rw & WRITE)
510 511 512 513 514 515 516 517 518 519 520 521
			bi->bi_end_io = raid5_end_write_request;
		else
			bi->bi_end_io = raid5_end_read_request;

		rcu_read_lock();
		rdev = rcu_dereference(conf->disks[i].rdev);
		if (rdev && test_bit(Faulty, &rdev->flags))
			rdev = NULL;
		if (rdev)
			atomic_inc(&rdev->nr_pending);
		rcu_read_unlock();

522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
		/* We have already checked bad blocks for reads.  Now
		 * need to check for writes.
		 */
		while ((rw & WRITE) && rdev &&
		       test_bit(WriteErrorSeen, &rdev->flags)) {
			sector_t first_bad;
			int bad_sectors;
			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
					      &first_bad, &bad_sectors);
			if (!bad)
				break;

			if (bad < 0) {
				set_bit(BlockedBadBlocks, &rdev->flags);
				if (!conf->mddev->external &&
				    conf->mddev->flags) {
					/* It is very unlikely, but we might
					 * still need to write out the
					 * bad block log - better give it
					 * a chance*/
					md_check_recovery(conf->mddev);
				}
				md_wait_for_blocked_rdev(rdev, conf->mddev);
			} else {
				/* Acknowledged bad block - skip the write */
				rdev_dec_pending(rdev, conf->mddev);
				rdev = NULL;
			}
		}

552
		if (rdev) {
553
			if (s->syncing || s->expanding || s->expanded)
554 555
				md_sync_acct(rdev->bdev, STRIPE_SECTORS);

Dan Williams's avatar
Dan Williams committed
556 557
			set_bit(STRIPE_IO_STARTED, &sh->state);

558 559
			bi->bi_bdev = rdev->bdev;
			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
560
				__func__, (unsigned long long)sh->sector,
561 562 563 564 565 566 567 568 569 570 571 572 573 574
				bi->bi_rw, i);
			atomic_inc(&sh->count);
			bi->bi_sector = sh->sector + rdev->data_offset;
			bi->bi_flags = 1 << BIO_UPTODATE;
			bi->bi_vcnt = 1;
			bi->bi_max_vecs = 1;
			bi->bi_idx = 0;
			bi->bi_io_vec = &sh->dev[i].vec;
			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
			bi->bi_io_vec[0].bv_offset = 0;
			bi->bi_size = STRIPE_SIZE;
			bi->bi_next = NULL;
			generic_make_request(bi);
		} else {
575
			if (rw & WRITE)
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
				set_bit(STRIPE_DEGRADED, &sh->state);
			pr_debug("skip op %ld on disc %d for sector %llu\n",
				bi->bi_rw, i, (unsigned long long)sh->sector);
			clear_bit(R5_LOCKED, &sh->dev[i].flags);
			set_bit(STRIPE_HANDLE, &sh->state);
		}
	}
}

static struct dma_async_tx_descriptor *
async_copy_data(int frombio, struct bio *bio, struct page *page,
	sector_t sector, struct dma_async_tx_descriptor *tx)
{
	struct bio_vec *bvl;
	struct page *bio_page;
	int i;
	int page_offset;
593
	struct async_submit_ctl submit;
Dan Williams's avatar
Dan Williams committed
594
	enum async_tx_flags flags = 0;
595 596 597 598 599

	if (bio->bi_sector >= sector)
		page_offset = (signed)(bio->bi_sector - sector) * 512;
	else
		page_offset = (signed)(sector - bio->bi_sector) * -512;
600

Dan Williams's avatar
Dan Williams committed
601 602 603 604
	if (frombio)
		flags |= ASYNC_TX_FENCE;
	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);

605
	bio_for_each_segment(bvl, bio, i) {
606
		int len = bvl->bv_len;
607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
		int clen;
		int b_offset = 0;

		if (page_offset < 0) {
			b_offset = -page_offset;
			page_offset += b_offset;
			len -= b_offset;
		}

		if (len > 0 && page_offset + len > STRIPE_SIZE)
			clen = STRIPE_SIZE - page_offset;
		else
			clen = len;

		if (clen > 0) {
622 623
			b_offset += bvl->bv_offset;
			bio_page = bvl->bv_page;
624 625
			if (frombio)
				tx = async_memcpy(page, bio_page, page_offset,
626
						  b_offset, clen, &submit);
627 628
			else
				tx = async_memcpy(bio_page, page, b_offset,
629
						  page_offset, clen, &submit);
630
		}
631 632 633
		/* chain the operations */
		submit.depend_tx = tx;

634 635 636 637 638 639 640 641 642 643 644 645 646
		if (clen < len) /* hit end of page */
			break;
		page_offset +=  len;
	}

	return tx;
}

static void ops_complete_biofill(void *stripe_head_ref)
{
	struct stripe_head *sh = stripe_head_ref;
	struct bio *return_bi = NULL;
	raid5_conf_t *conf = sh->raid_conf;
647
	int i;
648

649
	pr_debug("%s: stripe %llu\n", __func__,
650 651 652
		(unsigned long long)sh->sector);

	/* clear completed biofills */
653
	spin_lock_irq(&conf->device_lock);
654 655 656 657
	for (i = sh->disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];

		/* acknowledge completion of a biofill operation */
658 659
		/* and check if we need to reply to a read request,
		 * new R5_Wantfill requests are held off until
660
		 * !STRIPE_BIOFILL_RUN
661 662
		 */
		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
663 664 665 666 667 668 669 670
			struct bio *rbi, *rbi2;

			BUG_ON(!dev->read);
			rbi = dev->read;
			dev->read = NULL;
			while (rbi && rbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
				rbi2 = r5_next_bio(rbi, dev->sector);
671
				if (!raid5_dec_bi_phys_segments(rbi)) {
672 673 674 675 676 677 678
					rbi->bi_next = return_bi;
					return_bi = rbi;
				}
				rbi = rbi2;
			}
		}
	}
679 680
	spin_unlock_irq(&conf->device_lock);
	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
681 682 683

	return_io(return_bi);

684
	set_bit(STRIPE_HANDLE, &sh->state);
685 686 687 688 689 690 691
	release_stripe(sh);
}

static void ops_run_biofill(struct stripe_head *sh)
{
	struct dma_async_tx_descriptor *tx = NULL;
	raid5_conf_t *conf = sh->raid_conf;
692
	struct async_submit_ctl submit;
693 694
	int i;

695
	pr_debug("%s: stripe %llu\n", __func__,
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
		(unsigned long long)sh->sector);

	for (i = sh->disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		if (test_bit(R5_Wantfill, &dev->flags)) {
			struct bio *rbi;
			spin_lock_irq(&conf->device_lock);
			dev->read = rbi = dev->toread;
			dev->toread = NULL;
			spin_unlock_irq(&conf->device_lock);
			while (rbi && rbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
				tx = async_copy_data(0, rbi, dev->page,
					dev->sector, tx);
				rbi = r5_next_bio(rbi, dev->sector);
			}
		}
	}

	atomic_inc(&sh->count);
716 717
	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
	async_trigger_callback(&submit);
718 719
}

720
static void mark_target_uptodate(struct stripe_head *sh, int target)
721
{
722
	struct r5dev *tgt;
723

724 725
	if (target < 0)
		return;
726

727
	tgt = &sh->dev[target];
728 729 730
	set_bit(R5_UPTODATE, &tgt->flags);
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
	clear_bit(R5_Wantcompute, &tgt->flags);
731 732
}

733
static void ops_complete_compute(void *stripe_head_ref)
734 735 736
{
	struct stripe_head *sh = stripe_head_ref;

737
	pr_debug("%s: stripe %llu\n", __func__,
738 739
		(unsigned long long)sh->sector);

740
	/* mark the computed target(s) as uptodate */
741
	mark_target_uptodate(sh, sh->ops.target);
742
	mark_target_uptodate(sh, sh->ops.target2);
743

744 745 746
	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
	if (sh->check_state == check_state_compute_run)
		sh->check_state = check_state_compute_result;
747 748 749 750
	set_bit(STRIPE_HANDLE, &sh->state);
	release_stripe(sh);
}

751 752 753 754 755 756 757 758 759
/* return a pointer to the address conversion region of the scribble buffer */
static addr_conv_t *to_addr_conv(struct stripe_head *sh,
				 struct raid5_percpu *percpu)
{
	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
}

static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
760 761
{
	int disks = sh->disks;
762
	struct page **xor_srcs = percpu->scribble;
763 764 765 766 767
	int target = sh->ops.target;
	struct r5dev *tgt = &sh->dev[target];
	struct page *xor_dest = tgt->page;
	int count = 0;
	struct dma_async_tx_descriptor *tx;
768
	struct async_submit_ctl submit;
769 770 771
	int i;

	pr_debug("%s: stripe %llu block: %d\n",
772
		__func__, (unsigned long long)sh->sector, target);
773 774 775 776 777 778 779 780
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));

	for (i = disks; i--; )
		if (i != target)
			xor_srcs[count++] = sh->dev[i].page;

	atomic_inc(&sh->count);

Dan Williams's avatar
Dan Williams committed
781
	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
782
			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
783
	if (unlikely(count == 1))
784
		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
785
	else
786
		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
787 788 789 790

	return tx;
}

791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
/* set_syndrome_sources - populate source buffers for gen_syndrome
 * @srcs - (struct page *) array of size sh->disks
 * @sh - stripe_head to parse
 *
 * Populates srcs in proper layout order for the stripe and returns the
 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
 * destination buffer is recorded in srcs[count] and the Q destination
 * is recorded in srcs[count+1]].
 */
static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
{
	int disks = sh->disks;
	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
	int d0_idx = raid6_d0(sh);
	int count;
	int i;

	for (i = 0; i < disks; i++)
809
		srcs[i] = NULL;
810 811 812 813 814 815 816 817 818 819

	count = 0;
	i = d0_idx;
	do {
		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);

		srcs[slot] = sh->dev[i].page;
		i = raid6_next_disk(i, disks);
	} while (i != d0_idx);

820
	return syndrome_disks;
821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
}

static struct dma_async_tx_descriptor *
ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
{
	int disks = sh->disks;
	struct page **blocks = percpu->scribble;
	int target;
	int qd_idx = sh->qd_idx;
	struct dma_async_tx_descriptor *tx;
	struct async_submit_ctl submit;
	struct r5dev *tgt;
	struct page *dest;
	int i;
	int count;

	if (sh->ops.target < 0)
		target = sh->ops.target2;
	else if (sh->ops.target2 < 0)
		target = sh->ops.target;
841
	else
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
		/* we should only have one valid target */
		BUG();
	BUG_ON(target < 0);
	pr_debug("%s: stripe %llu block: %d\n",
		__func__, (unsigned long long)sh->sector, target);

	tgt = &sh->dev[target];
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
	dest = tgt->page;

	atomic_inc(&sh->count);

	if (target == qd_idx) {
		count = set_syndrome_sources(blocks, sh);
		blocks[count] = NULL; /* regenerating p is not necessary */
		BUG_ON(blocks[count+1] != dest); /* q should already be set */
Dan Williams's avatar
Dan Williams committed
858 859
		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
				  ops_complete_compute, sh,
860 861 862 863 864 865 866 867 868 869 870
				  to_addr_conv(sh, percpu));
		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
	} else {
		/* Compute any data- or p-drive using XOR */
		count = 0;
		for (i = disks; i-- ; ) {
			if (i == target || i == qd_idx)
				continue;
			blocks[count++] = sh->dev[i].page;
		}

Dan Williams's avatar
Dan Williams committed
871 872
		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
				  NULL, ops_complete_compute, sh,
873 874 875
				  to_addr_conv(sh, percpu));
		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
	}
876 877 878 879

	return tx;
}

880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
static struct dma_async_tx_descriptor *
ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
{
	int i, count, disks = sh->disks;
	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
	int d0_idx = raid6_d0(sh);
	int faila = -1, failb = -1;
	int target = sh->ops.target;
	int target2 = sh->ops.target2;
	struct r5dev *tgt = &sh->dev[target];
	struct r5dev *tgt2 = &sh->dev[target2];
	struct dma_async_tx_descriptor *tx;
	struct page **blocks = percpu->scribble;
	struct async_submit_ctl submit;

	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
		 __func__, (unsigned long long)sh->sector, target, target2);
	BUG_ON(target < 0 || target2 < 0);
	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));

901
	/* we need to open-code set_syndrome_sources to handle the
902 903 904
	 * slot number conversion for 'faila' and 'failb'
	 */
	for (i = 0; i < disks ; i++)
905
		blocks[i] = NULL;
906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
	count = 0;
	i = d0_idx;
	do {
		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);

		blocks[slot] = sh->dev[i].page;

		if (i == target)
			faila = slot;
		if (i == target2)
			failb = slot;
		i = raid6_next_disk(i, disks);
	} while (i != d0_idx);

	BUG_ON(faila == failb);
	if (failb < faila)
		swap(faila, failb);
	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
		 __func__, (unsigned long long)sh->sector, faila, failb);

	atomic_inc(&sh->count);

	if (failb == syndrome_disks+1) {
		/* Q disk is one of the missing disks */
		if (faila == syndrome_disks) {
			/* Missing P+Q, just recompute */
Dan Williams's avatar
Dan Williams committed
932 933 934
			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
					  ops_complete_compute, sh,
					  to_addr_conv(sh, percpu));
935
			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
						  STRIPE_SIZE, &submit);
		} else {
			struct page *dest;
			int data_target;
			int qd_idx = sh->qd_idx;

			/* Missing D+Q: recompute D from P, then recompute Q */
			if (target == qd_idx)
				data_target = target2;
			else
				data_target = target;

			count = 0;
			for (i = disks; i-- ; ) {
				if (i == data_target || i == qd_idx)
					continue;
				blocks[count++] = sh->dev[i].page;
			}
			dest = sh->dev[data_target].page;
Dan Williams's avatar
Dan Williams committed
955 956 957 958
			init_async_submit(&submit,
					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
					  NULL, NULL, NULL,
					  to_addr_conv(sh, percpu));
959 960 961 962
			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
				       &submit);

			count = set_syndrome_sources(blocks, sh);
Dan Williams's avatar
Dan Williams committed
963 964 965
			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
					  ops_complete_compute, sh,
					  to_addr_conv(sh, percpu));
966 967 968 969
			return async_gen_syndrome(blocks, 0, count+2,
						  STRIPE_SIZE, &submit);
		}
	} else {
970 971 972 973 974 975 976 977 978 979 980 981 982 983
		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
				  ops_complete_compute, sh,
				  to_addr_conv(sh, percpu));
		if (failb == syndrome_disks) {
			/* We're missing D+P. */
			return async_raid6_datap_recov(syndrome_disks+2,
						       STRIPE_SIZE, faila,
						       blocks, &submit);
		} else {
			/* We're missing D+D. */
			return async_raid6_2data_recov(syndrome_disks+2,
						       STRIPE_SIZE, faila, failb,
						       blocks, &submit);
		}
984 985 986 987
	}
}


988 989 990 991
static void ops_complete_prexor(void *stripe_head_ref)
{
	struct stripe_head *sh = stripe_head_ref;

992
	pr_debug("%s: stripe %llu\n", __func__,
993 994 995 996
		(unsigned long long)sh->sector);
}

static struct dma_async_tx_descriptor *
997 998
ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
	       struct dma_async_tx_descriptor *tx)
999 1000
{
	int disks = sh->disks;
1001
	struct page **xor_srcs = percpu->scribble;
1002
	int count = 0, pd_idx = sh->pd_idx, i;
1003
	struct async_submit_ctl submit;
1004 1005 1006 1007

	/* existing parity data subtracted */
	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;

1008
	pr_debug("%s: stripe %llu\n", __func__,
1009 1010 1011 1012 1013
		(unsigned long long)sh->sector);

	for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		/* Only process blocks that are known to be uptodate */
1014
		if (test_bit(R5_Wantdrain, &dev->flags))
1015 1016 1017
			xor_srcs[count++] = dev->page;
	}

Dan Williams's avatar
Dan Williams committed
1018
	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1019
			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1020
	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1021 1022 1023 1024 1025

	return tx;
}

static struct dma_async_tx_descriptor *
1026
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1027 1028
{
	int disks = sh->disks;
1029
	int i;
1030

1031
	pr_debug("%s: stripe %llu\n", __func__,
1032 1033 1034 1035 1036 1037
		(unsigned long long)sh->sector);

	for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		struct bio *chosen;

1038
		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1039 1040
			struct bio *wbi;

1041
			spin_lock_irq(&sh->raid_conf->device_lock);
1042 1043 1044 1045
			chosen = dev->towrite;
			dev->towrite = NULL;
			BUG_ON(dev->written);
			wbi = dev->written = chosen;
1046
			spin_unlock_irq(&sh->raid_conf->device_lock);
1047 1048 1049

			while (wbi && wbi->bi_sector <
				dev->sector + STRIPE_SECTORS) {
Tejun Heo's avatar
Tejun Heo committed
1050 1051
				if (wbi->bi_rw & REQ_FUA)
					set_bit(R5_WantFUA, &dev->flags);
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
				tx = async_copy_data(1, wbi, dev->page,
					dev->sector, tx);
				wbi = r5_next_bio(wbi, dev->sector);
			}
		}
	}

	return tx;
}

1062
static void ops_complete_reconstruct(void *stripe_head_ref)
1063 1064
{
	struct stripe_head *sh = stripe_head_ref;
1065 1066 1067 1068
	int disks = sh->disks;
	int pd_idx = sh->pd_idx;
	int qd_idx = sh->qd_idx;
	int i;
Tejun Heo's avatar
Tejun Heo committed
1069
	bool fua = false;
1070

1071
	pr_debug("%s: stripe %llu\n", __func__,
1072 1073
		(unsigned long long)sh->sector);

Tejun Heo's avatar
Tejun Heo committed
1074 1075 1076
	for (i = disks; i--; )
		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);

1077 1078
	for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
1079

Tejun Heo's avatar
Tejun Heo committed
1080
		if (dev->written || i == pd_idx || i == qd_idx) {
1081
			set_bit(R5_UPTODATE, &dev->flags);
Tejun Heo's avatar
Tejun Heo committed
1082 1083 1084
			if (fua)
				set_bit(R5_WantFUA, &dev->flags);
		}
1085 1086
	}

1087 1088 1089 1090 1091 1092 1093 1094
	if (sh->reconstruct_state == reconstruct_state_drain_run)
		sh->reconstruct_state = reconstruct_state_drain_result;
	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
	else {
		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
		sh->reconstruct_state = reconstruct_state_result;
	}
1095 1096 1097 1098 1099 1100

	set_bit(STRIPE_HANDLE, &sh->state);
	release_stripe(sh);
}

static void
1101 1102
ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
		     struct dma_async_tx_descriptor *tx)
1103 1104
{
	int disks = sh->disks;
1105
	struct page **xor_srcs = percpu->scribble;
1106
	struct async_submit_ctl submit;
1107 1108
	int count = 0, pd_idx = sh->pd_idx, i;
	struct page *xor_dest;
1109
	int prexor = 0;
1110 1111
	unsigned long flags;

1112
	pr_debug("%s: stripe %llu\n", __func__,
1113 1114 1115 1116 1117
		(unsigned long long)sh->sector);

	/* check if prexor is active which means only process blocks
	 * that are part of a read-modify-write (written)
	 */
1118 1119
	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
		prexor = 1;
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139
		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if (dev->written)
				xor_srcs[count++] = dev->page;
		}
	} else {
		xor_dest = sh->dev[pd_idx].page;
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if (i != pd_idx)
				xor_srcs[count++] = dev->page;
		}
	}

	/* 1/ if we prexor'd then the dest is reused as a source
	 * 2/ if we did not prexor then we are redoing the parity
	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
	 * for the synchronous xor case
	 */
1140
	flags = ASYNC_TX_ACK |
1141 1142 1143 1144
		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);

	atomic_inc(&sh->count);