dm-thin.c 110 KB
Newer Older
1
/*
2
 * Copyright (C) 2011-2012 Red Hat UK.
3 4 5 6 7
 *
 * This file is released under the GPL.
 */

#include "dm-thin-metadata.h"
8
#include "dm-bio-prison.h"
9
#include "dm.h"
10 11 12 13

#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
14
#include <linux/jiffies.h>
15
#include <linux/log2.h>
16
#include <linux/list.h>
17
#include <linux/rculist.h>
18 19 20
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
21
#include <linux/vmalloc.h>
22
#include <linux/sort.h>
23
#include <linux/rbtree.h>
24 25 26 27 28 29

#define	DM_MSG_PREFIX	"thin"

/*
 * Tunable constants
 */
30
#define ENDIO_HOOK_POOL_SIZE 1024
31
#define MAPPING_POOL_SIZE 1024
32
#define COMMIT_PERIOD HZ
33 34 35
#define NO_SPACE_TIMEOUT_SECS 60

static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
36

37 38 39
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
		"A percentage of time allocated for copy on write");

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
/*
 * The block size of the device holding pool data must be
 * between 64KB and 1GB.
 */
#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)

/*
 * Device id is restricted to 24 bits.
 */
#define MAX_DEV_ID ((1 << 24) - 1)

/*
 * How do we handle breaking sharing of data blocks?
 * =================================================
 *
 * We use a standard copy-on-write btree to store the mappings for the
 * devices (note I'm talking about copy-on-write of the metadata here, not
 * the data).  When you take an internal snapshot you clone the root node
 * of the origin btree.  After this there is no concept of an origin or a
 * snapshot.  They are just two device trees that happen to point to the
 * same data blocks.
 *
 * When we get a write in we decide if it's to a shared data block using
 * some timestamp magic.  If it is, we have to break sharing.
 *
 * Let's say we write to a shared block in what was the origin.  The
 * steps are:
 *
 * i) plug io further to this physical block. (see bio_prison code).
 *
 * ii) quiesce any read io to that shared data block.  Obviously
72
 * including all devices that share this block.  (see dm_deferred_set code)
73 74 75 76 77
 *
 * iii) copy the data block to a newly allocate block.  This step can be
 * missed out if the io covers the block. (schedule_copy).
 *
 * iv) insert the new mapping into the origin's btree
Joe Thornber's avatar
Joe Thornber committed
78
 * (process_prepared_mapping).  This act of inserting breaks some
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 * sharing of btree nodes between the two devices.  Breaking sharing only
 * effects the btree of that specific device.  Btrees for the other
 * devices that share the block never change.  The btree for the origin
 * device as it was after the last commit is untouched, ie. we're using
 * persistent data structures in the functional programming sense.
 *
 * v) unplug io to this physical block, including the io that triggered
 * the breaking of sharing.
 *
 * Steps (ii) and (iii) occur in parallel.
 *
 * The metadata _doesn't_ need to be committed before the io continues.  We
 * get away with this because the io is always written to a _new_ block.
 * If there's a crash, then:
 *
 * - The origin mapping will point to the old origin block (the shared
 * one).  This will contain the data as it was before the io that triggered
 * the breaking of sharing came in.
 *
 * - The snap mapping still points to the old block.  As it would after
 * the commit.
 *
 * The downside of this scheme is the timestamp magic isn't perfect, and
 * will continue to think that data block in the snapshot device is shared
 * even after the write to the origin has broken sharing.  I suspect data
 * blocks will typically be shared by many different devices, so we're
 * breaking sharing n + 1 times, rather than n, where n is the number of
 * devices that reference this data block.  At the moment I think the
 * benefits far, far outweigh the disadvantages.
 */

/*----------------------------------------------------------------*/

/*
 * Key building.
 */
115 116 117 118 119 120 121
enum lock_space {
	VIRTUAL,
	PHYSICAL
};

static void build_key(struct dm_thin_device *td, enum lock_space ls,
		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
122
{
123
	key->virtual = (ls == VIRTUAL);
124
	key->dev = dm_thin_dev_id(td);
125
	key->block_begin = b;
126 127 128 129 130 131 132
	key->block_end = e;
}

static void build_data_key(struct dm_thin_device *td, dm_block_t b,
			   struct dm_cell_key *key)
{
	build_key(td, PHYSICAL, b, b + 1llu, key);
133 134 135
}

static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
136
			      struct dm_cell_key *key)
137
{
138
	build_key(td, VIRTUAL, b, b + 1llu, key);
139 140 141 142
}

/*----------------------------------------------------------------*/

Joe Thornber's avatar
Joe Thornber committed
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
#define THROTTLE_THRESHOLD (1 * HZ)

struct throttle {
	struct rw_semaphore lock;
	unsigned long threshold;
	bool throttle_applied;
};

static void throttle_init(struct throttle *t)
{
	init_rwsem(&t->lock);
	t->throttle_applied = false;
}

static void throttle_work_start(struct throttle *t)
{
	t->threshold = jiffies + THROTTLE_THRESHOLD;
}

static void throttle_work_update(struct throttle *t)
{
	if (!t->throttle_applied && jiffies > t->threshold) {
		down_write(&t->lock);
		t->throttle_applied = true;
	}
}

static void throttle_work_complete(struct throttle *t)
{
	if (t->throttle_applied) {
		t->throttle_applied = false;
		up_write(&t->lock);
	}
}

static void throttle_lock(struct throttle *t)
{
	down_read(&t->lock);
}

static void throttle_unlock(struct throttle *t)
{
	up_read(&t->lock);
}

/*----------------------------------------------------------------*/

190 191 192 193 194
/*
 * A pool device ties together a metadata device and a data device.  It
 * also provides the interface for creating and destroying internal
 * devices.
 */
Mike Snitzer's avatar
Mike Snitzer committed
195
struct dm_thin_new_mapping;
196

197
/*
198
 * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
199 200 201
 */
enum pool_mode {
	PM_WRITE,		/* metadata may be changed */
202
	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
203 204 205 206
	PM_READ_ONLY,		/* metadata may not be changed */
	PM_FAIL,		/* all I/O fails */
};

207
struct pool_features {
208 209
	enum pool_mode mode;

Mike Snitzer's avatar
Mike Snitzer committed
210 211 212
	bool zero_new_blocks:1;
	bool discard_enabled:1;
	bool discard_passdown:1;
213
	bool error_if_no_space:1;
214 215
};

216 217
struct thin_c;
typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
218
typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
219 220
typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);

221 222
#define CELL_SORT_ARRAY_SIZE 8192

223 224 225 226 227 228 229 230 231
struct pool {
	struct list_head list;
	struct dm_target *ti;	/* Only set if a pool target is bound */

	struct mapped_device *pool_md;
	struct block_device *md_dev;
	struct dm_pool_metadata *pmd;

	dm_block_t low_water_blocks;
232
	uint32_t sectors_per_block;
233
	int sectors_per_block_shift;
234

235
	struct pool_features pf;
236
	bool low_water_triggered:1;	/* A dm event has been sent */
237
	bool suspended:1;
238
	bool out_of_data_space:1;
239

240
	struct dm_bio_prison *prison;
241 242 243
	struct dm_kcopyd_client *copier;

	struct workqueue_struct *wq;
Joe Thornber's avatar
Joe Thornber committed
244
	struct throttle throttle;
245
	struct work_struct worker;
246
	struct delayed_work waker;
247
	struct delayed_work no_space_timeout;
248

249
	unsigned long last_commit_jiffies;
250
	unsigned ref_count;
251 252 253 254

	spinlock_t lock;
	struct bio_list deferred_flush_bios;
	struct list_head prepared_mappings;
Joe Thornber's avatar
Joe Thornber committed
255
	struct list_head prepared_discards;
256
	struct list_head prepared_discards_pt2;
257
	struct list_head active_thins;
258

259 260
	struct dm_deferred_set *shared_read_ds;
	struct dm_deferred_set *all_io_ds;
261

Mike Snitzer's avatar
Mike Snitzer committed
262
	struct dm_thin_new_mapping *next_mapping;
263
	mempool_t *mapping_pool;
264 265 266 267

	process_bio_fn process_bio;
	process_bio_fn process_discard;

268 269 270
	process_cell_fn process_cell;
	process_cell_fn process_discard_cell;

271 272
	process_mapping_fn process_prepared_mapping;
	process_mapping_fn process_prepared_discard;
273
	process_mapping_fn process_prepared_discard_pt2;
274

275
	struct dm_bio_prison_cell **cell_sort_array;
276 277
};

278
static enum pool_mode get_pool_mode(struct pool *pool);
279
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
280

281 282 283 284 285 286 287 288 289 290 291
/*
 * Target context for a pool.
 */
struct pool_c {
	struct dm_target *ti;
	struct pool *pool;
	struct dm_dev *data_dev;
	struct dm_dev *metadata_dev;
	struct dm_target_callbacks callbacks;

	dm_block_t low_water_blocks;
292 293
	struct pool_features requested_pf; /* Features requested during table load */
	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
294 295 296 297 298 299
};

/*
 * Target context for a thin.
 */
struct thin_c {
300
	struct list_head list;
301
	struct dm_dev *pool_dev;
302
	struct dm_dev *origin_dev;
303
	sector_t origin_size;
304 305 306 307
	dm_thin_id dev_id;

	struct pool *pool;
	struct dm_thin_device *td;
308 309
	struct mapped_device *thin_md;

310
	bool requeue_mode:1;
311
	spinlock_t lock;
312
	struct list_head deferred_cells;
313 314
	struct bio_list deferred_bio_list;
	struct bio_list retry_on_resume_list;
315
	struct rb_root sort_bio_list; /* sorted list of deferred bios */
316 317 318 319 320 321 322

	/*
	 * Ensures the thin is not destroyed until the worker has finished
	 * iterating the active_thins list.
	 */
	atomic_t refcount;
	struct completion can_destroy;
323 324 325 326
};

/*----------------------------------------------------------------*/

327 328 329 330 331 332 333 334 335 336 337 338
static bool block_size_is_power_of_two(struct pool *pool)
{
	return pool->sectors_per_block_shift >= 0;
}

static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
{
	return block_size_is_power_of_two(pool) ?
		(b << pool->sectors_per_block_shift) :
		(b * pool->sectors_per_block);
}

339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
/*----------------------------------------------------------------*/

struct discard_op {
	struct thin_c *tc;
	struct blk_plug plug;
	struct bio *parent_bio;
	struct bio *bio;
};

static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
{
	BUG_ON(!parent);

	op->tc = tc;
	blk_start_plug(&op->plug);
	op->parent_bio = parent;
	op->bio = NULL;
}

static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
359
{
360
	struct thin_c *tc = op->tc;
361 362
	sector_t s = block_to_sectors(tc->pool, data_b);
	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
363

364
	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
365
				      GFP_NOWAIT, 0, &op->bio);
366 367 368 369 370 371 372 373 374 375
}

static void end_discard(struct discard_op *op, int r)
{
	if (op->bio) {
		/*
		 * Even if one of the calls to issue_discard failed, we
		 * need to wait for the chain to complete.
		 */
		bio_chain(op->bio, op->parent_bio);
Mike Christie's avatar
Mike Christie committed
376
		bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
377
		submit_bio(op->bio);
378
	}
379

380 381 382 383 384 385 386 387 388
	blk_finish_plug(&op->plug);

	/*
	 * Even if r is set, there could be sub discards in flight that we
	 * need to wait for.
	 */
	if (r && !op->parent_bio->bi_error)
		op->parent_bio->bi_error = r;
	bio_endio(op->parent_bio);
389 390 391 392
}

/*----------------------------------------------------------------*/

393 394 395 396 397 398 399 400 401 402 403
/*
 * wake_worker() is used when new work is queued and when pool_resume is
 * ready to continue deferred IO processing.
 */
static void wake_worker(struct pool *pool)
{
	queue_work(pool->wq, &pool->worker);
}

/*----------------------------------------------------------------*/

404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
		      struct dm_bio_prison_cell **cell_result)
{
	int r;
	struct dm_bio_prison_cell *cell_prealloc;

	/*
	 * Allocate a cell from the prison's mempool.
	 * This might block but it can't fail.
	 */
	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);

	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
	if (r)
		/*
		 * We reused an old cell; we can get rid of
		 * the new one.
		 */
		dm_bio_prison_free_cell(pool->prison, cell_prealloc);

	return r;
}

static void cell_release(struct pool *pool,
			 struct dm_bio_prison_cell *cell,
			 struct bio_list *bios)
{
	dm_cell_release(pool->prison, cell, bios);
	dm_bio_prison_free_cell(pool->prison, cell);
}

435 436 437 438 439 440 441 442 443
static void cell_visit_release(struct pool *pool,
			       void (*fn)(void *, struct dm_bio_prison_cell *),
			       void *context,
			       struct dm_bio_prison_cell *cell)
{
	dm_cell_visit_release(pool->prison, fn, context, cell);
	dm_bio_prison_free_cell(pool->prison, cell);
}

444 445 446 447 448 449 450 451
static void cell_release_no_holder(struct pool *pool,
				   struct dm_bio_prison_cell *cell,
				   struct bio_list *bios)
{
	dm_cell_release_no_holder(pool->prison, cell, bios);
	dm_bio_prison_free_cell(pool->prison, cell);
}

452 453
static void cell_error_with_code(struct pool *pool,
				 struct dm_bio_prison_cell *cell, int error_code)
454
{
455
	dm_cell_error(pool->prison, cell, error_code);
456 457 458
	dm_bio_prison_free_cell(pool->prison, cell);
}

459 460 461 462 463
static int get_pool_io_error_code(struct pool *pool)
{
	return pool->out_of_data_space ? -ENOSPC : -EIO;
}

464 465
static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
{
466 467 468
	int error = get_pool_io_error_code(pool);

	cell_error_with_code(pool, cell, error);
469 470
}

471 472 473 474 475 476 477 478 479 480
static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
{
	cell_error_with_code(pool, cell, 0);
}

static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
{
	cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
}

481 482
/*----------------------------------------------------------------*/

483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
/*
 * A global list of pools that uses a struct mapped_device as a key.
 */
static struct dm_thin_pool_table {
	struct mutex mutex;
	struct list_head pools;
} dm_thin_pool_table;

static void pool_table_init(void)
{
	mutex_init(&dm_thin_pool_table.mutex);
	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
}

static void __pool_table_insert(struct pool *pool)
{
	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	list_add(&pool->list, &dm_thin_pool_table.pools);
}

static void __pool_table_remove(struct pool *pool)
{
	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
	list_del(&pool->list);
}

static struct pool *__pool_table_lookup(struct mapped_device *md)
{
	struct pool *pool = NULL, *tmp;

	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
		if (tmp->pool_md == md) {
			pool = tmp;
			break;
		}
	}

	return pool;
}

static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
{
	struct pool *pool = NULL, *tmp;

	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
		if (tmp->md_dev == md_dev) {
			pool = tmp;
			break;
		}
	}

	return pool;
}

/*----------------------------------------------------------------*/

Mike Snitzer's avatar
Mike Snitzer committed
543
struct dm_thin_endio_hook {
544
	struct thin_c *tc;
545 546
	struct dm_deferred_entry *shared_read_entry;
	struct dm_deferred_entry *all_io_entry;
Mike Snitzer's avatar
Mike Snitzer committed
547
	struct dm_thin_new_mapping *overwrite_mapping;
548
	struct rb_node rb_node;
549
	struct dm_bio_prison_cell *cell;
550 551
};

552 553 554 555 556 557 558
static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
{
	bio_list_merge(bios, master);
	bio_list_init(master);
}

static void error_bio_list(struct bio_list *bios, int error)
559 560
{
	struct bio *bio;
561

562 563 564 565
	while ((bio = bio_list_pop(bios))) {
		bio->bi_error = error;
		bio_endio(bio);
	}
566 567 568 569
}

static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
{
570
	struct bio_list bios;
571
	unsigned long flags;
572 573

	bio_list_init(&bios);
574

575
	spin_lock_irqsave(&tc->lock, flags);
576
	__merge_bio_list(&bios, master);
577
	spin_unlock_irqrestore(&tc->lock, flags);
578

579
	error_bio_list(&bios, error);
580 581
}

582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
static void requeue_deferred_cells(struct thin_c *tc)
{
	struct pool *pool = tc->pool;
	unsigned long flags;
	struct list_head cells;
	struct dm_bio_prison_cell *cell, *tmp;

	INIT_LIST_HEAD(&cells);

	spin_lock_irqsave(&tc->lock, flags);
	list_splice_init(&tc->deferred_cells, &cells);
	spin_unlock_irqrestore(&tc->lock, flags);

	list_for_each_entry_safe(cell, tmp, &cells, user_list)
		cell_requeue(pool, cell);
}

599 600
static void requeue_io(struct thin_c *tc)
{
601
	struct bio_list bios;
602
	unsigned long flags;
603 604 605

	bio_list_init(&bios);

606
	spin_lock_irqsave(&tc->lock, flags);
607 608
	__merge_bio_list(&bios, &tc->deferred_bio_list);
	__merge_bio_list(&bios, &tc->retry_on_resume_list);
609
	spin_unlock_irqrestore(&tc->lock, flags);
610

611 612
	error_bio_list(&bios, DM_ENDIO_REQUEUE);
	requeue_deferred_cells(tc);
613 614
}

615
static void error_retry_list_with_code(struct pool *pool, int error)
616 617 618 619 620
{
	struct thin_c *tc;

	rcu_read_lock();
	list_for_each_entry_rcu(tc, &pool->active_thins, list)
621
		error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
622 623 624
	rcu_read_unlock();
}

625 626
static void error_retry_list(struct pool *pool)
{
627 628
	int error = get_pool_io_error_code(pool);

629
	error_retry_list_with_code(pool, error);
630 631
}

632 633 634 635 636 637 638 639 640
/*
 * This section of code contains the logic for processing a thin device's IO.
 * Much of the code depends on pool object resources (lists, workqueues, etc)
 * but most is exclusively called from the thin target rather than the thin-pool
 * target.
 */

static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
641
	struct pool *pool = tc->pool;
642
	sector_t block_nr = bio->bi_iter.bi_sector;
643

644 645
	if (block_size_is_power_of_two(pool))
		block_nr >>= pool->sectors_per_block_shift;
646
	else
647
		(void) sector_div(block_nr, pool->sectors_per_block);
648 649

	return block_nr;
650 651
}

652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
/*
 * Returns the _complete_ blocks that this bio covers.
 */
static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
				dm_block_t *begin, dm_block_t *end)
{
	struct pool *pool = tc->pool;
	sector_t b = bio->bi_iter.bi_sector;
	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);

	b += pool->sectors_per_block - 1ull; /* so we round up */

	if (block_size_is_power_of_two(pool)) {
		b >>= pool->sectors_per_block_shift;
		e >>= pool->sectors_per_block_shift;
	} else {
		(void) sector_div(b, pool->sectors_per_block);
		(void) sector_div(e, pool->sectors_per_block);
	}

	if (e < b)
		/* Can happen if the bio is within a single block. */
		e = b;

	*begin = b;
	*end = e;
}

680 681 682
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{
	struct pool *pool = tc->pool;
683
	sector_t bi_sector = bio->bi_iter.bi_sector;
684 685

	bio->bi_bdev = tc->pool_dev->bdev;
686
	if (block_size_is_power_of_two(pool))
687 688 689
		bio->bi_iter.bi_sector =
			(block << pool->sectors_per_block_shift) |
			(bi_sector & (pool->sectors_per_block - 1));
690
	else
691
		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
692
				 sector_div(bi_sector, pool->sectors_per_block);
693 694
}

695 696 697 698 699
static void remap_to_origin(struct thin_c *tc, struct bio *bio)
{
	bio->bi_bdev = tc->origin_dev->bdev;
}

700 701
static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
{
702
	return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
703 704 705
		dm_thin_changed_this_transaction(tc->td);
}

706 707 708 709
static void inc_all_io_entry(struct pool *pool, struct bio *bio)
{
	struct dm_thin_endio_hook *h;

Mike Christie's avatar
Mike Christie committed
710
	if (bio_op(bio) == REQ_OP_DISCARD)
711 712
		return;

713
	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
714 715 716
	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
}

717
static void issue(struct thin_c *tc, struct bio *bio)
718 719 720 721
{
	struct pool *pool = tc->pool;
	unsigned long flags;

722 723 724 725 726
	if (!bio_triggers_commit(tc, bio)) {
		generic_make_request(bio);
		return;
	}

727
	/*
728 729 730
	 * Complete bio with an error if earlier I/O caused changes to
	 * the metadata that can't be committed e.g, due to I/O errors
	 * on the metadata device.
731
	 */
732 733 734 735 736 737 738 739 740 741 742 743
	if (dm_thin_aborted_changes(tc->td)) {
		bio_io_error(bio);
		return;
	}

	/*
	 * Batch together any bios that trigger commits and then issue a
	 * single commit for them in process_deferred_bios().
	 */
	spin_lock_irqsave(&pool->lock, flags);
	bio_list_add(&pool->deferred_flush_bios, bio);
	spin_unlock_irqrestore(&pool->lock, flags);
744 745
}

746 747 748 749 750 751 752 753 754 755 756 757 758
static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
{
	remap_to_origin(tc, bio);
	issue(tc, bio);
}

static void remap_and_issue(struct thin_c *tc, struct bio *bio,
			    dm_block_t block)
{
	remap(tc, bio, block);
	issue(tc, bio);
}

759 760 761 762 763
/*----------------------------------------------------------------*/

/*
 * Bio endio functions.
 */
Mike Snitzer's avatar
Mike Snitzer committed
764
struct dm_thin_new_mapping {
765 766
	struct list_head list;

767
	bool pass_discard:1;
768
	bool maybe_shared:1;
769

770 771 772 773 774 775 776
	/*
	 * Track quiescing, copying and zeroing preparation actions.  When this
	 * counter hits zero the block is prepared and can be inserted into the
	 * btree.
	 */
	atomic_t prepare_actions;

777
	int err;
778
	struct thin_c *tc;
779
	dm_block_t virt_begin, virt_end;
780
	dm_block_t data_block;
781
	struct dm_bio_prison_cell *cell;
782 783 784 785 786 787 788 789 790 791 792

	/*
	 * If the bio covers the whole area of a block then we can avoid
	 * zeroing or copying.  Instead this bio is hooked.  The bio will
	 * still be in the cell, so care has to be taken to avoid issuing
	 * the bio twice.
	 */
	struct bio *bio;
	bio_end_io_t *saved_bi_end_io;
};

793
static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
794 795 796
{
	struct pool *pool = m->tc->pool;

797
	if (atomic_dec_and_test(&m->prepare_actions)) {
798
		list_add_tail(&m->list, &pool->prepared_mappings);
799 800 801 802
		wake_worker(pool);
	}
}

803
static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
804 805 806 807 808
{
	unsigned long flags;
	struct pool *pool = m->tc->pool;

	spin_lock_irqsave(&pool->lock, flags);
809
	__complete_mapping_preparation(m);
810 811 812
	spin_unlock_irqrestore(&pool->lock, flags);
}

813 814 815 816 817 818 819 820
static void copy_complete(int read_err, unsigned long write_err, void *context)
{
	struct dm_thin_new_mapping *m = context;

	m->err = read_err || write_err ? -EIO : 0;
	complete_mapping_preparation(m);
}

821
static void overwrite_endio(struct bio *bio)
822
{
823
	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
Mike Snitzer's avatar
Mike Snitzer committed
824
	struct dm_thin_new_mapping *m = h->overwrite_mapping;
825

826 827
	bio->bi_end_io = m->saved_bi_end_io;

828
	m->err = bio->bi_error;
829
	complete_mapping_preparation(m);
830 831 832 833 834 835 836 837 838 839 840 841 842
}

/*----------------------------------------------------------------*/

/*
 * Workqueue.
 */

/*
 * Prepared mapping jobs.
 */

/*
843 844
 * This sends the bios in the cell, except the original holder, back
 * to the deferred_bios list.
845
 */
846
static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
847 848 849 850
{
	struct pool *pool = tc->pool;
	unsigned long flags;

851 852 853
	spin_lock_irqsave(&tc->lock, flags);
	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
	spin_unlock_irqrestore(&tc->lock, flags);
854 855 856 857

	wake_worker(pool);
}

858 859
static void thin_defer_bio(struct thin_c *tc, struct bio *bio);

860 861 862 863 864 865 866 867
struct remap_info {
	struct thin_c *tc;
	struct bio_list defer_bios;
	struct bio_list issue_bios;
};

static void __inc_remap_and_issue_cell(void *context,
				       struct dm_bio_prison_cell *cell)
868
{
869
	struct remap_info *info = context;
870 871
	struct bio *bio;

872
	while ((bio = bio_list_pop(&cell->bios))) {
873
		if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
Mike Christie's avatar
Mike Christie committed
874
		    bio_op(bio) == REQ_OP_DISCARD)
875
			bio_list_add(&info->defer_bios, bio);
876
		else {
877 878 879 880 881 882 883 884
			inc_all_io_entry(info->tc->pool, bio);

			/*
			 * We can't issue the bios with the bio prison lock
			 * held, so we add them to a list to issue on
			 * return from this function.
			 */
			bio_list_add(&info->issue_bios, bio);
885 886 887 888
		}
	}
}

889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914
static void inc_remap_and_issue_cell(struct thin_c *tc,
				     struct dm_bio_prison_cell *cell,
				     dm_block_t block)
{
	struct bio *bio;
	struct remap_info info;

	info.tc = tc;
	bio_list_init(&info.defer_bios);
	bio_list_init(&info.issue_bios);

	/*
	 * We have to be careful to inc any bios we're about to issue
	 * before the cell is released, and avoid a race with new bios
	 * being added to the cell.
	 */
	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
			   &info, cell);

	while ((bio = bio_list_pop(&info.defer_bios)))
		thin_defer_bio(tc, bio);

	while ((bio = bio_list_pop(&info.issue_bios)))
		remap_and_issue(info.tc, bio, block);
}

915 916
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
917
	cell_error(m->tc->pool, m->cell);
918 919 920
	list_del(&m->list);
	mempool_free(m, m->tc->pool->mapping_pool);
}
921

Mike Snitzer's avatar
Mike Snitzer committed
922
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
923 924
{
	struct thin_c *tc = m->tc;
925
	struct pool *pool = tc->pool;
926
	struct bio *bio = m->bio;
927 928 929
	int r;

	if (m->err) {
930
		cell_error(pool, m->cell);
931
		goto out;
932 933 934 935 936 937 938
	}

	/*
	 * Commit the prepared block into the mapping btree.
	 * Any I/O for this block arriving after this point will get
	 * remapped to it directly.
	 */
939
	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
940
	if (r) {
941
		metadata_operation_failed(pool, "dm_thin_insert_block", r);
942
		cell_error(pool, m->cell);
943
		goto out;
944 945 946 947 948 949 950 951 952
	}

	/*
	 * Release any bios held while the block was being provisioned.
	 * If we are processing a write bio that completely covers the block,
	 * we already processed it so can ignore it now when processing
	 * the bios in the cell.
	 */
	if (bio) {
953
		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
954
		bio_endio(bio);
955 956 957 958 959
	} else {
		inc_all_io_entry(tc->pool, m->cell->holder);
		remap_and_issue(tc, m->cell->holder, m->data_block);
		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
	}
960

961
out:
962
	list_del(&m->list);
963
	mempool_free(m, pool->mapping_pool);
964 965
}

966 967 968
/*----------------------------------------------------------------*/

static void free_discard_mapping(struct dm_thin_new_mapping *m)
Joe Thornber's avatar
Joe Thornber committed
969 970
{
	struct thin_c *tc = m->tc;
971 972 973 974
	if (m->cell)
		cell_defer_no_holder(tc, m->cell);
	mempool_free(m, tc->pool->mapping_pool);
}
Joe Thornber's avatar
Joe Thornber committed
975

976 977
static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
{
978
	bio_io_error(m->bio);
979 980 981 982 983
	free_discard_mapping(m);
}

static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
{
984
	bio_endio(m->bio);
985 986 987 988 989 990 991 992 993 994 995 996 997
	free_discard_mapping(m);
}

static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
{
	int r;
	struct thin_c *tc = m->tc;

	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
	if (r) {
		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
		bio_io_error(m->bio);
	} else
998
		bio_endio(m->bio);
999

1000
	cell_defer_no_holder(tc, m->cell);
1001 1002 1003
	mempool_free(m, tc->pool->mapping_pool);
}

1004 1005
/*----------------------------------------------------------------*/

1006 1007
static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
						   struct bio *discard_parent)
1008
{
1009 1010 1011 1012
	/*
	 * We've already unmapped this range of blocks, but before we
	 * passdown we have to check that these blocks are now unused.
	 */
1013
	int r = 0;
1014
	bool used = true;
1015
	struct thin_c *tc = m->tc;
1016 1017
	struct pool *pool = tc->pool;
	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1018
	struct discard_op op;
Joe Thornber's avatar
Joe Thornber committed
1019

1020
	begin_discard(&op, tc, discard_parent);
1021 1022 1023 1024 1025
	while (b != end) {
		/* find start of unmapped run */
		for (; b < end; b++) {
			r = dm_pool_block_is_used(pool->pmd, b, &used);
			if (r)
1026
				goto out;
1027

1028 1029
			if (!used)
				break;
1030
		}
Joe Thornber's avatar
Joe Thornber committed
1031

1032 1033 1034 1035 1036 1037 1038
		if (b == end)
			break;

		/* find end of run */
		for (e = b + 1; e != end; e++) {
			r = dm_pool_block_is_used(pool->pmd, e, &used);
			if (r)
1039
				goto out;
1040 1041 1042 1043 1044

			if (used)
				break;
		}

1045
		r = issue_discard(&op, b, e);
1046
		if (r)
1047
			goto out;
1048 1049 1050

		b = e;
	}
1051 1052
out:
	end_discard(&op, r);
Joe Thornber's avatar
Joe Thornber committed
1053 1054
}

1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
{
	unsigned long flags;
	struct pool *pool = m->tc->pool;

	spin_lock_irqsave(&pool->lock, flags);
	list_add_tail(&m->list, &pool->prepared_discards_pt2);
	spin_unlock_irqrestore(&pool->lock, flags);
	wake_worker(pool);
}

static void passdown_endio(struct bio *bio)
{
	/*
	 * It doesn't matter if the passdown discard failed, we still want
	 * to unmap (we ignore err).
	 */
	queue_passdown_pt2(bio->bi_private);
}

static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1076 1077 1078
{
	int r;
	struct thin_c *tc = m->tc;
1079
	struct pool *pool = tc->pool;
1080 1081
	struct bio *discard_parent;
	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1082

1083 1084 1085 1086 1087
	/*
	 * Only this thread allocates blocks, so we can be sure that the
	 * newly unmapped blocks will not be allocated before the end of
	 * the function.
	 */
1088
	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
1089
	if (r) {
1090
		metadata_operation_failed(pool, "dm_thin_remove_range", r);
1091
		bio_io_error(m->bio);
1092 1093 1094 1095
		cell_defer_no_holder(tc, m->cell);
		mempool_free(m, pool->mapping_pool);
		return;
	}
1096

1097 1098 1099 1100 1101
	discard_parent = bio_alloc(GFP_NOIO, 1);
	if (!discard_parent) {
		DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
		       dm_device_name(tc->pool->pool_md));
		queue_passdown_pt2(m);
1102 1103

	} else {
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
		discard_parent->bi_end_io = passdown_endio;
		discard_parent->bi_private = m;

		if (m->maybe_shared)
			passdown_double_checking_shared_status(m, discard_parent);
		else {
			struct discard_op op;

			begin_discard(&op, tc, discard_parent);
			r = issue_discard(&op, m->data_block, data_end);
			end_discard(&op, r);
		}
1116
	}
1117

1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149
	/*
	 * Increment the unmapped blocks.  This prevents a race between the
	 * passdown io and reallocation of freed blocks.
	 */
	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
	if (r) {
		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
		bio_io_error(m->bio);
		cell_defer_no_holder(tc, m->cell);
		mempool_free(m, pool->mapping_pool);
		return;
	}
}

static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
{
	int r;
	struct thin_c *tc = m->tc;
	struct pool *pool = tc->pool;

	/*
	 * The passdown has completed, so now we can decrement all those
	 * unmapped blocks.
	 */
	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
				   m->data_block + (m->virt_end - m->virt_begin));
	if (r) {
		metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
		bio_io_error(m->bio);
	} else
		bio_endio(m->bio);

1150 1151
	cell_defer_no_holder(tc, m->cell);
	mempool_free(m, pool->mapping_pool);
1152 1153
}

Joe Thornber's avatar
Joe Thornber committed
1154
static void process_prepared(struct pool *pool, struct list_head *head,
1155
			     process_mapping_fn *fn)
1156 1157 1158
{
	unsigned long flags;
	struct list_head maps;
Mike Snitzer's avatar
Mike Snitzer committed
1159
	struct dm_thin_new_mapping *m, *tmp;
1160 1161 1162

	INIT_LIST_HEAD(&maps);
	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber's avatar
Joe Thornber committed
1163
	list_splice_init(head, &maps);
1164 1165 1166
	spin_unlock_irqrestore(&pool->lock, flags);

	list_for_each_entry_safe(m, tmp, &maps, list)
1167
		(*fn)(m);
1168 1169 1170 1171 1172
}

/*
 * Deferred bio jobs.
 */
Joe Thornber's avatar
Joe Thornber committed
1173
static int io_overlaps_block(struct pool *pool, struct bio *bio)
1174
{
1175 1176
	return bio->bi_iter.bi_size ==
		(pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber's avatar
Joe Thornber committed
1177 1178 1179 1180 1181 1182
}

static int io_overwrites_block(struct pool *pool, struct bio *bio)
{
	return (bio_data_dir(bio) == WRITE) &&
		io_overlaps_block(pool, bio);
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201
}

static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
			       bio_end_io_t *fn)
{
	*save = bio->bi_end_io;
	bio->bi_end_io = fn;
}

static int ensure_next_mapping(struct pool *pool)
{
	if (pool->next_mapping)
		return 0;

	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);

	return pool->next_mapping ? 0 : -ENOMEM;
}

Mike Snitzer's avatar
Mike Snitzer committed
1202
static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
1203
{
1204
	struct dm_thin_new_mapping *m = pool->next_mapping;
1205 1206 1207

	BUG_ON(!pool->next_mapping);

1208 1209 1210 1211
	memset(m, 0, sizeof(struct dm_thin_new_mapping));
	INIT_LIST_HEAD(&m->list);
	m->bio = NULL;

1212 1213
	pool->next_mapping = NULL;

1214
	return m;
1215 1216
}

1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233
static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
		    sector_t begin, sector_t end)
{
	int r;
	struct dm_io_region to;

	to.bdev = tc->pool_dev->bdev;
	to.sector = begin;
	to.count = end - begin;

	r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
	if (r < 0) {
		DMERR_LIMIT("dm_kcopyd_zero() failed");
		copy_complete(1, 1, m);
	}
}

1234
static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
1235
				      dm_block_t data_begin,
1236 1237 1238 1239 1240 1241 1242 1243 1244
				      struct dm_thin_new_mapping *m)
{
	struct pool *pool = tc->pool;
	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

	h->overwrite_mapping = m;
	m->bio = bio;
	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
	inc_all_io_entry(pool, bio);
1245
	remap_and_issue(tc, bio, data_begin);
1246 1247
}

1248 1249 1250
/*
 * A partial copy also needs to zero the uncopied region.
 */
1251
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1252 1253
			  struct dm_dev *origin, dm_block_t data_origin,
			  dm_block_t data_dest,
1254 1255
			  struct dm_bio_prison_cell *cell, struct bio *bio,
			  sector_t len)
1256 1257 1258
{
	int r;
	struct pool *pool = tc->pool;
Mike Snitzer's avatar
Mike Snitzer committed
1259
	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1260 1261

	m->tc = tc;
1262 1263
	m->virt_begin = virt_block;
	m->virt_end = virt_block + 1u;
1264 1265 1266
	m->data_block = data_dest;
	m->cell = cell;

1267 1268 1269 1270 1271 1272 1273
	/*
	 * quiesce action + copy action + an extra reference held for the
	 * duration of this function (we may need to inc later for a
	 * partial zero).
	 */
	atomic_set(&m->prepare_actions, 3);

1274
	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
1275
		complete_mapping_preparation(m); /* already quiesced */
1276 1277 1278 1279 1280 1281 1282

	/*
	 * IO to pool_dev remaps to the pool target's data_dev.
	 *
	 * If the whole block of data is being overwritten, we can issue the
	 * bio immediately. Otherwise we use kcopyd to clone the data first.
	 */
1283 1284 1285
	if (io_overwrites_block(pool, bio))
		remap_and_issue_overwrite(tc, bio, data_dest, m);
	else {
1286 1287
		struct dm_io_region from, to;

1288
		from.bdev = origin->bdev;
1289
		from.sector = data_origin * pool->sectors_per_block;
1290
		from.count = len;
1291 1292 1293

		to.bdev = tc->pool_dev->bdev;
		to.sector = data_dest * pool->sectors_per_block;
1294
		to.count = len;
1295 1296 1297 1298

		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				   0, copy_complete, m);
		if (r < 0) {
1299
			DMERR_LIMIT("dm_kcopyd_copy() failed");
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
			copy_complete(1, 1, m);

			/*
			 * We allow the zero to be issued, to simplify the
			 * error path.  Otherwise we'd need to start
			 * worrying about decrementing the prepare_actions
			 * counter.
			 */
		}

		/*
		 * Do we need to zero a tail region?
		 */
		if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
			atomic_inc(&m->prepare_actions);
			ll_zero(tc, m,
				data_dest * pool->sectors_per_block + len,
				(data_dest + 1) * pool->sectors_per_block);
1318 1319
		}
	}
1320 1321

	complete_mapping_preparation(m); /* drop our ref */
1322 1323
}

1324 1325
static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
				   dm_block_t data_origin, dm_block_t data_dest,
Mike Snitzer's avatar
Mike Snitzer committed
1326
				   struct dm_bio_prison_cell *cell, struct bio *bio)
1327 1328
{
	schedule_copy(tc, virt_block, tc->pool_dev,
1329 1330
		      data_origin, data_dest, cell, bio,
		      tc->pool->sectors_per_block);
1331 1332
}

1333
static void schedule_zero(struct thin_c