btree.c 56.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
 *
 * Uses a block device as cache for other block devices; optimized for SSDs.
 * All allocation is done in buckets, which should match the erase block size
 * of the device.
 *
 * Buckets containing cached data are kept on a heap sorted by priority;
 * bucket priority is increased on cache hit, and periodically all the buckets
 * on the heap have their priority scaled down. This currently is just used as
 * an LRU but in the future should allow for more intelligent heuristics.
 *
 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
 * counter. Garbage collection is used to remove stale pointers.
 *
 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
 * as keys are inserted we only sort the pages that have not yet been written.
 * When garbage collection is run, we resort the entire node.
 *
 * All configuration is done via sysfs; see Documentation/bcache.txt.
 */

#include "bcache.h"
#include "btree.h"
#include "debug.h"
26
#include "extents.h"
27 28 29 30

#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/hash.h>
31
#include <linux/kthread.h>
32
#include <linux/prefetch.h>
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <trace/events/bcache.h>

/*
 * Todo:
 * register_bcache: Return errors out to userspace correctly
 *
 * Writeback: don't undirty key until after a cache flush
 *
 * Create an iterator for key pointers
 *
 * On btree write error, mark bucket such that it won't be freed from the cache
 *
 * Journalling:
 *   Check for bad keys in replay
 *   Propagate barriers
 *   Refcount journal entries in journal_replay
 *
 * Garbage collection:
 *   Finish incremental gc
 *   Gc should free old UUIDs, data for invalid UUIDs
 *
 * Provide a way to list backing device UUIDs we have data cached for, and
 * probably how long it's been since we've seen them, and a way to invalidate
 * dirty data for devices that will never be attached again
 *
 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
 * that based on that and how much dirty data we have we can keep writeback
 * from being starved
 *
 * Add a tracepoint or somesuch to watch for writeback starvation
 *
 * When btree depth > 1 and splitting an interior node, we have to make sure
 * alloc_bucket() cannot fail. This should be true but is not completely
 * obvious.
 *
 * Plugging?
 *
 * If data write is less than hard sector size of ssd, round up offset in open
 * bucket to the next whole sector
 *
 * Superblock needs to be fleshed out for multiple cache devices
 *
 * Add a sysfs tunable for the number of writeback IOs in flight
 *
 * Add a sysfs tunable for the number of open data buckets
 *
 * IO tracking: Can we track when one process is doing io on behalf of another?
 * IO tracking: Don't use just an average, weigh more recent stuff higher
 *
 * Test module load/unload
 */

#define MAX_NEED_GC		64
#define MAX_SAVE_PRIO		72

#define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))

#define PTR_HASH(c, k)							\
	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
#define insert_lock(s, b)	((b)->level <= (s)->lock)

/*
 * These macros are for recursing down the btree - they handle the details of
 * locking and looking up nodes in the cache for you. They're best treated as
 * mere syntax when reading code that uses them.
 *
 * op->lock determines whether we take a read or a write lock at a given depth.
 * If you've got a read lock and find that you need a write lock (i.e. you're
 * going to have to split), set op->lock and return -EINTR; btree_root() will
 * call you again and you'll have the correct lock.
 */

/**
 * btree - recurse down the btree on a specified key
 * @fn:		function to call, which will be passed the child node
 * @key:	key to recurse on
 * @b:		parent btree node
 * @op:		pointer to struct btree_op
 */
#define btree(fn, key, b, op, ...)					\
({									\
	int _r, l = (b)->level - 1;					\
	bool _w = l <= (op)->lock;					\
119 120
	struct btree *_child = bch_btree_node_get((b)->c, op, key, l,	\
						  _w, b);		\
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
	if (!IS_ERR(_child)) {						\
		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
		rw_unlock(_w, _child);					\
	} else								\
		_r = PTR_ERR(_child);					\
	_r;								\
})

/**
 * btree_root - call a function on the root of the btree
 * @fn:		function to call, which will be passed the child node
 * @c:		cache set
 * @op:		pointer to struct btree_op
 */
#define btree_root(fn, c, op, ...)					\
({									\
	int _r = -EINTR;						\
	do {								\
		struct btree *_b = (c)->root;				\
		bool _w = insert_lock(op, _b);				\
		rw_lock(_w, _b, _b->level);				\
		if (_b == (c)->root &&					\
		    _w == insert_lock(op, _b)) {			\
			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
		}							\
		rw_unlock(_w, _b);					\
147
		bch_cannibalize_unlock(c);				\
148 149
		if (_r == -EINTR)					\
			schedule();					\
150 151
	} while (_r == -EINTR);						\
									\
152
	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
153 154 155
	_r;								\
})

156 157 158 159 160
static inline struct bset *write_block(struct btree *b)
{
	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
}

161 162 163 164 165 166 167 168 169 170 171 172 173 174
static void bch_btree_init_next(struct btree *b)
{
	/* If not a leaf node, always sort */
	if (b->level && b->keys.nsets)
		bch_btree_sort(&b->keys, &b->c->sort);
	else
		bch_btree_sort_lazy(&b->keys, &b->c->sort);

	if (b->written < btree_blocks(b))
		bch_bset_init_next(&b->keys, write_block(b),
				   bset_magic(&b->c->sb));

}

175 176
/* Btree key manipulation */

177
void bkey_put(struct cache_set *c, struct bkey *k)
178 179 180 181 182 183 184 185
{
	unsigned i;

	for (i = 0; i < KEY_PTRS(k); i++)
		if (ptr_available(c, k, i))
			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}

186 187 188 189 190
/* Btree IO */

static uint64_t btree_csum_set(struct btree *b, struct bset *i)
{
	uint64_t crc = b->key.ptr[0];
191
	void *data = (void *) i + 8, *end = bset_bkey_last(i);
192

193
	crc = bch_crc64_update(crc, data, end - data);
Kent Overstreet's avatar
Kent Overstreet committed
194
	return crc ^ 0xffffffffffffffffULL;
195 196
}

197
void bch_btree_node_read_done(struct btree *b)
198 199
{
	const char *err = "bad btree header";
200
	struct bset *i = btree_bset_first(b);
Kent Overstreet's avatar
Kent Overstreet committed
201
	struct btree_iter *iter;
202

203
	iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
Kent Overstreet's avatar
Kent Overstreet committed
204
	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
205 206
	iter->used = 0;

207
#ifdef CONFIG_BCACHE_DEBUG
208
	iter->b = &b->keys;
209 210
#endif

Kent Overstreet's avatar
Kent Overstreet committed
211
	if (!i->seq)
212 213 214
		goto err;

	for (;
215
	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
216 217 218 219 220 221
	     i = write_block(b)) {
		err = "unsupported bset version";
		if (i->version > BCACHE_BSET_VERSION)
			goto err;

		err = "bad btree header";
222 223
		if (b->written + set_blocks(i, block_bytes(b->c)) >
		    btree_blocks(b))
224 225 226
			goto err;

		err = "bad magic";
227
		if (i->magic != bset_magic(&b->c->sb))
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
			goto err;

		err = "bad checksum";
		switch (i->version) {
		case 0:
			if (i->csum != csum_set(i))
				goto err;
			break;
		case BCACHE_BSET_VERSION:
			if (i->csum != btree_csum_set(b, i))
				goto err;
			break;
		}

		err = "empty set";
243
		if (i != b->keys.set[0].data && !i->keys)
244 245
			goto err;

246
		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
247

248
		b->written += set_blocks(i, block_bytes(b->c));
249 250 251 252
	}

	err = "corrupted btree";
	for (i = write_block(b);
253
	     bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
254
	     i = ((void *) i) + block_bytes(b->c))
255
		if (i->seq == b->keys.set[0].data->seq)
256 257
			goto err;

258
	bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
259

260
	i = b->keys.set[0].data;
261
	err = "short btree key";
262 263
	if (b->keys.set[0].size &&
	    bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
264 265 266
		goto err;

	if (b->written < btree_blocks(b))
267 268
		bch_bset_init_next(&b->keys, write_block(b),
				   bset_magic(&b->c->sb));
269
out:
Kent Overstreet's avatar
Kent Overstreet committed
270 271
	mempool_free(iter, b->c->fill_iter);
	return;
272 273
err:
	set_btree_node_io_error(b);
Kent Overstreet's avatar
Kent Overstreet committed
274
	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
275
			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
Kent Overstreet's avatar
Kent Overstreet committed
276
			    bset_block_offset(b, i), i->keys);
277 278 279
	goto out;
}

280
static void btree_node_read_endio(struct bio *bio)
281
{
Kent Overstreet's avatar
Kent Overstreet committed
282 283 284
	struct closure *cl = bio->bi_private;
	closure_put(cl);
}
285

286
static void bch_btree_node_read(struct btree *b)
Kent Overstreet's avatar
Kent Overstreet committed
287 288 289 290
{
	uint64_t start_time = local_clock();
	struct closure cl;
	struct bio *bio;
291

292 293
	trace_bcache_btree_read(b);

Kent Overstreet's avatar
Kent Overstreet committed
294
	closure_init_stack(&cl);
295

Kent Overstreet's avatar
Kent Overstreet committed
296
	bio = bch_bbio_alloc(b->c);
297
	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
Kent Overstreet's avatar
Kent Overstreet committed
298 299
	bio->bi_end_io	= btree_node_read_endio;
	bio->bi_private	= &cl;
Mike Christie's avatar
Mike Christie committed
300
	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
301

302
	bch_bio_map(bio, b->keys.set[0].data);
303

Kent Overstreet's avatar
Kent Overstreet committed
304 305
	bch_submit_bbio(bio, b->c, &b->key, 0);
	closure_sync(&cl);
306

307
	if (bio->bi_error)
Kent Overstreet's avatar
Kent Overstreet committed
308 309 310 311 312 313 314 315 316 317 318 319
		set_btree_node_io_error(b);

	bch_bbio_free(bio, b->c);

	if (btree_node_io_error(b))
		goto err;

	bch_btree_node_read_done(b);
	bch_time_stats_update(&b->c->btree_read_time, start_time);

	return;
err:
320
	bch_cache_set_error(b->c, "io error reading bucket %zu",
Kent Overstreet's avatar
Kent Overstreet committed
321
			    PTR_BUCKET_NR(b->c, &b->key, 0));
322 323 324 325 326 327
}

static void btree_complete_write(struct btree *b, struct btree_write *w)
{
	if (w->prio_blocked &&
	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
328
		wake_up_allocators(b->c);
329 330 331 332 333 334 335 336 337 338

	if (w->journal) {
		atomic_dec_bug(w->journal);
		__closure_wake_up(&b->c->journal.wait);
	}

	w->prio_blocked	= 0;
	w->journal	= NULL;
}

339 340 341 342 343 344 345
static void btree_node_write_unlock(struct closure *cl)
{
	struct btree *b = container_of(cl, struct btree, io);

	up(&b->io_mutex);
}

Kent Overstreet's avatar
Kent Overstreet committed
346
static void __btree_node_write_done(struct closure *cl)
347
{
348
	struct btree *b = container_of(cl, struct btree, io);
349 350 351 352 353 354 355
	struct btree_write *w = btree_prev_write(b);

	bch_bbio_free(b->bio, b->c);
	b->bio = NULL;
	btree_complete_write(b, w);

	if (btree_node_dirty(b))
Kent Overstreet's avatar
Kent Overstreet committed
356
		schedule_delayed_work(&b->work, 30 * HZ);
357

358
	closure_return_with_destructor(cl, btree_node_write_unlock);
359 360
}

Kent Overstreet's avatar
Kent Overstreet committed
361
static void btree_node_write_done(struct closure *cl)
362
{
363
	struct btree *b = container_of(cl, struct btree, io);
364 365 366
	struct bio_vec *bv;
	int n;

367
	bio_for_each_segment_all(bv, b->bio, n)
368 369
		__free_page(bv->bv_page);

Kent Overstreet's avatar
Kent Overstreet committed
370
	__btree_node_write_done(cl);
371 372
}

373
static void btree_node_write_endio(struct bio *bio)
Kent Overstreet's avatar
Kent Overstreet committed
374 375
{
	struct closure *cl = bio->bi_private;
376
	struct btree *b = container_of(cl, struct btree, io);
Kent Overstreet's avatar
Kent Overstreet committed
377

378
	if (bio->bi_error)
Kent Overstreet's avatar
Kent Overstreet committed
379 380
		set_btree_node_io_error(b);

381
	bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
Kent Overstreet's avatar
Kent Overstreet committed
382 383 384 385
	closure_put(cl);
}

static void do_btree_node_write(struct btree *b)
386
{
387
	struct closure *cl = &b->io;
388
	struct bset *i = btree_bset_last(b);
389 390 391 392 393
	BKEY_PADDED(key) k;

	i->version	= BCACHE_BSET_VERSION;
	i->csum		= btree_csum_set(b, i);

Kent Overstreet's avatar
Kent Overstreet committed
394 395 396 397
	BUG_ON(b->bio);
	b->bio = bch_bbio_alloc(b->c);

	b->bio->bi_end_io	= btree_node_write_endio;
398
	b->bio->bi_private	= cl;
399
	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
Mike Christie's avatar
Mike Christie committed
400
	bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
401
	bch_bio_map(b->bio, i);
402

Kent Overstreet's avatar
Kent Overstreet committed
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
	/*
	 * If we're appending to a leaf node, we don't technically need FUA -
	 * this write just needs to be persisted before the next journal write,
	 * which will be marked FLUSH|FUA.
	 *
	 * Similarly if we're writing a new btree root - the pointer is going to
	 * be in the next journal entry.
	 *
	 * But if we're writing a new btree node (that isn't a root) or
	 * appending to a non leaf btree node, we need either FUA or a flush
	 * when we write the parent with the new pointer. FUA is cheaper than a
	 * flush, and writes appending to leaf nodes aren't blocking anything so
	 * just make all btree node writes FUA to keep things sane.
	 */

418
	bkey_copy(&k.key, &b->key);
419
	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
420
		       bset_sector_offset(&b->keys, i));
421

422
	if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
423 424 425 426
		int j;
		struct bio_vec *bv;
		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));

427
		bio_for_each_segment_all(bv, b->bio, j)
428 429 430 431 432
			memcpy(page_address(bv->bv_page),
			       base + j * PAGE_SIZE, PAGE_SIZE);

		bch_submit_bbio(b->bio, b->c, &k.key, 0);

Kent Overstreet's avatar
Kent Overstreet committed
433
		continue_at(cl, btree_node_write_done, NULL);
434 435
	} else {
		b->bio->bi_vcnt = 0;
436
		bch_bio_map(b->bio, i);
437 438 439 440

		bch_submit_bbio(b->bio, b->c, &k.key, 0);

		closure_sync(cl);
441
		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
442 443 444
	}
}

445
void __bch_btree_node_write(struct btree *b, struct closure *parent)
446
{
447
	struct bset *i = btree_bset_last(b);
448

449 450
	lockdep_assert_held(&b->write_lock);

451 452
	trace_bcache_btree_write(b);

453
	BUG_ON(current->bio_list);
Kent Overstreet's avatar
Kent Overstreet committed
454 455
	BUG_ON(b->written >= btree_blocks(b));
	BUG_ON(b->written && !i->keys);
456
	BUG_ON(btree_bset_first(b)->seq != i->seq);
457
	bch_check_keys(&b->keys, "writing");
458 459 460

	cancel_delayed_work(&b->work);

Kent Overstreet's avatar
Kent Overstreet committed
461
	/* If caller isn't waiting for write, parent refcount is cache set */
462 463
	down(&b->io_mutex);
	closure_init(&b->io, parent ?: &b->c->cl);
Kent Overstreet's avatar
Kent Overstreet committed
464

465 466 467
	clear_bit(BTREE_NODE_dirty,	 &b->flags);
	change_bit(BTREE_NODE_write_idx, &b->flags);

Kent Overstreet's avatar
Kent Overstreet committed
468
	do_btree_node_write(b);
469

470
	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
471 472
			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);

473
	b->written += set_blocks(i, block_bytes(b->c));
474
}
475

476 477 478 479 480 481 482
void bch_btree_node_write(struct btree *b, struct closure *parent)
{
	unsigned nsets = b->keys.nsets;

	lockdep_assert_held(&b->lock);

	__bch_btree_node_write(b, parent);
483

484 485 486 487
	/*
	 * do verify if there was more than one set initially (i.e. we did a
	 * sort) and we sorted down to a single set:
	 */
488
	if (nsets && !b->keys.nsets)
489 490
		bch_btree_verify(b);

491
	bch_btree_init_next(b);
492 493
}

494 495 496 497 498
static void bch_btree_node_write_sync(struct btree *b)
{
	struct closure cl;

	closure_init_stack(&cl);
499 500

	mutex_lock(&b->write_lock);
501
	bch_btree_node_write(b, &cl);
502 503
	mutex_unlock(&b->write_lock);

504 505 506
	closure_sync(&cl);
}

Kent Overstreet's avatar
Kent Overstreet committed
507
static void btree_node_write_work(struct work_struct *w)
508 509 510
{
	struct btree *b = container_of(to_delayed_work(w), struct btree, work);

511
	mutex_lock(&b->write_lock);
512
	if (btree_node_dirty(b))
513 514
		__bch_btree_node_write(b, NULL);
	mutex_unlock(&b->write_lock);
515 516
}

517
static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
518
{
519
	struct bset *i = btree_bset_last(b);
520 521
	struct btree_write *w = btree_current_write(b);

522 523
	lockdep_assert_held(&b->write_lock);

Kent Overstreet's avatar
Kent Overstreet committed
524 525
	BUG_ON(!b->written);
	BUG_ON(!i->keys);
526

Kent Overstreet's avatar
Kent Overstreet committed
527
	if (!btree_node_dirty(b))
Kent Overstreet's avatar
Kent Overstreet committed
528
		schedule_delayed_work(&b->work, 30 * HZ);
529

Kent Overstreet's avatar
Kent Overstreet committed
530
	set_btree_node_dirty(b);
531

532
	if (journal_ref) {
533
		if (w->journal &&
534
		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
535 536 537 538 539
			atomic_dec_bug(w->journal);
			w->journal = NULL;
		}

		if (!w->journal) {
540
			w->journal = journal_ref;
541 542 543 544 545
			atomic_inc(w->journal);
		}
	}

	/* Force write if set is too big */
Kent Overstreet's avatar
Kent Overstreet committed
546 547 548
	if (set_bytes(i) > PAGE_SIZE - 48 &&
	    !current->bio_list)
		bch_btree_node_write(b, NULL);
549 550 551 552 553 554 555 556 557 558
}

/*
 * Btree in memory cache - allocation/freeing
 * mca -> memory cache
 */

#define mca_reserve(c)	(((c->root && c->root->level)		\
			  ? c->root->level : 1) * 8 + 16)
#define mca_can_free(c)						\
559
	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
560 561 562

static void mca_data_free(struct btree *b)
{
563
	BUG_ON(b->io_mutex.count != 1);
564

565
	bch_btree_keys_free(&b->keys);
566

567
	b->c->btree_cache_used--;
568
	list_move(&b->list, &b->c->btree_cache_freed);
569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
}

static void mca_bucket_free(struct btree *b)
{
	BUG_ON(btree_node_dirty(b));

	b->key.ptr[0] = 0;
	hlist_del_init_rcu(&b->hash);
	list_move(&b->list, &b->c->btree_cache_freeable);
}

static unsigned btree_order(struct bkey *k)
{
	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
}

static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
{
587
	if (!bch_btree_keys_alloc(&b->keys,
588 589 590 591
				  max_t(unsigned,
					ilog2(b->c->btree_pages),
					btree_order(k)),
				  gfp)) {
592
		b->c->btree_cache_used++;
593 594 595 596
		list_move(&b->list, &b->c->btree_cache);
	} else {
		list_move(&b->list, &b->c->btree_cache_freed);
	}
597 598 599 600 601 602 603 604 605 606 607
}

static struct btree *mca_bucket_alloc(struct cache_set *c,
				      struct bkey *k, gfp_t gfp)
{
	struct btree *b = kzalloc(sizeof(struct btree), gfp);
	if (!b)
		return NULL;

	init_rwsem(&b->lock);
	lockdep_set_novalidate_class(&b->lock);
608 609
	mutex_init(&b->write_lock);
	lockdep_set_novalidate_class(&b->write_lock);
610
	INIT_LIST_HEAD(&b->list);
Kent Overstreet's avatar
Kent Overstreet committed
611
	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
612
	b->c = c;
613
	sema_init(&b->io_mutex, 1);
614 615 616 617 618

	mca_data_alloc(b, k, gfp);
	return b;
}

619
static int mca_reap(struct btree *b, unsigned min_order, bool flush)
620
{
621 622 623
	struct closure cl;

	closure_init_stack(&cl);
624 625 626 627 628
	lockdep_assert_held(&b->c->bucket_lock);

	if (!down_write_trylock(&b->lock))
		return -ENOMEM;

629
	BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
630

631
	if (b->keys.page_order < min_order)
632 633 634 635 636 637 638 639 640
		goto out_unlock;

	if (!flush) {
		if (btree_node_dirty(b))
			goto out_unlock;

		if (down_trylock(&b->io_mutex))
			goto out_unlock;
		up(&b->io_mutex);
641 642
	}

643
	mutex_lock(&b->write_lock);
644
	if (btree_node_dirty(b))
645 646 647 648
		__bch_btree_node_write(b, &cl);
	mutex_unlock(&b->write_lock);

	closure_sync(&cl);
649

650
	/* wait for any in flight btree write */
651 652
	down(&b->io_mutex);
	up(&b->io_mutex);
653

654
	return 0;
655 656 657
out_unlock:
	rw_unlock(true, b);
	return -ENOMEM;
658 659
}

660 661
static unsigned long bch_mca_scan(struct shrinker *shrink,
				  struct shrink_control *sc)
662 663 664 665
{
	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
	struct btree *b, *t;
	unsigned long i, nr = sc->nr_to_scan;
666
	unsigned long freed = 0;
667 668

	if (c->shrinker_disabled)
669
		return SHRINK_STOP;
670

671
	if (c->btree_cache_alloc_lock)
672
		return SHRINK_STOP;
673 674

	/* Return -1 if we can't do anything right now */
675
	if (sc->gfp_mask & __GFP_IO)
676 677 678 679
		mutex_lock(&c->bucket_lock);
	else if (!mutex_trylock(&c->bucket_lock))
		return -1;

680 681 682 683 684 685 686
	/*
	 * It's _really_ critical that we don't free too many btree nodes - we
	 * have to always leave ourselves a reserve. The reserve is how we
	 * guarantee that allocating memory for a new btree node can always
	 * succeed, so that inserting keys into the btree can always succeed and
	 * IO can always make forward progress:
	 */
687 688 689 690 691
	nr /= c->btree_pages;
	nr = min_t(unsigned long, nr, mca_can_free(c));

	i = 0;
	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
692
		if (freed >= nr)
693 694 695
			break;

		if (++i > 3 &&
696
		    !mca_reap(b, 0, false)) {
697 698
			mca_data_free(b);
			rw_unlock(true, b);
699
			freed++;
700 701 702
		}
	}

703
	for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
704 705 706
		if (list_empty(&c->btree_cache))
			goto out;

707 708 709 710
		b = list_first_entry(&c->btree_cache, struct btree, list);
		list_rotate_left(&c->btree_cache);

		if (!b->accessed &&
711
		    !mca_reap(b, 0, false)) {
712 713 714
			mca_bucket_free(b);
			mca_data_free(b);
			rw_unlock(true, b);
715
			freed++;
716 717 718 719 720
		} else
			b->accessed = 0;
	}
out:
	mutex_unlock(&c->bucket_lock);
721 722 723 724 725 726 727 728 729 730 731
	return freed;
}

static unsigned long bch_mca_count(struct shrinker *shrink,
				   struct shrink_control *sc)
{
	struct cache_set *c = container_of(shrink, struct cache_set, shrink);

	if (c->shrinker_disabled)
		return 0;

732
	if (c->btree_cache_alloc_lock)
733 734 735
		return 0;

	return mca_can_free(c) * c->btree_pages;
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751
}

void bch_btree_cache_free(struct cache_set *c)
{
	struct btree *b;
	struct closure cl;
	closure_init_stack(&cl);

	if (c->shrink.list.next)
		unregister_shrinker(&c->shrink);

	mutex_lock(&c->bucket_lock);

#ifdef CONFIG_BCACHE_DEBUG
	if (c->verify_data)
		list_move(&c->verify_data->list, &c->btree_cache);
752 753

	free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
#endif

	list_splice(&c->btree_cache_freeable,
		    &c->btree_cache);

	while (!list_empty(&c->btree_cache)) {
		b = list_first_entry(&c->btree_cache, struct btree, list);

		if (btree_node_dirty(b))
			btree_complete_write(b, btree_current_write(b));
		clear_bit(BTREE_NODE_dirty, &b->flags);

		mca_data_free(b);
	}

	while (!list_empty(&c->btree_cache_freed)) {
		b = list_first_entry(&c->btree_cache_freed,
				     struct btree, list);
		list_del(&b->list);
		cancel_delayed_work_sync(&b->work);
		kfree(b);
	}

	mutex_unlock(&c->bucket_lock);
}

int bch_btree_cache_alloc(struct cache_set *c)
{
	unsigned i;

	for (i = 0; i < mca_reserve(c); i++)
785 786
		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
			return -ENOMEM;
787 788 789 790 791 792 793

	list_splice_init(&c->btree_cache,
			 &c->btree_cache_freeable);

#ifdef CONFIG_BCACHE_DEBUG
	mutex_init(&c->verify_lock);

794 795 796
	c->verify_ondisk = (void *)
		__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));

797 798 799
	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);

	if (c->verify_data &&
800
	    c->verify_data->keys.set->data)
801 802 803 804 805
		list_del_init(&c->verify_data->list);
	else
		c->verify_data = NULL;
#endif

806 807
	c->shrink.count_objects = bch_mca_count;
	c->shrink.scan_objects = bch_mca_scan;
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835
	c->shrink.seeks = 4;
	c->shrink.batch = c->btree_pages * 2;
	register_shrinker(&c->shrink);

	return 0;
}

/* Btree in memory cache - hash table */

static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
{
	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
}

static struct btree *mca_find(struct cache_set *c, struct bkey *k)
{
	struct btree *b;

	rcu_read_lock();
	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
			goto out;
	b = NULL;
out:
	rcu_read_unlock();
	return b;
}

836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
	struct task_struct *old;

	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
	if (old && old != current) {
		if (op)
			prepare_to_wait(&c->btree_cache_wait, &op->wait,
					TASK_UNINTERRUPTIBLE);
		return -EINTR;
	}

	return 0;
}

static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
				     struct bkey *k)
853
{
854
	struct btree *b;
855

856 857
	trace_bcache_btree_cache_cannibalize(c);

858 859
	if (mca_cannibalize_lock(c, op))
		return ERR_PTR(-EINTR);
860

861 862 863
	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!mca_reap(b, btree_order(k), false))
			return b;
864

865 866 867
	list_for_each_entry_reverse(b, &c->btree_cache, list)
		if (!mca_reap(b, btree_order(k), true))
			return b;
868

869
	WARN(1, "btree cache cannibalize failed\n");
870
	return ERR_PTR(-ENOMEM);
871 872 873 874 875 876 877 878
}

/*
 * We can only have one thread cannibalizing other cached btree nodes at a time,
 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 * cannibalize_bucket() will take. This means every time we unlock the root of
 * the btree, we need to release this lock if we have it held.
 */
879
static void bch_cannibalize_unlock(struct cache_set *c)
880
{
881 882 883
	if (c->btree_cache_alloc_lock == current) {
		c->btree_cache_alloc_lock = NULL;
		wake_up(&c->btree_cache_wait);
884 885 886
	}
}

887 888
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
			       struct bkey *k, int level)
889 890 891
{
	struct btree *b;

892 893
	BUG_ON(current->bio_list);

894 895 896 897 898 899 900 901 902
	lockdep_assert_held(&c->bucket_lock);

	if (mca_find(c, k))
		return NULL;

	/* btree_free() doesn't free memory; it sticks the node on the end of
	 * the list. Check if there's any freed nodes there:
	 */
	list_for_each_entry(b, &c->btree_cache_freeable, list)
903
		if (!mca_reap(b, btree_order(k), false))
904 905 906 907 908 909
			goto out;

	/* We never free struct btree itself, just the memory that holds the on
	 * disk node. Check the freed list before allocating a new one:
	 */
	list_for_each_entry(b, &c->btree_cache_freed, list)
910
		if (!mca_reap(b, 0, false)) {
911
			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
912
			if (!b->keys.set[0].data)
913 914 915 916 917 918 919 920 921 922
				goto err;
			else
				goto out;
		}

	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
	if (!b)
		goto err;

	BUG_ON(!down_write_trylock(&b->lock));
923
	if (!b->keys.set->data)
924 925
		goto err;
out:
926
	BUG_ON(b->io_mutex.count != 1);
927 928 929 930 931 932 933

	bkey_copy(&b->key, k);
	list_move(&b->list, &c->btree_cache);
	hlist_del_init_rcu(&b->hash);
	hlist_add_head_rcu(&b->hash, mca_hash(c, k));

	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
934
	b->parent	= (void *) ~0UL;
935 936 937
	b->flags	= 0;
	b->written	= 0;
	b->level	= level;
938

939
	if (!b->level)
940 941
		bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
				    &b->c->expensive_debug_checks);
942
	else
943 944
		bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
				    &b->c->expensive_debug_checks);
945 946 947 948 949 950

	return b;
err:
	if (b)
		rw_unlock(true, b);

951
	b = mca_cannibalize(c, op, k);
952 953 954 955 956 957 958 959 960 961
	if (!IS_ERR(b))
		goto out;

	return b;
}

/**
 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 * in from disk if necessary.
 *
Kent Overstreet's avatar
Kent Overstreet committed
962
 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
963 964 965 966
 *
 * The btree node will have either a read or a write lock held, depending on
 * level and op->lock.
 */
967
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
968 969
				 struct bkey *k, int level, bool write,
				 struct btree *parent)
970 971 972 973 974 975 976 977 978
{
	int i = 0;
	struct btree *b;

	BUG_ON(level < 0);
retry:
	b = mca_find(c, k);

	if (!b) {
Kent Overstreet's avatar
Kent Overstreet committed
979 980 981
		if (current->bio_list)
			return ERR_PTR(-EAGAIN);

982
		mutex_lock(&c->bucket_lock);
983
		b = mca_alloc(c, op, k, level);
984 985 986 987 988 989 990
		mutex_unlock(&c->bucket_lock);

		if (!b)
			goto retry;
		if (IS_ERR(b))
			return b;

Kent Overstreet's avatar
Kent Overstreet committed
991
		bch_btree_node_read(b);
992 993 994 995 996 997 998 999 1000 1001 1002 1003

		if (!write)
			downgrade_write(&b->lock);
	} else {
		rw_lock(write, b, level);
		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
			rw_unlock(write, b);
			goto retry;
		}
		BUG_ON(b->level != level);
	}

1004
	b->parent = parent;
1005 1006
	b->accessed = 1;

1007 1008 1009
	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
		prefetch(b->keys.set[i].tree);
		prefetch(b->keys.set[i].data);
1010 1011
	}

1012 1013
	for (; i <= b->keys.nsets; i++)
		prefetch(b->keys.set[i].data);
1014

Kent Overstreet's avatar
Kent Overstreet committed
1015
	if (btree_node_io_error(b)) {
1016
		rw_unlock(write, b);
Kent Overstreet's avatar
Kent Overstreet committed
1017 1018 1019 1020
		return ERR_PTR(-EIO);
	}

	BUG_ON(!b->written);
1021 1022 1023 1024

	return b;
}

1025
static void btree_node_prefetch(struct btree *parent, struct bkey *k)
1026 1027 1028
{
	struct btree *b;

1029 1030 1031
	mutex_lock(&parent->c->bucket_lock);
	b = mca_alloc(parent->c, NULL, k, parent->level - 1);
	mutex_unlock(&parent->c->bucket_lock);
1032 1033

	if (!IS_ERR_OR_NULL(b)) {
1034
		b->parent = parent;
Kent Overstreet's avatar
Kent Overstreet committed
1035
		bch_btree_node_read(b);
1036 1037 1038 1039 1040 1041
		rw_unlock(true, b);
	}
}

/* Btree alloc */

1042
static void btree_node_free(struct btree *b)
1043
{
1044 1045
	trace_bcache_btree_node_free(b);

1046 1047
	BUG_ON(b == b->c->root);

1048 1049
	mutex_lock(&b->write_lock);

1050 1051 1052 1053
	if (btree_node_dirty(b))
		btree_complete_write(b, btree_current_write(b));
	clear_bit(BTREE_NODE_dirty, &b->flags);

1054 1055
	mutex_unlock(&b->write_lock);

1056 1057 1058 1059 1060 1061 1062 1063
	cancel_delayed_work(&b->work);

	mutex_lock(&b->c->bucket_lock);
	bch_bucket_free(b->c, &b->key);
	mca_bucket_free(b);
	mutex_unlock(&b->c->bucket_lock);
}

1064
struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1065 1066
				     int level, bool wait,
				     struct btree *parent)
1067 1068 1069 1070 1071 1072
{
	BKEY_PADDED(key) k;
	struct btree *b = ERR_PTR(-EAGAIN);

	mutex_lock(&c->bucket_lock);
retry:
1073
	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1074 1075
		goto err;

1076
	bkey_put(c, &k.key);
1077 1078
	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);

1079
	b = mca_alloc(c, op, &k.key, level);
1080 1081 1082 1083
	if (IS_ERR(b))
		goto err_free;

	if (!b) {
1084 1085
		cache_bug(c,
			"Tried to allocate bucket that was in btree cache");
1086 1087 1088 1089
		goto retry;
	}

	b->accessed = 1;
1090
	b->parent = parent;
1091
	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1092 1093

	mutex_unlock(&c->bucket_lock);
1094 1095

	trace_bcache_btree_node_alloc(b);
1096 1097 1098 1099 1100
	return b;
err_free:
	bch_bucket_free(c, &k.key);
err:
	mutex_unlock(&c->bucket_lock);
1101

1102
	trace_bcache_btree_node_alloc_fail(c);
1103 1104 1105
	return b;
}

1106
static struct btree *bch_btree_node_alloc(struct cache_set *c,
1107 1108
					  struct btree_op *op, int level,
					  struct btree *parent)
1109
{
1110
	return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1111 1112
}

1113 1114
static struct btree *btree_node_alloc_replacement(struct btree *b,
						  struct btree_op *op)
1115
{
1116
	struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1117
	if (!IS_ERR_OR_NULL(n)) {
1118
		mutex_lock(&n->write_lock);
1119
		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1120
		bkey_copy_key(&n->key, &b->key);
1121
		mutex_unlock(&n->write_lock);
1122
	}
1123 1124 1125 1126

	return n;
}

1127 1128 1129 1130
static void make_btree_freeing_key(struct btree *b, struct bkey *k)
{
	unsigned i;

1131 1132 1133 1134
	mutex_lock(&b->c->bucket_lock);

	atomic_inc(&b->c->prio_blocked);

1135 1136 1137
	bkey_copy(k, &b->key);
	bkey_copy_key(k, &ZERO_KEY);

1138 1139 1140 1141
	for (i = 0; i < KEY_PTRS(k); i++)
		SET_PTR_GEN(k, i,
			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
					PTR_BUCKET(b->c, &b->key, i)));
1142

1143
	mutex_unlock(&b->c->bucket_lock);
1144 1145
}

1146 1147 1148 1149
static int btree_check_reserve(struct btree *b, struct btree_op *op)
{
	struct cache_set *c = b->c;
	struct cache *ca;
1150
	unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
1151 1152 1153 1154 1155 1156

	mutex_lock(&c->bucket_lock);

	for_each_cache(ca, c, i)
		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
			if (op)
1157
				prepare_to_wait(&c->btree_cache_wait, &op->wait,
1158
						TASK_UNINTERRUPTIBLE);
1159 1160
			mutex_unlock(&c->bucket_lock);
			return -EINTR;
1161 1162 1163
		}

	mutex_unlock(&c->bucket_lock);
1164 1165

	return mca_cannibalize_lock(b->c, op);
1166 1167
}

1168 1169
/* Garbage collection */

1170 1171
static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
				    struct bkey *k)
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
{
	uint8_t stale = 0;
	unsigned i;
	struct bucket *g;

	/*
	 * ptr_invalid() can't return true for the keys that mark btree nodes as
	 * freed, but since ptr_bad() returns true we'll never actually use them
	 * for anything and thus we don't want mark their pointers here
	 */
	if (!bkey_cmp(k, &ZERO_KEY))
		return stale;

	for (i = 0; i < KEY_PTRS(k); i++) {
		if (!ptr_available(c, k, i))
			continue;

		g = PTR_BUCKET(c, k, i);

1191 1192
		if (gen_after(g->last_gc, PTR_GEN(k, i)))
			g->last_gc = PTR_GEN(k, i);
1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207

		if (ptr_stale(c, k, i)) {
			stale = max(stale, ptr_stale(c, k, i));
			continue;
		}

		cache_bug_on(GC_MARK(g) &&
			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
			     c, "inconsistent ptrs: mark = %llu, level = %i",
			     GC_MARK(g), level);

		if (level)
			SET_GC_MARK(g, GC_MARK_METADATA);
		else if (KEY_DIRTY(k))
			SET_GC_MARK(g, GC_MARK_DIRTY);
1208 1209
		else if (!GC_MARK(g))
			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
1210 1211 1212 1213

		/* guard against overflow */
		SET_GC_SECTORS_USED(g, min_t(unsigned,
					     GC_SECTORS_USED(g) + KEY_SIZE(k),
1214
					     MAX_GC_SECTORS_USED));
1215 1216 1217 1218 1219 1220 1221 1222 1223

		BUG_ON(!GC_SECTORS_USED(g));
	}

	return stale;
}

#define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)

1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
{
	unsigned i;

	for (i = 0; i < KEY_PTRS(k); i++)
		if (ptr_available(c, k, i) &&
		    !ptr_stale(c,</