extent_io.c 123 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
13
#include <linux/prefetch.h>
14
#include <linux/cleancache.h>
15
16
#include "extent_io.h"
#include "extent_map.h"
17
#include "compat.h"
18
19
#include "ctree.h"
#include "btrfs_inode.h"
20
#include "volumes.h"
21
#include "check-integrity.h"
22
#include "locking.h"
23
#include "rcu-string.h"
24
25
26
27
28
29

static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;

static LIST_HEAD(buffers);
static LIST_HEAD(states);
Chris Mason's avatar
Chris Mason committed
30

31
#define LEAK_DEBUG 0
32
#if LEAK_DEBUG
33
static DEFINE_SPINLOCK(leak_lock);
Chris Mason's avatar
Chris Mason committed
34
#endif
35
36
37
38
39
40
41
42
43
44
45
46
47

#define BUFFER_LRU_MAX 64

struct tree_entry {
	u64 start;
	u64 end;
	struct rb_node rb_node;
};

struct extent_page_data {
	struct bio *bio;
	struct extent_io_tree *tree;
	get_extent_t *get_extent;
48
49
50
51

	/* tells writepage not to lock the state bits for this range
	 * it still does the unlocking
	 */
52
53
54
55
	unsigned int extent_locked:1;

	/* tells the submit_bio code to use a WRITE_SYNC */
	unsigned int sync_io:1;
56
57
};

58
static noinline void flush_write_bio(void *data);
59
60
61
62
63
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
	return btrfs_sb(tree->mapping->host->i_sb);
}
64

65
66
int __init extent_io_init(void)
{
67
68
69
	extent_state_cache = kmem_cache_create("extent_state",
			sizeof(struct extent_state), 0,
			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
70
71
72
	if (!extent_state_cache)
		return -ENOMEM;

73
74
75
	extent_buffer_cache = kmem_cache_create("extent_buffers",
			sizeof(struct extent_buffer), 0,
			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
76
77
78
79
80
81
82
83
84
85
86
87
	if (!extent_buffer_cache)
		goto free_state_cache;
	return 0;

free_state_cache:
	kmem_cache_destroy(extent_state_cache);
	return -ENOMEM;
}

void extent_io_exit(void)
{
	struct extent_state *state;
88
	struct extent_buffer *eb;
89
90

	while (!list_empty(&states)) {
91
		state = list_entry(states.next, struct extent_state, leak_list);
92
93
94
95
96
		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
		       "state %lu in tree %p refs %d\n",
		       (unsigned long long)state->start,
		       (unsigned long long)state->end,
		       state->state, state->tree, atomic_read(&state->refs));
97
		list_del(&state->leak_list);
98
99
100
101
		kmem_cache_free(extent_state_cache, state);

	}

102
103
	while (!list_empty(&buffers)) {
		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
104
105
106
		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
		       "refs %d\n", (unsigned long long)eb->start,
		       eb->len, atomic_read(&eb->refs));
107
108
109
		list_del(&eb->leak_list);
		kmem_cache_free(extent_buffer_cache, eb);
	}
110
111
112
113
114
115
116
	if (extent_state_cache)
		kmem_cache_destroy(extent_state_cache);
	if (extent_buffer_cache)
		kmem_cache_destroy(extent_buffer_cache);
}

void extent_io_tree_init(struct extent_io_tree *tree,
117
			 struct address_space *mapping)
118
{
119
	tree->state = RB_ROOT;
120
	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
121
122
	tree->ops = NULL;
	tree->dirty_bytes = 0;
123
	spin_lock_init(&tree->lock);
124
	spin_lock_init(&tree->buffer_lock);
125
126
127
	tree->mapping = mapping;
}

128
static struct extent_state *alloc_extent_state(gfp_t mask)
129
130
{
	struct extent_state *state;
131
#if LEAK_DEBUG
132
	unsigned long flags;
Chris Mason's avatar
Chris Mason committed
133
#endif
134
135

	state = kmem_cache_alloc(extent_state_cache, mask);
136
	if (!state)
137
138
139
		return state;
	state->state = 0;
	state->private = 0;
140
	state->tree = NULL;
141
#if LEAK_DEBUG
142
143
144
	spin_lock_irqsave(&leak_lock, flags);
	list_add(&state->leak_list, &states);
	spin_unlock_irqrestore(&leak_lock, flags);
Chris Mason's avatar
Chris Mason committed
145
#endif
146
147
	atomic_set(&state->refs, 1);
	init_waitqueue_head(&state->wq);
148
	trace_alloc_extent_state(state, mask, _RET_IP_);
149
150
151
	return state;
}

152
void free_extent_state(struct extent_state *state)
153
154
155
156
{
	if (!state)
		return;
	if (atomic_dec_and_test(&state->refs)) {
157
#if LEAK_DEBUG
158
		unsigned long flags;
Chris Mason's avatar
Chris Mason committed
159
#endif
160
		WARN_ON(state->tree);
161
#if LEAK_DEBUG
162
163
164
		spin_lock_irqsave(&leak_lock, flags);
		list_del(&state->leak_list);
		spin_unlock_irqrestore(&leak_lock, flags);
Chris Mason's avatar
Chris Mason committed
165
#endif
166
		trace_free_extent_state(state, _RET_IP_);
167
168
169
170
171
172
173
		kmem_cache_free(extent_state_cache, state);
	}
}

static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
				   struct rb_node *node)
{
174
175
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
176
177
	struct tree_entry *entry;

178
	while (*p) {
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
		parent = *p;
		entry = rb_entry(parent, struct tree_entry, rb_node);

		if (offset < entry->start)
			p = &(*p)->rb_left;
		else if (offset > entry->end)
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

195
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
196
197
198
				     struct rb_node **prev_ret,
				     struct rb_node **next_ret)
{
199
	struct rb_root *root = &tree->state;
200
	struct rb_node *n = root->rb_node;
201
202
203
204
205
	struct rb_node *prev = NULL;
	struct rb_node *orig_prev = NULL;
	struct tree_entry *entry;
	struct tree_entry *prev_entry = NULL;

206
	while (n) {
207
208
209
210
211
212
213
214
		entry = rb_entry(n, struct tree_entry, rb_node);
		prev = n;
		prev_entry = entry;

		if (offset < entry->start)
			n = n->rb_left;
		else if (offset > entry->end)
			n = n->rb_right;
215
		else
216
217
218
219
220
			return n;
	}

	if (prev_ret) {
		orig_prev = prev;
221
		while (prev && offset > prev_entry->end) {
222
223
224
225
226
227
228
229
230
			prev = rb_next(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
		*prev_ret = prev;
		prev = orig_prev;
	}

	if (next_ret) {
		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
231
		while (prev && offset < prev_entry->start) {
232
233
234
235
236
237
238
239
			prev = rb_prev(prev);
			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
		}
		*next_ret = prev;
	}
	return NULL;
}

240
241
static inline struct rb_node *tree_search(struct extent_io_tree *tree,
					  u64 offset)
242
{
243
	struct rb_node *prev = NULL;
244
	struct rb_node *ret;
245

246
	ret = __etree_search(tree, offset, &prev, NULL);
247
	if (!ret)
248
249
250
251
		return prev;
	return ret;
}

Josef Bacik's avatar
Josef Bacik committed
252
253
254
255
256
257
258
259
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
		     struct extent_state *other)
{
	if (tree->ops && tree->ops->merge_extent_hook)
		tree->ops->merge_extent_hook(tree->mapping->host, new,
					     other);
}

260
261
262
263
264
265
266
267
268
/*
 * utility function to look for merge candidates inside a given range.
 * Any extents with matching state are merged together into a single
 * extent in the tree.  Extents with EXTENT_IO in their state field
 * are not merged because the end_io handlers need to be able to do
 * operations on them without sleeping (or doing allocations/splits).
 *
 * This should be called with the tree lock held.
 */
269
270
static void merge_state(struct extent_io_tree *tree,
		        struct extent_state *state)
271
272
273
274
{
	struct extent_state *other;
	struct rb_node *other_node;

275
	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
276
		return;
277
278
279
280
281
282

	other_node = rb_prev(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->end == state->start - 1 &&
		    other->state == state->state) {
Josef Bacik's avatar
Josef Bacik committed
283
			merge_cb(tree, state, other);
284
			state->start = other->start;
285
			other->tree = NULL;
286
287
288
289
290
291
292
293
294
			rb_erase(&other->rb_node, &tree->state);
			free_extent_state(other);
		}
	}
	other_node = rb_next(&state->rb_node);
	if (other_node) {
		other = rb_entry(other_node, struct extent_state, rb_node);
		if (other->start == state->end + 1 &&
		    other->state == state->state) {
Josef Bacik's avatar
Josef Bacik committed
295
			merge_cb(tree, state, other);
296
297
298
299
			state->end = other->end;
			other->tree = NULL;
			rb_erase(&other->rb_node, &tree->state);
			free_extent_state(other);
300
301
302
303
		}
	}
}

304
static void set_state_cb(struct extent_io_tree *tree,
305
			 struct extent_state *state, int *bits)
306
{
307
308
	if (tree->ops && tree->ops->set_bit_hook)
		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
309
310
311
}

static void clear_state_cb(struct extent_io_tree *tree,
312
			   struct extent_state *state, int *bits)
313
{
Josef Bacik's avatar
Josef Bacik committed
314
315
	if (tree->ops && tree->ops->clear_bit_hook)
		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
316
317
}

318
319
320
static void set_state_bits(struct extent_io_tree *tree,
			   struct extent_state *state, int *bits);

321
322
323
324
325
326
327
328
329
330
331
332
/*
 * insert an extent_state struct into the tree.  'bits' are set on the
 * struct before it is inserted.
 *
 * This may return -EEXIST if the extent is already there, in which case the
 * state struct is freed.
 *
 * The tree lock is not taken internally.  This is a utility function and
 * probably isn't what you want to call (see set/clear_extent_bit).
 */
static int insert_state(struct extent_io_tree *tree,
			struct extent_state *state, u64 start, u64 end,
333
			int *bits)
334
335
336
337
{
	struct rb_node *node;

	if (end < start) {
338
339
340
		printk(KERN_ERR "btrfs end < start %llu %llu\n",
		       (unsigned long long)end,
		       (unsigned long long)start);
341
342
343
344
		WARN_ON(1);
	}
	state->start = start;
	state->end = end;
Josef Bacik's avatar
Josef Bacik committed
345

346
347
	set_state_bits(tree, state, bits);

348
349
350
351
	node = tree_insert(&tree->state, end, &state->rb_node);
	if (node) {
		struct extent_state *found;
		found = rb_entry(node, struct extent_state, rb_node);
352
353
354
355
		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
		       "%llu %llu\n", (unsigned long long)found->start,
		       (unsigned long long)found->end,
		       (unsigned long long)start, (unsigned long long)end);
356
357
		return -EEXIST;
	}
358
	state->tree = tree;
359
360
361
362
	merge_state(tree, state);
	return 0;
}

363
static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
Josef Bacik's avatar
Josef Bacik committed
364
365
366
		     u64 split)
{
	if (tree->ops && tree->ops->split_extent_hook)
367
		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
Josef Bacik's avatar
Josef Bacik committed
368
369
}

370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
/*
 * split a given extent state struct in two, inserting the preallocated
 * struct 'prealloc' as the newly created second half.  'split' indicates an
 * offset inside 'orig' where it should be split.
 *
 * Before calling,
 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 * are two extent state structs in the tree:
 * prealloc: [orig->start, split - 1]
 * orig: [ split, orig->end ]
 *
 * The tree locks are not taken by this function. They need to be held
 * by the caller.
 */
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
		       struct extent_state *prealloc, u64 split)
{
	struct rb_node *node;
Josef Bacik's avatar
Josef Bacik committed
388
389
390

	split_cb(tree, orig, split);

391
392
393
394
395
396
397
398
399
400
	prealloc->start = orig->start;
	prealloc->end = split - 1;
	prealloc->state = orig->state;
	orig->start = split;

	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
	if (node) {
		free_extent_state(prealloc);
		return -EEXIST;
	}
401
	prealloc->tree = tree;
402
403
404
	return 0;
}

405
406
407
408
409
410
411
412
413
static struct extent_state *next_state(struct extent_state *state)
{
	struct rb_node *next = rb_next(&state->rb_node);
	if (next)
		return rb_entry(next, struct extent_state, rb_node);
	else
		return NULL;
}

414
415
/*
 * utility function to clear some bits in an extent state struct.
416
 * it will optionally wake up any one waiting on this state (wake == 1).
417
418
419
420
 *
 * If no bits are set on the state struct after clearing things, the
 * struct is freed and removed from the tree
 */
421
422
423
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
					    struct extent_state *state,
					    int *bits, int wake)
424
{
425
	struct extent_state *next;
426
	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
427

428
	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
429
430
431
432
		u64 range = state->end - state->start + 1;
		WARN_ON(range > tree->dirty_bytes);
		tree->dirty_bytes -= range;
	}
433
	clear_state_cb(tree, state, bits);
434
	state->state &= ~bits_to_clear;
435
436
	if (wake)
		wake_up(&state->wq);
437
	if (state->state == 0) {
438
		next = next_state(state);
439
		if (state->tree) {
440
			rb_erase(&state->rb_node, &tree->state);
441
			state->tree = NULL;
442
443
444
445
446
447
			free_extent_state(state);
		} else {
			WARN_ON(1);
		}
	} else {
		merge_state(tree, state);
448
		next = next_state(state);
449
	}
450
	return next;
451
452
}

453
454
455
456
457
458
459
460
461
static struct extent_state *
alloc_extent_state_atomic(struct extent_state *prealloc)
{
	if (!prealloc)
		prealloc = alloc_extent_state(GFP_ATOMIC);

	return prealloc;
}

462
463
464
465
466
467
468
void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
		    "Extent tree was modified by another "
		    "thread while locked.");
}

469
470
471
472
473
474
475
476
477
478
/*
 * clear some bits on a range in the tree.  This may require splitting
 * or inserting elements in the tree, so the gfp mask is used to
 * indicate which allocations or sleeping are allowed.
 *
 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 * the given range from the tree regardless of state (ie for truncate).
 *
 * the range [start, end] is inclusive.
 *
479
 * This takes the tree lock, and returns 0 on success and < 0 on error.
480
481
 */
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
482
483
484
		     int bits, int wake, int delete,
		     struct extent_state **cached_state,
		     gfp_t mask)
485
486
{
	struct extent_state *state;
487
	struct extent_state *cached;
488
489
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
490
	u64 last_end;
491
	int err;
492
	int clear = 0;
493

494
495
496
497
	if (delete)
		bits |= ~EXTENT_CTLBITS;
	bits |= EXTENT_FIRST_DELALLOC;

498
499
	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
		clear = 1;
500
501
502
503
504
505
506
again:
	if (!prealloc && (mask & __GFP_WAIT)) {
		prealloc = alloc_extent_state(mask);
		if (!prealloc)
			return -ENOMEM;
	}

507
	spin_lock(&tree->lock);
508
509
	if (cached_state) {
		cached = *cached_state;
510
511
512
513
514
515

		if (clear) {
			*cached_state = NULL;
			cached_state = NULL;
		}

516
517
		if (cached && cached->tree && cached->start <= start &&
		    cached->end > start) {
518
519
			if (clear)
				atomic_dec(&cached->refs);
520
			state = cached;
521
			goto hit_next;
522
		}
523
524
		if (clear)
			free_extent_state(cached);
525
	}
526
527
528
529
	/*
	 * this search will find the extents that end after
	 * our range starts
	 */
530
	node = tree_search(tree, start);
531
532
533
	if (!node)
		goto out;
	state = rb_entry(node, struct extent_state, rb_node);
534
hit_next:
535
536
537
	if (state->start > end)
		goto out;
	WARN_ON(state->end < start);
538
	last_end = state->end;
539

540
	/* the state doesn't have the wanted bits, go ahead */
541
542
	if (!(state->state & bits)) {
		state = next_state(state);
543
		goto next;
544
	}
545

546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
	/*
	 *     | ---- desired range ---- |
	 *  | state | or
	 *  | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip
	 * bits on second half.
	 *
	 * If the extent we found extends past our range, we
	 * just split and search again.  It'll get split again
	 * the next time though.
	 *
	 * If the extent we found is inside our range, we clear
	 * the desired bit on it.
	 */

	if (state->start < start) {
563
564
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
565
		err = split_state(tree, state, prealloc, start);
566
567
568
		if (err)
			extent_io_tree_panic(tree, err);

569
570
571
572
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
573
574
			state = clear_state_bit(tree, state, &bits, wake);
			goto next;
575
576
577
578
579
580
581
582
583
584
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *                        | state |
	 * We need to split the extent, and clear the bit
	 * on the first half
	 */
	if (state->start <= end && state->end > end) {
585
586
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
587
		err = split_state(tree, state, prealloc, end + 1);
588
589
590
		if (err)
			extent_io_tree_panic(tree, err);

591
592
		if (wake)
			wake_up(&state->wq);
593

594
		clear_state_bit(tree, prealloc, &bits, wake);
Josef Bacik's avatar
Josef Bacik committed
595

596
597
598
		prealloc = NULL;
		goto out;
	}
599

600
	state = clear_state_bit(tree, state, &bits, wake);
601
next:
602
603
604
	if (last_end == (u64)-1)
		goto out;
	start = last_end + 1;
605
	if (start <= end && state && !need_resched())
606
		goto hit_next;
607
608
609
	goto search_again;

out:
610
	spin_unlock(&tree->lock);
611
612
613
	if (prealloc)
		free_extent_state(prealloc);

614
	return 0;
615
616
617
618

search_again:
	if (start > end)
		goto out;
619
	spin_unlock(&tree->lock);
620
621
622
623
624
	if (mask & __GFP_WAIT)
		cond_resched();
	goto again;
}

625
626
static void wait_on_state(struct extent_io_tree *tree,
			  struct extent_state *state)
627
628
		__releases(tree->lock)
		__acquires(tree->lock)
629
630
631
{
	DEFINE_WAIT(wait);
	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
632
	spin_unlock(&tree->lock);
633
	schedule();
634
	spin_lock(&tree->lock);
635
636
637
638
639
640
641
642
	finish_wait(&state->wq, &wait);
}

/*
 * waits for one or more bits to clear on a range in the state tree.
 * The range [start, end] is inclusive.
 * The tree lock is taken by this function
 */
643
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
644
645
646
647
{
	struct extent_state *state;
	struct rb_node *node;

648
	spin_lock(&tree->lock);
649
650
651
652
653
654
again:
	while (1) {
		/*
		 * this search will find all the extents that end after
		 * our range starts
		 */
655
		node = tree_search(tree, start);
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
		if (!node)
			break;

		state = rb_entry(node, struct extent_state, rb_node);

		if (state->start > end)
			goto out;

		if (state->state & bits) {
			start = state->start;
			atomic_inc(&state->refs);
			wait_on_state(tree, state);
			free_extent_state(state);
			goto again;
		}
		start = state->end + 1;

		if (start > end)
			break;

676
		cond_resched_lock(&tree->lock);
677
678
	}
out:
679
	spin_unlock(&tree->lock);
680
681
}

682
static void set_state_bits(struct extent_io_tree *tree,
683
			   struct extent_state *state,
684
			   int *bits)
685
{
686
	int bits_to_set = *bits & ~EXTENT_CTLBITS;
Josef Bacik's avatar
Josef Bacik committed
687

688
	set_state_cb(tree, state, bits);
689
	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
690
691
692
		u64 range = state->end - state->start + 1;
		tree->dirty_bytes += range;
	}
693
	state->state |= bits_to_set;
694
695
}

696
697
698
699
700
701
702
703
704
705
706
static void cache_state(struct extent_state *state,
			struct extent_state **cached_ptr)
{
	if (cached_ptr && !(*cached_ptr)) {
		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
			*cached_ptr = state;
			atomic_inc(&state->refs);
		}
	}
}

707
708
709
710
static void uncache_state(struct extent_state **cached_ptr)
{
	if (cached_ptr && (*cached_ptr)) {
		struct extent_state *state = *cached_ptr;
711
712
		*cached_ptr = NULL;
		free_extent_state(state);
713
714
715
	}
}

716
/*
717
718
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
719
 *
720
721
722
 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 * part of the range already has the desired bits set.  The start of the
 * existing range is returned in failed_start in this case.
723
 *
724
 * [start, end] is inclusive This takes the tree lock.
725
 */
726

Jeff Mahoney's avatar
Jeff Mahoney committed
727
728
729
730
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
		 int bits, int exclusive_bits, u64 *failed_start,
		 struct extent_state **cached_state, gfp_t mask)
731
732
733
734
735
736
737
{
	struct extent_state *state;
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
	int err = 0;
	u64 last_start;
	u64 last_end;
738

739
	bits |= EXTENT_FIRST_DELALLOC;
740
741
742
again:
	if (!prealloc && (mask & __GFP_WAIT)) {
		prealloc = alloc_extent_state(mask);
743
		BUG_ON(!prealloc);
744
745
	}

746
	spin_lock(&tree->lock);
747
748
	if (cached_state && *cached_state) {
		state = *cached_state;
749
750
		if (state->start <= start && state->end > start &&
		    state->tree) {
751
752
753
754
			node = &state->rb_node;
			goto hit_next;
		}
	}
755
756
757
758
	/*
	 * this search will find all the extents that end after
	 * our range starts.
	 */
759
	node = tree_search(tree, start);
760
	if (!node) {
761
762
		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
763
		err = insert_state(tree, prealloc, start, end, &bits);
764
765
766
		if (err)
			extent_io_tree_panic(tree, err);

767
768
769
770
		prealloc = NULL;
		goto out;
	}
	state = rb_entry(node, struct extent_state, rb_node);
Chris Mason's avatar
Chris Mason committed
771
hit_next:
772
773
774
775
776
777
778
779
780
781
	last_start = state->start;
	last_end = state->end;

	/*
	 * | ---- desired range ---- |
	 * | state |
	 *
	 * Just lock what we found and keep going
	 */
	if (state->start == start && state->end <= end) {
782
		if (state->state & exclusive_bits) {
783
784
785
786
			*failed_start = state->start;
			err = -EEXIST;
			goto out;
		}
787

788
		set_state_bits(tree, state, &bits);
789
		cache_state(state, cached_state);
790
		merge_state(tree, state);
791
792
793
		if (last_end == (u64)-1)
			goto out;
		start = last_end + 1;
794
795
796
797
		state = next_state(state);
		if (start < end && state && state->start == start &&
		    !need_resched())
			goto hit_next;
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
		goto search_again;
	}

	/*
	 *     | ---- desired range ---- |
	 * | state |
	 *   or
	 * | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip bits on
	 * second half.
	 *
	 * If the extent we found extends past our
	 * range, we just split and search again.  It'll get split
	 * again the next time though.
	 *
	 * If the extent we found is inside our range, we set the
	 * desired bit on it.
	 */
	if (state->start < start) {
818
		if (state->state & exclusive_bits) {
819
820
821
822
			*failed_start = start;
			err = -EEXIST;
			goto out;
		}
823
824
825

		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
826
		err = split_state(tree, state, prealloc, start);
827
828
829
		if (err)
			extent_io_tree_panic(tree, err);

830
831
832
833
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
834
			set_state_bits(tree, state, &bits);
835
			cache_state(state, cached_state);
836
			merge_state(tree, state);
837
838
839
			if (last_end == (u64)-1)
				goto out;
			start = last_end + 1;
840
841
842
843
			state = next_state(state);
			if (start < end && state && state->start == start &&
			    !need_resched())
				goto hit_next;
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *     | state | or               | state |
	 *
	 * There's a hole, we need to insert something in it and
	 * ignore the extent we found.
	 */
	if (state->start > start) {
		u64 this_end;
		if (end < last_start)
			this_end = end;
		else
859
			this_end = last_start - 1;
860
861
862

		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
863
864
865
866
867

		/*
		 * Avoid to free 'prealloc' if it can be merged with
		 * the later extent.
		 */
868
		err = insert_state(tree, prealloc, start, this_end,
869
				   &bits);
870
871
872
		if (err)
			extent_io_tree_panic(tree, err);

Josef Bacik's avatar
Josef Bacik committed
873
874
		cache_state(prealloc, cached_state);
		prealloc = NULL;
875
876
877
878
879
880
881
882
883
884
		start = this_end + 1;
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *                        | state |
	 * We need to split the extent, and set the bit
	 * on the first half
	 */
	if (state->start <= end && state->end > end) {
885
		if (state->state & exclusive_bits) {
886
887
888
889
			*failed_start = start;
			err = -EEXIST;
			goto out;
		}
890
891
892

		prealloc = alloc_extent_state_atomic(prealloc);
		BUG_ON(!prealloc);
893
		err = split_state(tree, state, prealloc, end + 1);
894
895
		if (err)
			extent_io_tree_panic(tree, err);
896

897
		set_state_bits(tree, prealloc, &bits);
898
		cache_state(prealloc, cached_state);
899
900
901
902
903
904
905
906
		merge_state(tree, prealloc);
		prealloc = NULL;
		goto out;
	}

	goto search_again;

out:
907
	spin_unlock(&tree->lock);
908
909
910
911
912
913
914
915
	if (prealloc)
		free_extent_state(prealloc);

	return err;

search_again:
	if (start > end)
		goto out;
916
	spin_unlock(&tree->lock);
917
918
919
920
921
	if (mask & __GFP_WAIT)
		cond_resched();
	goto again;
}

Jeff Mahoney's avatar
Jeff Mahoney committed
922
923
924
925
926
927
928
929
930
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
		   u64 *failed_start, struct extent_state **cached_state,
		   gfp_t mask)
{
	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
				cached_state, mask);
}


931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
/**
 * convert_extent - convert all bits in a given range from one bit to another
 * @tree:	the io tree to search
 * @start:	the start offset in bytes
 * @end:	the end offset in bytes (inclusive)
 * @bits:	the bits to set in this range
 * @clear_bits:	the bits to clear in this range
 * @mask:	the allocation mask
 *
 * This will go through and set bits for the given range.  If any states exist
 * already in this range they are set with the given bit and cleared of the
 * clear_bits.  This is only meant to be used by things that are mergeable, ie
 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 * boundary bits like LOCK.
 */
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
		       int bits, int clear_bits, gfp_t mask)
{
	struct extent_state *state;
	struct extent_state *prealloc = NULL;
	struct rb_node *node;
	int err = 0;
	u64 last_start;
	u64 last_end;

again:
	if (!prealloc && (mask & __GFP_WAIT)) {
		prealloc = alloc_extent_state(mask);
		if (!prealloc)
			return -ENOMEM;
	}

	spin_lock(&tree->lock);
	/*
	 * this search will find all the extents that end after
	 * our range starts.
	 */
	node = tree_search(tree, start);
	if (!node) {
		prealloc = alloc_extent_state_atomic(prealloc);
971
972
973
974
		if (!prealloc) {
			err = -ENOMEM;
			goto out;
		}
975
976
		err = insert_state(tree, prealloc, start, end, &bits);
		prealloc = NULL;
977
978
		if (err)
			extent_io_tree_panic(tree, err);
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
		goto out;
	}
	state = rb_entry(node, struct extent_state, rb_node);
hit_next:
	last_start = state->start;
	last_end = state->end;

	/*
	 * | ---- desired range ---- |
	 * | state |
	 *
	 * Just lock what we found and keep going
	 */
	if (state->start == start && state->end <= end) {
		set_state_bits(tree, state, &bits);
994
		state = clear_state_bit(tree, state, &clear_bits, 0);
995
996
997
		if (last_end == (u64)-1)
			goto out;
		start = last_end + 1;
998
999
1000
		if (start < end && state && state->start == start &&
		    !need_resched())
			goto hit_next;
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
		goto search_again;
	}

	/*
	 *     | ---- desired range ---- |
	 * | state |
	 *   or
	 * | ------------- state -------------- |
	 *
	 * We need to split the extent we found, and may flip bits on
	 * second half.
	 *
	 * If the extent we found extends past our
	 * range, we just split and search again.  It'll get split
	 * again the next time though.
	 *
	 * If the extent we found is inside our range, we set the
	 * desired bit on it.
	 */
	if (state->start < start) {
		prealloc = alloc_extent_state_atomic(prealloc);
1022
1023
1024
1025
		if (!prealloc) {
			err = -ENOMEM;
			goto out;
		}
1026
		err = split_state(tree, state, prealloc, start);
1027
1028
		if (err)
			extent_io_tree_panic(tree, err);
1029
1030
1031
1032
1033
		prealloc = NULL;
		if (err)
			goto out;
		if (state->end <= end) {
			set_state_bits(tree, state, &bits);
1034
			state = clear_state_bit(tree, state, &clear_bits, 0);
1035
1036
1037
			if (last_end == (u64)-1)
				goto out;
			start = last_end + 1;
1038
1039
1040
			if (start < end && state && state->start == start &&
			    !need_resched())
				goto hit_next;
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
		}
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *     | state | or               | state |
	 *
	 * There's a hole, we need to insert something in it and
	 * ignore the extent we found.
	 */
	if (state->start > start) {
		u64 this_end;
		if (end < last_start)
			this_end = end;
		else
			this_end = last_start - 1;

		prealloc = alloc_extent_state_atomic(prealloc);
1059
1060
1061
1062
		if (!prealloc) {
			err = -ENOMEM;
			goto out;
		}
1063
1064
1065
1066
1067
1068
1069

		/*
		 * Avoid to free 'prealloc' if it can be merged with
		 * the later extent.
		 */
		err = insert_state(tree, prealloc, start, this_end,
				   &bits);
1070
1071
		if (err)
			extent_io_tree_panic(tree, err);
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
		prealloc = NULL;
		start = this_end + 1;
		goto search_again;
	}
	/*
	 * | ---- desired range ---- |
	 *                        | state |
	 * We need to split the extent, and set the bit
	 * on the first half
	 */
	if (state->start <= end && state->end > end) {
		prealloc = alloc_extent_state_atomic(prealloc);
1084
1085
1086
1087
		if (!prealloc) {
			err = -ENOMEM;
			goto out;
		}
1088
1089

		err = split_state(tree, state, prealloc, end + 1);
1090
1091
		if (err)
			extent_io_tree_panic(tree, err);
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116

		set_state_bits(tree, prealloc, &bits);
		clear_state_bit(tree, prealloc, &clear_bits, 0);
		prealloc = NULL;
		goto out;
	}

	goto search_again;

out:
	spin_unlock(&tree->lock);
	if (prealloc)
		free_extent_state(prealloc);

	return err;

search_again:
	if (start > end)
		goto out;
	spin_unlock(&tree->lock);
	if (mask & __GFP_WAIT)
		cond_resched();
	goto again;
}

1117
1118
1119
1120
/* wrappers around set/clear extent bit */
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
		     gfp_t mask)
{
Jeff Mahoney's avatar
Jeff Mahoney committed
1121
	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1122
			      NULL, mask);
1123
1124
1125
1126
1127
}

int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
		    int bits, gfp_t mask)
{
Jeff Mahoney's avatar
Jeff Mahoney committed
1128
	return set_extent_bit(tree, start, end, bits, NULL,
1129
			      NULL, mask);
1130
1131
1132
1133
1134
}

int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
		      int bits, gfp_t mask)
{
1135
	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1136
1137
1138
}

int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1139
			struct extent_state **cached_state, gfp_t mask)
1140
1141
{
	return set_extent_bit(tree, start, end,
1142
			      EXTENT_DELALLOC | EXTENT_UPTODATE,
Jeff Mahoney's avatar
Jeff Mahoney committed
1143
			      NULL, cached_state, mask);
1144
1145
1146
1147
1148
1149
}

int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
		       gfp_t mask)
{
	return clear_extent_bit(tree, start, end,
1150
				EXTENT_DIRTY | EXTENT_DELALLOC |
1151
				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1152
1153
1154
1155
1156
}

int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
		     gfp_t mask)
{
Jeff Mahoney's avatar
Jeff Mahoney committed
1157
	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1158
			      NULL, mask);
1159
1160
1161
}

int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1162
			struct extent_state **cached_state, gfp_t mask)
1163
{
1164
	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
Jeff Mahoney's avatar
Jeff Mahoney committed
1165
			      cached_state, mask);
1166
1167
}

1168
1169
int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
			  struct extent_state **cached_state, gfp_t mask)
1170
{
1171
	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1172
				cached_state, mask);
1173
1174
}

Chris Mason's avatar
Chris Mason committed
1175
1176
1177
1178
/*
 * either insert or lock state struct between start and end use mask to tell
 * us if waiting is desired.
 */
1179
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1180
		     int bits, struct extent_state **cached_state)
1181
1182
1183
1184
{
	int err;
	u64 failed_start;
	while (1) {
Jeff Mahoney's avatar
Jeff Mahoney committed
1185
1186
1187
		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
				       EXTENT_LOCKED, &failed_start,
				       cached_state, GFP_NOFS);
1188
		if (err == -EEXIST) {
1189
1190
			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
			start = failed_start;
1191
		} else
1192
1193
1194
1195
1196
1197
			break;
		WARN_ON(start > end);
	}
	return err;
}

1198
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1199
{
1200
	return lock_extent_bits(tree, start, end, 0, NULL);
1201
1202
}

Jeff Mahoney's avatar