ctree.c 109 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
/*
Chris Mason's avatar
Chris Mason committed
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
Chris Mason's avatar
Chris Mason committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20
21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26
27
28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31
32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34
35
36
37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38
39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

41
struct btrfs_path *btrfs_alloc_path(void)
Chris Mason's avatar
Chris Mason committed
42
{
43
	struct btrfs_path *path;
Jeff Mahoney's avatar
Jeff Mahoney committed
44
45
	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
	if (path)
46
		path->reada = 1;
47
	return path;
Chris Mason's avatar
Chris Mason committed
48
49
}

50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
 * set all locked nodes in the path to blocking locks.  This should
 * be done before scheduling
 */
noinline void btrfs_set_path_blocking(struct btrfs_path *p)
{
	int i;
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
		if (p->nodes[i] && p->locks[i])
			btrfs_set_lock_blocking(p->nodes[i]);
	}
}

/*
 * reset all the locked nodes in the patch to spinning locks.
65
66
67
68
69
 *
 * held is used to keep lockdep happy, when lockdep is enabled
 * we set held to a blocking lock before we go around and
 * retake all the spinlocks in the path.  You can safely use NULL
 * for held
70
 */
71
72
noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
					struct extent_buffer *held)
73
74
{
	int i;
75
76
77
78
79
80
81
82
83
84
85
86
87
88

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	/* lockdep really cares that we take all of these spinlocks
	 * in the right order.  If any of the locks in the path are not
	 * currently blocking, it is going to complain.  So, make really
	 * really sure by forcing the path to blocking before we clear
	 * the path blocking.
	 */
	if (held)
		btrfs_set_lock_blocking(held);
	btrfs_set_path_blocking(p);
#endif

	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89
90
91
		if (p->nodes[i] && p->locks[i])
			btrfs_clear_lock_blocking(p->nodes[i]);
	}
92
93
94
95
96

#ifdef CONFIG_DEBUG_LOCK_ALLOC
	if (held)
		btrfs_clear_lock_blocking(held);
#endif
97
98
}

Chris Mason's avatar
Chris Mason committed
99
/* this also releases the path */
100
void btrfs_free_path(struct btrfs_path *p)
101
{
102
103
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
104
105
}

Chris Mason's avatar
Chris Mason committed
106
107
108
109
110
111
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
112
noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
113
114
{
	int i;
115

116
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
117
		p->slots[i] = 0;
118
		if (!p->nodes[i])
119
120
121
122
123
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
124
		free_extent_buffer(p->nodes[i]);
125
		p->nodes[i] = NULL;
126
127
128
	}
}

Chris Mason's avatar
Chris Mason committed
129
130
131
132
133
134
135
136
137
138
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
139
140
141
142
143
144
145
146
147
148
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

Chris Mason's avatar
Chris Mason committed
149
150
151
152
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
153
154
155
156
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

157
	while (1) {
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

Chris Mason's avatar
Chris Mason committed
174
175
176
177
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
178
179
180
181
182
183
184
185
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

Chris Mason's avatar
Chris Mason committed
186
187
188
189
190
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
191
192
193
194
195
196
197
198
199
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
200
	struct btrfs_disk_key disk_key;
201
202
203
204
205
206
207

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
208
209
210
211
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);
Zheng Yan's avatar
Zheng Yan committed
212

213
214
215
216
	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
				     new_root_objectid, &disk_key, level,
				     buf->start, 0);
	if (IS_ERR(cow))
217
218
219
220
221
		return PTR_ERR(cow);

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
222
223
224
225
226
227
228
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, new_root_objectid);
229

Yan Zheng's avatar
Yan Zheng committed
230
231
232
233
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

234
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
235
236
237
238
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
		ret = btrfs_inc_ref(trans, root, cow, 1);
	else
		ret = btrfs_inc_ref(trans, root, cow, 0);
239

240
241
242
243
244
245
246
247
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/*
 * check if the tree block can be shared by multiple trees
 */
int btrfs_block_can_be_shared(struct btrfs_root *root,
			      struct extent_buffer *buf)
{
	/*
	 * Tree blocks not in refernece counted trees and tree roots
	 * are never shared. If a block was allocated after the last
	 * snapshot and the block was not allocated by tree relocation,
	 * we know the block is not shared.
	 */
	if (root->ref_cows &&
	    buf != root->node && buf != root->commit_root &&
	    (btrfs_header_generation(buf) <=
	     btrfs_root_last_snapshot(&root->root_item) ||
	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
		return 1;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	if (root->ref_cows &&
	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
		return 1;
#endif
	return 0;
}

static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root,
				       struct extent_buffer *buf,
				       struct extent_buffer *cow)
{
	u64 refs;
	u64 owner;
	u64 flags;
	u64 new_flags = 0;
	int ret;

	/*
	 * Backrefs update rules:
	 *
	 * Always use full backrefs for extent pointers in tree block
	 * allocated by tree relocation.
	 *
	 * If a shared tree block is no longer referenced by its owner
	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
	 * use full backrefs for extent pointers in tree block.
	 *
	 * If a tree block is been relocating
	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
	 * use full backrefs for extent pointers in tree block.
	 * The reason for this is some operations (such as drop tree)
	 * are only allowed for blocks use full backrefs.
	 */

	if (btrfs_block_can_be_shared(root, buf)) {
		ret = btrfs_lookup_extent_info(trans, root, buf->start,
					       buf->len, &refs, &flags);
		BUG_ON(ret);
		BUG_ON(refs == 0);
	} else {
		refs = 1;
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
		else
			flags = 0;
	}

	owner = btrfs_header_owner(buf);
	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));

	if (refs > 1) {
		if ((owner == root->root_key.objectid ||
		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
			ret = btrfs_inc_ref(trans, root, buf, 1);
			BUG_ON(ret);

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				ret = btrfs_dec_ref(trans, root, buf, 0);
				BUG_ON(ret);
				ret = btrfs_inc_ref(trans, root, cow, 1);
				BUG_ON(ret);
			}
			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
		} else {

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
				ret = btrfs_inc_ref(trans, root, cow, 1);
			else
				ret = btrfs_inc_ref(trans, root, cow, 0);
			BUG_ON(ret);
		}
		if (new_flags != 0) {
			ret = btrfs_set_disk_extent_flags(trans, root,
							  buf->start,
							  buf->len,
							  new_flags, 0);
			BUG_ON(ret);
		}
	} else {
		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
				ret = btrfs_inc_ref(trans, root, cow, 1);
			else
				ret = btrfs_inc_ref(trans, root, cow, 0);
			BUG_ON(ret);
			ret = btrfs_dec_ref(trans, root, buf, 1);
			BUG_ON(ret);
		}
		clean_tree_block(trans, root, buf);
	}
	return 0;
}

Chris Mason's avatar
Chris Mason committed
367
/*
368
369
370
371
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
Chris Mason's avatar
Chris Mason committed
372
373
374
 *
 * search_start -- an allocation hint for the new block
 *
375
376
377
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
Chris Mason's avatar
Chris Mason committed
378
 */
379
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
380
381
382
383
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
384
			     u64 search_start, u64 empty_size)
Chris Mason's avatar
Chris Mason committed
385
{
386
	struct btrfs_disk_key disk_key;
387
	struct extent_buffer *cow;
388
	int level;
389
	int unlock_orig = 0;
390
	u64 parent_start;
391

392
393
394
	if (*cow_ret == buf)
		unlock_orig = 1;

395
	btrfs_assert_tree_locked(buf);
396

397
398
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
399
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
400

401
	level = btrfs_header_level(buf);
Zheng Yan's avatar
Zheng Yan committed
402

403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);

	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent)
			parent_start = parent->start;
		else
			parent_start = 0;
	} else
		parent_start = 0;

	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
				     root->root_key.objectid, &disk_key,
				     level, search_start, empty_size);
419
420
	if (IS_ERR(cow))
		return PTR_ERR(cow);
421

422
423
	/* cow is set to blocking by btrfs_init_new_buffer */

424
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
425
	btrfs_set_header_bytenr(cow, cow->start);
426
	btrfs_set_header_generation(cow, trans->transid);
427
428
429
430
431
432
433
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, root->root_key.objectid);
434

Yan Zheng's avatar
Yan Zheng committed
435
436
437
438
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

439
	update_ref_for_cow(trans, root, buf, cow);
Zheng Yan's avatar
Zheng Yan committed
440

Chris Mason's avatar
Chris Mason committed
441
	if (buf == root->node) {
442
		WARN_ON(parent && parent != buf);
443
444
445
446
447
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			parent_start = buf->start;
		else
			parent_start = 0;
448
449

		spin_lock(&root->node_lock);
Chris Mason's avatar
Chris Mason committed
450
		root->node = cow;
451
		extent_buffer_get(cow);
452
453
		spin_unlock(&root->node_lock);

454
455
456
		btrfs_free_extent(trans, root, buf->start, buf->len,
				  parent_start, root->root_key.objectid,
				  level, 0);
457
		free_extent_buffer(buf);
458
		add_root_to_dirty_list(root);
Chris Mason's avatar
Chris Mason committed
459
	} else {
460
461
462
463
464
465
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
			parent_start = parent->start;
		else
			parent_start = 0;

		WARN_ON(trans->transid != btrfs_header_generation(parent));
466
		btrfs_set_node_blockptr(parent, parent_slot,
467
					cow->start);
468
469
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
Chris Mason's avatar
Chris Mason committed
470
		btrfs_mark_buffer_dirty(parent);
471
		btrfs_free_extent(trans, root, buf->start, buf->len,
472
473
				  parent_start, root->root_key.objectid,
				  level, 0);
Chris Mason's avatar
Chris Mason committed
474
	}
475
476
	if (unlock_orig)
		btrfs_tree_unlock(buf);
477
	free_extent_buffer(buf);
Chris Mason's avatar
Chris Mason committed
478
	btrfs_mark_buffer_dirty(cow);
Chris Mason's avatar
Chris Mason committed
479
	*cow_ret = cow;
Chris Mason's avatar
Chris Mason committed
480
481
482
	return 0;
}

483
484
485
486
487
488
489
490
491
492
493
494
static inline int should_cow_block(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct extent_buffer *buf)
{
	if (btrfs_header_generation(buf) == trans->transid &&
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
		return 0;
	return 1;
}

Chris Mason's avatar
Chris Mason committed
495
496
497
498
499
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
500
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
501
502
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
503
		    struct extent_buffer **cow_ret)
504
505
{
	u64 search_start;
506
	int ret;
Chris Mason's avatar
Chris Mason committed
507

508
	if (trans->transaction != root->fs_info->running_transaction) {
509
510
511
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)
512
513
514
515
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
516
517
518
		printk(KERN_CRIT "trans %llu running %llu\n",
		       (unsigned long long)trans->transid,
		       (unsigned long long)root->fs_info->generation);
519
520
		WARN_ON(1);
	}
Chris Mason's avatar
Chris Mason committed
521

522
	if (!should_cow_block(trans, root, buf)) {
523
524
525
		*cow_ret = buf;
		return 0;
	}
526

527
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
528
529
530
531
532

	if (parent)
		btrfs_set_lock_blocking(parent);
	btrfs_set_lock_blocking(buf);

533
	ret = __btrfs_cow_block(trans, root, buf, parent,
534
				 parent_slot, cow_ret, search_start, 0);
535
	return ret;
536
537
}

Chris Mason's avatar
Chris Mason committed
538
539
540
541
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
542
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
543
{
544
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
545
		return 1;
546
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
547
548
549
550
		return 1;
	return 0;
}

551
552
553
554
555
556
557
558
559
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

560
	return btrfs_comp_cpu_keys(&k1, k2);
561
562
}

563
564
565
/*
 * same as comp_keys only with two btrfs_key's
 */
566
int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
582

Chris Mason's avatar
Chris Mason committed
583
584
585
586
587
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
588
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
589
		       struct btrfs_root *root, struct extent_buffer *parent,
590
591
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
592
{
593
	struct extent_buffer *cur;
594
	u64 blocknr;
595
	u64 gen;
596
597
	u64 search_start = *last_ret;
	u64 last_block = 0;
598
599
600
601
602
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
603
	int parent_level;
604
605
	int uptodate;
	u32 blocksize;
606
607
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
608

609
610
611
612
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

613
	if (trans->transaction != root->fs_info->running_transaction)
614
		WARN_ON(1);
615
	if (trans->transid != root->fs_info->generation)
616
		WARN_ON(1);
617

618
619
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
620
621
622
623
624
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

625
626
	btrfs_set_lock_blocking(parent);

627
628
	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
629

630
631
632
633
634
635
636
637
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
638
639
640
641
642
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
643
		blocknr = btrfs_node_blockptr(parent, i);
644
		gen = btrfs_node_ptr_generation(parent, i);
645
646
		if (last_block == 0)
			last_block = blocknr;
647

648
		if (i > 0) {
649
650
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
651
		}
652
		if (!close && i < end_slot - 2) {
653
654
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
655
		}
656
657
		if (close) {
			last_block = blocknr;
658
			continue;
659
		}
660
661
662
663
664
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
665

666
667
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
668
			uptodate = btrfs_buffer_uptodate(cur, gen);
669
670
		else
			uptodate = 0;
671
		if (!cur || !uptodate) {
672
			if (cache_only) {
673
				free_extent_buffer(cur);
674
675
				continue;
			}
676
677
			if (!cur) {
				cur = read_tree_block(root, blocknr,
678
							 blocksize, gen);
679
			} else if (!uptodate) {
680
				btrfs_read_buffer(cur, gen);
681
			}
682
		}
683
		if (search_start == 0)
684
			search_start = last_block;
685

686
		btrfs_tree_lock(cur);
687
		btrfs_set_lock_blocking(cur);
688
		err = __btrfs_cow_block(trans, root, cur, parent, i,
689
					&cur, search_start,
690
					min(16 * blocksize,
691
					    (end_slot - i) * blocksize));
Yan's avatar
Yan committed
692
		if (err) {
693
			btrfs_tree_unlock(cur);
694
			free_extent_buffer(cur);
695
			break;
Yan's avatar
Yan committed
696
		}
697
698
		search_start = cur->start;
		last_block = cur->start;
699
		*last_ret = search_start;
700
701
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
702
	}
703
704
705
706
707
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
708
709
710
	return err;
}

Chris Mason's avatar
Chris Mason committed
711
712
713
714
715
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
716
static inline unsigned int leaf_data_end(struct btrfs_root *root,
717
					 struct extent_buffer *leaf)
718
{
719
	u32 nr = btrfs_header_nritems(leaf);
720
	if (nr == 0)
721
		return BTRFS_LEAF_DATA_SIZE(root);
722
	return btrfs_item_offset_nr(leaf, nr - 1);
723
724
}

Chris Mason's avatar
Chris Mason committed
725
726
727
728
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
729
730
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
Chris Mason's avatar
Chris Mason committed
731
{
732
733
734
735
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
Chris Mason's avatar
Chris Mason committed
736
	int parent_slot;
737
738
	int slot;
	struct btrfs_key cpukey;
739
	u32 nritems = btrfs_header_nritems(node);
Chris Mason's avatar
Chris Mason committed
740
741

	if (path->nodes[level + 1])
742
		parent = path->nodes[level + 1];
Aneesh's avatar
Aneesh committed
743

744
	slot = path->slots[level];
745
746
	BUG_ON(nritems == 0);
	if (parent) {
Aneesh's avatar
Aneesh committed
747
		parent_slot = path->slots[level + 1];
748
749
750
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
Chris Mason's avatar
Chris Mason committed
751
			      sizeof(struct btrfs_disk_key)));
752
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
753
		       btrfs_header_bytenr(node));
Chris Mason's avatar
Chris Mason committed
754
	}
755
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
756
	if (slot != 0) {
757
758
759
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
760
761
	}
	if (slot < nritems - 1) {
762
763
764
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
Chris Mason's avatar
Chris Mason committed
765
766
767
768
	}
	return 0;
}

Chris Mason's avatar
Chris Mason committed
769
770
771
772
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
773
774
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
Chris Mason's avatar
Chris Mason committed
775
{
776
777
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
Chris Mason's avatar
Chris Mason committed
778
	int parent_slot;
779
	struct btrfs_key cpukey;
780
781
782
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
783

784
	u32 nritems = btrfs_header_nritems(leaf);
Chris Mason's avatar
Chris Mason committed
785
786

	if (path->nodes[level + 1])
787
		parent = path->nodes[level + 1];
788
789
790
791
792

	if (nritems == 0)
		return 0;

	if (parent) {
Aneesh's avatar
Aneesh committed
793
		parent_slot = path->slots[level + 1];
794
795
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
796

797
		BUG_ON(memcmp(&parent_key, &leaf_key,
Chris Mason's avatar
Chris Mason committed
798
		       sizeof(struct btrfs_disk_key)));
799
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
800
		       btrfs_header_bytenr(leaf));
801
802
803
804
805
806
	}
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
807
			printk(KERN_CRIT "slot %d offset bad key\n", slot);
808
809
810
811
812
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
813
			printk(KERN_CRIT "slot %d offset bad\n", slot);
814
815
			BUG_ON(1);
		}
816
817
	}
	if (slot < nritems - 1) {
818
819
820
821
822
823
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
824
			printk(KERN_CRIT "slot %d offset bad\n", slot);
825
826
			BUG_ON(1);
		}
Chris Mason's avatar
Chris Mason committed
827
	}
828
829
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
Chris Mason's avatar
Chris Mason committed
830
831
832
	return 0;
}

833
static noinline int check_block(struct btrfs_root *root,
834
				struct btrfs_path *path, int level)
Chris Mason's avatar
Chris Mason committed
835
{
836
	return 0;
Chris Mason's avatar
Chris Mason committed
837
	if (level == 0)
838
839
		return check_leaf(root, path, level);
	return check_node(root, path, level);
Chris Mason's avatar
Chris Mason committed
840
841
}

Chris Mason's avatar
Chris Mason committed
842
/*
843
844
845
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
Chris Mason's avatar
Chris Mason committed
846
847
848
849
850
851
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
852
853
854
855
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
856
857
858
859
860
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
861
	struct btrfs_disk_key *tmp = NULL;
862
863
864
865
866
867
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
868
	int err;
869

870
	while (low < high) {
871
		mid = (low + high) / 2;
872
873
874
875
876
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
877
			if (map_token) {
878
				unmap_extent_buffer(eb, map_token, KM_USER0);
879
880
				map_token = NULL;
			}
881
882

			err = map_private_extent_buffer(eb, offset,
883
884
885
886
887
888
889
890
891
892
893
894
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
895
896
897
898
899

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
900
901
902
903
904
905
906
907
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
908
909
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
910
911
912
913
			return 0;
		}
	}
	*slot = low;
914
915
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
916
917
918
	return 1;
}

Chris Mason's avatar
Chris Mason committed
919
920
921
922
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
923
924
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
925
{
926
927
928
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
Chris Mason's avatar
Chris Mason committed
929
					  sizeof(struct btrfs_item),
930
					  key, btrfs_header_nritems(eb),
931
					  slot);
932
	} else {
933
934
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
935
					  sizeof(struct btrfs_key_ptr),
936
					  key, btrfs_header_nritems(eb),
937
					  slot);
938
939
940
941
	}
	return -1;
}

942
943
944
945
946
947
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		     int level, int *slot)
{
	return bin_search(eb, key, level, slot);
}

Chris Mason's avatar
Chris Mason committed
948
949
950
951
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
952
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
953
				   struct extent_buffer *parent, int slot)
954
{
955
	int level = btrfs_header_level(parent);
956
957
	if (slot < 0)
		return NULL;
958
	if (slot >= btrfs_header_nritems(parent))
959
		return NULL;
960
961
962

	BUG_ON(level == 0);

963
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
964
965
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
966
967
}

Chris Mason's avatar
Chris Mason committed
968
969
970
971
972
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
973
static noinline int balance_level(struct btrfs_trans_handle *trans,
974
975
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
976
{
977
978
979
980
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
981
982
983
984
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
985
	int err_on_enospc = 0;
986
	u64 orig_ptr;
987
988
989
990

	if (level == 0)
		return 0;

991
	mid = path->nodes[level];
992

993
	WARN_ON(!path->locks[level]);
994
995
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

996
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
997

998
	if (level < BTRFS_MAX_LEVEL - 1)
999
		parent = path->nodes[level + 1];
1000
	pslot = path->slots[level + 1];