disk-io.c 80.2 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
#include <linux/fs.h>
20
#include <linux/blkdev.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/scatterlist.h>
Chris Mason's avatar
Chris Mason committed
22
#include <linux/swap.h>
23
#include <linux/radix-tree.h>
24
#include <linux/writeback.h>
25
#include <linux/buffer_head.h>
26
#include <linux/workqueue.h>
27
#include <linux/kthread.h>
Chris Mason's avatar
Chris Mason committed
28
#include <linux/freezer.h>
29
#include <linux/crc32c.h>
30
#include <linux/slab.h>
31
#include <linux/migrate.h>
Chris Mason's avatar
Chris Mason committed
32
#include "compat.h"
33
34
#include "ctree.h"
#include "disk-io.h"
35
#include "transaction.h"
36
#include "btrfs_inode.h"
37
#include "volumes.h"
38
#include "print-tree.h"
39
#include "async-thread.h"
40
#include "locking.h"
41
#include "tree-log.h"
42
#include "free-space-cache.h"
43

44
static struct extent_io_ops btree_extent_io_ops;
45
static void end_workqueue_fn(struct btrfs_work *work);
46
static void free_fs_root(struct btrfs_root *root);
47
48
49
50
51
52
53
54
55
56
57
58
59
60
static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
				    int read_only);
static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				      struct btrfs_root *root);
static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
					struct extent_io_tree *dirty_pages,
					int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
				       struct extent_io_tree *pinned_extents);
static int btrfs_cleanup_transaction(struct btrfs_root *root);
61

Chris Mason's avatar
Chris Mason committed
62
63
64
65
66
/*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
 * by writes to insert metadata for new file extents after IO is complete.
 */
67
68
69
70
71
72
struct end_io_wq {
	struct bio *bio;
	bio_end_io_t *end_io;
	void *private;
	struct btrfs_fs_info *info;
	int error;
73
	int metadata;
74
	struct list_head list;
75
	struct btrfs_work work;
76
};
77

Chris Mason's avatar
Chris Mason committed
78
79
80
81
82
/*
 * async submit bios are used to offload expensive checksumming
 * onto the worker threads.  They checksum file and metadata bios
 * just before they are sent down the IO stack.
 */
83
84
85
86
struct async_submit_bio {
	struct inode *inode;
	struct bio *bio;
	struct list_head list;
87
88
	extent_submit_bio_hook_t *submit_bio_start;
	extent_submit_bio_hook_t *submit_bio_done;
89
90
	int rw;
	int mirror_num;
91
	unsigned long bio_flags;
92
93
94
95
96
	/*
	 * bio_offset is optional, can be used if the pages in the bio
	 * can't tell us where in the file the bio should go
	 */
	u64 bio_offset;
97
	struct btrfs_work work;
98
99
};

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* These are used to set the lockdep class on the extent buffer locks.
 * The class is set by the readpage_end_io_hook after the buffer has
 * passed csum validation but before the pages are unlocked.
 *
 * The lockdep class is also set by btrfs_init_new_buffer on freshly
 * allocated blocks.
 *
 * The class is based on the level in the tree block, which allows lockdep
 * to know that lower nodes nest inside the locks of higher nodes.
 *
 * We also add a check to make sure the highest level of the tree is
 * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
 * code needs update as well.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
#  error
# endif
static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
	/* leaf */
	"btrfs-extent-00",
	"btrfs-extent-01",
	"btrfs-extent-02",
	"btrfs-extent-03",
	"btrfs-extent-04",
	"btrfs-extent-05",
	"btrfs-extent-06",
	"btrfs-extent-07",
	/* highest possible level */
	"btrfs-extent-08",
};
#endif

Chris Mason's avatar
Chris Mason committed
134
135
136
137
/*
 * extents on the btree inode are pretty simple, there's one extent
 * that covers the entire device
 */
138
139
140
static struct extent_map *btree_get_extent(struct inode *inode,
		struct page *page, size_t page_offset, u64 start, u64 len,
		int create)
141
{
142
143
144
145
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
	int ret;

146
	read_lock(&em_tree->lock);
147
	em = lookup_extent_mapping(em_tree, start, len);
148
149
150
	if (em) {
		em->bdev =
			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
151
		read_unlock(&em_tree->lock);
152
		goto out;
153
	}
154
	read_unlock(&em_tree->lock);
155

156
157
158
159
160
161
	em = alloc_extent_map(GFP_NOFS);
	if (!em) {
		em = ERR_PTR(-ENOMEM);
		goto out;
	}
	em->start = 0;
162
	em->len = (u64)-1;
163
	em->block_len = (u64)-1;
164
	em->block_start = 0;
165
	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
166

167
	write_lock(&em_tree->lock);
168
169
	ret = add_extent_mapping(em_tree, em);
	if (ret == -EEXIST) {
170
171
172
		u64 failed_start = em->start;
		u64 failed_len = em->len;

173
		free_extent_map(em);
174
		em = lookup_extent_mapping(em_tree, start, len);
175
		if (em) {
176
			ret = 0;
177
178
179
		} else {
			em = lookup_extent_mapping(em_tree, failed_start,
						   failed_len);
180
			ret = -EIO;
181
		}
182
	} else if (ret) {
183
184
		free_extent_map(em);
		em = NULL;
185
	}
186
	write_unlock(&em_tree->lock);
187
188
189

	if (ret)
		em = ERR_PTR(ret);
190
191
out:
	return em;
192
193
}

194
195
u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
{
196
	return crc32c(seed, data, len);
197
198
199
200
201
202
203
}

void btrfs_csum_final(u32 crc, char *result)
{
	*(__le32 *)result = ~cpu_to_le32(crc);
}

Chris Mason's avatar
Chris Mason committed
204
205
206
207
/*
 * compute the csum for a btree block, and either verify it or write it
 * into the csum field of the block.
 */
208
209
210
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
			   int verify)
{
211
212
213
	u16 csum_size =
		btrfs_super_csum_size(&root->fs_info->super_copy);
	char *result = NULL;
214
215
216
217
218
219
220
221
222
	unsigned long len;
	unsigned long cur_len;
	unsigned long offset = BTRFS_CSUM_SIZE;
	char *map_token = NULL;
	char *kaddr;
	unsigned long map_start;
	unsigned long map_len;
	int err;
	u32 crc = ~(u32)0;
223
	unsigned long inline_result;
224
225

	len = buf->len - offset;
226
	while (len > 0) {
227
228
229
		err = map_private_extent_buffer(buf, offset, 32,
					&map_token, &kaddr,
					&map_start, &map_len, KM_USER0);
230
		if (err)
231
232
233
234
235
236
237
238
			return 1;
		cur_len = min(len, map_len - (offset - map_start));
		crc = btrfs_csum_data(root, kaddr + offset - map_start,
				      crc, cur_len);
		len -= cur_len;
		offset += cur_len;
		unmap_extent_buffer(buf, map_token, KM_USER0);
	}
239
240
241
242
243
244
245
246
	if (csum_size > sizeof(inline_result)) {
		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
		if (!result)
			return 1;
	} else {
		result = (char *)&inline_result;
	}

247
248
249
	btrfs_csum_final(crc, result);

	if (verify) {
250
		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
251
252
			u32 val;
			u32 found = 0;
253
			memcpy(&found, result, csum_size);
254

255
			read_extent_buffer(buf, &val, 0, csum_size);
256
257
258
259
260
261
262
263
			if (printk_ratelimit()) {
				printk(KERN_INFO "btrfs: %s checksum verify "
				       "failed on %llu wanted %X found %X "
				       "level %d\n",
				       root->fs_info->sb->s_id,
				       (unsigned long long)buf->start, val, found,
				       btrfs_header_level(buf));
			}
264
265
			if (result != (char *)&inline_result)
				kfree(result);
266
267
268
			return 1;
		}
	} else {
269
		write_extent_buffer(buf, result, 0, csum_size);
270
	}
271
272
	if (result != (char *)&inline_result)
		kfree(result);
273
274
275
	return 0;
}

Chris Mason's avatar
Chris Mason committed
276
277
278
279
280
281
/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
282
283
284
static int verify_parent_transid(struct extent_io_tree *io_tree,
				 struct extent_buffer *eb, u64 parent_transid)
{
285
	struct extent_state *cached_state = NULL;
286
287
288
289
290
	int ret;

	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
		return 0;

291
292
293
	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
			 0, &cached_state, GFP_NOFS);
	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
294
295
296
297
	    btrfs_header_generation(eb) == parent_transid) {
		ret = 0;
		goto out;
	}
298
299
300
301
302
303
304
	if (printk_ratelimit()) {
		printk("parent transid verify failed on %llu wanted %llu "
		       "found %llu\n",
		       (unsigned long long)eb->start,
		       (unsigned long long)parent_transid,
		       (unsigned long long)btrfs_header_generation(eb));
	}
305
	ret = 1;
306
	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
Chris Mason's avatar
Chris Mason committed
307
out:
308
309
	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
			     &cached_state, GFP_NOFS);
310
311
312
	return ret;
}

Chris Mason's avatar
Chris Mason committed
313
314
315
316
/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
 */
317
318
static int btree_read_extent_buffer_pages(struct btrfs_root *root,
					  struct extent_buffer *eb,
319
					  u64 start, u64 parent_transid)
320
321
322
323
324
325
326
327
328
329
{
	struct extent_io_tree *io_tree;
	int ret;
	int num_copies = 0;
	int mirror_num = 0;

	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
	while (1) {
		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
					       btree_get_extent, mirror_num);
330
331
		if (!ret &&
		    !verify_parent_transid(io_tree, eb, parent_transid))
332
			return ret;
333

334
335
		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
					      eb->start, eb->len);
Chris Mason's avatar
Chris Mason committed
336
		if (num_copies == 1)
337
			return ret;
Chris Mason's avatar
Chris Mason committed
338

339
		mirror_num++;
Chris Mason's avatar
Chris Mason committed
340
		if (mirror_num > num_copies)
341
342
343
344
			return ret;
	}
	return -EIO;
}
345

Chris Mason's avatar
Chris Mason committed
346
/*
347
348
 * checksum a dirty tree block before IO.  This has extra checks to make sure
 * we only fill in the checksum field in the first page of a multi-page block
Chris Mason's avatar
Chris Mason committed
349
 */
350

351
static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
352
{
353
	struct extent_io_tree *tree;
354
	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
355
356
357
	u64 found_start;
	unsigned long len;
	struct extent_buffer *eb;
358
359
	int ret;

360
	tree = &BTRFS_I(page->mapping->host)->io_tree;
361
362
363
364
365
366

	if (page->private == EXTENT_PAGE_PRIVATE)
		goto out;
	if (!page->private)
		goto out;
	len = page->private >> 2;
367
368
	WARN_ON(len == 0);

369
	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
Tsutomu Itoh's avatar
Tsutomu Itoh committed
370
371
372
373
	if (eb == NULL) {
		WARN_ON(1);
		goto out;
	}
374
375
	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
					     btrfs_header_generation(eb));
376
	BUG_ON(ret);
377
378
	WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));

379
380
	found_start = btrfs_header_bytenr(eb);
	if (found_start != start) {
381
382
383
384
385
386
387
388
389
390
		WARN_ON(1);
		goto err;
	}
	if (eb->first_page != page) {
		WARN_ON(1);
		goto err;
	}
	if (!PageUptodate(page)) {
		WARN_ON(1);
		goto err;
391
392
	}
	csum_tree_block(root, eb, 0);
393
err:
394
395
396
397
398
	free_extent_buffer(eb);
out:
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
static int check_tree_block_fsid(struct btrfs_root *root,
				 struct extent_buffer *eb)
{
	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
	u8 fsid[BTRFS_UUID_SIZE];
	int ret = 1;

	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
			   BTRFS_FSID_SIZE);
	while (fs_devices) {
		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
			ret = 0;
			break;
		}
		fs_devices = fs_devices->seed;
	}
	return ret;
}

418
419
420
421
422
423
424
425
426
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
{
	lockdep_set_class_and_name(&eb->lock,
			   &btrfs_eb_class[level],
			   btrfs_eb_name[level]);
}
#endif

427
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
428
429
430
431
432
433
434
435
			       struct extent_state *state)
{
	struct extent_io_tree *tree;
	u64 found_start;
	int found_level;
	unsigned long len;
	struct extent_buffer *eb;
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
436
	int ret = 0;
437
438
439
440
441
442

	tree = &BTRFS_I(page->mapping->host)->io_tree;
	if (page->private == EXTENT_PAGE_PRIVATE)
		goto out;
	if (!page->private)
		goto out;
443

444
	len = page->private >> 2;
445
446
	WARN_ON(len == 0);

447
	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
Tsutomu Itoh's avatar
Tsutomu Itoh committed
448
449
450
451
	if (eb == NULL) {
		ret = -EIO;
		goto out;
	}
452

453
	found_start = btrfs_header_bytenr(eb);
454
	if (found_start != start) {
455
456
457
458
459
460
		if (printk_ratelimit()) {
			printk(KERN_INFO "btrfs bad tree block start "
			       "%llu %llu\n",
			       (unsigned long long)found_start,
			       (unsigned long long)eb->start);
		}
461
		ret = -EIO;
462
463
464
		goto err;
	}
	if (eb->first_page != page) {
465
466
		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
		       eb->first_page->index, page->index);
467
		WARN_ON(1);
468
		ret = -EIO;
469
470
		goto err;
	}
Yan Zheng's avatar
Yan Zheng committed
471
	if (check_tree_block_fsid(root, eb)) {
472
473
474
475
		if (printk_ratelimit()) {
			printk(KERN_INFO "btrfs bad fsid on block %llu\n",
			       (unsigned long long)eb->start);
		}
476
477
478
		ret = -EIO;
		goto err;
	}
479
480
	found_level = btrfs_header_level(eb);

481
482
	btrfs_set_buffer_lockdep_class(eb, found_level);

483
	ret = csum_tree_block(root, eb, 1);
484
485
	if (ret)
		ret = -EIO;
486
487
488
489
490
491

	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
	end = eb->start + end - 1;
err:
	free_extent_buffer(eb);
out:
492
	return ret;
493
494
495
496
497
498
499
500
501
}

static void end_workqueue_bio(struct bio *bio, int err)
{
	struct end_io_wq *end_io_wq = bio->bi_private;
	struct btrfs_fs_info *fs_info;

	fs_info = end_io_wq->info;
	end_io_wq->error = err;
502
503
	end_io_wq->work.func = end_workqueue_fn;
	end_io_wq->work.flags = 0;
504

505
	if (bio->bi_rw & REQ_WRITE) {
506
		if (end_io_wq->metadata == 1)
507
508
			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
					   &end_io_wq->work);
509
510
511
		else if (end_io_wq->metadata == 2)
			btrfs_queue_worker(&fs_info->endio_freespace_worker,
					   &end_io_wq->work);
512
513
514
		else
			btrfs_queue_worker(&fs_info->endio_write_workers,
					   &end_io_wq->work);
515
516
517
518
519
520
521
522
	} else {
		if (end_io_wq->metadata)
			btrfs_queue_worker(&fs_info->endio_meta_workers,
					   &end_io_wq->work);
		else
			btrfs_queue_worker(&fs_info->endio_workers,
					   &end_io_wq->work);
	}
523
524
}

525
526
527
528
529
530
531
/*
 * For the metadata arg you want
 *
 * 0 - if data
 * 1 - if normal metadta
 * 2 - if writing to the free space cache area
 */
532
533
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
			int metadata)
534
{
535
536
537
538
539
540
541
	struct end_io_wq *end_io_wq;
	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
	if (!end_io_wq)
		return -ENOMEM;

	end_io_wq->private = bio->bi_private;
	end_io_wq->end_io = bio->bi_end_io;
542
	end_io_wq->info = info;
543
544
	end_io_wq->error = 0;
	end_io_wq->bio = bio;
545
	end_io_wq->metadata = metadata;
546
547
548

	bio->bi_private = end_io_wq;
	bio->bi_end_io = end_workqueue_bio;
549
550
551
	return 0;
}

552
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
553
{
554
555
556
557
558
	unsigned long limit = min_t(unsigned long,
				    info->workers.max_workers,
				    info->fs_devices->open_devices);
	return 256 * limit;
}
559

560
561
int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
{
562
563
	return atomic_read(&info->nr_async_bios) >
		btrfs_async_submit_limit(info);
564
565
}

566
567
568
569
570
571
static void run_one_async_start(struct btrfs_work *work)
{
	struct async_submit_bio *async;

	async = container_of(work, struct  async_submit_bio, work);
	async->submit_bio_start(async->inode, async->rw, async->bio,
572
573
			       async->mirror_num, async->bio_flags,
			       async->bio_offset);
574
575
576
}

static void run_one_async_done(struct btrfs_work *work)
577
578
579
{
	struct btrfs_fs_info *fs_info;
	struct async_submit_bio *async;
580
	int limit;
581
582
583

	async = container_of(work, struct  async_submit_bio, work);
	fs_info = BTRFS_I(async->inode)->root->fs_info;
584

585
	limit = btrfs_async_submit_limit(fs_info);
586
587
	limit = limit * 2 / 3;

588
	atomic_dec(&fs_info->nr_async_submits);
589

590
591
	if (atomic_read(&fs_info->nr_async_submits) < limit &&
	    waitqueue_active(&fs_info->async_submit_wait))
592
593
		wake_up(&fs_info->async_submit_wait);

594
	async->submit_bio_done(async->inode, async->rw, async->bio,
595
596
			       async->mirror_num, async->bio_flags,
			       async->bio_offset);
597
598
599
600
601
602
603
}

static void run_one_async_free(struct btrfs_work *work)
{
	struct async_submit_bio *async;

	async = container_of(work, struct  async_submit_bio, work);
604
605
606
	kfree(async);
}

607
608
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
			int rw, struct bio *bio, int mirror_num,
609
			unsigned long bio_flags,
610
			u64 bio_offset,
611
612
			extent_submit_bio_hook_t *submit_bio_start,
			extent_submit_bio_hook_t *submit_bio_done)
613
614
615
616
617
618
619
620
621
622
623
{
	struct async_submit_bio *async;

	async = kmalloc(sizeof(*async), GFP_NOFS);
	if (!async)
		return -ENOMEM;

	async->inode = inode;
	async->rw = rw;
	async->bio = bio;
	async->mirror_num = mirror_num;
624
625
626
627
628
629
630
	async->submit_bio_start = submit_bio_start;
	async->submit_bio_done = submit_bio_done;

	async->work.func = run_one_async_start;
	async->work.ordered_func = run_one_async_done;
	async->work.ordered_free = run_one_async_free;

631
	async->work.flags = 0;
632
	async->bio_flags = bio_flags;
633
	async->bio_offset = bio_offset;
634

635
	atomic_inc(&fs_info->nr_async_submits);
636

637
	if (rw & REQ_SYNC)
638
639
		btrfs_set_work_high_prio(&async->work);

640
	btrfs_queue_worker(&fs_info->workers, &async->work);
641

642
	while (atomic_read(&fs_info->async_submit_draining) &&
643
644
645
646
647
	      atomic_read(&fs_info->nr_async_submits)) {
		wait_event(fs_info->async_submit_wait,
			   (atomic_read(&fs_info->nr_async_submits) == 0));
	}

648
649
650
	return 0;
}

651
652
653
654
655
656
657
static int btree_csum_one_bio(struct bio *bio)
{
	struct bio_vec *bvec = bio->bi_io_vec;
	int bio_index = 0;
	struct btrfs_root *root;

	WARN_ON(bio->bi_vcnt <= 0);
658
	while (bio_index < bio->bi_vcnt) {
659
660
661
662
663
664
665
666
		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
		csum_dirty_buffer(root, bvec->bv_page);
		bio_index++;
		bvec++;
	}
	return 0;
}

667
668
static int __btree_submit_bio_start(struct inode *inode, int rw,
				    struct bio *bio, int mirror_num,
669
670
				    unsigned long bio_flags,
				    u64 bio_offset)
671
{
672
673
	/*
	 * when we're called for a write, we're already in the async
674
	 * submission context.  Just jump into btrfs_map_bio
675
	 */
676
677
678
	btree_csum_one_bio(bio);
	return 0;
}
679

680
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
681
682
				 int mirror_num, unsigned long bio_flags,
				 u64 bio_offset)
683
{
684
	/*
685
686
	 * when we're called for a write, we're already in the async
	 * submission context.  Just jump into btrfs_map_bio
687
688
	 */
	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
689
690
}

691
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
692
693
				 int mirror_num, unsigned long bio_flags,
				 u64 bio_offset)
694
{
695
696
697
698
699
700
	int ret;

	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
					  bio, 1);
	BUG_ON(ret);

701
	if (!(rw & REQ_WRITE)) {
702
703
704
705
706
		/*
		 * called for a read, do the setup so that checksum validation
		 * can happen in the async kernel threads
		 */
		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
707
				     mirror_num, 0);
708
	}
709

710
711
712
713
	/*
	 * kthread helpers are used to submit writes so that checksumming
	 * can happen in parallel across all CPUs
	 */
714
	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
715
				   inode, rw, bio, mirror_num, 0,
716
				   bio_offset,
717
718
				   __btree_submit_bio_start,
				   __btree_submit_bio_done);
719
720
}

Jan Beulich's avatar
Jan Beulich committed
721
#ifdef CONFIG_MIGRATION
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
static int btree_migratepage(struct address_space *mapping,
			struct page *newpage, struct page *page)
{
	/*
	 * we can't safely write a btree page from here,
	 * we haven't done the locking hook
	 */
	if (PageDirty(page))
		return -EAGAIN;
	/*
	 * Buffers may be managed in a filesystem specific way.
	 * We must have no buffers or drop them.
	 */
	if (page_has_private(page) &&
	    !try_to_release_page(page, GFP_KERNEL))
		return -EAGAIN;
	return migrate_page(mapping, newpage, page);
}
Jan Beulich's avatar
Jan Beulich committed
740
#endif
741

742
743
static int btree_writepage(struct page *page, struct writeback_control *wbc)
{
744
	struct extent_io_tree *tree;
745
746
747
748
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
	struct extent_buffer *eb;
	int was_dirty;

749
	tree = &BTRFS_I(page->mapping->host)->io_tree;
750
751
752
753
	if (!(current->flags & PF_MEMALLOC)) {
		return extent_write_full_page(tree, page,
					      btree_get_extent, wbc);
	}
754

755
	redirty_page_for_writepage(wbc, page);
756
	eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
757
758
759
760
761
762
763
	WARN_ON(!eb);

	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
	if (!was_dirty) {
		spin_lock(&root->fs_info->delalloc_lock);
		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
		spin_unlock(&root->fs_info->delalloc_lock);
764
	}
765
766
767
768
	free_extent_buffer(eb);

	unlock_page(page);
	return 0;
769
}
770
771
772
773

static int btree_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
{
774
775
	struct extent_io_tree *tree;
	tree = &BTRFS_I(mapping->host)->io_tree;
776
	if (wbc->sync_mode == WB_SYNC_NONE) {
777
		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
778
		u64 num_dirty;
779
		unsigned long thresh = 32 * 1024 * 1024;
780
781
782
783

		if (wbc->for_kupdate)
			return 0;

784
785
		/* this is a bit racy, but that's ok */
		num_dirty = root->fs_info->dirty_metadata_bytes;
786
		if (num_dirty < thresh)
787
788
			return 0;
	}
789
790
791
	return extent_writepages(tree, mapping, btree_get_extent, wbc);
}

792
static int btree_readpage(struct file *file, struct page *page)
793
{
794
795
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
796
797
	return extent_read_full_page(tree, page, btree_get_extent);
}
Chris Mason's avatar
Chris Mason committed
798

799
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
800
{
801
802
	struct extent_io_tree *tree;
	struct extent_map_tree *map;
803
	int ret;
804

805
	if (PageWriteback(page) || PageDirty(page))
806
		return 0;
807

808
809
	tree = &BTRFS_I(page->mapping->host)->io_tree;
	map = &BTRFS_I(page->mapping->host)->extent_tree;
810

811
	ret = try_release_extent_state(map, tree, page, gfp_flags);
812
	if (!ret)
813
814
815
		return 0;

	ret = try_release_extent_buffer(tree, page);
816
817
818
819
820
	if (ret == 1) {
		ClearPagePrivate(page);
		set_page_private(page, 0);
		page_cache_release(page);
	}
821

822
823
824
	return ret;
}

825
static void btree_invalidatepage(struct page *page, unsigned long offset)
826
{
827
828
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
829
830
	extent_invalidatepage(tree, page, offset);
	btree_releasepage(page, GFP_NOFS);
831
	if (PagePrivate(page)) {
832
833
		printk(KERN_WARNING "btrfs warning page private not zero "
		       "on page %llu\n", (unsigned long long)page_offset(page));
834
835
836
837
		ClearPagePrivate(page);
		set_page_private(page, 0);
		page_cache_release(page);
	}
838
839
}

840
static const struct address_space_operations btree_aops = {
841
842
	.readpage	= btree_readpage,
	.writepage	= btree_writepage,
843
	.writepages	= btree_writepages,
844
845
	.releasepage	= btree_releasepage,
	.invalidatepage = btree_invalidatepage,
846
	.sync_page	= block_sync_page,
847
#ifdef CONFIG_MIGRATION
848
	.migratepage	= btree_migratepage,
849
#endif
850
851
};

852
853
int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
			 u64 parent_transid)
Chris Mason's avatar
Chris Mason committed
854
{
855
856
	struct extent_buffer *buf = NULL;
	struct inode *btree_inode = root->fs_info->btree_inode;
857
	int ret = 0;
Chris Mason's avatar
Chris Mason committed
858

859
	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
860
	if (!buf)
Chris Mason's avatar
Chris Mason committed
861
		return 0;
862
	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
863
				 buf, 0, 0, btree_get_extent, 0);
864
	free_extent_buffer(buf);
865
	return ret;
Chris Mason's avatar
Chris Mason committed
866
867
}

868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
					    u64 bytenr, u32 blocksize)
{
	struct inode *btree_inode = root->fs_info->btree_inode;
	struct extent_buffer *eb;
	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
				bytenr, blocksize, GFP_NOFS);
	return eb;
}

struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
						 u64 bytenr, u32 blocksize)
{
	struct inode *btree_inode = root->fs_info->btree_inode;
	struct extent_buffer *eb;

	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
				 bytenr, blocksize, NULL, GFP_NOFS);
	return eb;
}


890
891
int btrfs_write_tree_block(struct extent_buffer *buf)
{
892
893
	return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
					buf->start + buf->len - 1);
894
895
896
897
}

int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
898
899
	return filemap_fdatawait_range(buf->first_page->mapping,
				       buf->start, buf->start + buf->len - 1);
900
901
}

902
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
903
				      u32 blocksize, u64 parent_transid)
904
905
906
907
908
909
910
911
{
	struct extent_buffer *buf = NULL;
	int ret;

	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
	if (!buf)
		return NULL;

912
	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
913

914
	if (ret == 0)
915
		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
916
	return buf;
917

918
919
}

920
int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
921
		     struct extent_buffer *buf)
922
{
923
	struct inode *btree_inode = root->fs_info->btree_inode;
924
	if (btrfs_header_generation(buf) ==
925
	    root->fs_info->running_transaction->transid) {
926
		btrfs_assert_tree_locked(buf);
927

928
929
930
931
932
933
934
935
		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
			spin_lock(&root->fs_info->delalloc_lock);
			if (root->fs_info->dirty_metadata_bytes >= buf->len)
				root->fs_info->dirty_metadata_bytes -= buf->len;
			else
				WARN_ON(1);
			spin_unlock(&root->fs_info->delalloc_lock);
		}
936

937
938
		/* ugh, clear_extent_buffer_dirty needs to lock the page */
		btrfs_set_lock_blocking(buf);
939
		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
940
					  buf);
941
	}
942
943
944
	return 0;
}

945
static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
946
			u32 stripesize, struct btrfs_root *root,
947
			struct btrfs_fs_info *fs_info,
Chris Mason's avatar
Chris Mason committed
948
			u64 objectid)
949
{
Chris Mason's avatar
Chris Mason committed
950
	root->node = NULL;
951
	root->commit_root = NULL;
952
953
954
	root->sectorsize = sectorsize;
	root->nodesize = nodesize;
	root->leafsize = leafsize;
955
	root->stripesize = stripesize;
956
	root->ref_cows = 0;
957
	root->track_dirty = 0;
958
	root->in_radix = 0;
959
960
	root->orphan_item_inserted = 0;
	root->orphan_cleanup_state = 0;
961

962
	root->fs_info = fs_info;
963
964
	root->objectid = objectid;
	root->last_trans = 0;
965
	root->highest_objectid = 0;
966
	root->name = NULL;
967
	root->in_sysfs = 0;
968
	root->inode_tree = RB_ROOT;
969
	root->block_rsv = NULL;
970
	root->orphan_block_rsv = NULL;
971
972

	INIT_LIST_HEAD(&root->dirty_list);
973
	INIT_LIST_HEAD(&root->orphan_list);
974
	INIT_LIST_HEAD(&root->root_list);
975
	spin_lock_init(&root->node_lock);
976
	spin_lock_init(&root->orphan_lock);
977
	spin_lock_init(&root->inode_lock);
978
	spin_lock_init(&root->accounting_lock);
979
	mutex_init(&root->objectid_mutex);
980
	mutex_init(&root->log_mutex);
Yan Zheng's avatar
Yan Zheng committed
981
982
983
984
985
986
987
988
	init_waitqueue_head(&root->log_writer_wait);
	init_waitqueue_head(&root->log_commit_wait[0]);
	init_waitqueue_head(&root->log_commit_wait[1]);
	atomic_set(&root->log_commit[0], 0);
	atomic_set(&root->log_commit[1], 0);
	atomic_set(&root->log_writers, 0);
	root->log_batch = 0;
	root->log_transid = 0;
989
	root->last_log_commit = 0;
990
991
	extent_io_tree_init(&root->dirty_log_pages,
			     fs_info->btree_inode->i_mapping, GFP_NOFS);
992

993
994
	memset(&root->root_key, 0, sizeof(root->root_key));
	memset(&root->root_item, 0, sizeof(root->root_item));
995
	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
996
	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
997
	root->defrag_trans_start = fs_info->generation;
998
	init_completion(&root->kobj_unregister);
999
	root->defrag_running = 0;
1000
	root->root_key.objectid = objectid;
1001
1002
1003
1004
1005
1006
	root->anon_super.s_root = NULL;
	root->anon_super.s_dev = 0;
	INIT_LIST_HEAD(&root->anon_super.s_list);
	INIT_LIST_HEAD(&root->anon_super.s_instances);
	init_rwsem(&root->anon_super.s_umount);

1007
1008
1009
	return 0;
}

1010
static int find_and_setup_root(struct btrfs_root *tree_root,
1011