volumes.c 85.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */
#include <linux/sched.h>
#include <linux/bio.h>
20
#include <linux/buffer_head.h>
21
#include <linux/blkdev.h>
22
#include <linux/random.h>
23
#include <linux/iocontext.h>
24
#include <asm/div64.h>
Chris Mason's avatar
Chris Mason committed
25
#include "compat.h"
26
27
28
29
30
31
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
32
#include "async-thread.h"
33

34
35
36
37
38
39
40
struct map_lookup {
	u64 type;
	int io_align;
	int io_width;
	int stripe_len;
	int sector_size;
	int num_stripes;
Chris Mason's avatar
Chris Mason committed
41
	int sub_stripes;
42
	struct btrfs_bio_stripe stripes[];
43
44
};

Yan Zheng's avatar
Yan Zheng committed
45
46
47
48
49
static int init_first_rw_device(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);

50
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
51
			    (sizeof(struct btrfs_bio_stripe) * (n)))
52

53
54
55
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);

56
57
58
59
60
61
62
63
64
65
void btrfs_lock_volumes(void)
{
	mutex_lock(&uuid_mutex);
}

void btrfs_unlock_volumes(void)
{
	mutex_unlock(&uuid_mutex);
}

66
67
68
69
70
71
72
73
74
75
static void lock_chunks(struct btrfs_root *root)
{
	mutex_lock(&root->fs_info->chunk_mutex);
}

static void unlock_chunks(struct btrfs_root *root)
{
	mutex_unlock(&root->fs_info->chunk_mutex);
}

Yan Zheng's avatar
Yan Zheng committed
76
77
78
79
80
81
82
83
84
85
86
87
88
89
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
		kfree(device->name);
		kfree(device);
	}
	kfree(fs_devices);
}

90
91
92
93
int btrfs_cleanup_fs_uuids(void)
{
	struct btrfs_fs_devices *fs_devices;

Yan Zheng's avatar
Yan Zheng committed
94
95
96
97
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
					struct btrfs_fs_devices, list);
		list_del(&fs_devices->list);
Yan Zheng's avatar
Yan Zheng committed
98
		free_fs_devices(fs_devices);
99
100
101
102
	}
	return 0;
}

103
104
static noinline struct btrfs_device *__find_device(struct list_head *head,
						   u64 devid, u8 *uuid)
105
106
107
{
	struct btrfs_device *dev;

108
	list_for_each_entry(dev, head, dev_list) {
109
		if (dev->devid == devid &&
110
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
111
			return dev;
112
		}
113
114
115
116
	}
	return NULL;
}

117
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
118
119
120
{
	struct btrfs_fs_devices *fs_devices;

121
	list_for_each_entry(fs_devices, &fs_uuids, list) {
122
123
124
125
126
127
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

128
129
130
131
132
133
134
135
136
137
138
139
140
141
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

142
143
144
145
146
147
148
149
150
151
152
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
153
static noinline int run_scheduled_bios(struct btrfs_device *device)
154
155
156
{
	struct bio *pending;
	struct backing_dev_info *bdi;
157
	struct btrfs_fs_info *fs_info;
158
	struct btrfs_pending_bios *pending_bios;
159
160
161
	struct bio *tail;
	struct bio *cur;
	int again = 0;
162
163
	unsigned long num_run;
	unsigned long num_sync_run;
164
	unsigned long batch_run = 0;
165
	unsigned long limit;
166
	unsigned long last_waited = 0;
167
	int force_reg = 0;
168

169
	bdi = blk_get_backing_dev_info(device->bdev);
170
171
172
173
	fs_info = device->dev_root->fs_info;
	limit = btrfs_async_submit_limit(fs_info);
	limit = limit * 2 / 3;

174
175
176
177
178
	/* we want to make sure that every time we switch from the sync
	 * list to the normal list, we unplug
	 */
	num_sync_run = 0;

179
180
181
loop:
	spin_lock(&device->io_lock);

182
loop_lock:
183
	num_run = 0;
184

185
186
187
188
189
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
190
	if (!force_reg && device->pending_sync_bios.head) {
191
		pending_bios = &device->pending_sync_bios;
192
193
		force_reg = 1;
	} else {
194
		pending_bios = &device->pending_bios;
195
196
		force_reg = 0;
	}
197
198
199

	pending = pending_bios->head;
	tail = pending_bios->tail;
200
201
202
203
204
205
206
207
208
209
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
210
211
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
212
213
		again = 0;
		device->running_pending = 0;
214
215
216
	} else {
		again = 1;
		device->running_pending = 1;
217
	}
218
219
220
221

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

222
223
	spin_unlock(&device->io_lock);

224
225
226
227
228
229
230
231
232
	/*
	 * if we're doing the regular priority list, make sure we unplug
	 * for any high prio bios we've sent down
	 */
	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
		num_sync_run = 0;
		blk_run_backing_dev(bdi, NULL);
	}

233
	while (pending) {
234
235

		rmb();
236
237
238
239
240
241
242
243
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
244
245
246
247
248
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

249
250
251
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
252
253
254
255
256
		atomic_dec(&fs_info->nr_async_bios);

		if (atomic_read(&fs_info->nr_async_bios) < limit &&
		    waitqueue_active(&fs_info->async_submit_wait))
			wake_up(&fs_info->async_submit_wait);
257
258

		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259
260
		submit_bio(cur->bi_rw, cur);
		num_run++;
261
262
		batch_run++;

263
		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264
265
266
267
268
269
270
271
272
			num_sync_run++;

		if (need_resched()) {
			if (num_sync_run) {
				blk_run_backing_dev(bdi, NULL);
				num_sync_run = 0;
			}
			cond_resched();
		}
273
274
275
276
277
278

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
Chris Mason's avatar
Chris Mason committed
279
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
280
		    fs_info->fs_devices->open_devices > 1) {
281
			struct io_context *ioc;
282

283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
305
306
307
308
309
310
311
				if (need_resched()) {
					if (num_sync_run) {
						blk_run_backing_dev(bdi, NULL);
						num_sync_run = 0;
					}
					cond_resched();
				}
312
313
				continue;
			}
314
			spin_lock(&device->io_lock);
315
			requeue_list(pending_bios, pending, tail);
316
			device->running_pending = 1;
317
318
319
320
321
322

			spin_unlock(&device->io_lock);
			btrfs_requeue_work(&device->work);
			goto done;
		}
	}
323
324
325
326
327

	if (num_sync_run) {
		num_sync_run = 0;
		blk_run_backing_dev(bdi, NULL);
	}
328
329
330
331
332
333
334
335
336
337
338
	/*
	 * IO has already been through a long path to get here.  Checksumming,
	 * async helper threads, perhaps compression.  We've done a pretty
	 * good job of collecting a batch of IO and should just unplug
	 * the device right away.
	 *
	 * This will help anyone who is waiting on the IO, they might have
	 * already unplugged, but managed to do so before the bio they
	 * cared about found its way down here.
	 */
	blk_run_backing_dev(bdi, NULL);
339
340
341
342
343
344
345
346
347
348

	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

349
350
351
352
done:
	return 0;
}

353
static void pending_bios_fn(struct btrfs_work *work)
354
355
356
357
358
359
360
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

361
static noinline int device_list_add(const char *path,
362
363
364
365
366
367
			   struct btrfs_super_block *disk_super,
			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
	u64 found_transid = btrfs_super_generation(disk_super);
368
	char *name;
369
370
371

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
372
		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
373
374
375
		if (!fs_devices)
			return -ENOMEM;
		INIT_LIST_HEAD(&fs_devices->devices);
376
		INIT_LIST_HEAD(&fs_devices->alloc_list);
377
378
379
380
		list_add(&fs_devices->list, &fs_uuids);
		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
		fs_devices->latest_devid = devid;
		fs_devices->latest_trans = found_transid;
381
		mutex_init(&fs_devices->device_list_mutex);
382
383
		device = NULL;
	} else {
384
385
		device = __find_device(&fs_devices->devices, devid,
				       disk_super->dev_item.uuid);
386
387
	}
	if (!device) {
Yan Zheng's avatar
Yan Zheng committed
388
389
390
		if (fs_devices->opened)
			return -EBUSY;

391
392
393
394
395
396
		device = kzalloc(sizeof(*device), GFP_NOFS);
		if (!device) {
			/* we can safely leave the fs_devices entry around */
			return -ENOMEM;
		}
		device->devid = devid;
397
		device->work.func = pending_bios_fn;
398
399
		memcpy(device->uuid, disk_super->dev_item.uuid,
		       BTRFS_UUID_SIZE);
400
		device->barriers = 1;
401
		spin_lock_init(&device->io_lock);
402
403
404
405
406
		device->name = kstrdup(path, GFP_NOFS);
		if (!device->name) {
			kfree(device);
			return -ENOMEM;
		}
Yan Zheng's avatar
Yan Zheng committed
407
		INIT_LIST_HEAD(&device->dev_alloc_list);
408
409

		mutex_lock(&fs_devices->device_list_mutex);
410
		list_add(&device->dev_list, &fs_devices->devices);
411
412
		mutex_unlock(&fs_devices->device_list_mutex);

Yan Zheng's avatar
Yan Zheng committed
413
		device->fs_devices = fs_devices;
414
		fs_devices->num_devices++;
415
416
417
418
419
420
	} else if (strcmp(device->name, path)) {
		name = kstrdup(path, GFP_NOFS);
		if (!name)
			return -ENOMEM;
		kfree(device->name);
		device->name = name;
421
422
423
424
425
426
427
428
429
430
	}

	if (found_transid > fs_devices->latest_trans) {
		fs_devices->latest_devid = devid;
		fs_devices->latest_trans = found_transid;
	}
	*fs_devices_ret = fs_devices;
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
431
432
433
434
435
436
437
438
439
440
441
442
443
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
	if (!fs_devices)
		return ERR_PTR(-ENOMEM);

	INIT_LIST_HEAD(&fs_devices->devices);
	INIT_LIST_HEAD(&fs_devices->alloc_list);
	INIT_LIST_HEAD(&fs_devices->list);
444
	mutex_init(&fs_devices->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
445
446
447
448
	fs_devices->latest_devid = orig->latest_devid;
	fs_devices->latest_trans = orig->latest_trans;
	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));

449
	mutex_lock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
450
451
452
453
454
455
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
		device = kzalloc(sizeof(*device), GFP_NOFS);
		if (!device)
			goto error;

		device->name = kstrdup(orig_dev->name, GFP_NOFS);
Julia Lawall's avatar
Julia Lawall committed
456
457
		if (!device->name) {
			kfree(device);
Yan Zheng's avatar
Yan Zheng committed
458
			goto error;
Julia Lawall's avatar
Julia Lawall committed
459
		}
Yan Zheng's avatar
Yan Zheng committed
460
461
462
463
464
465
466
467
468
469
470
471
472

		device->devid = orig_dev->devid;
		device->work.func = pending_bios_fn;
		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
		device->barriers = 1;
		spin_lock_init(&device->io_lock);
		INIT_LIST_HEAD(&device->dev_list);
		INIT_LIST_HEAD(&device->dev_alloc_list);

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
473
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
474
475
	return fs_devices;
error:
476
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
477
478
479
480
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

481
482
int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
{
483
	struct btrfs_device *device, *next;
484
485
486

	mutex_lock(&uuid_mutex);
again:
487
	mutex_lock(&fs_devices->device_list_mutex);
488
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
Yan Zheng's avatar
Yan Zheng committed
489
490
491
492
		if (device->in_fs_metadata)
			continue;

		if (device->bdev) {
493
			close_bdev_exclusive(device->bdev, device->mode);
Yan Zheng's avatar
Yan Zheng committed
494
495
496
497
498
499
500
501
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
		if (device->writeable) {
			list_del_init(&device->dev_alloc_list);
			device->writeable = 0;
			fs_devices->rw_devices--;
		}
Yan Zheng's avatar
Yan Zheng committed
502
503
504
505
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
		kfree(device->name);
		kfree(device);
506
	}
507
	mutex_unlock(&fs_devices->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
508
509
510
511
512
513

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

514
515
516
	mutex_unlock(&uuid_mutex);
	return 0;
}
517

Yan Zheng's avatar
Yan Zheng committed
518
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
519
520
{
	struct btrfs_device *device;
Yan Zheng's avatar
Yan Zheng committed
521

Yan Zheng's avatar
Yan Zheng committed
522
523
	if (--fs_devices->opened > 0)
		return 0;
524

525
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
526
		if (device->bdev) {
527
			close_bdev_exclusive(device->bdev, device->mode);
528
			fs_devices->open_devices--;
529
		}
Yan Zheng's avatar
Yan Zheng committed
530
531
532
533
534
		if (device->writeable) {
			list_del_init(&device->dev_alloc_list);
			fs_devices->rw_devices--;
		}

535
		device->bdev = NULL;
Yan Zheng's avatar
Yan Zheng committed
536
		device->writeable = 0;
537
		device->in_fs_metadata = 0;
538
	}
Yan Zheng's avatar
Yan Zheng committed
539
540
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Yan Zheng's avatar
Yan Zheng committed
541
542
543
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

544
545
546
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
547
548
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Yan Zheng's avatar
Yan Zheng committed
549
	struct btrfs_fs_devices *seed_devices = NULL;
Yan Zheng's avatar
Yan Zheng committed
550
551
552
553
	int ret;

	mutex_lock(&uuid_mutex);
	ret = __btrfs_close_devices(fs_devices);
Yan Zheng's avatar
Yan Zheng committed
554
555
556
557
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Yan Zheng's avatar
Yan Zheng committed
558
	mutex_unlock(&uuid_mutex);
Yan Zheng's avatar
Yan Zheng committed
559
560
561
562
563
564
565

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
	}
Yan Zheng's avatar
Yan Zheng committed
566
567
568
	return ret;
}

Yan Zheng's avatar
Yan Zheng committed
569
570
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				fmode_t flags, void *holder)
571
572
573
574
{
	struct block_device *bdev;
	struct list_head *head = &fs_devices->devices;
	struct btrfs_device *device;
575
576
577
578
579
580
	struct block_device *latest_bdev = NULL;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 latest_devid = 0;
	u64 latest_transid = 0;
	u64 devid;
Yan Zheng's avatar
Yan Zheng committed
581
	int seeding = 1;
582
	int ret = 0;
583

584
	list_for_each_entry(device, head, dev_list) {
585
586
		if (device->bdev)
			continue;
587
588
589
		if (!device->name)
			continue;

590
		bdev = open_bdev_exclusive(device->name, flags, holder);
591
		if (IS_ERR(bdev)) {
592
			printk(KERN_INFO "open %s failed\n", device->name);
593
			goto error;
594
		}
595
		set_blocksize(bdev, 4096);
596

Yan Zheng's avatar
Yan Zheng committed
597
		bh = btrfs_read_dev_super(bdev);
598
599
600
601
		if (!bh)
			goto error_close;

		disk_super = (struct btrfs_super_block *)bh->b_data;
602
		devid = btrfs_stack_device_id(&disk_super->dev_item);
603
604
605
		if (devid != device->devid)
			goto error_brelse;

Yan Zheng's avatar
Yan Zheng committed
606
607
608
609
610
611
		if (memcmp(device->uuid, disk_super->dev_item.uuid,
			   BTRFS_UUID_SIZE))
			goto error_brelse;

		device->generation = btrfs_super_generation(disk_super);
		if (!latest_transid || device->generation > latest_transid) {
612
			latest_devid = devid;
Yan Zheng's avatar
Yan Zheng committed
613
			latest_transid = device->generation;
614
615
616
			latest_bdev = bdev;
		}

Yan Zheng's avatar
Yan Zheng committed
617
618
619
620
621
622
623
		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
			device->writeable = 0;
		} else {
			device->writeable = !bdev_read_only(bdev);
			seeding = 0;
		}

624
		device->bdev = bdev;
625
		device->in_fs_metadata = 0;
626
627
		device->mode = flags;

Chris Mason's avatar
Chris Mason committed
628
629
630
		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
			fs_devices->rotating = 1;

631
		fs_devices->open_devices++;
Yan Zheng's avatar
Yan Zheng committed
632
633
634
635
636
		if (device->writeable) {
			fs_devices->rw_devices++;
			list_add(&device->dev_alloc_list,
				 &fs_devices->alloc_list);
		}
637
		continue;
638

639
640
641
error_brelse:
		brelse(bh);
error_close:
642
		close_bdev_exclusive(bdev, FMODE_READ);
643
644
error:
		continue;
645
	}
646
647
648
649
	if (fs_devices->open_devices == 0) {
		ret = -EIO;
		goto out;
	}
Yan Zheng's avatar
Yan Zheng committed
650
651
	fs_devices->seeding = seeding;
	fs_devices->opened = 1;
652
653
654
	fs_devices->latest_bdev = latest_bdev;
	fs_devices->latest_devid = latest_devid;
	fs_devices->latest_trans = latest_transid;
Yan Zheng's avatar
Yan Zheng committed
655
	fs_devices->total_rw_bytes = 0;
656
out:
Yan Zheng's avatar
Yan Zheng committed
657
658
659
660
	return ret;
}

int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
661
		       fmode_t flags, void *holder)
Yan Zheng's avatar
Yan Zheng committed
662
663
664
665
666
{
	int ret;

	mutex_lock(&uuid_mutex);
	if (fs_devices->opened) {
Yan Zheng's avatar
Yan Zheng committed
667
668
		fs_devices->opened++;
		ret = 0;
Yan Zheng's avatar
Yan Zheng committed
669
	} else {
670
		ret = __btrfs_open_devices(fs_devices, flags, holder);
Yan Zheng's avatar
Yan Zheng committed
671
	}
672
673
674
675
	mutex_unlock(&uuid_mutex);
	return ret;
}

676
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
677
678
679
680
681
682
683
			  struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_super_block *disk_super;
	struct block_device *bdev;
	struct buffer_head *bh;
	int ret;
	u64 devid;
684
	u64 transid;
685
686
687

	mutex_lock(&uuid_mutex);

688
	bdev = open_bdev_exclusive(path, flags, holder);
689
690
691
692
693
694
695
696
697

	if (IS_ERR(bdev)) {
		ret = PTR_ERR(bdev);
		goto error;
	}

	ret = set_blocksize(bdev, 4096);
	if (ret)
		goto error_close;
Yan Zheng's avatar
Yan Zheng committed
698
	bh = btrfs_read_dev_super(bdev);
699
700
701
702
703
	if (!bh) {
		ret = -EIO;
		goto error_close;
	}
	disk_super = (struct btrfs_super_block *)bh->b_data;
704
	devid = btrfs_stack_device_id(&disk_super->dev_item);
705
	transid = btrfs_super_generation(disk_super);
706
	if (disk_super->label[0])
707
		printk(KERN_INFO "device label %s ", disk_super->label);
708
709
	else {
		/* FIXME, make a readl uuid parser */
710
		printk(KERN_INFO "device fsid %llx-%llx ",
711
712
713
		       *(unsigned long long *)disk_super->fsid,
		       *(unsigned long long *)(disk_super->fsid + 8));
	}
714
	printk(KERN_CONT "devid %llu transid %llu %s\n",
715
	       (unsigned long long)devid, (unsigned long long)transid, path);
716
717
718
719
	ret = device_list_add(path, disk_super, devid, fs_devices_ret);

	brelse(bh);
error_close:
720
	close_bdev_exclusive(bdev, flags);
721
722
723
724
error:
	mutex_unlock(&uuid_mutex);
	return ret;
}
725
726
727
728
729
730

/*
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
 */
731
732
733
int find_free_dev_extent(struct btrfs_trans_handle *trans,
			 struct btrfs_device *device, u64 num_bytes,
			 u64 *start, u64 *max_avail)
734
735
736
737
{
	struct btrfs_key key;
	struct btrfs_root *root = device->dev_root;
	struct btrfs_dev_extent *dev_extent = NULL;
Yan Zheng's avatar
Yan Zheng committed
738
	struct btrfs_path *path;
739
740
741
742
743
744
745
746
747
	u64 hole_size = 0;
	u64 last_byte = 0;
	u64 search_start = 0;
	u64 search_end = device->total_bytes;
	int ret;
	int slot = 0;
	int start_found;
	struct extent_buffer *l;

Yan Zheng's avatar
Yan Zheng committed
748
749
750
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
751
	path->reada = 2;
Yan Zheng's avatar
Yan Zheng committed
752
	start_found = 0;
753
754
755

	/* FIXME use last free of some kind */

756
757
758
759
	/* we don't want to overwrite the superblock on the drive,
	 * so we make sure to start at an offset of at least 1MB
	 */
	search_start = max((u64)1024 * 1024, search_start);
760
761
762
763

	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
		search_start = max(root->fs_info->alloc_start, search_start);

764
765
766
767
768
769
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
	if (ret < 0)
		goto error;
770
771
772
773
774
775
776
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
			goto error;
		if (ret > 0)
			start_found = 1;
	}
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
	l = path->nodes[0];
	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
no_more_items:
			if (!start_found) {
				if (search_start >= search_end) {
					ret = -ENOSPC;
					goto error;
				}
				*start = search_start;
				start_found = 1;
				goto check_pending;
			}
			*start = last_byte > search_start ?
				last_byte : search_start;
			if (search_end <= *start) {
				ret = -ENOSPC;
				goto error;
			}
			goto check_pending;
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
			goto no_more_items;

		if (key.offset >= search_start && key.offset > last_byte &&
		    start_found) {
			if (last_byte < search_start)
				last_byte = search_start;
			hole_size = key.offset - last_byte;
819
820
821
822

			if (hole_size > *max_avail)
				*max_avail = hole_size;

823
824
825
826
827
828
			if (key.offset > last_byte &&
			    hole_size >= num_bytes) {
				*start = last_byte;
				goto check_pending;
			}
		}
829
		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
			goto next;

		start_found = 1;
		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
next:
		path->slots[0]++;
		cond_resched();
	}
check_pending:
	/* we have to make sure we didn't find an extent that has already
	 * been allocated by the map tree or the original allocation
	 */
	BUG_ON(*start < search_start);

845
	if (*start + num_bytes > search_end) {
846
847
848
849
		ret = -ENOSPC;
		goto error;
	}
	/* check for pending inserts here */
Yan Zheng's avatar
Yan Zheng committed
850
	ret = 0;
851
852

error:
Yan Zheng's avatar
Yan Zheng committed
853
	btrfs_free_path(path);
854
855
856
	return ret;
}

857
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
858
859
860
861
862
863
864
			  struct btrfs_device *device,
			  u64 start)
{
	int ret;
	struct btrfs_path *path;
	struct btrfs_root *root = device->dev_root;
	struct btrfs_key key;
865
866
867
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
868
869
870
871
872
873
874
875
876
877

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
		BUG_ON(ret);
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
		ret = 0;
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
	}
894
895
	BUG_ON(ret);

896
897
	if (device->bytes_used > 0)
		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
898
899
900
901
902
903
904
	ret = btrfs_del_item(trans, root, path);
	BUG_ON(ret);

	btrfs_free_path(path);
	return ret;
}

Yan Zheng's avatar
Yan Zheng committed
905
int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
906
			   struct btrfs_device *device,
907
			   u64 chunk_tree, u64 chunk_objectid,
Yan Zheng's avatar
Yan Zheng committed
908
			   u64 chunk_offset, u64 start, u64 num_bytes)
909
910
911
912
913
914
915
916
{
	int ret;
	struct btrfs_path *path;
	struct btrfs_root *root = device->dev_root;
	struct btrfs_dev_extent *extent;
	struct extent_buffer *leaf;
	struct btrfs_key key;

917
	WARN_ON(!device->in_fs_metadata);
918
919
920
921
922
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
Yan Zheng's avatar
Yan Zheng committed
923
	key.offset = start;
924
925
926
927
928
929
930
931
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*extent));
	BUG_ON(ret);

	leaf = path->nodes[0];
	extent = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_dev_extent);
932
933
934
935
936
937
938
939
	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
		    BTRFS_UUID_SIZE);

940
941
942
943
944
945
	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);
	return ret;
}

946
947
static noinline int find_next_chunk(struct btrfs_root *root,
				    u64 objectid, u64 *offset)
948
949
950
951
{
	struct btrfs_path *path;
	int ret;
	struct btrfs_key key;
952
	struct btrfs_chunk *chunk;
953
954
955
956
957
	struct btrfs_key found_key;

	path = btrfs_alloc_path();
	BUG_ON(!path);

958
	key.objectid = objectid;
959
960
961
962
963
964
965
966
967
968
969
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto error;

	BUG_ON(ret == 0);

	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
	if (ret) {
970
		*offset = 0;
971
972
973
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
974
975
976
977
978
979
980
981
		if (found_key.objectid != objectid)
			*offset = 0;
		else {
			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
					       struct btrfs_chunk);
			*offset = found_key.offset +
				btrfs_chunk_length(path->nodes[0], chunk);
		}
982
983
984
985
986
987
988
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

Yan Zheng's avatar
Yan Zheng committed
989
static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
990
991
992
993
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Yan Zheng's avatar
Yan Zheng committed
994
995
996
997
998
999
1000
	struct btrfs_path *path;

	root = root->fs_info->chunk_root;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;