volumes.c 184 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */
#include <linux/sched.h>
#include <linux/bio.h>
20
#include <linux/slab.h>
21
#include <linux/buffer_head.h>
22
#include <linux/blkdev.h>
23
#include <linux/random.h>
24
#include <linux/iocontext.h>
25
#include <linux/capability.h>
26
#include <linux/ratelimit.h>
Ilya Dryomov's avatar
Ilya Dryomov committed
27
#include <linux/kthread.h>
David Woodhouse's avatar
David Woodhouse committed
28
#include <linux/raid/pq.h>
29
#include <linux/semaphore.h>
David Woodhouse's avatar
David Woodhouse committed
30
#include <asm/div64.h>
31
32
33
34
35
36
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
David Woodhouse's avatar
David Woodhouse committed
37
#include "raid56.h"
38
#include "async-thread.h"
39
#include "check-integrity.h"
40
#include "rcu-string.h"
41
#include "math.h"
42
#include "dev-replace.h"
43
#include "sysfs.h"
44

45
46
47
48
49
50
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
51
		.tolerated_failures = 1,
52
53
54
55
56
57
58
59
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
60
		.tolerated_failures = 1,
61
62
63
64
65
66
67
68
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
69
		.tolerated_failures = 0,
70
71
72
73
74
75
76
77
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
78
		.tolerated_failures = 0,
79
80
81
82
83
84
85
86
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
87
		.tolerated_failures = 0,
88
89
90
91
92
93
94
95
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
96
		.tolerated_failures = 1,
97
98
99
100
101
102
103
104
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
105
		.tolerated_failures = 2,
106
107
108
109
110
		.devs_increment	= 1,
		.ncopies	= 3,
	},
};

111
const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
112
113
114
115
116
117
118
119
120
	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
};

121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/*
 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 * condition is not met. Zero means there's no corresponding
 * BTRFS_ERROR_DEV_*_NOT_MET value.
 */
const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	[BTRFS_RAID_DUP]    = 0,
	[BTRFS_RAID_RAID0]  = 0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
};

Yan Zheng's avatar
Yan Zheng committed
136
137
138
139
static int init_first_rw_device(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
140
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
141
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
142
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
143
static void btrfs_close_one_device(struct btrfs_device *device);
Yan Zheng's avatar
Yan Zheng committed
144

145
DEFINE_MUTEX(uuid_mutex);
146
static LIST_HEAD(fs_uuids);
147
148
149
150
struct list_head *btrfs_get_fs_uuids(void)
{
	return &fs_uuids;
}
151

152
153
154
155
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
	struct btrfs_fs_devices *fs_devs;

156
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
157
158
159
160
161
162
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
163
	INIT_LIST_HEAD(&fs_devs->resized_devices);
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
	INIT_LIST_HEAD(&fs_devs->alloc_list);
	INIT_LIST_HEAD(&fs_devs->list);

	return fs_devs;
}

/**
 * alloc_fs_devices - allocate struct btrfs_fs_devices
 * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
 *		generated.
 *
 * Return: a pointer to a new &struct btrfs_fs_devices on success;
 * ERR_PTR() on error.  Returned struct is not linked onto any lists and
 * can be destroyed with kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
	struct btrfs_fs_devices *fs_devs;

	fs_devs = __alloc_fs_devices();
	if (IS_ERR(fs_devs))
		return fs_devs;

	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
	else
		generate_random_uuid(fs_devs->fsid);

	return fs_devs;
}

Yan Zheng's avatar
Yan Zheng committed
195
196
197
198
199
200
201
202
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
203
		rcu_string_free(device->name);
Yan Zheng's avatar
Yan Zheng committed
204
205
206
207
208
		kfree(device);
	}
	kfree(fs_devices);
}

209
210
211
212
213
214
215
static void btrfs_kobject_uevent(struct block_device *bdev,
				 enum kobject_action action)
{
	int ret;

	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
	if (ret)
216
		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
217
218
219
220
221
			action,
			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			&disk_to_dev(bdev->bd_disk)->kobj);
}

222
void btrfs_cleanup_fs_uuids(void)
223
224
225
{
	struct btrfs_fs_devices *fs_devices;

Yan Zheng's avatar
Yan Zheng committed
226
227
228
229
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
					struct btrfs_fs_devices, list);
		list_del(&fs_devices->list);
Yan Zheng's avatar
Yan Zheng committed
230
		free_fs_devices(fs_devices);
231
232
233
	}
}

234
235
236
237
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

238
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
239
240
241
242
243
	if (!dev)
		return ERR_PTR(-ENOMEM);

	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
244
	INIT_LIST_HEAD(&dev->resized_list);
245
246
247
248
249

	spin_lock_init(&dev->io_lock);

	spin_lock_init(&dev->reada_lock);
	atomic_set(&dev->reada_in_flight, 0);
250
	atomic_set(&dev->dev_stats_ccnt, 0);
251
	btrfs_device_data_ordered_init(dev);
252
253
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
254
255
256
257

	return dev;
}

258
259
static noinline struct btrfs_device *__find_device(struct list_head *head,
						   u64 devid, u8 *uuid)
260
261
262
{
	struct btrfs_device *dev;

263
	list_for_each_entry(dev, head, dev_list) {
264
		if (dev->devid == devid &&
265
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
266
			return dev;
267
		}
268
269
270
271
	}
	return NULL;
}

272
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
273
274
275
{
	struct btrfs_fs_devices *fs_devices;

276
	list_for_each_entry(fs_devices, &fs_uuids, list) {
277
278
279
280
281
282
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
	ret = set_blocksize(*bdev, 4096);
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
306
307
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
308
309
310
311
312
313
314
315
316
317
318
319
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

320
321
322
323
324
325
326
327
328
329
330
331
332
333
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

334
335
336
337
338
339
340
341
342
343
344
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
345
static noinline void run_scheduled_bios(struct btrfs_device *device)
346
347
348
{
	struct bio *pending;
	struct backing_dev_info *bdi;
349
	struct btrfs_fs_info *fs_info;
350
	struct btrfs_pending_bios *pending_bios;
351
352
353
	struct bio *tail;
	struct bio *cur;
	int again = 0;
354
	unsigned long num_run;
355
	unsigned long batch_run = 0;
356
	unsigned long limit;
357
	unsigned long last_waited = 0;
358
	int force_reg = 0;
359
	int sync_pending = 0;
360
361
362
363
364
365
366
367
368
	struct blk_plug plug;

	/*
	 * this function runs all the bios we've collected for
	 * a particular device.  We don't want to wander off to
	 * another device without first sending all of these down.
	 * So, setup a plug here and finish it off before we return
	 */
	blk_start_plug(&plug);
369

370
	bdi = blk_get_backing_dev_info(device->bdev);
371
372
373
374
	fs_info = device->dev_root->fs_info;
	limit = btrfs_async_submit_limit(fs_info);
	limit = limit * 2 / 3;

375
376
377
loop:
	spin_lock(&device->io_lock);

378
loop_lock:
379
	num_run = 0;
380

381
382
383
384
385
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
386
	if (!force_reg && device->pending_sync_bios.head) {
387
		pending_bios = &device->pending_sync_bios;
388
389
		force_reg = 1;
	} else {
390
		pending_bios = &device->pending_bios;
391
392
		force_reg = 0;
	}
393
394
395

	pending = pending_bios->head;
	tail = pending_bios->tail;
396
397
398
399
400
401
402
403
404
405
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
406
407
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
408
409
		again = 0;
		device->running_pending = 0;
410
411
412
	} else {
		again = 1;
		device->running_pending = 1;
413
	}
414
415
416
417

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

418
419
	spin_unlock(&device->io_lock);

420
	while (pending) {
421
422

		rmb();
423
424
425
426
427
428
429
430
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
431
432
433
434
435
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

436
437
438
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
439

440
441
442
		/*
		 * atomic_dec_return implies a barrier for waitqueue_active
		 */
443
		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
444
445
		    waitqueue_active(&fs_info->async_submit_wait))
			wake_up(&fs_info->async_submit_wait);
446

447
		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
448

449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
		/*
		 * if we're doing the sync list, record that our
		 * plug has some sync requests on it
		 *
		 * If we're doing the regular list and there are
		 * sync requests sitting around, unplug before
		 * we add more
		 */
		if (pending_bios == &device->pending_sync_bios) {
			sync_pending = 1;
		} else if (sync_pending) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}

465
		btrfsic_submit_bio(cur->bi_rw, cur);
466
467
		num_run++;
		batch_run++;
468
469

		cond_resched();
470
471
472
473
474
475

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
Chris Mason's avatar
Chris Mason committed
476
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
477
		    fs_info->fs_devices->open_devices > 1) {
478
			struct io_context *ioc;
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
502
				cond_resched();
503
504
				continue;
			}
505
			spin_lock(&device->io_lock);
506
			requeue_list(pending_bios, pending, tail);
507
			device->running_pending = 1;
508
509

			spin_unlock(&device->io_lock);
510
511
			btrfs_queue_work(fs_info->submit_workers,
					 &device->work);
512
513
			goto done;
		}
514
515
516
517
518
519
		/* unplug every 64 requests just for good measure */
		if (batch_run % 64 == 0) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}
520
	}
521

522
523
524
525
526
527
528
529
530
	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

531
done:
532
	blk_finish_plug(&plug);
533
534
}

535
static void pending_bios_fn(struct btrfs_work *work)
536
537
538
539
540
541
542
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

Anand Jain's avatar
Anand Jain committed
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597

void btrfs_free_stale_device(struct btrfs_device *cur_dev)
{
	struct btrfs_fs_devices *fs_devs;
	struct btrfs_device *dev;

	if (!cur_dev->name)
		return;

	list_for_each_entry(fs_devs, &fs_uuids, list) {
		int del = 1;

		if (fs_devs->opened)
			continue;
		if (fs_devs->seeding)
			continue;

		list_for_each_entry(dev, &fs_devs->devices, dev_list) {

			if (dev == cur_dev)
				continue;
			if (!dev->name)
				continue;

			/*
			 * Todo: This won't be enough. What if the same device
			 * comes back (with new uuid and) with its mapper path?
			 * But for now, this does help as mostly an admin will
			 * either use mapper or non mapper path throughout.
			 */
			rcu_read_lock();
			del = strcmp(rcu_str_deref(dev->name),
						rcu_str_deref(cur_dev->name));
			rcu_read_unlock();
			if (!del)
				break;
		}

		if (!del) {
			/* delete the stale device */
			if (fs_devs->num_devices == 1) {
				btrfs_sysfs_remove_fsid(fs_devs);
				list_del(&fs_devs->list);
				free_fs_devices(fs_devs);
			} else {
				fs_devs->num_devices--;
				list_del(&dev->dev_list);
				rcu_string_free(dev->name);
				kfree(dev);
			}
			break;
		}
	}
}

598
599
600
601
602
603
604
605
/*
 * Add new device to list of registered devices
 *
 * Returns:
 * 1   - first time device is seen
 * 0   - device already known
 * < 0 - error
 */
606
static noinline int device_list_add(const char *path,
607
608
609
610
611
			   struct btrfs_super_block *disk_super,
			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
612
	struct rcu_string *name;
613
	int ret = 0;
614
615
616
617
	u64 found_transid = btrfs_super_generation(disk_super);

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
618
619
620
621
		fs_devices = alloc_fs_devices(disk_super->fsid);
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);

622
		list_add(&fs_devices->list, &fs_uuids);
623

624
625
		device = NULL;
	} else {
626
627
		device = __find_device(&fs_devices->devices, devid,
				       disk_super->dev_item.uuid);
628
	}
629

630
	if (!device) {
Yan Zheng's avatar
Yan Zheng committed
631
632
633
		if (fs_devices->opened)
			return -EBUSY;

634
635
636
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
637
			/* we can safely leave the fs_devices entry around */
638
			return PTR_ERR(device);
639
		}
640
641
642

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
643
644
645
			kfree(device);
			return -ENOMEM;
		}
646
		rcu_assign_pointer(device->name, name);
647

648
		mutex_lock(&fs_devices->device_list_mutex);
649
		list_add_rcu(&device->dev_list, &fs_devices->devices);
650
		fs_devices->num_devices++;
651
652
		mutex_unlock(&fs_devices->device_list_mutex);

653
		ret = 1;
Yan Zheng's avatar
Yan Zheng committed
654
		device->fs_devices = fs_devices;
655
	} else if (!device->name || strcmp(device->name->str, path)) {
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
677
678
679
680
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
681
		 */
682
		if (!fs_devices->opened && found_transid < device->generation) {
683
684
685
686
687
688
689
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
690
			return -EEXIST;
691
		}
692

693
		name = rcu_string_strdup(path, GFP_NOFS);
694
695
		if (!name)
			return -ENOMEM;
696
697
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
698
699
700
701
		if (device->missing) {
			fs_devices->missing_devices--;
			device->missing = 0;
		}
702
703
	}

704
705
706
707
708
709
710
711
712
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
	if (!fs_devices->opened)
		device->generation = found_transid;

Anand Jain's avatar
Anand Jain committed
713
714
715
716
	/*
	 * if there is new btrfs on an already registered device,
	 * then remove the stale device entry.
	 */
717
718
	if (ret > 0)
		btrfs_free_stale_device(device);
Anand Jain's avatar
Anand Jain committed
719

720
	*fs_devices_ret = fs_devices;
721
722

	return ret;
723
724
}

Yan Zheng's avatar
Yan Zheng committed
725
726
727
728
729
730
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

731
732
733
	fs_devices = alloc_fs_devices(orig->fsid);
	if (IS_ERR(fs_devices))
		return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
734

735
	mutex_lock(&orig->device_list_mutex);
Josef Bacik's avatar
Josef Bacik committed
736
	fs_devices->total_devices = orig->total_devices;
Yan Zheng's avatar
Yan Zheng committed
737

738
	/* We have held the volume lock, it is safe to get the devices. */
Yan Zheng's avatar
Yan Zheng committed
739
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
740
741
		struct rcu_string *name;

742
743
744
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
		if (IS_ERR(device))
Yan Zheng's avatar
Yan Zheng committed
745
746
			goto error;

747
748
749
750
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
751
		if (orig_dev->name) {
752
753
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
754
755
756
757
758
			if (!name) {
				kfree(device);
				goto error;
			}
			rcu_assign_pointer(device->name, name);
Julia Lawall's avatar
Julia Lawall committed
759
		}
Yan Zheng's avatar
Yan Zheng committed
760
761
762
763
764

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
765
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
766
767
	return fs_devices;
error:
768
	mutex_unlock(&orig->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
769
770
771
772
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

773
void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
774
{
775
	struct btrfs_device *device, *next;
776
	struct btrfs_device *latest_dev = NULL;
777

778
779
	mutex_lock(&uuid_mutex);
again:
780
	/* This is the initialized path, it is safe to release the devices. */
781
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
782
		if (device->in_fs_metadata) {
783
			if (!device->is_tgtdev_for_dev_replace &&
784
785
786
			    (!latest_dev ||
			     device->generation > latest_dev->generation)) {
				latest_dev = device;
787
			}
Yan Zheng's avatar
Yan Zheng committed
788
			continue;
789
		}
Yan Zheng's avatar
Yan Zheng committed
790

791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
			if (step == 0 || device->is_tgtdev_for_dev_replace) {
				continue;
			}
		}
Yan Zheng's avatar
Yan Zheng committed
806
		if (device->bdev) {
807
			blkdev_put(device->bdev, device->mode);
Yan Zheng's avatar
Yan Zheng committed
808
809
810
811
812
813
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
		if (device->writeable) {
			list_del_init(&device->dev_alloc_list);
			device->writeable = 0;
814
815
			if (!device->is_tgtdev_for_dev_replace)
				fs_devices->rw_devices--;
Yan Zheng's avatar
Yan Zheng committed
816
		}
Yan Zheng's avatar
Yan Zheng committed
817
818
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
819
		rcu_string_free(device->name);
Yan Zheng's avatar
Yan Zheng committed
820
		kfree(device);
821
	}
Yan Zheng's avatar
Yan Zheng committed
822
823
824
825
826
827

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

828
	fs_devices->latest_bdev = latest_dev->bdev;
829

830
831
	mutex_unlock(&uuid_mutex);
}
832

833
834
835
836
837
838
839
840
841
static void __free_device(struct work_struct *work)
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, rcu_work);

	if (device->bdev)
		blkdev_put(device->bdev, device->mode);

842
	rcu_string_free(device->name);
843
844
845
846
847
848
849
850
851
852
853
854
855
	kfree(device);
}

static void free_device(struct rcu_head *head)
{
	struct btrfs_device *device;

	device = container_of(head, struct btrfs_device, rcu);

	INIT_WORK(&device->rcu_work, __free_device);
	schedule_work(&device->rcu_work);
}

Yan Zheng's avatar
Yan Zheng committed
856
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
857
{
858
	struct btrfs_device *device, *tmp;
Yan Zheng's avatar
Yan Zheng committed
859

Yan Zheng's avatar
Yan Zheng committed
860
861
	if (--fs_devices->opened > 0)
		return 0;
862

863
	mutex_lock(&fs_devices->device_list_mutex);
864
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
865
		btrfs_close_one_device(device);
866
	}
867
868
	mutex_unlock(&fs_devices->device_list_mutex);

Yan Zheng's avatar
Yan Zheng committed
869
870
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Yan Zheng's avatar
Yan Zheng committed
871
872
873
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

874
875
876
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
877
878
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Yan Zheng's avatar
Yan Zheng committed
879
	struct btrfs_fs_devices *seed_devices = NULL;
Yan Zheng's avatar
Yan Zheng committed
880
881
882
883
	int ret;

	mutex_lock(&uuid_mutex);
	ret = __btrfs_close_devices(fs_devices);
Yan Zheng's avatar
Yan Zheng committed
884
885
886
887
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Yan Zheng's avatar
Yan Zheng committed
888
	mutex_unlock(&uuid_mutex);
Yan Zheng's avatar
Yan Zheng committed
889
890
891
892
893
894
895

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
	}
896
897
898
899
900
901
	/*
	 * Wait for rcu kworkers under __btrfs_close_devices
	 * to finish all blkdev_puts so device is really
	 * free when umount is done.
	 */
	rcu_barrier();
Yan Zheng's avatar
Yan Zheng committed
902
903
904
	return ret;
}

Yan Zheng's avatar
Yan Zheng committed
905
906
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				fmode_t flags, void *holder)
907
{
908
	struct request_queue *q;
909
910
911
	struct block_device *bdev;
	struct list_head *head = &fs_devices->devices;
	struct btrfs_device *device;
912
	struct btrfs_device *latest_dev = NULL;
913
914
915
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
Yan Zheng's avatar
Yan Zheng committed
916
	int seeding = 1;
917
	int ret = 0;
918

919
920
	flags |= FMODE_EXCL;

921
	list_for_each_entry(device, head, dev_list) {
922
923
		if (device->bdev)
			continue;
924
925
926
		if (!device->name)
			continue;

927
928
929
		/* Just open everything we can; ignore failures here */
		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
					    &bdev, &bh))
930
			continue;
931
932

		disk_super = (struct btrfs_super_block *)bh->b_data;
933
		devid = btrfs_stack_device_id(&disk_super->dev_item);
934
935
936
		if (devid != device->devid)
			goto error_brelse;

Yan Zheng's avatar
Yan Zheng committed
937
938
939
940
941
		if (memcmp(device->uuid, disk_super->dev_item.uuid,
			   BTRFS_UUID_SIZE))
			goto error_brelse;

		device->generation = btrfs_super_generation(disk_super);
942
943
944
		if (!latest_dev ||
		    device->generation > latest_dev->generation)
			latest_dev = device;
945

Yan Zheng's avatar
Yan Zheng committed
946
947
948
949
950
951
952
		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
			device->writeable = 0;
		} else {
			device->writeable = !bdev_read_only(bdev);
			seeding = 0;
		}

953
		q = bdev_get_queue(bdev);
954
		if (blk_queue_discard(q))
955
956
			device->can_discard = 1;

957
		device->bdev = bdev;
958
		device->in_fs_metadata = 0;
959
960
		device->mode = flags;

Chris Mason's avatar
Chris Mason committed
961
962
963
		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
			fs_devices->rotating = 1;

964
		fs_devices->open_devices++;
965
966
		if (device->writeable &&
		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
Yan Zheng's avatar
Yan Zheng committed
967
968
969
970
			fs_devices->rw_devices++;
			list_add(&device->dev_alloc_list,
				 &fs_devices->alloc_list);
		}
971
		brelse(bh);
972
		continue;
973

974
975
error_brelse:
		brelse(bh);
976
		blkdev_put(bdev, flags);
977
		continue;
978
	}
979
	if (fs_devices->open_devices == 0) {
980
		ret = -EINVAL;
981
982
		goto out;
	}
Yan Zheng's avatar
Yan Zheng committed
983
984
	fs_devices->seeding = seeding;
	fs_devices->opened = 1;
985
	fs_devices->latest_bdev = latest_dev->bdev;
Yan Zheng's avatar
Yan Zheng committed
986
	fs_devices->total_rw_bytes = 0;
987
out:
Yan Zheng's avatar
Yan Zheng committed
988
989
990
991
	return ret;
}

int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
992
		       fmode_t flags, void *holder)
Yan Zheng's avatar
Yan Zheng committed
993
994
995
996
997
{
	int ret;

	mutex_lock(&uuid_mutex);
	if (fs_devices->opened) {
Yan Zheng's avatar
Yan Zheng committed
998
999
		fs_devices->opened++;
		ret = 0;
Yan Zheng's avatar
Yan Zheng committed
1000
	} else {