transaction.c 62 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
#include <linux/fs.h>
20
#include <linux/slab.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
25
#include <linux/uuid.h>
Chris Mason's avatar
Chris Mason committed
26
27
28
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "inode-map.h"
32
#include "volumes.h"
33
#include "dev-replace.h"
Josef Bacik's avatar
Josef Bacik committed
34
#include "qgroup.h"
Chris Mason's avatar
Chris Mason committed
35

36
37
#define BTRFS_ROOT_TRANS_TAG 0

38
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
	[TRANS_STATE_RUNNING]		= 0U,
	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
					   __TRANS_START),
	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH),
	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN),
	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK),
	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK),
};

61
void btrfs_put_transaction(struct btrfs_transaction *transaction)
Chris Mason's avatar
Chris Mason committed
62
{
63
64
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
Josef Bacik's avatar
Josef Bacik committed
65
		BUG_ON(!list_empty(&transaction->list));
Liu Bo's avatar
Liu Bo committed
66
		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
67
68
69
		if (transaction->delayed_refs.pending_csums)
			printk(KERN_ERR "pending csums is %llu\n",
			       transaction->delayed_refs.pending_csums);
70
71
72
73
74
75
76
77
		while (!list_empty(&transaction->pending_chunks)) {
			struct extent_map *em;

			em = list_first_entry(&transaction->pending_chunks,
					      struct extent_map, list);
			list_del_init(&em->list);
			free_extent_map(em);
		}
Chris Mason's avatar
Chris Mason committed
78
		kmem_cache_free(btrfs_transaction_cachep, transaction);
Chris Mason's avatar
Chris Mason committed
79
	}
Chris Mason's avatar
Chris Mason committed
80
81
}

82
83
84
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
	spin_lock(&tree->lock);
85
86
87
88
89
90
	/*
	 * Do a single barrier for the waitqueue_active check here, the state
	 * of the waitqueue should not change once clear_btree_io_tree is
	 * called.
	 */
	smp_mb();
91
92
93
94
95
96
97
98
99
100
101
102
103
104
	while (!RB_EMPTY_ROOT(&tree->state)) {
		struct rb_node *node;
		struct extent_state *state;

		node = rb_first(&tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
		rb_erase(&state->rb_node, &tree->state);
		RB_CLEAR_NODE(&state->rb_node);
		/*
		 * btree io trees aren't supposed to have tasks waiting for
		 * changes in the flags of extent states ever.
		 */
		ASSERT(!waitqueue_active(&state->wq));
		free_extent_state(state);
105
106

		cond_resched_lock(&tree->lock);
107
108
109
110
	}
	spin_unlock(&tree->lock);
}

111
112
static noinline void switch_commit_roots(struct btrfs_transaction *trans,
					 struct btrfs_fs_info *fs_info)
Josef Bacik's avatar
Josef Bacik committed
113
{
114
115
116
117
118
119
120
121
122
123
	struct btrfs_root *root, *tmp;

	down_write(&fs_info->commit_root_sem);
	list_for_each_entry_safe(root, tmp, &trans->switch_commits,
				 dirty_list) {
		list_del_init(&root->dirty_list);
		free_extent_buffer(root->commit_root);
		root->commit_root = btrfs_root_node(root);
		if (is_fstree(root->objectid))
			btrfs_unpin_free_ino(root);
124
		clear_btree_io_tree(&root->dirty_log_pages);
125
	}
126
127
128
129
130
131
132
133
134
135
136
137

	/* We can free old roots now. */
	spin_lock(&trans->dropped_roots_lock);
	while (!list_empty(&trans->dropped_roots)) {
		root = list_first_entry(&trans->dropped_roots,
					struct btrfs_root, root_list);
		list_del_init(&root->root_list);
		spin_unlock(&trans->dropped_roots_lock);
		btrfs_drop_and_free_fs_root(fs_info, root);
		spin_lock(&trans->dropped_roots_lock);
	}
	spin_unlock(&trans->dropped_roots_lock);
138
	up_write(&fs_info->commit_root_sem);
Josef Bacik's avatar
Josef Bacik committed
139
140
}

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_inc(&trans->num_extwriters);
}

static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_dec(&trans->num_extwriters);
}

static inline void extwriter_counter_init(struct btrfs_transaction *trans,
					  unsigned int type)
{
	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
}

static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
	return atomic_read(&trans->num_extwriters);
164
165
}

Chris Mason's avatar
Chris Mason committed
166
167
168
/*
 * either allocate a new transaction or hop into the existing one
 */
169
static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
Chris Mason's avatar
Chris Mason committed
170
171
{
	struct btrfs_transaction *cur_trans;
172
	struct btrfs_fs_info *fs_info = root->fs_info;
Josef Bacik's avatar
Josef Bacik committed
173

174
	spin_lock(&fs_info->trans_lock);
175
loop:
176
	/* The file system has been taken offline. No new transactions. */
177
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
178
		spin_unlock(&fs_info->trans_lock);
179
180
181
		return -EROFS;
	}

182
	cur_trans = fs_info->running_transaction;
Josef Bacik's avatar
Josef Bacik committed
183
	if (cur_trans) {
184
		if (cur_trans->aborted) {
185
			spin_unlock(&fs_info->trans_lock);
186
			return cur_trans->aborted;
187
		}
188
		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
189
190
191
			spin_unlock(&fs_info->trans_lock);
			return -EBUSY;
		}
Josef Bacik's avatar
Josef Bacik committed
192
		atomic_inc(&cur_trans->use_count);
193
		atomic_inc(&cur_trans->num_writers);
194
		extwriter_counter_inc(cur_trans, type);
195
		spin_unlock(&fs_info->trans_lock);
Josef Bacik's avatar
Josef Bacik committed
196
		return 0;
Chris Mason's avatar
Chris Mason committed
197
	}
198
	spin_unlock(&fs_info->trans_lock);
Josef Bacik's avatar
Josef Bacik committed
199

200
201
202
203
204
205
206
	/*
	 * If we are ATTACH, we just want to catch the current transaction,
	 * and commit it. If there is no transaction, just return ENOENT.
	 */
	if (type == TRANS_ATTACH)
		return -ENOENT;

207
208
209
210
211
212
	/*
	 * JOIN_NOLOCK only happens during the transaction commit, so
	 * it is impossible that ->running_transaction is NULL
	 */
	BUG_ON(type == TRANS_JOIN_NOLOCK);

Josef Bacik's avatar
Josef Bacik committed
213
214
215
	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
216

217
218
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
219
220
		/*
		 * someone started a transaction after we unlocked.  Make sure
221
		 * to redo the checks above
222
		 */
Josef Bacik's avatar
Josef Bacik committed
223
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
224
		goto loop;
225
	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
226
		spin_unlock(&fs_info->trans_lock);
227
228
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		return -EROFS;
Chris Mason's avatar
Chris Mason committed
229
	}
230

Josef Bacik's avatar
Josef Bacik committed
231
	atomic_set(&cur_trans->num_writers, 1);
232
	extwriter_counter_init(cur_trans, type);
Josef Bacik's avatar
Josef Bacik committed
233
234
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
235
	init_waitqueue_head(&cur_trans->pending_wait);
236
	cur_trans->state = TRANS_STATE_RUNNING;
Josef Bacik's avatar
Josef Bacik committed
237
238
239
240
241
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
Zhao Lei's avatar
Zhao Lei committed
242
	cur_trans->have_free_bgs = 0;
243
	atomic_set(&cur_trans->pending_ordered, 0);
Josef Bacik's avatar
Josef Bacik committed
244
	cur_trans->start_time = get_seconds();
245
	cur_trans->dirty_bg_run = 0;
Josef Bacik's avatar
Josef Bacik committed
246

247
248
	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));

Liu Bo's avatar
Liu Bo committed
249
	cur_trans->delayed_refs.href_root = RB_ROOT;
250
	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
251
	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
252
253
254
255
256
257

	/*
	 * although the tree mod log is per file system and not per transaction,
	 * the log must never go across transaction boundaries.
	 */
	smp_mb();
Julia Lawall's avatar
Julia Lawall committed
258
	if (!list_empty(&fs_info->tree_mod_seq_list))
259
		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
260
			"creating a fresh transaction\n");
Julia Lawall's avatar
Julia Lawall committed
261
	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
262
		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
263
			"creating a fresh transaction\n");
264
	atomic64_set(&fs_info->tree_mod_seq, 0);
265

Josef Bacik's avatar
Josef Bacik committed
266
267
268
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
269
	INIT_LIST_HEAD(&cur_trans->pending_chunks);
270
	INIT_LIST_HEAD(&cur_trans->switch_commits);
271
	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
272
	INIT_LIST_HEAD(&cur_trans->io_bgs);
273
	INIT_LIST_HEAD(&cur_trans->dropped_roots);
274
	mutex_init(&cur_trans->cache_write_mutex);
275
	cur_trans->num_dirty_bgs = 0;
276
	spin_lock_init(&cur_trans->dirty_bgs_lock);
277
278
	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
	spin_lock_init(&cur_trans->deleted_bgs_lock);
279
	spin_lock_init(&cur_trans->dropped_roots_lock);
280
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
Josef Bacik's avatar
Josef Bacik committed
281
	extent_io_tree_init(&cur_trans->dirty_pages,
282
283
284
285
			     fs_info->btree_inode->i_mapping);
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
286
	cur_trans->aborted = 0;
287
	spin_unlock(&fs_info->trans_lock);
288

Chris Mason's avatar
Chris Mason committed
289
290
291
	return 0;
}

Chris Mason's avatar
Chris Mason committed
292
/*
293
294
295
296
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
Chris Mason's avatar
Chris Mason committed
297
 */
Chris Mason's avatar
Chris Mason committed
298
static int record_root_in_trans(struct btrfs_trans_handle *trans,
Josef Bacik's avatar
Josef Bacik committed
299
			       struct btrfs_root *root)
300
{
301
302
	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
	    root->last_trans < trans->transid) {
303
		WARN_ON(root == root->fs_info->extent_root);
304
305
		WARN_ON(root->commit_root != root->node);

Chris Mason's avatar
Chris Mason committed
306
		/*
307
		 * see below for IN_TRANS_SETUP usage rules
Chris Mason's avatar
Chris Mason committed
308
309
310
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
311
		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
Chris Mason's avatar
Chris Mason committed
312

313
		/* make sure readers find IN_TRANS_SETUP before
Chris Mason's avatar
Chris Mason committed
314
315
316
317
		 * they find our root->last_trans update
		 */
		smp_wmb();

Josef Bacik's avatar
Josef Bacik committed
318
319
320
321
322
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
323
324
325
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
Josef Bacik's avatar
Josef Bacik committed
326
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
Chris Mason's avatar
Chris Mason committed
327
328
329
330
331
332
333
334
335
336
337
338
339
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
340
		 * with root IN_TRANS_SETUP.  When this is 1, we're still
Chris Mason's avatar
Chris Mason committed
341
342
343
344
345
346
347
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
348
		btrfs_init_reloc_root(trans, root);
349
		smp_mb__before_atomic();
350
		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
351
352
353
	}
	return 0;
}
354

Chris Mason's avatar
Chris Mason committed
355

356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	/* Add ourselves to the transaction dropped list */
	spin_lock(&cur_trans->dropped_roots_lock);
	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
	spin_unlock(&cur_trans->dropped_roots_lock);

	/* Make sure we don't try to update the root at commit time */
	spin_lock(&root->fs_info->fs_roots_radix_lock);
	radix_tree_tag_clear(&root->fs_info->fs_roots_radix,
			     (unsigned long)root->root_key.objectid,
			     BTRFS_ROOT_TRANS_TAG);
	spin_unlock(&root->fs_info->fs_roots_radix_lock);
}

Chris Mason's avatar
Chris Mason committed
374
375
376
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
377
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
Chris Mason's avatar
Chris Mason committed
378
379
380
		return 0;

	/*
381
	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
Chris Mason's avatar
Chris Mason committed
382
383
384
385
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
386
	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
Chris Mason's avatar
Chris Mason committed
387
388
389
390
391
392
393
394
395
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

396
397
398
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
	return (trans->state >= TRANS_STATE_BLOCKED &&
399
400
		trans->state < TRANS_STATE_UNBLOCKED &&
		!trans->aborted);
401
402
}

Chris Mason's avatar
Chris Mason committed
403
404
405
406
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
Chris Mason's avatar
Chris Mason committed
407
static void wait_current_trans(struct btrfs_root *root)
Chris Mason's avatar
Chris Mason committed
408
{
409
	struct btrfs_transaction *cur_trans;
Chris Mason's avatar
Chris Mason committed
410

Josef Bacik's avatar
Josef Bacik committed
411
	spin_lock(&root->fs_info->trans_lock);
412
	cur_trans = root->fs_info->running_transaction;
413
	if (cur_trans && is_transaction_blocked(cur_trans)) {
414
		atomic_inc(&cur_trans->use_count);
Josef Bacik's avatar
Josef Bacik committed
415
		spin_unlock(&root->fs_info->trans_lock);
Li Zefan's avatar
Li Zefan committed
416
417

		wait_event(root->fs_info->transaction_wait,
418
419
			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
			   cur_trans->aborted);
420
		btrfs_put_transaction(cur_trans);
Josef Bacik's avatar
Josef Bacik committed
421
422
	} else {
		spin_unlock(&root->fs_info->trans_lock);
423
	}
Chris Mason's avatar
Chris Mason committed
424
425
}

426
427
static int may_wait_transaction(struct btrfs_root *root, int type)
{
Josef Bacik's avatar
Josef Bacik committed
428
429
430
431
432
433
434
435
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
436
		return 1;
Josef Bacik's avatar
Josef Bacik committed
437

438
439
440
	return 0;
}

441
442
443
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
	if (!root->fs_info->reloc_ctl ||
444
	    !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
445
446
447
448
449
450
451
	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
	    root->reloc_root)
		return false;

	return true;
}

452
static struct btrfs_trans_handle *
453
454
start_transaction(struct btrfs_root *root, unsigned int num_items,
		  unsigned int type, enum btrfs_reserve_flush_enum flush)
Chris Mason's avatar
Chris Mason committed
455
{
456
457
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
458
	u64 num_bytes = 0;
459
	u64 qgroup_reserved = 0;
460
461
	bool reloc_reserved = false;
	int ret;
462

463
	/* Send isn't supposed to start transactions. */
464
	ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
465

466
	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
467
		return ERR_PTR(-EROFS);
468

469
	if (current->journal_info) {
470
		WARN_ON(type & TRANS_EXTWRITERS);
471
472
		h = current->journal_info;
		h->use_count++;
473
		WARN_ON(h->use_count > 2);
474
475
476
477
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
478
479
480
481
482
483

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
484
485
		if (root->fs_info->quota_enabled &&
		    is_fstree(root->root_key.objectid)) {
486
			qgroup_reserved = num_items * root->nodesize;
487
488
489
490
491
			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
			if (ret)
				return ERR_PTR(ret);
		}

492
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
493
494
495
		/*
		 * Do the reservation for the relocation root creation
		 */
496
		if (need_reserve_reloc_root(root)) {
497
498
499
500
			num_bytes += root->nodesize;
			reloc_reserved = true;
		}

501
502
503
		ret = btrfs_block_rsv_add(root,
					  &root->fs_info->trans_block_rsv,
					  num_bytes, flush);
504
		if (ret)
505
			goto reserve_fail;
506
	}
507
again:
508
	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
509
510
511
512
	if (!h) {
		ret = -ENOMEM;
		goto alloc_fail;
	}
Chris Mason's avatar
Chris Mason committed
513

514
515
516
517
518
519
	/*
	 * If we are JOIN_NOLOCK we're already committing a transaction and
	 * waiting on this guy, so we don't need to do the sb_start_intwrite
	 * because we're already holding a ref.  We need this because we could
	 * have raced in and did an fsync() on a file which can kick a commit
	 * and then we deadlock with somebody doing a freeze.
520
521
522
	 *
	 * If we are ATTACH, it means we just want to catch the current
	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
523
	 */
524
	if (type & __TRANS_FREEZABLE)
525
		sb_start_intwrite(root->fs_info->sb);
526

527
	if (may_wait_transaction(root, type))
Chris Mason's avatar
Chris Mason committed
528
		wait_current_trans(root);
529

Josef Bacik's avatar
Josef Bacik committed
530
	do {
531
		ret = join_transaction(root, type);
532
		if (ret == -EBUSY) {
Josef Bacik's avatar
Josef Bacik committed
533
			wait_current_trans(root);
534
535
536
			if (unlikely(type == TRANS_ATTACH))
				ret = -ENOENT;
		}
Josef Bacik's avatar
Josef Bacik committed
537
538
	} while (ret == -EBUSY);

Tsutomu Itoh's avatar
Tsutomu Itoh committed
539
	if (ret < 0) {
540
541
		/* We must get the transaction if we are JOIN_NOLOCK. */
		BUG_ON(type == TRANS_JOIN_NOLOCK);
542
		goto join_fail;
Tsutomu Itoh's avatar
Tsutomu Itoh committed
543
	}
544

545
546
547
548
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
549
	h->root = root;
550
	h->use_count = 1;
551
	h->type = type;
552
	h->can_flush_pending_bgs = true;
553
	INIT_LIST_HEAD(&h->qgroup_ref_list);
554
	INIT_LIST_HEAD(&h->new_bgs);
555

556
	smp_mb();
557
558
	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
	    may_wait_transaction(root, type)) {
559
		current->journal_info = h;
560
561
562
563
		btrfs_commit_transaction(h, root);
		goto again;
	}

564
	if (num_bytes) {
Josef Bacik's avatar
Josef Bacik committed
565
		trace_btrfs_space_reservation(root->fs_info, "transaction",
566
					      h->transid, num_bytes, 1);
567
568
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
569
		h->reloc_reserved = reloc_reserved;
570
	}
571
	h->qgroup_reserved = qgroup_reserved;
Josef Bacik's avatar
Josef Bacik committed
572

573
got_it:
Josef Bacik's avatar
Josef Bacik committed
574
	btrfs_record_root_in_trans(h, root);
575
576
577

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
Chris Mason's avatar
Chris Mason committed
578
	return h;
579
580

join_fail:
581
	if (type & __TRANS_FREEZABLE)
582
583
584
585
586
587
588
589
590
591
		sb_end_intwrite(root->fs_info->sb);
	kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
	if (num_bytes)
		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
					num_bytes);
reserve_fail:
	if (qgroup_reserved)
		btrfs_qgroup_free(root, qgroup_reserved);
	return ERR_PTR(ret);
Chris Mason's avatar
Chris Mason committed
592
593
}

594
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
595
						   unsigned int num_items)
596
{
597
598
	return start_transaction(root, num_items, TRANS_START,
				 BTRFS_RESERVE_FLUSH_ALL);
599
}
600

601
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
602
603
					struct btrfs_root *root,
					unsigned int num_items)
604
{
605
606
	return start_transaction(root, num_items, TRANS_START,
				 BTRFS_RESERVE_FLUSH_LIMIT);
607
608
}

609
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
610
{
611
	return start_transaction(root, 0, TRANS_JOIN, 0);
612
613
}

614
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
615
{
616
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
617
618
}

619
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
620
{
621
	return start_transaction(root, 0, TRANS_USERSPACE, 0);
622
623
}

Miao Xie's avatar
Miao Xie committed
624
625
626
627
628
629
630
631
632
633
634
635
636
/*
 * btrfs_attach_transaction() - catch the running transaction
 *
 * It is used when we want to commit the current the transaction, but
 * don't want to start a new one.
 *
 * Note: If this function return -ENOENT, it just means there is no
 * running transaction. But it is possible that the inactive transaction
 * is still in the memory, not fully on disk. If you hope there is no
 * inactive transaction in the fs when -ENOENT is returned, you should
 * invoke
 *     btrfs_attach_transaction_barrier()
 */
637
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
638
{
639
	return start_transaction(root, 0, TRANS_ATTACH, 0);
640
641
}

Miao Xie's avatar
Miao Xie committed
642
/*
643
 * btrfs_attach_transaction_barrier() - catch the running transaction
Miao Xie's avatar
Miao Xie committed
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
 *
 * It is similar to the above function, the differentia is this one
 * will wait for all the inactive transactions until they fully
 * complete.
 */
struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
	struct btrfs_trans_handle *trans;

	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
		btrfs_wait_for_commit(root, 0);

	return trans;
}

Chris Mason's avatar
Chris Mason committed
661
/* wait for a transaction commit to be fully complete */
662
static noinline void wait_for_commit(struct btrfs_root *root,
663
664
				    struct btrfs_transaction *commit)
{
665
	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
666
667
}

668
669
670
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
671
	int ret = 0;
672
673
674

	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
Josef Bacik's avatar
Josef Bacik committed
675
			goto out;
676
677

		/* find specified transaction */
Josef Bacik's avatar
Josef Bacik committed
678
		spin_lock(&root->fs_info->trans_lock);
679
680
681
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
Josef Bacik's avatar
Josef Bacik committed
682
				atomic_inc(&cur_trans->use_count);
683
				ret = 0;
684
685
				break;
			}
686
687
			if (t->transid > transid) {
				ret = 0;
688
				break;
689
			}
690
		}
Josef Bacik's avatar
Josef Bacik committed
691
		spin_unlock(&root->fs_info->trans_lock);
Sage Weil's avatar
Sage Weil committed
692
693
694
695
696
697
698
699

		/*
		 * The specified transaction doesn't exist, or we
		 * raced with btrfs_commit_transaction
		 */
		if (!cur_trans) {
			if (transid > root->fs_info->last_trans_committed)
				ret = -EINVAL;
700
			goto out;
Sage Weil's avatar
Sage Weil committed
701
		}
702
703
	} else {
		/* find newest transaction that is committing | committed */
Josef Bacik's avatar
Josef Bacik committed
704
		spin_lock(&root->fs_info->trans_lock);
705
706
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
707
708
			if (t->state >= TRANS_STATE_COMMIT_START) {
				if (t->state == TRANS_STATE_COMPLETED)
709
					break;
710
				cur_trans = t;
Josef Bacik's avatar
Josef Bacik committed
711
				atomic_inc(&cur_trans->use_count);
712
713
714
				break;
			}
		}
Josef Bacik's avatar
Josef Bacik committed
715
		spin_unlock(&root->fs_info->trans_lock);
716
		if (!cur_trans)
Josef Bacik's avatar
Josef Bacik committed
717
			goto out;  /* nothing committing|committed */
718
719
720
	}

	wait_for_commit(root, cur_trans);
721
	btrfs_put_transaction(cur_trans);
Josef Bacik's avatar
Josef Bacik committed
722
out:
723
724
725
	return ret;
}

Chris Mason's avatar
Chris Mason committed
726
727
void btrfs_throttle(struct btrfs_root *root)
{
Josef Bacik's avatar
Josef Bacik committed
728
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
729
		wait_current_trans(root);
Chris Mason's avatar
Chris Mason committed
730
731
}

732
733
734
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
735
	if (root->fs_info->global_block_rsv.space_info->full &&
736
	    btrfs_check_space_for_delayed_refs(trans, root))
737
		return 1;
738

739
	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
740
741
742
743
744
745
746
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	int updates;
747
	int err;
748

Josef Bacik's avatar
Josef Bacik committed
749
	smp_mb();
750
751
	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
	    cur_trans->delayed_refs.flushing)
752
753
754
755
		return 1;

	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
756
	if (updates) {
757
		err = btrfs_run_delayed_refs(trans, root, updates * 2);
758
759
760
		if (err) /* Error code will also eval true */
			return err;
	}
761
762
763
764

	return should_end_transaction(trans, root);
}

765
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
766
			  struct btrfs_root *root, int throttle)
Chris Mason's avatar
Chris Mason committed
767
{
768
	struct btrfs_transaction *cur_trans = trans->transaction;
769
	struct btrfs_fs_info *info = root->fs_info;
770
	unsigned long cur = trans->delayed_ref_updates;
771
	int lock = (trans->type != TRANS_JOIN_NOLOCK);
772
	int err = 0;
Chris Mason's avatar
Chris Mason committed
773
	int must_run_delayed_refs = 0;
774

775
776
	if (trans->use_count > 1) {
		trans->use_count--;
777
778
779
780
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

781
	btrfs_trans_release_metadata(trans, root);
782
	trans->block_rsv = NULL;
783

784
785
786
	if (!list_empty(&trans->new_bgs))
		btrfs_create_pending_block_groups(trans, root);

787
	trans->delayed_ref_updates = 0;
Chris Mason's avatar
Chris Mason committed
788
789
790
	if (!trans->sync) {
		must_run_delayed_refs =
			btrfs_should_throttle_delayed_refs(trans, root);
791
		cur = max_t(unsigned long, cur, 32);
Chris Mason's avatar
Chris Mason committed
792
793
794
795
796
797
798
799

		/*
		 * don't make the caller wait if they are from a NOLOCK
		 * or ATTACH transaction, it will deadlock with commit
		 */
		if (must_run_delayed_refs == 1 &&
		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
			must_run_delayed_refs = 2;
800
	}
801

Josef Bacik's avatar
Josef Bacik committed
802
803
804
805
806
807
808
809
810
	if (trans->qgroup_reserved) {
		/*
		 * the same root has to be passed here between start_transaction
		 * and end_transaction. Subvolume quota depends on this.
		 */
		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
		trans->qgroup_reserved = 0;
	}

811
812
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;
813

814
815
816
	if (!list_empty(&trans->new_bgs))
		btrfs_create_pending_block_groups(trans, root);

817
818
	btrfs_trans_release_chunk_metadata(trans);

Josef Bacik's avatar
Josef Bacik committed
819
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
820
821
822
823
824
825
	    should_end_transaction(trans, root) &&
	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
		spin_lock(&info->trans_lock);
		if (cur_trans->state == TRANS_STATE_RUNNING)
			cur_trans->state = TRANS_STATE_BLOCKED;
		spin_unlock(&info->trans_lock);
Josef Bacik's avatar
Josef Bacik committed
826
	}
827

828
	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
829
		if (throttle)
830
			return btrfs_commit_transaction(trans, root);
831
		else
832
833
834
			wake_up_process(info->transaction_kthread);
	}

835
	if (trans->type & __TRANS_FREEZABLE)
836
		sb_end_intwrite(root->fs_info->sb);
837

838
	WARN_ON(cur_trans != info->running_transaction);
839
840
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
841
	extwriter_counter_dec(cur_trans, trans->type);
842

843
844
845
	/*
	 * Make sure counter is updated before we wake up waiters.
	 */
846
	smp_mb();
Chris Mason's avatar
Chris Mason committed
847
848
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
849
	btrfs_put_transaction(cur_trans);
Josef Bacik's avatar
Josef Bacik committed
850
851
852

	if (current->journal_info == trans)
		current->journal_info = NULL;
853

Yan, Zheng's avatar
Yan, Zheng committed
854
855
856
	if (throttle)
		btrfs_run_delayed_iputs(root);

857
	if (trans->aborted ||
858
859
	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
		wake_up_process(info->transaction_kthread);
860
		err = -EIO;
861
	}
862
	assert_qgroups_uptodate(trans);
863

864
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Chris Mason's avatar
Chris Mason committed
865
866
867
868
	if (must_run_delayed_refs) {
		btrfs_async_run_delayed_refs(root, cur,
					     must_run_delayed_refs == 1);
	}
869
	return err;
Chris Mason's avatar
Chris Mason committed
870
871
}

872
873
874
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
875
	return __btrfs_end_transaction(trans, root, 0);
876
877
878
879
880
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
881
	return __btrfs_end_transaction(trans, root, 1);
882
883
}

Chris Mason's avatar
Chris Mason committed
884
885
886
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
887
 * those extents are sent to disk but does not wait on them
Chris Mason's avatar
Chris Mason committed
888
 */
889
int btrfs_write_marked_extents(struct btrfs_root *root,
890
			       struct extent_io_tree *dirty_pages, int mark)
Chris Mason's avatar
Chris Mason committed
891
{
892
	int err = 0;
893
	int werr = 0;
Josef Bacik's avatar
Josef Bacik committed
894
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
895
	struct extent_state *cached_state = NULL;
896
	u64 start = 0;
897
	u64 end;
898

Josef Bacik's avatar
Josef Bacik committed
899
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
900
				      mark, &cached_state)) {
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
		bool wait_writeback = false;

		err = convert_extent_bit(dirty_pages, start, end,
					 EXTENT_NEED_WAIT,
					 mark, &cached_state, GFP_NOFS);
		/*
		 * convert_extent_bit can return -ENOMEM, which is most of the
		 * time a temporary error. So when it happens, ignore the error
		 * and wait for writeback of this range to finish - because we
		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
		 * to btrfs_wait_marked_extents() would not know that writeback
		 * for this range started and therefore wouldn't wait for it to
		 * finish - we don't want to commit a superblock that points to
		 * btree nodes/leafs for which writeback hasn't finished yet
		 * (and without errors).
		 * We cleanup any entries left in the io tree when committing
		 * the transaction (through clear_btree_io_tree()).
		 */
		if (err == -ENOMEM) {
			err = 0;
			wait_writeback = true;
		}
		if (!err)
			err = filemap_fdatawrite_range(mapping, start, end);
Josef Bacik's avatar
Josef Bacik committed
925
926
		if (err)
			werr = err;
927
928
		else if (wait_writeback)
			werr = filemap_fdatawait_range(mapping, start, end);