transaction.c 59.4 KB
Newer Older
Chris Mason's avatar
Chris Mason committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

Chris Mason's avatar
Chris Mason committed
19
#include <linux/fs.h>
20
#include <linux/slab.h>
Chris Mason's avatar
Chris Mason committed
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
25
#include <linux/uuid.h>
Chris Mason's avatar
Chris Mason committed
26
27
28
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "inode-map.h"
32
#include "volumes.h"
33
#include "dev-replace.h"
Josef Bacik's avatar
Josef Bacik committed
34
#include "qgroup.h"
Chris Mason's avatar
Chris Mason committed
35

36
37
#define BTRFS_ROOT_TRANS_TAG 0

38
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
	[TRANS_STATE_RUNNING]		= 0U,
	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
					   __TRANS_START),
	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH),
	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN),
	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK),
	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK),
};

61
void btrfs_put_transaction(struct btrfs_transaction *transaction)
Chris Mason's avatar
Chris Mason committed
62
{
63
64
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
Josef Bacik's avatar
Josef Bacik committed
65
		BUG_ON(!list_empty(&transaction->list));
Liu Bo's avatar
Liu Bo committed
66
		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
67
68
69
		if (transaction->delayed_refs.pending_csums)
			printk(KERN_ERR "pending csums is %llu\n",
			       transaction->delayed_refs.pending_csums);
70
71
72
73
74
75
76
77
		while (!list_empty(&transaction->pending_chunks)) {
			struct extent_map *em;

			em = list_first_entry(&transaction->pending_chunks,
					      struct extent_map, list);
			list_del_init(&em->list);
			free_extent_map(em);
		}
Chris Mason's avatar
Chris Mason committed
78
		kmem_cache_free(btrfs_transaction_cachep, transaction);
Chris Mason's avatar
Chris Mason committed
79
	}
Chris Mason's avatar
Chris Mason committed
80
81
}

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
	spin_lock(&tree->lock);
	while (!RB_EMPTY_ROOT(&tree->state)) {
		struct rb_node *node;
		struct extent_state *state;

		node = rb_first(&tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
		rb_erase(&state->rb_node, &tree->state);
		RB_CLEAR_NODE(&state->rb_node);
		/*
		 * btree io trees aren't supposed to have tasks waiting for
		 * changes in the flags of extent states ever.
		 */
		ASSERT(!waitqueue_active(&state->wq));
		free_extent_state(state);
99
100

		cond_resched_lock(&tree->lock);
101
102
103
104
	}
	spin_unlock(&tree->lock);
}

105
106
static noinline void switch_commit_roots(struct btrfs_transaction *trans,
					 struct btrfs_fs_info *fs_info)
Josef Bacik's avatar
Josef Bacik committed
107
{
108
109
110
111
112
113
114
115
116
117
	struct btrfs_root *root, *tmp;

	down_write(&fs_info->commit_root_sem);
	list_for_each_entry_safe(root, tmp, &trans->switch_commits,
				 dirty_list) {
		list_del_init(&root->dirty_list);
		free_extent_buffer(root->commit_root);
		root->commit_root = btrfs_root_node(root);
		if (is_fstree(root->objectid))
			btrfs_unpin_free_ino(root);
118
		clear_btree_io_tree(&root->dirty_log_pages);
119
120
	}
	up_write(&fs_info->commit_root_sem);
Josef Bacik's avatar
Josef Bacik committed
121
122
}

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_inc(&trans->num_extwriters);
}

static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_dec(&trans->num_extwriters);
}

static inline void extwriter_counter_init(struct btrfs_transaction *trans,
					  unsigned int type)
{
	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
}

static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
	return atomic_read(&trans->num_extwriters);
146
147
}

Chris Mason's avatar
Chris Mason committed
148
149
150
/*
 * either allocate a new transaction or hop into the existing one
 */
151
static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
Chris Mason's avatar
Chris Mason committed
152
153
{
	struct btrfs_transaction *cur_trans;
154
	struct btrfs_fs_info *fs_info = root->fs_info;
Josef Bacik's avatar
Josef Bacik committed
155

156
	spin_lock(&fs_info->trans_lock);
157
loop:
158
	/* The file system has been taken offline. No new transactions. */
159
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
160
		spin_unlock(&fs_info->trans_lock);
161
162
163
		return -EROFS;
	}

164
	cur_trans = fs_info->running_transaction;
Josef Bacik's avatar
Josef Bacik committed
165
	if (cur_trans) {
166
		if (cur_trans->aborted) {
167
			spin_unlock(&fs_info->trans_lock);
168
			return cur_trans->aborted;
169
		}
170
		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
171
172
173
			spin_unlock(&fs_info->trans_lock);
			return -EBUSY;
		}
Josef Bacik's avatar
Josef Bacik committed
174
		atomic_inc(&cur_trans->use_count);
175
		atomic_inc(&cur_trans->num_writers);
176
		extwriter_counter_inc(cur_trans, type);
177
		spin_unlock(&fs_info->trans_lock);
Josef Bacik's avatar
Josef Bacik committed
178
		return 0;
Chris Mason's avatar
Chris Mason committed
179
	}
180
	spin_unlock(&fs_info->trans_lock);
Josef Bacik's avatar
Josef Bacik committed
181

182
183
184
185
186
187
188
	/*
	 * If we are ATTACH, we just want to catch the current transaction,
	 * and commit it. If there is no transaction, just return ENOENT.
	 */
	if (type == TRANS_ATTACH)
		return -ENOENT;

189
190
191
192
193
194
	/*
	 * JOIN_NOLOCK only happens during the transaction commit, so
	 * it is impossible that ->running_transaction is NULL
	 */
	BUG_ON(type == TRANS_JOIN_NOLOCK);

Josef Bacik's avatar
Josef Bacik committed
195
196
197
	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
198

199
200
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
201
202
		/*
		 * someone started a transaction after we unlocked.  Make sure
203
		 * to redo the checks above
204
		 */
Josef Bacik's avatar
Josef Bacik committed
205
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
206
		goto loop;
207
	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
208
		spin_unlock(&fs_info->trans_lock);
209
210
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		return -EROFS;
Chris Mason's avatar
Chris Mason committed
211
	}
212

Josef Bacik's avatar
Josef Bacik committed
213
	atomic_set(&cur_trans->num_writers, 1);
214
	extwriter_counter_init(cur_trans, type);
Josef Bacik's avatar
Josef Bacik committed
215
216
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
217
	cur_trans->state = TRANS_STATE_RUNNING;
Josef Bacik's avatar
Josef Bacik committed
218
219
220
221
222
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
Zhao Lei's avatar
Zhao Lei committed
223
	cur_trans->have_free_bgs = 0;
Josef Bacik's avatar
Josef Bacik committed
224
225
	cur_trans->start_time = get_seconds();

Liu Bo's avatar
Liu Bo committed
226
	cur_trans->delayed_refs.href_root = RB_ROOT;
227
	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
Josef Bacik's avatar
Josef Bacik committed
228
	cur_trans->delayed_refs.num_heads_ready = 0;
229
	cur_trans->delayed_refs.pending_csums = 0;
Josef Bacik's avatar
Josef Bacik committed
230
231
232
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
233
234
235
236
237
238

	/*
	 * although the tree mod log is per file system and not per transaction,
	 * the log must never go across transaction boundaries.
	 */
	smp_mb();
Julia Lawall's avatar
Julia Lawall committed
239
	if (!list_empty(&fs_info->tree_mod_seq_list))
240
		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
241
			"creating a fresh transaction\n");
Julia Lawall's avatar
Julia Lawall committed
242
	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
243
		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
244
			"creating a fresh transaction\n");
245
	atomic64_set(&fs_info->tree_mod_seq, 0);
246

Josef Bacik's avatar
Josef Bacik committed
247
248
249
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
250
	INIT_LIST_HEAD(&cur_trans->pending_chunks);
251
	INIT_LIST_HEAD(&cur_trans->switch_commits);
252
	INIT_LIST_HEAD(&cur_trans->pending_ordered);
253
254
	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
	spin_lock_init(&cur_trans->dirty_bgs_lock);
255
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
Josef Bacik's avatar
Josef Bacik committed
256
	extent_io_tree_init(&cur_trans->dirty_pages,
257
258
259
260
			     fs_info->btree_inode->i_mapping);
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
261
	cur_trans->aborted = 0;
262
	spin_unlock(&fs_info->trans_lock);
263

Chris Mason's avatar
Chris Mason committed
264
265
266
	return 0;
}

Chris Mason's avatar
Chris Mason committed
267
/*
268
269
270
271
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
Chris Mason's avatar
Chris Mason committed
272
 */
Chris Mason's avatar
Chris Mason committed
273
static int record_root_in_trans(struct btrfs_trans_handle *trans,
Josef Bacik's avatar
Josef Bacik committed
274
			       struct btrfs_root *root)
275
{
276
277
	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
	    root->last_trans < trans->transid) {
278
		WARN_ON(root == root->fs_info->extent_root);
279
280
		WARN_ON(root->commit_root != root->node);

Chris Mason's avatar
Chris Mason committed
281
		/*
282
		 * see below for IN_TRANS_SETUP usage rules
Chris Mason's avatar
Chris Mason committed
283
284
285
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
286
		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
Chris Mason's avatar
Chris Mason committed
287

288
		/* make sure readers find IN_TRANS_SETUP before
Chris Mason's avatar
Chris Mason committed
289
290
291
292
		 * they find our root->last_trans update
		 */
		smp_wmb();

Josef Bacik's avatar
Josef Bacik committed
293
294
295
296
297
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
298
299
300
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
Josef Bacik's avatar
Josef Bacik committed
301
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
Chris Mason's avatar
Chris Mason committed
302
303
304
305
306
307
308
309
310
311
312
313
314
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
315
		 * with root IN_TRANS_SETUP.  When this is 1, we're still
Chris Mason's avatar
Chris Mason committed
316
317
318
319
320
321
322
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
323
		btrfs_init_reloc_root(trans, root);
324
		smp_mb__before_atomic();
325
		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
326
327
328
	}
	return 0;
}
329

Chris Mason's avatar
Chris Mason committed
330
331
332
333

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
334
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
Chris Mason's avatar
Chris Mason committed
335
336
337
		return 0;

	/*
338
	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
Chris Mason's avatar
Chris Mason committed
339
340
341
342
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
343
	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
Chris Mason's avatar
Chris Mason committed
344
345
346
347
348
349
350
351
352
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

353
354
355
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
	return (trans->state >= TRANS_STATE_BLOCKED &&
356
357
		trans->state < TRANS_STATE_UNBLOCKED &&
		!trans->aborted);
358
359
}

Chris Mason's avatar
Chris Mason committed
360
361
362
363
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
Chris Mason's avatar
Chris Mason committed
364
static void wait_current_trans(struct btrfs_root *root)
Chris Mason's avatar
Chris Mason committed
365
{
366
	struct btrfs_transaction *cur_trans;
Chris Mason's avatar
Chris Mason committed
367

Josef Bacik's avatar
Josef Bacik committed
368
	spin_lock(&root->fs_info->trans_lock);
369
	cur_trans = root->fs_info->running_transaction;
370
	if (cur_trans && is_transaction_blocked(cur_trans)) {
371
		atomic_inc(&cur_trans->use_count);
Josef Bacik's avatar
Josef Bacik committed
372
		spin_unlock(&root->fs_info->trans_lock);
Li Zefan's avatar
Li Zefan committed
373
374

		wait_event(root->fs_info->transaction_wait,
375
376
			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
			   cur_trans->aborted);
377
		btrfs_put_transaction(cur_trans);
Josef Bacik's avatar
Josef Bacik committed
378
379
	} else {
		spin_unlock(&root->fs_info->trans_lock);
380
	}
Chris Mason's avatar
Chris Mason committed
381
382
}

383
384
static int may_wait_transaction(struct btrfs_root *root, int type)
{
Josef Bacik's avatar
Josef Bacik committed
385
386
387
388
389
390
391
392
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
393
		return 1;
Josef Bacik's avatar
Josef Bacik committed
394

395
396
397
	return 0;
}

398
399
400
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
	if (!root->fs_info->reloc_ctl ||
401
	    !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
402
403
404
405
406
407
408
	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
	    root->reloc_root)
		return false;

	return true;
}

409
static struct btrfs_trans_handle *
410
start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
411
		  enum btrfs_reserve_flush_enum flush)
Chris Mason's avatar
Chris Mason committed
412
{
413
414
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
415
	u64 num_bytes = 0;
416
	u64 qgroup_reserved = 0;
417
418
	bool reloc_reserved = false;
	int ret;
419

420
	/* Send isn't supposed to start transactions. */
421
	ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
422

423
	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
424
		return ERR_PTR(-EROFS);
425

426
	if (current->journal_info) {
427
		WARN_ON(type & TRANS_EXTWRITERS);
428
429
		h = current->journal_info;
		h->use_count++;
430
		WARN_ON(h->use_count > 2);
431
432
433
434
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
435
436
437
438
439
440

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
441
442
		if (root->fs_info->quota_enabled &&
		    is_fstree(root->root_key.objectid)) {
443
			qgroup_reserved = num_items * root->nodesize;
444
445
446
447
448
			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
			if (ret)
				return ERR_PTR(ret);
		}

449
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
450
451
452
		/*
		 * Do the reservation for the relocation root creation
		 */
453
		if (need_reserve_reloc_root(root)) {
454
455
456
457
			num_bytes += root->nodesize;
			reloc_reserved = true;
		}

458
459
460
		ret = btrfs_block_rsv_add(root,
					  &root->fs_info->trans_block_rsv,
					  num_bytes, flush);
461
		if (ret)
462
			goto reserve_fail;
463
	}
464
465
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
466
467
468
469
	if (!h) {
		ret = -ENOMEM;
		goto alloc_fail;
	}
Chris Mason's avatar
Chris Mason committed
470

471
472
473
474
475
476
	/*
	 * If we are JOIN_NOLOCK we're already committing a transaction and
	 * waiting on this guy, so we don't need to do the sb_start_intwrite
	 * because we're already holding a ref.  We need this because we could
	 * have raced in and did an fsync() on a file which can kick a commit
	 * and then we deadlock with somebody doing a freeze.
477
478
479
	 *
	 * If we are ATTACH, it means we just want to catch the current
	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
480
	 */
481
	if (type & __TRANS_FREEZABLE)
482
		sb_start_intwrite(root->fs_info->sb);
483

484
	if (may_wait_transaction(root, type))
Chris Mason's avatar
Chris Mason committed
485
		wait_current_trans(root);
486

Josef Bacik's avatar
Josef Bacik committed
487
	do {
488
		ret = join_transaction(root, type);
489
		if (ret == -EBUSY) {
Josef Bacik's avatar
Josef Bacik committed
490
			wait_current_trans(root);
491
492
493
			if (unlikely(type == TRANS_ATTACH))
				ret = -ENOENT;
		}
Josef Bacik's avatar
Josef Bacik committed
494
495
	} while (ret == -EBUSY);

Tsutomu Itoh's avatar
Tsutomu Itoh committed
496
	if (ret < 0) {
497
498
		/* We must get the transaction if we are JOIN_NOLOCK. */
		BUG_ON(type == TRANS_JOIN_NOLOCK);
499
		goto join_fail;
Tsutomu Itoh's avatar
Tsutomu Itoh committed
500
	}
501

502
503
504
505
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
Chris Mason's avatar
Chris Mason committed
506
	h->blocks_used = 0;
507
	h->bytes_reserved = 0;
508
	h->root = root;
509
	h->delayed_ref_updates = 0;
510
	h->use_count = 1;
511
	h->adding_csums = 0;
512
	h->block_rsv = NULL;
513
	h->orig_rsv = NULL;
514
	h->aborted = 0;
515
	h->qgroup_reserved = 0;
516
	h->delayed_ref_elem.seq = 0;
517
	h->type = type;
518
	h->allocating_chunk = false;
519
	h->reloc_reserved = false;
520
	h->sync = false;
521
	INIT_LIST_HEAD(&h->qgroup_ref_list);
522
	INIT_LIST_HEAD(&h->new_bgs);
523
	INIT_LIST_HEAD(&h->ordered);
524

525
	smp_mb();
526
527
	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
	    may_wait_transaction(root, type)) {
528
		current->journal_info = h;
529
530
531
532
		btrfs_commit_transaction(h, root);
		goto again;
	}

533
	if (num_bytes) {
Josef Bacik's avatar
Josef Bacik committed
534
		trace_btrfs_space_reservation(root->fs_info, "transaction",
535
					      h->transid, num_bytes, 1);
536
537
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
538
		h->reloc_reserved = reloc_reserved;
539
	}
540
	h->qgroup_reserved = qgroup_reserved;
Josef Bacik's avatar
Josef Bacik committed
541

542
got_it:
Josef Bacik's avatar
Josef Bacik committed
543
	btrfs_record_root_in_trans(h, root);
544
545
546

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
Chris Mason's avatar
Chris Mason committed
547
	return h;
548
549

join_fail:
550
	if (type & __TRANS_FREEZABLE)
551
552
553
554
555
556
557
558
559
560
		sb_end_intwrite(root->fs_info->sb);
	kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
	if (num_bytes)
		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
					num_bytes);
reserve_fail:
	if (qgroup_reserved)
		btrfs_qgroup_free(root, qgroup_reserved);
	return ERR_PTR(ret);
Chris Mason's avatar
Chris Mason committed
561
562
}

563
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
564
						   int num_items)
565
{
566
567
	return start_transaction(root, num_items, TRANS_START,
				 BTRFS_RESERVE_FLUSH_ALL);
568
}
569

570
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
571
572
					struct btrfs_root *root, int num_items)
{
573
574
	return start_transaction(root, num_items, TRANS_START,
				 BTRFS_RESERVE_FLUSH_LIMIT);
575
576
}

577
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
578
{
579
	return start_transaction(root, 0, TRANS_JOIN, 0);
580
581
}

582
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
583
{
584
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
585
586
}

587
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
588
{
589
	return start_transaction(root, 0, TRANS_USERSPACE, 0);
590
591
}

Miao Xie's avatar
Miao Xie committed
592
593
594
595
596
597
598
599
600
601
602
603
604
/*
 * btrfs_attach_transaction() - catch the running transaction
 *
 * It is used when we want to commit the current the transaction, but
 * don't want to start a new one.
 *
 * Note: If this function return -ENOENT, it just means there is no
 * running transaction. But it is possible that the inactive transaction
 * is still in the memory, not fully on disk. If you hope there is no
 * inactive transaction in the fs when -ENOENT is returned, you should
 * invoke
 *     btrfs_attach_transaction_barrier()
 */
605
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
606
{
607
	return start_transaction(root, 0, TRANS_ATTACH, 0);
608
609
}

Miao Xie's avatar
Miao Xie committed
610
/*
611
 * btrfs_attach_transaction_barrier() - catch the running transaction
Miao Xie's avatar
Miao Xie committed
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
 *
 * It is similar to the above function, the differentia is this one
 * will wait for all the inactive transactions until they fully
 * complete.
 */
struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
	struct btrfs_trans_handle *trans;

	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
		btrfs_wait_for_commit(root, 0);

	return trans;
}

Chris Mason's avatar
Chris Mason committed
629
/* wait for a transaction commit to be fully complete */
630
static noinline void wait_for_commit(struct btrfs_root *root,
631
632
				    struct btrfs_transaction *commit)
{
633
	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
634
635
}

636
637
638
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
639
	int ret = 0;
640
641
642

	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
Josef Bacik's avatar
Josef Bacik committed
643
			goto out;
644
645

		/* find specified transaction */
Josef Bacik's avatar
Josef Bacik committed
646
		spin_lock(&root->fs_info->trans_lock);
647
648
649
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
Josef Bacik's avatar
Josef Bacik committed
650
				atomic_inc(&cur_trans->use_count);
651
				ret = 0;
652
653
				break;
			}
654
655
			if (t->transid > transid) {
				ret = 0;
656
				break;
657
			}
658
		}
Josef Bacik's avatar
Josef Bacik committed
659
		spin_unlock(&root->fs_info->trans_lock);
Sage Weil's avatar
Sage Weil committed
660
661
662
663
664
665
666
667

		/*
		 * The specified transaction doesn't exist, or we
		 * raced with btrfs_commit_transaction
		 */
		if (!cur_trans) {
			if (transid > root->fs_info->last_trans_committed)
				ret = -EINVAL;
668
			goto out;
Sage Weil's avatar
Sage Weil committed
669
		}
670
671
	} else {
		/* find newest transaction that is committing | committed */
Josef Bacik's avatar
Josef Bacik committed
672
		spin_lock(&root->fs_info->trans_lock);
673
674
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
675
676
			if (t->state >= TRANS_STATE_COMMIT_START) {
				if (t->state == TRANS_STATE_COMPLETED)
677
					break;
678
				cur_trans = t;
Josef Bacik's avatar
Josef Bacik committed
679
				atomic_inc(&cur_trans->use_count);
680
681
682
				break;
			}
		}
Josef Bacik's avatar
Josef Bacik committed
683
		spin_unlock(&root->fs_info->trans_lock);
684
		if (!cur_trans)
Josef Bacik's avatar
Josef Bacik committed
685
			goto out;  /* nothing committing|committed */
686
687
688
	}

	wait_for_commit(root, cur_trans);
689
	btrfs_put_transaction(cur_trans);
Josef Bacik's avatar
Josef Bacik committed
690
out:
691
692
693
	return ret;
}

Chris Mason's avatar
Chris Mason committed
694
695
void btrfs_throttle(struct btrfs_root *root)
{
Josef Bacik's avatar
Josef Bacik committed
696
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
697
		wait_current_trans(root);
Chris Mason's avatar
Chris Mason committed
698
699
}

700
701
702
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
703
	if (root->fs_info->global_block_rsv.space_info->full &&
704
	    btrfs_check_space_for_delayed_refs(trans, root))
705
		return 1;
706

707
	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
708
709
710
711
712
713
714
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	int updates;
715
	int err;
716

Josef Bacik's avatar
Josef Bacik committed
717
	smp_mb();
718
719
	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
	    cur_trans->delayed_refs.flushing)
720
721
722
723
		return 1;

	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
724
	if (updates) {
725
		err = btrfs_run_delayed_refs(trans, root, updates * 2);
726
727
728
		if (err) /* Error code will also eval true */
			return err;
	}
729
730
731
732

	return should_end_transaction(trans, root);
}

733
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
734
			  struct btrfs_root *root, int throttle)
Chris Mason's avatar
Chris Mason committed
735
{
736
	struct btrfs_transaction *cur_trans = trans->transaction;
737
	struct btrfs_fs_info *info = root->fs_info;
738
	unsigned long cur = trans->delayed_ref_updates;
739
	int lock = (trans->type != TRANS_JOIN_NOLOCK);
740
	int err = 0;
Chris Mason's avatar
Chris Mason committed
741
	int must_run_delayed_refs = 0;
742

743
744
	if (trans->use_count > 1) {
		trans->use_count--;
745
746
747
748
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

749
	btrfs_trans_release_metadata(trans, root);
750
	trans->block_rsv = NULL;
751

752
753
754
	if (!list_empty(&trans->new_bgs))
		btrfs_create_pending_block_groups(trans, root);

755
756
757
758
759
760
	if (!list_empty(&trans->ordered)) {
		spin_lock(&info->trans_lock);
		list_splice(&trans->ordered, &cur_trans->pending_ordered);
		spin_unlock(&info->trans_lock);
	}

761
	trans->delayed_ref_updates = 0;
Chris Mason's avatar
Chris Mason committed
762
763
764
	if (!trans->sync) {
		must_run_delayed_refs =
			btrfs_should_throttle_delayed_refs(trans, root);
765
		cur = max_t(unsigned long, cur, 32);
Chris Mason's avatar
Chris Mason committed
766
767
768
769
770
771
772
773

		/*
		 * don't make the caller wait if they are from a NOLOCK
		 * or ATTACH transaction, it will deadlock with commit
		 */
		if (must_run_delayed_refs == 1 &&
		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
			must_run_delayed_refs = 2;
774
	}
775

Josef Bacik's avatar
Josef Bacik committed
776
777
778
779
780
781
782
783
784
	if (trans->qgroup_reserved) {
		/*
		 * the same root has to be passed here between start_transaction
		 * and end_transaction. Subvolume quota depends on this.
		 */
		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
		trans->qgroup_reserved = 0;
	}

785
786
	btrfs_trans_release_metadata(trans, root);
	trans->block_rsv = NULL;
787

788
789
790
	if (!list_empty(&trans->new_bgs))
		btrfs_create_pending_block_groups(trans, root);

Josef Bacik's avatar
Josef Bacik committed
791
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
792
793
794
795
796
797
	    should_end_transaction(trans, root) &&
	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
		spin_lock(&info->trans_lock);
		if (cur_trans->state == TRANS_STATE_RUNNING)
			cur_trans->state = TRANS_STATE_BLOCKED;
		spin_unlock(&info->trans_lock);
Josef Bacik's avatar
Josef Bacik committed
798
	}
799

800
	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
801
		if (throttle)
802
			return btrfs_commit_transaction(trans, root);
803
		else
804
805
806
			wake_up_process(info->transaction_kthread);
	}

807
	if (trans->type & __TRANS_FREEZABLE)
808
		sb_end_intwrite(root->fs_info->sb);
809

810
	WARN_ON(cur_trans != info->running_transaction);
811
812
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
813
	extwriter_counter_dec(cur_trans, trans->type);
814

815
	smp_mb();
Chris Mason's avatar
Chris Mason committed
816
817
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
818
	btrfs_put_transaction(cur_trans);
Josef Bacik's avatar
Josef Bacik committed
819
820
821

	if (current->journal_info == trans)
		current->journal_info = NULL;
822

Yan, Zheng's avatar
Yan, Zheng committed
823
824
825
	if (throttle)
		btrfs_run_delayed_iputs(root);

826
	if (trans->aborted ||
827
828
	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
		wake_up_process(info->transaction_kthread);
829
		err = -EIO;
830
	}
831
	assert_qgroups_uptodate(trans);
832

833
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Chris Mason's avatar
Chris Mason committed
834
835
836
837
	if (must_run_delayed_refs) {
		btrfs_async_run_delayed_refs(root, cur,
					     must_run_delayed_refs == 1);
	}
838
	return err;
Chris Mason's avatar
Chris Mason committed
839
840
}

841
842
843
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
844
	return __btrfs_end_transaction(trans, root, 0);
845
846
847
848
849
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
850
	return __btrfs_end_transaction(trans, root, 1);
851
852
}

Chris Mason's avatar
Chris Mason committed
853
854
855
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
856
 * those extents are sent to disk but does not wait on them
Chris Mason's avatar
Chris Mason committed
857
 */
858
int btrfs_write_marked_extents(struct btrfs_root *root,
859
			       struct extent_io_tree *dirty_pages, int mark)
Chris Mason's avatar
Chris Mason committed
860
{
861
	int err = 0;
862
	int werr = 0;
Josef Bacik's avatar
Josef Bacik committed
863
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
864
	struct extent_state *cached_state = NULL;
865
	u64 start = 0;
866
	u64 end;
867

Josef Bacik's avatar
Josef Bacik committed
868
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
869
				      mark, &cached_state)) {
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
		bool wait_writeback = false;

		err = convert_extent_bit(dirty_pages, start, end,
					 EXTENT_NEED_WAIT,
					 mark, &cached_state, GFP_NOFS);
		/*
		 * convert_extent_bit can return -ENOMEM, which is most of the
		 * time a temporary error. So when it happens, ignore the error
		 * and wait for writeback of this range to finish - because we
		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
		 * to btrfs_wait_marked_extents() would not know that writeback
		 * for this range started and therefore wouldn't wait for it to
		 * finish - we don't want to commit a superblock that points to
		 * btree nodes/leafs for which writeback hasn't finished yet
		 * (and without errors).
		 * We cleanup any entries left in the io tree when committing
		 * the transaction (through clear_btree_io_tree()).
		 */
		if (err == -ENOMEM) {
			err = 0;
			wait_writeback = true;
		}
		if (!err)
			err = filemap_fdatawrite_range(mapping, start, end);
Josef Bacik's avatar
Josef Bacik committed
894
895
		if (err)
			werr = err;
896
897
		else if (wait_writeback)
			werr = filemap_fdatawait_range(mapping, start, end);
898
		free_extent_state(cached_state);
899
		cached_state = NULL;
Josef Bacik's avatar
Josef Bacik committed
900
901
		cond_resched();
		start = end + 1;
902
	}
903
904
905
906
907
908
909
910
911
912
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
913
			      struct extent_io_tree *dirty_pages, int mark)
914
915
916
{
	int err = 0;
	int werr = 0;
Josef Bacik's avatar
Josef Bacik committed
917
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;