Commit e64df3eb authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs updates from Chris Mason:
 "These are all fixes I'd like to get out to a broader audience.

  The biggest of the bunch is Mark's quota fix, which is also in the
  SUSE kernel, and makes our subvolume quotas dramatically more
  accurate.

  I've been running xfstests with these against your current git
  overnight, but I'm queueing up longer tests as well"

* 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
  btrfs: disable strict file flushes for renames and truncates
  Btrfs: fix csum tree corruption, duplicate and outdated checksums
  Btrfs: Fix memory corruption by ulist_add_merge() on 32bit arch
  Btrfs: fix compressed write corruption on enospc
  btrfs: correctly handle return from ulist_add
  btrfs: qgroup: account shared subtrees during snapshot delete
  Btrfs: read lock extent buffer while walking backrefs
  Btrfs: __btrfs_mod_ref should always use no_quota
  btrfs: adjust statfs calculations according to raid profiles
parents 53b95d63 8d875f95
......@@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
if (ret > 0)
goto next;
ret = ulist_add_merge(parents, eb->start,
(uintptr_t)eie,
(u64 *)&old, GFP_NOFS);
ret = ulist_add_merge_ptr(parents, eb->start,
eie, (void **)&old, GFP_NOFS);
if (ret < 0)
break;
if (!ret && extent_item_pos) {
......@@ -1001,16 +1000,19 @@ again:
ret = -EIO;
goto out;
}
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie);
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
ref->inode_list = eie;
}
ret = ulist_add_merge(refs, ref->parent,
(uintptr_t)ref->inode_list,
(u64 *)&eie, GFP_NOFS);
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
(void **)&eie, GFP_NOFS);
if (ret < 0)
goto out;
if (!ret && extent_item_pos) {
......
......@@ -84,12 +84,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
/*
* list for tracking inodes that must be sent to disk before a
* rename or truncate commit
*/
struct list_head ordered_operations;
/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;
......
......@@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0, 1);
ret = btrfs_inc_ref(trans, root, cow, 0);
if (ret)
return ret;
......@@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1, 1);
ret = btrfs_inc_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0, 1);
ret = btrfs_dec_ref(trans, root, buf, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
ret = btrfs_inc_ref(trans, root, cow, 1);
BUG_ON(ret); /* -ENOMEM */
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
......@@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0, 1);
ret = btrfs_inc_ref(trans, root, cow, 0);
BUG_ON(ret); /* -ENOMEM */
}
if (new_flags != 0) {
......@@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1, 1);
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0, 1);
ret = btrfs_inc_ref(trans, root, cow, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, buf, 1, 1);
ret = btrfs_dec_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
}
clean_tree_block(trans, root, buf);
......
......@@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota);
struct extent_buffer *buf, int full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota);
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
......
......@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
......@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
}
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
INIT_LIST_HEAD(&splice);
mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
list_del_init(&btrfs_inode->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
btrfs_invalidate_inodes(btrfs_inode->root);
spin_lock(&root->fs_info->ordered_root_lock);
}
spin_unlock(&root->fs_info->ordered_root_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
......@@ -4093,8 +4063,6 @@ again:
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
btrfs_destroy_ordered_operations(cur_trans, root);
btrfs_destroy_delayed_refs(cur_trans, root);
cur_trans->state = TRANS_STATE_COMMIT_START;
......
......@@ -3057,7 +3057,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
int full_backref, int inc, int no_quota)
int full_backref, int inc)
{
u64 bytenr;
u64 num_bytes;
......@@ -3111,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
key.offset, no_quota);
key.offset, 1);
if (ret)
goto fail;
} else {
......@@ -3119,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
no_quota);
1);
if (ret)
goto fail;
}
......@@ -3130,15 +3130,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota)
struct extent_buffer *buf, int full_backref)
{
return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota)
struct extent_buffer *buf, int full_backref)
{
return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
......@@ -7478,6 +7478,220 @@ reada:
wc->reada_slot = slot;
}
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
/* filter out non qgroup-accountable extents */
extent_type = btrfs_file_extent_type(eb, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
continue;
bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
if (!bytenr)
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
ret = btrfs_qgroup_record_ref(trans, root->fs_info,
root->objectid,
bytenr, num_bytes,
BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
if (ret)
return ret;
}
return 0;
}
/*
* Walk up the tree from the bottom, freeing leaves and any interior
* nodes which have had all slots visited. If a node (leaf or
* interior) is freed, the node above it will have it's slot
* incremented. The root node will never be freed.
*
* At the end of this function, we should have a path which has all
* slots incremented to the next position for a search. If we need to
* read a new node it will be NULL and the node above it will have the
* correct slot selected for a later read.
*
* If we increment the root nodes slot counter past the number of
* elements, 1 is returned to signal completion of the search.
*/
static int adjust_slots_upwards(struct btrfs_root *root,
struct btrfs_path *path, int root_level)
{
int level = 0;
int nr, slot;
struct extent_buffer *eb;
if (root_level == 0)
return 1;
while (level <= root_level) {
eb = path->nodes[level];
nr = btrfs_header_nritems(eb);
path->slots[level]++;
slot = path->slots[level];
if (slot >= nr || level == 0) {
/*
* Don't free the root - we will detect this
* condition after our loop and return a
* positive value for caller to stop walking the tree.
*/
if (level != root_level) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
free_extent_buffer(eb);
path->nodes[level] = NULL;
path->slots[level] = 0;
}
} else {
/*
* We have a valid slot to walk back down
* from. Stop here so caller can process these
* new nodes.
*/
break;
}
level++;
}
eb = path->nodes[root_level];
if (path->slots[root_level] >= btrfs_header_nritems(eb))
return 1;
return 0;
}
/*
* root_eb is the subtree root and is locked before this function is called.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *root_eb,
u64 root_gen,
int root_level)
{
int ret = 0;
int level;
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
BUG_ON(root_eb == NULL);
if (!root->fs_info->quota_enabled)
return 0;
if (!extent_buffer_uptodate(root_eb)) {
ret = btrfs_read_buffer(root_eb, root_gen);
if (ret)
goto out;
}
if (root_level == 0) {
ret = account_leaf_items(trans, root, root_eb);
goto out;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* Walk down the tree. Missing extent blocks are filled in as
* we go. Metadata is accounted every time we read a new
* extent block.
*
* When we reach a leaf, we account for file extent items in it,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
extent_buffer_get(root_eb); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
int child_bsize = root->nodesize;
int parent_slot;
u64 child_gen;
u64 child_bytenr;
/* We need to get child blockptr/gen from
* parent before we can read it. */
eb = path->nodes[level + 1];
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
eb = read_tree_block(root, child_bytenr, child_bsize,
child_gen);
if (!eb || !extent_buffer_uptodate(eb)) {
ret = -EIO;
goto out;
}
path->nodes[level] = eb;
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
ret = btrfs_qgroup_record_ref(trans, root->fs_info,
root->objectid,
child_bytenr,
child_bsize,
BTRFS_QGROUP_OPER_SUB_SUBTREE,
0);
if (ret)
goto out;
}
if (level == 0) {
ret = account_leaf_items(trans, root, path->nodes[level]);
if (ret)
goto out;
/* Nonzero return here means we completed our search */
ret = adjust_slots_upwards(root, path, root_level);
if (ret)
break;
/* Restart search with new slots */
goto walk_down;
}
level--;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* helper to process tree block while walking down the tree.
*
......@@ -7532,9 +7746,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
ret = btrfs_inc_ref(trans, root, eb, 1);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag,
......@@ -7581,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
int level = wc->level;
int reada = 0;
int ret = 0;
bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
......@@ -7626,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level - 1] > 1) {
need_account = true;
if (level == 1 &&
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
goto skip;
......@@ -7689,6 +7905,16 @@ skip:
parent = 0;
}
if (need_account) {
ret = account_shared_subtree(trans, root, next,
generation, level - 1);
if (ret) {
printk_ratelimited(KERN_ERR "BTRFS: %s Error "
"%d accounting shared subtree. Quota "
"is out of sync, rescan required.\n",
root->fs_info->sb->s_id, ret);
}
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0, 0);
BUG_ON(ret); /* -ENOMEM */
......@@ -7769,12 +7995,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
ret = btrfs_dec_ref(trans, root, eb, 1,
wc->for_reloc);
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0,
wc->for_reloc);
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = account_leaf_items(trans, root, eb);
if (ret) {
printk_ratelimited(KERN_ERR "BTRFS: %s Error "
"%d accounting leaf items. Quota "
"is out of sync, rescan required.\n",
root->fs_info->sb->s_id, ret);
}
}
/* make block locked assertion in clean_tree_block happy */
if (!path->locks[level] &&
......@@ -7900,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
......@@ -8025,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
/*
* Qgroup update accounting is run from
* delayed ref handling. This usually works
* out because delayed refs are normally the
* only way qgroup updates are added. However,
* we may have added updates during our tree
* walk so run qgroups here to make sure we
* don't lose any updates.
*/
ret = btrfs_delayed_qgroup_accounting(trans,
root->fs_info);
if (ret)
printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
"running qgroup updates "
"during snapshot delete. "
"Quota is out of sync, "
"rescan required.\n", ret);
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
......@@ -8078,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
if (ret)
printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
"running qgroup updates "
"during snapshot delete. "
"Quota is out of sync, "
"rescan required.\n", ret);
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
......
......@@ -756,7 +756,7 @@ again:
found_next = 1;
if (ret != 0)
goto insert;
slot = 0;
slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
......
......@@ -1838,33 +1838,9 @@ out:
int btrfs_release_file(struct inode *inode, struct file *filp)
{
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags)) {
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
/*
* We need to block on a committing transaction to keep us from
* throwing a ordered operation on to the list and causing
* something like sync to deadlock trying to flush out this
* inode.
*/
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
btrfs_end_transaction(trans, root);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
}
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
filemap_flush(inode->i_mapping);
return 0;
}
......
......@@ -709,6 +709,18 @@ retry:
unlock_extent(io_tree, async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
/*
* we need to redirty the pages if we decide to
* fallback to uncompressed IO, otherwise we
* will not submit these pages down to lower
* layers.
*/
extent_range_redirty_for_io(inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
goto retry;