Commit 839a3f76 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs fixes from Chris Mason:
 "These are bug fixes, including a really old fsync bug, and a few trace
  points to help us track down problems in the quota code"

* 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
  Btrfs: fix file/data loss caused by fsync after rename and new inode
  btrfs: Reset IO error counters before start of device replacing
  btrfs: Add qgroup tracing
  Btrfs: don't use src fd for printk
  btrfs: fallback to vmalloc in btrfs_compare_tree
  btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
  btrfs: Output more info for enospc_debug mount option
  Btrfs: fix invalid reference in replace_path
  Btrfs: Improve FL_KEEP_SIZE handling in fallocate
parents 67592126 56f23fdb
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/vmalloc.h>
#include "ctree.h" #include "ctree.h"
#include "disk-io.h" #include "disk-io.h"
#include "transaction.h" #include "transaction.h"
...@@ -5361,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ...@@ -5361,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out; goto out;
} }
tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
if (!tmp_buf) { if (!tmp_buf) {
ret = -ENOMEM; tmp_buf = vmalloc(left_root->nodesize);
goto out; if (!tmp_buf) {
ret = -ENOMEM;
goto out;
}
} }
left_path->search_commit_root = 1; left_path->search_commit_root = 1;
...@@ -5565,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ...@@ -5565,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
out: out:
btrfs_free_path(left_path); btrfs_free_path(left_path);
btrfs_free_path(right_path); btrfs_free_path(right_path);
kfree(tmp_buf); kvfree(tmp_buf);
return ret; return ret;
} }
......
...@@ -394,6 +394,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ...@@ -394,6 +394,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
dev_replace->cursor_right = 0; dev_replace->cursor_right = 0;
dev_replace->is_valid = 1; dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 1; dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace, 1); btrfs_dev_replace_unlock(dev_replace, 1);
......
...@@ -9386,15 +9386,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ...@@ -9386,15 +9386,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 dev_min = 1; u64 dev_min = 1;
u64 dev_nr = 0; u64 dev_nr = 0;
u64 target; u64 target;
int debug;
int index; int index;
int full = 0; int full = 0;
int ret = 0; int ret = 0;
debug = btrfs_test_opt(root, ENOSPC_DEBUG);
block_group = btrfs_lookup_block_group(root->fs_info, bytenr); block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
/* odd, couldn't find the block group, leave it alone */ /* odd, couldn't find the block group, leave it alone */
if (!block_group) if (!block_group) {
if (debug)
btrfs_warn(root->fs_info,
"can't find block group for bytenr %llu",
bytenr);
return -1; return -1;
}
min_free = btrfs_block_group_used(&block_group->item); min_free = btrfs_block_group_used(&block_group->item);
...@@ -9448,8 +9456,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ...@@ -9448,8 +9456,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* this is just a balance, so if we were marked as full * this is just a balance, so if we were marked as full
* we know there is no space for a new chunk * we know there is no space for a new chunk
*/ */
if (full) if (full) {
if (debug)
btrfs_warn(root->fs_info,
"no space to alloc new chunk for block group %llu",
block_group->key.objectid);
goto out; goto out;
}
index = get_block_group_index(block_group); index = get_block_group_index(block_group);
} }
...@@ -9496,6 +9509,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ...@@ -9496,6 +9509,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
ret = -1; ret = -1;
} }
} }
if (debug && ret == -1)
btrfs_warn(root->fs_info,
"no space to allocate a new chunk for block group %llu",
block_group->key.objectid);
mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->chunk_mutex);
btrfs_end_transaction(trans, root); btrfs_end_transaction(trans, root);
out: out:
......
...@@ -2682,9 +2682,12 @@ static long btrfs_fallocate(struct file *file, int mode, ...@@ -2682,9 +2682,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret; return ret;
inode_lock(inode); inode_lock(inode);
ret = inode_newsize_ok(inode, alloc_end);
if (ret) if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
goto out; ret = inode_newsize_ok(inode, offset + len);
if (ret)
goto out;
}
/* /*
* TODO: Move these two operations after we have checked * TODO: Move these two operations after we have checked
......
...@@ -1654,7 +1654,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, ...@@ -1654,7 +1654,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
src_inode = file_inode(src.file); src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) { if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(src_inode)->root->fs_info, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS"); "Snapshot src from another FS");
ret = -EXDEV; ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) { } else if (!inode_owner_or_capable(src_inode)) {
......
...@@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record ...@@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record
u64 bytenr = record->bytenr; u64 bytenr = record->bytenr;
assert_spin_locked(&delayed_refs->lock); assert_spin_locked(&delayed_refs->lock);
trace_btrfs_qgroup_insert_dirty_extent(record);
while (*p) { while (*p) {
parent_node = *p; parent_node = *p;
...@@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, ...@@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
cur_new_count);
/* Rfer update part */ /* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) { if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes; qg->rfer += num_bytes;
...@@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, ...@@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
goto out_free; goto out_free;
BUG_ON(!fs_info->quota_root); BUG_ON(!fs_info->quota_root);
trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
nr_new_roots);
qgroups = ulist_alloc(GFP_NOFS); qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) { if (!qgroups) {
ret = -ENOMEM; ret = -ENOMEM;
...@@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, ...@@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record, record = rb_entry(node, struct btrfs_qgroup_extent_record,
node); node);
trace_btrfs_qgroup_account_extents(record);
if (!ret) { if (!ret) {
/* /*
* Use (u64)-1 as time_seq to do special search, which * Use (u64)-1 as time_seq to do special search, which
...@@ -1842,8 +1851,10 @@ out: ...@@ -1842,8 +1851,10 @@ out:
} }
/* /*
* copy the acounting information between qgroups. This is necessary when a * Copy the acounting information between qgroups. This is necessary
* snapshot or a subvolume is created * when a snapshot or a subvolume is created. Throwing an error will
* cause a transaction abort so we take extra care here to only error
* when a readonly fs is a reasonable outcome.
*/ */
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
...@@ -1873,15 +1884,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ...@@ -1873,15 +1884,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2 * inherit->num_excl_copies; 2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) { for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups); srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
if (!srcgroup) {
ret = -EINVAL;
goto out;
}
if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { /*
ret = -EINVAL; * Zero out invalid groups so we can ignore
goto out; * them later.
} */
if (!srcgroup ||
((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
*i_qgroups = 0ULL;
++i_qgroups; ++i_qgroups;
} }
} }
...@@ -1916,17 +1927,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ...@@ -1916,17 +1927,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
*/ */
if (inherit) { if (inherit) {
i_qgroups = (u64 *)(inherit + 1); i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) { for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
if (*i_qgroups == 0)
continue;
ret = add_qgroup_relation_item(trans, quota_root, ret = add_qgroup_relation_item(trans, quota_root,
objectid, *i_qgroups); objectid, *i_qgroups);
if (ret) if (ret && ret != -EEXIST)
goto out; goto out;
ret = add_qgroup_relation_item(trans, quota_root, ret = add_qgroup_relation_item(trans, quota_root,
*i_qgroups, objectid); *i_qgroups, objectid);
if (ret) if (ret && ret != -EEXIST)
goto out; goto out;
++i_qgroups;
} }
ret = 0;
} }
...@@ -1987,17 +2000,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ...@@ -1987,17 +2000,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
i_qgroups = (u64 *)(inherit + 1); i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) { for (i = 0; i < inherit->num_qgroups; ++i) {
ret = add_relation_rb(quota_root->fs_info, objectid, if (*i_qgroups) {
*i_qgroups); ret = add_relation_rb(quota_root->fs_info, objectid,
if (ret) *i_qgroups);
goto unlock; if (ret)
goto unlock;
}
++i_qgroups; ++i_qgroups;
} }
for (i = 0; i < inherit->num_ref_copies; ++i) { for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src; struct btrfs_qgroup *src;
struct btrfs_qgroup *dst; struct btrfs_qgroup *dst;
if (!i_qgroups[0] || !i_qgroups[1])
continue;
src = find_qgroup_rb(fs_info, i_qgroups[0]); src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]); dst = find_qgroup_rb(fs_info, i_qgroups[1]);
...@@ -2008,12 +2026,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ...@@ -2008,12 +2026,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->rfer = src->rfer - level_size; dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size;
i_qgroups += 2;
} }
for (i = 0; i < inherit->num_excl_copies; ++i) { for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src; struct btrfs_qgroup *src;
struct btrfs_qgroup *dst; struct btrfs_qgroup *dst;
if (!i_qgroups[0] || !i_qgroups[1])
continue;
src = find_qgroup_rb(fs_info, i_qgroups[0]); src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]); dst = find_qgroup_rb(fs_info, i_qgroups[1]);
...@@ -2024,7 +2044,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ...@@ -2024,7 +2044,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->excl = src->excl + level_size; dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size; dst->excl_cmpr = src->excl_cmpr + level_size;
i_qgroups += 2;
} }
unlock: unlock:
......
...@@ -1850,6 +1850,7 @@ again: ...@@ -1850,6 +1850,7 @@ again:
eb = read_tree_block(dest, old_bytenr, old_ptr_gen); eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (IS_ERR(eb)) { if (IS_ERR(eb)) {
ret = PTR_ERR(eb); ret = PTR_ERR(eb);
break;
} else if (!extent_buffer_uptodate(eb)) { } else if (!extent_buffer_uptodate(eb)) {
ret = -EIO; ret = -EIO;
free_extent_buffer(eb); free_extent_buffer(eb);
......
...@@ -4415,6 +4415,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, ...@@ -4415,6 +4415,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
return ret; return ret;
} }
/*
* When we are logging a new inode X, check if it doesn't have a reference that
* matches the reference from some other inode Y created in a past transaction
* and that was renamed in the current transaction. If we don't do this, then at
* log replay time we can lose inode Y (and all its files if it's a directory):
*
* mkdir /mnt/x
* echo "hello world" > /mnt/x/foobar
* sync
* mv /mnt/x /mnt/y
* mkdir /mnt/x # or touch /mnt/x
* xfs_io -c fsync /mnt/x
* <power fail>
* mount fs, trigger log replay
*
* After the log replay procedure, we would lose the first directory and all its
* files (file foobar).
* For the case where inode Y is not a directory we simply end up losing it:
*
* echo "123" > /mnt/foo
* sync
* mv /mnt/foo /mnt/bar
* echo "abc" > /mnt/foo
* xfs_io -c fsync /mnt/foo
* <power fail>
*
* We also need this for cases where a snapshot entry is replaced by some other
* entry (file or directory) otherwise we end up with an unreplayable log due to
* attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
* if it were a regular entry:
*
* mkdir /mnt/x
* btrfs subvolume snapshot /mnt /mnt/x/snap
* btrfs subvolume delete /mnt/x/snap
* rmdir /mnt/x
* mkdir /mnt/x
* fsync /mnt/x or fsync some new file inside it
* <power fail>
*
* The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
* the same transaction.
*/
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct inode *inode)
{
int ret;
struct btrfs_path *search_path;
char *name = NULL;
u32 name_len = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
search_path = btrfs_alloc_path();
if (!search_path)
return -ENOMEM;
search_path->search_commit_root = 1;
search_path->skip_locking = 1;
while (cur_offset < item_size) {
u64 parent;
u32 this_name_len;
u32 this_len;
unsigned long name_ptr;
struct btrfs_dir_item *di;
if (key->type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
parent = key->offset;
this_name_len = btrfs_inode_ref_name_len(eb, iref);
name_ptr = (unsigned long)(iref + 1);
this_len = sizeof(*iref) + this_name_len;
} else {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)(ptr +
cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
this_name_len = btrfs_inode_extref_name_len(eb, extref);
name_ptr = (unsigned long)&extref->name;
this_len = sizeof(*extref) + this_name_len;
}
if (this_name_len > name_len) {
char *new_name;
new_name = krealloc(name, this_name_len, GFP_NOFS);
if (!new_name) {
ret = -ENOMEM;
goto out;
}
name_len = this_name_len;
name = new_name;
}
read_extent_buffer(eb, name, name_ptr, this_name_len);
di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
search_path, parent,
name, this_name_len, 0);
if (di && !IS_ERR(di)) {
ret = 1;
goto out;
} else if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
btrfs_release_path(search_path);
cur_offset += this_len;
}
ret = 0;
out:
btrfs_free_path(search_path);
kfree(name);
return ret;
}
/* log a single inode in the tree log. /* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree * At least one parent directory for this inode must exist in the tree
* or be logged already. * or be logged already.
...@@ -4602,6 +4723,22 @@ again: ...@@ -4602,6 +4723,22 @@ again:
if (min_key.type == BTRFS_INODE_ITEM_KEY) if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false; need_log_inode_item = false;
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
BTRFS_I(inode)->generation == trans->transid) {
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0],
&min_key, inode);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0) {
err = 1;
btrfs_set_log_full_commit(root->fs_info, trans);
goto out_unlock;
}
}
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0) if (ins_nr == 0)
......
...@@ -23,7 +23,7 @@ struct map_lookup; ...@@ -23,7 +23,7 @@ struct map_lookup;
struct extent_buffer; struct extent_buffer;
struct btrfs_work; struct btrfs_work;
struct __btrfs_workqueue; struct __btrfs_workqueue;
struct btrfs_qgroup_operation; struct btrfs_qgroup_extent_record;
#define show_ref_type(type) \ #define show_ref_type(type) \
__print_symbolic(type, \ __print_symbolic(type, \
...@@ -1231,6 +1231,93 @@ DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref, ...@@ -1231,6 +1231,93 @@ DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
TP_ARGS(ref_root, reserved) TP_ARGS(ref_root, reserved)
); );
DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
TP_PROTO(struct btrfs_qgroup_extent_record *rec),
TP_ARGS(rec),
TP_STRUCT__entry(
__field( u64, bytenr )
__field( u64, num_bytes )
),
TP_fast_assign(
__entry->bytenr = rec->bytenr,
__entry->num_bytes = rec->num_bytes;
),
TP_printk("bytenr = %llu, num_bytes = %llu",
(unsigned long long)__entry->bytenr,
(unsigned long long)__entry->num_bytes)
);
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
TP_PROTO(struct btrfs_qgroup_extent_record *rec),
TP_ARGS(rec)
);
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent,
TP_PROTO(struct btrfs_qgroup_extent_record *rec),
TP_ARGS(rec)
);
TRACE_EVENT(btrfs_qgroup_account_extent,
TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots),
TP_STRUCT__entry(
__field( u64, bytenr )
__field( u64, num_bytes )
__field( u64, nr_old_roots )
__field( u64, nr_new_roots )
),
TP_fast_assign(
__entry->