Commit 17d217fe authored by Yan Zheng's avatar Yan Zheng Committed by Chris Mason
Browse files

Btrfs: fix nodatasum handling in balancing code



Checksums on data can be disabled by mount option, so it's
possible some data extents don't have checksums or have
invalid checksums. This causes trouble for data relocation.
This patch contains following things to make data relocation
work.

1) make nodatasum/nodatacow mount option only affects new
files. Checksums and COW on data are only controlled by the
inode flags.

2) check the existence of checksum in the nodatacow checker.
If checksums exist, force COW the data extent. This ensure that
checksum for a given block is either valid or does not exist.

3) update data relocation code to properly handle the case
of checksum missing.
Signed-off-by: default avatarYan Zheng <zheng.yan@oracle.com>
parent e4404d6e
......@@ -124,8 +124,7 @@ static int check_compressed_csum(struct inode *inode,
u32 csum;
u32 *cb_sum = &cb->sums;
if (btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM))
if (btrfs_test_flag(inode, NODATASUM))
return 0;
for (i = 0; i < cb->nr_pages; i++) {
......@@ -671,8 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
*/
atomic_inc(&cb->pending_bios);
if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
if (!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, comp_bio,
sums);
}
......@@ -699,8 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
BUG_ON(ret);
if (!btrfs_test_opt(root, NODATASUM) &&
!btrfs_test_flag(inode, NODATASUM)) {
if (!btrfs_test_flag(inode, NODATASUM)) {
btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
}
......
......@@ -1702,7 +1702,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr);
struct btrfs_root *root, u64 objectid, u64 bytenr);
int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
......@@ -1789,6 +1789,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *buf, u64 orig_start);
int btrfs_add_dead_reloc_root(struct btrfs_root *root);
int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
/* ctree.c */
int btrfs_previous_item(struct btrfs_root *root,
......@@ -1994,6 +1995,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
u64 isize);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
u64 end, struct list_head *list);
/* inode.c */
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
......
......@@ -1359,7 +1359,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr)
struct btrfs_root *root, u64 objectid, u64 bytenr)
{
struct btrfs_root *extent_root = root->fs_info->extent_root;
struct btrfs_path *path;
......@@ -1418,8 +1418,9 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
ref_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_ref);
ref_root = btrfs_ref_root(leaf, ref_item);
if (ref_root != root->root_key.objectid &&
ref_root != BTRFS_TREE_LOG_OBJECTID) {
if ((ref_root != root->root_key.objectid &&
ref_root != BTRFS_TREE_LOG_OBJECTID) ||
objectid != btrfs_ref_objectid(leaf, ref_item)) {
ret = 1;
goto out;
}
......@@ -5367,7 +5368,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
if (ret)
goto out;
}
btrfs_record_root_in_trans(found_root);
ret = replace_one_extent(trans, found_root,
path, extent_key,
&first_key, ref_path,
......@@ -5534,6 +5534,7 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
} else {
BUG_ON(1);
}
BTRFS_I(inode)->index_cnt = group->key.objectid;
err = btrfs_orphan_add(trans, inode);
out:
......@@ -5546,6 +5547,47 @@ out:
return inode;
}
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
{
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct list_head list;
size_t offset;
int ret;
u64 disk_bytenr;
INIT_LIST_HEAD(&list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root, disk_bytenr,
disk_bytenr + len - 1, &list);
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
sector_sum = sums->sums;
sums->bytenr = ordered->start;
offset = 0;
while (offset < sums->len) {
sector_sum->bytenr += ordered->start - disk_bytenr;
sector_sum++;
offset += root->sectorsize;
}
btrfs_add_ordered_sum(inode, ordered, sums);
}
btrfs_put_ordered_extent(ordered);
return 0;
}
int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
{
struct btrfs_trans_handle *trans;
......
......@@ -16,6 +16,7 @@
#define EXTENT_ORDERED (1 << 9)
#define EXTENT_ORDERED_METADATA (1 << 10)
#define EXTENT_BOUNDARY (1 << 11)
#define EXTENT_NODATASUM (1 << 12)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
......
......@@ -140,6 +140,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst)
{
......@@ -185,9 +186,16 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
sum = 0;
printk("no csum found for inode %lu start "
"%llu\n", inode->i_ino,
(unsigned long long)offset);
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
offset + bvec->bv_len - 1,
EXTENT_NODATASUM, GFP_NOFS);
} else {
printk("no csum found for inode %lu "
"start %llu\n", inode->i_ino,
(unsigned long long)offset);
}
item = NULL;
btrfs_release_path(root, path);
goto found;
......@@ -228,6 +236,106 @@ found:
return 0;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list)
{
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct btrfs_csum_item *item;
unsigned long offset;
int ret;
size_t size;
u64 csum_end;
u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
path = btrfs_alloc_path();
BUG_ON(!path);
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
&key, path, 0, 0);
if (ret < 0)
goto fail;
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
offset = (start - key.offset) >>
root->fs_info->sb->s_blocksize_bits;
if (offset * csum_size <
btrfs_item_size_nr(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
while (start <= end) {
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root->fs_info->csum_root, path);
if (ret < 0)
goto fail;
if (ret > 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
key.type != BTRFS_EXTENT_CSUM_KEY)
break;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.offset > end)
break;
if (key.offset > start)
start = key.offset;
size = btrfs_item_size_nr(leaf, path->slots[0]);
csum_end = key.offset + (size / csum_size) * root->sectorsize;
size = min(csum_end, end + 1) - start;
sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
BUG_ON(!sums);
sector_sum = sums->sums;
sums->bytenr = start;
sums->len = size;
offset = (start - key.offset) >>
root->fs_info->sb->s_blocksize_bits;
offset *= csum_size;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
while (size > 0) {
read_extent_buffer(path->nodes[0], &sector_sum->sum,
((unsigned long)item) + offset,
csum_size);
sector_sum->bytenr = start;
size -= root->sectorsize;
start += root->sectorsize;
offset += csum_size;
sector_sum++;
}
list_add_tail(&sums->list, list);
path->slots[0]++;
}
ret = 0;
fail:
btrfs_free_path(path);
return ret;
}
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig)
{
......
......@@ -1059,14 +1059,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
first_index = pos >> PAGE_CACHE_SHIFT;
last_index = (pos + count) >> PAGE_CACHE_SHIFT;
/*
* if this is a nodatasum mount, force summing off for the inode
* all the time. That way a later mount with summing on won't
* get confused
*/
if (btrfs_test_opt(root, NODATASUM))
btrfs_set_flag(inode, NODATASUM);
/*
* there are lots of better ways to do this, but this code
* makes sure the first and last page in the file range are
......
......@@ -771,6 +771,13 @@ static noinline int cow_file_range(struct inode *inode,
ram_size, cur_alloc_size, 0);
BUG_ON(ret);
if (root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
BUG_ON(ret);
}
if (disk_num_bytes < cur_alloc_size) {
printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
cur_alloc_size);
......@@ -910,6 +917,26 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
return 0;
}
static int noinline csum_exist_in_range(struct btrfs_root *root,
u64 bytenr, u64 num_bytes)
{
int ret;
struct btrfs_ordered_sum *sums;
LIST_HEAD(list);
ret = btrfs_lookup_csums_range(root, bytenr, bytenr + num_bytes - 1,
&list);
if (ret == 0 && list_empty(&list))
return 0;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del(&sums->list);
kfree(sums);
}
return 1;
}
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
......@@ -971,6 +998,7 @@ next_slot:
nocow = 0;
disk_bytenr = 0;
num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid > inode->i_ino ||
......@@ -996,19 +1024,29 @@ next_slot:
path->slots[0]++;
goto next_slot;
}
if (disk_bytenr == 0)
goto out_check;
if (btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
goto out_check;
if (disk_bytenr == 0)
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
goto out_check;
if (btrfs_extent_readonly(root, disk_bytenr))
goto out_check;
if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
disk_bytenr))
goto out_check;
disk_bytenr += btrfs_file_extent_offset(leaf, fi);
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
/*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
* either valid or do not exist.
*/
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out_check;
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
......@@ -1041,8 +1079,6 @@ out_check:
cow_start = (u64)-1;
}
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
struct extent_map *em;
struct extent_map_tree *em_tree;
......@@ -1105,11 +1141,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
if (btrfs_test_opt(root, NODATACOW) ||
btrfs_test_flag(inode, NODATACOW))
if (btrfs_test_flag(inode, NODATACOW))
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
else if (btrfs_test_flag(inode, PREALLOC))
......@@ -1252,8 +1286,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
BUG_ON(ret);
skip_sum = btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM);
skip_sum = btrfs_test_flag(inode, NODATASUM);
if (!(rw & (1 << BIO_RW))) {
if (bio_flags & EXTENT_BIO_COMPRESSED) {
......@@ -1263,6 +1296,9 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
btrfs_lookup_bio_sums(root, inode, bio, NULL);
goto mapit;
} else if (!skip_sum) {
/* csum items have already been cloned */
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
......@@ -1692,9 +1728,15 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ClearPageChecked(page);
goto good;
}
if (btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM))
if (btrfs_test_flag(inode, NODATASUM))
return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
GFP_NOFS);
return 0;
}
if (state && state->start == start) {
private = state->private;
......@@ -3391,6 +3433,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
owner = 1;
BTRFS_I(inode)->block_group =
btrfs_find_block_group(root, 0, alloc_hint, owner);
if ((mode & S_IFREG)) {
if (btrfs_test_opt(root, NODATASUM))
btrfs_set_flag(inode, NODATASUM);
if (btrfs_test_opt(root, NODATACOW))
btrfs_set_flag(inode, NODATACOW);
}
key[0].objectid = objectid;
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment