Commit 1a40e23b authored by Zheng Yan's avatar Zheng Yan Committed by Chris Mason
Browse files

Btrfs: update space balancing code



This patch updates the space balancing code to utilize the new
backref format.  Before, btrfs-vol -b would break any COW links
on data blocks or metadata.  This was slow and caused the amount
of space used to explode if a large number of snapshots were present.

The new code can keeps the sharing of all data extents and
most of the tree blocks.

To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.

To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).

To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 5b21f2ed
......@@ -179,7 +179,6 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *cow;
u32 nritems;
int ret = 0;
int different_trans = 0;
int level;
int unlock_orig = 0;
......@@ -233,13 +232,33 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (btrfs_header_generation(buf) != trans->transid) {
u32 nr_extents;
different_trans = 1;
ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
if (ret)
return ret;
ret = btrfs_cache_ref(trans, root, buf, nr_extents);
WARN_ON(ret);
} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
/*
* There are only two places that can drop reference to
* tree blocks owned by living reloc trees, one is here,
* the other place is btrfs_merge_path. In both places,
* we check reference count while tree block is locked.
* Furthermore, if reference count is one, it won't get
* increased by someone else.
*/
u32 refs;
ret = btrfs_lookup_extent_ref(trans, root, buf->start,
buf->len, &refs);
BUG_ON(ret);
if (refs == 1) {
ret = btrfs_update_ref(trans, root, buf, cow,
0, nritems);
clean_tree_block(trans, root, buf);
} else {
ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
}
BUG_ON(ret);
} else {
ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
if (ret)
......@@ -247,6 +266,14 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
clean_tree_block(trans, root, buf);
}
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_add_reloc_mapping(root, buf->start,
buf->len, cow->start);
BUG_ON(ret);
ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
WARN_ON(ret);
}
if (buf == root->node) {
WARN_ON(parent && parent != buf);
......@@ -1466,6 +1493,130 @@ done:
return ret;
}
int btrfs_merge_path(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *node_keys,
u64 *nodes, int lowest_level)
{
struct extent_buffer *eb;
struct extent_buffer *parent;
struct btrfs_key key;
u64 bytenr;
u64 generation;
u32 blocksize;
int level;
int slot;
int key_match;
int ret;
eb = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
BUG_ON(ret);
parent = eb;
while (1) {
level = btrfs_header_level(parent);
if (level == 0 || level <= lowest_level)
break;
ret = bin_search(parent, &node_keys[lowest_level], level,
&slot);
if (ret && slot > 0)
slot--;
bytenr = btrfs_node_blockptr(parent, slot);
if (nodes[level - 1] == bytenr)
break;
blocksize = btrfs_level_size(root, level - 1);
generation = btrfs_node_ptr_generation(parent, slot);
btrfs_node_key_to_cpu(eb, &key, slot);
key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
/*
* if node keys match and node pointer hasn't been modified
* in the running transaction, we can merge the path. for
* reloc trees, the node pointer check is skipped, this is
* because the reloc trees are fully controlled by the space
* balance code, no one else can modify them.
*/
if (!nodes[level - 1] || !key_match ||
(generation == trans->transid &&
root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
next_level:
if (level == 1 || level == lowest_level + 1)
break;
eb = read_tree_block(root, bytenr, blocksize,
generation);
btrfs_tree_lock(eb);
ret = btrfs_cow_block(trans, root, eb, parent, slot,
&eb, 0);
BUG_ON(ret);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
parent = eb;
continue;
}
if (generation == trans->transid) {
u32 refs;
BUG_ON(btrfs_header_owner(eb) !=
BTRFS_TREE_RELOC_OBJECTID);
/*
* lock the block to keep __btrfs_cow_block from
* changing the reference count.
*/
eb = read_tree_block(root, bytenr, blocksize,
generation);
btrfs_tree_lock(eb);
ret = btrfs_lookup_extent_ref(trans, root, bytenr,
blocksize, &refs);
BUG_ON(ret);
/*
* if replace block whose reference count is one,
* we have to "drop the subtree". so skip it for
* simplicity
*/
if (refs == 1) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
goto next_level;
}
}
btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
btrfs_set_node_ptr_generation(parent, slot, trans->transid);
btrfs_mark_buffer_dirty(parent);
ret = btrfs_inc_extent_ref(trans, root,
nodes[level - 1],
blocksize, parent->start,
btrfs_header_owner(parent),
btrfs_header_generation(parent),
level - 1, 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, root, bytenr,
blocksize, parent->start,
btrfs_header_owner(parent),
btrfs_header_generation(parent),
level - 1, 0, 1);
BUG_ON(ret);
if (generation == trans->transid) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
break;
}
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
return 0;
}
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
......
......@@ -604,6 +604,7 @@ struct btrfs_fs_info {
struct mutex chunk_mutex;
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
......@@ -647,6 +648,10 @@ struct btrfs_fs_info {
struct task_struct *cleaner_kthread;
int thread_pool_size;
/* tree relocation relocated fields */
struct extent_io_tree reloc_mapping_tree;
struct list_head dead_reloc_roots;
struct btrfs_leaf_ref_tree reloc_ref_tree;
struct btrfs_leaf_ref_tree shared_ref_tree;
struct kobject super_kobj;
......@@ -698,6 +703,7 @@ struct btrfs_root {
struct btrfs_leaf_ref_tree ref_tree_struct;
struct btrfs_dirty_root *dirty_root;
struct btrfs_root *log_root;
struct btrfs_root *reloc_root;
struct btrfs_root_item root_item;
struct btrfs_key root_key;
......@@ -1517,7 +1523,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize);
int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
......@@ -1582,10 +1587,29 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
int btrfs_free_reloc_root(struct btrfs_root *root);
int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
u64 num_bytes, u64 new_bytenr);
int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
u64 num_bytes, u64 *new_bytenr);
void btrfs_free_reloc_mappings(struct btrfs_root *root);
int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf, u64 orig_start);
int btrfs_add_dead_reloc_root(struct btrfs_root *root);
int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
/* ctree.c */
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
int btrfs_merge_path(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *node_keys,
u64 *nodes, int lowest_level);
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *new_key);
......
......@@ -1406,6 +1406,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->do_barriers = 1;
extent_io_tree_init(&fs_info->reloc_mapping_tree,
fs_info->btree_inode->i_mapping, GFP_NOFS);
INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
BTRFS_I(fs_info->btree_inode)->root = tree_root;
......@@ -1421,6 +1425,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->tree_reloc_mutex);
init_waitqueue_head(&fs_info->transaction_throttle);
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
......@@ -1627,6 +1632,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
ret = btrfs_recover_log_trees(log_tree_root);
BUG_ON(ret);
}
ret = btrfs_cleanup_reloc_trees(tree_root);
BUG_ON(ret);
fs_info->last_trans_committed = btrfs_super_generation(disk_super);
return tree_root;
......
This diff is collapsed.
......@@ -210,7 +210,10 @@ again:
goto err;
}
ret = btrfs_add_dead_root(dead_root, latest);
if (objectid == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_add_dead_reloc_root(dead_root);
else
ret = btrfs_add_dead_root(dead_root, latest);
if (ret)
goto err;
goto again;
......
......@@ -477,6 +477,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
dirty = root->dirty_root;
btrfs_free_log(trans, root);
btrfs_free_reloc_root(root);
if (root->commit_root == root->node) {
WARN_ON(root->node->start !=
......@@ -855,6 +856,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* with the tree-log code.
*/
mutex_lock(&root->fs_info->tree_log_mutex);
/*
* keep tree reloc code from adding new reloc trees
*/
mutex_lock(&root->fs_info->tree_reloc_mutex);
ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
&dirty_fs_roots);
......@@ -865,6 +871,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_free_log_root_tree(trans, root->fs_info);
btrfs_free_reloc_mappings(root);
ret = btrfs_commit_tree_roots(trans, root);
BUG_ON(ret);
......@@ -910,10 +918,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->tree_log_mutex);
btrfs_finish_extent_commit(trans, root, pinned_copy);
mutex_lock(&root->fs_info->trans_mutex);
kfree(pinned_copy);
btrfs_drop_dead_reloc_roots(root);
mutex_unlock(&root->fs_info->tree_reloc_mutex);
mutex_lock(&root->fs_info->trans_mutex);
cur_trans->commit_done = 1;
root->fs_info->last_trans_committed = cur_trans->transid;
wake_up(&cur_trans->commit_wait);
......
......@@ -1268,7 +1268,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
em_tree = &root->fs_info->mapping_tree.map_tree;
/* step one, relocate all the extents inside this chunk */
ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
BUG_ON(ret);
trans = btrfs_start_transaction(root, 1);
......@@ -1308,15 +1308,18 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
BUG_ON(ret);
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
spin_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
kfree(map);
em->bdev = NULL;
/* once for the tree */
free_extent_map(em);
spin_unlock(&em_tree->lock);
/* once for us */
free_extent_map(em);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment