Commit fa9c0d79 authored by Chris Mason's avatar Chris Mason
Browse files

Btrfs: rework allocation clustering



Because btrfs is copy-on-write, we end up picking new locations for
blocks very often.  This makes it fairly difficult to maintain perfect
read patterns over time, but we can at least do some optimizations
for writes.

This is done today by remembering the last place we allocated and
trying to find a free space hole big enough to hold more than just one
allocation.  The end result is that we tend to write sequentially to
the drive.

This happens all the time for metadata and it happens for data
when mounted -o ssd.  But, the way we record it is fairly racey
and it tends to fragment the free space over time because we are trying
to allocate fairly large areas at once.

This commit gets rid of the races by adding a free space cluster object
with dedicated locking to make sure that only one process at a time
is out replacing the cluster.

The free space fragmentation is somewhat solved by allowing a cluster
to be comprised of smaller free space extents.  This part definitely
adds some CPU time to the cluster allocations, but it allows the allocator
to consume the small holes left behind by cow.

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 8e73f275
......@@ -633,11 +633,29 @@ struct btrfs_space_info {
struct rw_semaphore groups_sem;
};
struct btrfs_free_space {
struct rb_node bytes_index;
struct rb_node offset_index;
u64 offset;
u64 bytes;
/*
* free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
* allocations and data allocations in ssd mode.
*/
struct btrfs_free_cluster {
spinlock_t lock;
spinlock_t refill_lock;
struct rb_root root;
/* largest extent in this cluster */
u64 max_size;
/* first extent starting offset */
u64 window_start;
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
* cluster onto a list in the block group so that it can
* be freed before the block group is freed.
*/
struct list_head block_group_list;
};
struct btrfs_block_group_cache {
......@@ -667,6 +685,11 @@ struct btrfs_block_group_cache {
/* usage count */
atomic_t count;
/* List of struct btrfs_free_clusters for this block group.
* Today it will only have one thing on it, but that may change
*/
struct list_head cluster_list;
};
struct btrfs_leaf_ref_tree {
......@@ -838,8 +861,12 @@ struct btrfs_fs_info {
spinlock_t delalloc_lock;
spinlock_t new_trans_lock;
u64 delalloc_bytes;
u64 last_alloc;
u64 last_data_alloc;
/* data_alloc_cluster is only used in ssd mode */
struct btrfs_free_cluster data_alloc_cluster;
/* all metadata allocations go through this cluster */
struct btrfs_free_cluster meta_alloc_cluster;
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
......@@ -1747,6 +1774,7 @@ static inline struct dentry *fdentry(struct file *file)
}
/* extent-tree.c */
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
......@@ -2173,16 +2201,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
int btrfs_init_acl(struct inode *inode, struct inode *dir);
int btrfs_acl_chmod(struct inode *inode);
/* free-space-cache.c */
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
*block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes, u64 empty_size);
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes);
u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
#endif
......@@ -38,6 +38,7 @@
#include "locking.h"
#include "ref-cache.h"
#include "tree-log.h"
#include "free-space-cache.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
......@@ -1652,6 +1653,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->tree_reloc_mutex);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
init_waitqueue_head(&fs_info->transaction_throttle);
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
......
......@@ -31,6 +31,7 @@
#include "volumes.h"
#include "locking.h"
#include "ref-cache.h"
#include "free-space-cache.h"
#define PENDING_EXTENT_INSERT 0
#define PENDING_EXTENT_DELETE 1
......@@ -324,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
return cache;
}
static inline void put_block_group(struct btrfs_block_group_cache *cache)
void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
if (atomic_dec_and_test(&cache->count))
kfree(cache);
......@@ -397,12 +398,12 @@ again:
div_factor(cache->key.offset, factor)) {
group_start = cache->key.objectid;
spin_unlock(&cache->lock);
put_block_group(cache);
btrfs_put_block_group(cache);
goto found;
}
}
spin_unlock(&cache->lock);
put_block_group(cache);
btrfs_put_block_group(cache);
cond_resched();
}
if (!wrapped) {
......@@ -1592,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
if (!block_group || block_group->ro)
readonly = 1;
if (block_group)
put_block_group(block_group);
btrfs_put_block_group(block_group);
return readonly;
}
......@@ -2016,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
WARN_ON(ret);
}
}
put_block_group(cache);
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
}
......@@ -2033,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return 0;
bytenr = cache->key.objectid;
put_block_group(cache);
btrfs_put_block_group(cache);
return bytenr;
}
......@@ -2077,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
if (cache->cached)
btrfs_add_free_space(cache, bytenr, len);
}
put_block_group(cache);
btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
......@@ -2108,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
put_block_group(cache);
btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
......@@ -2543,12 +2544,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
u64 *last_ptr = NULL;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
int using_hint = 0;
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
......@@ -2561,38 +2563,39 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
allowed_chunk_alloc = 1;
if (data & BTRFS_BLOCK_GROUP_METADATA) {
last_ptr = &root->fs_info->last_alloc;
last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
empty_cluster = 64 * 1024;
}
if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
last_ptr = &root->fs_info->last_data_alloc;
if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
last_ptr = &root->fs_info->data_alloc_cluster;
}
if (last_ptr) {
if (*last_ptr)
hint_byte = *last_ptr;
else
empty_size += empty_cluster;
} else {
empty_cluster = 0;
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
if (!last_ptr) {
empty_cluster = 0;
loop = 1;
}
if (search_start == hint_byte) {
using_hint = 1;
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
if (block_group && block_group_bits(block_group, data)) {
down_read(&space_info->groups_sem);
goto have_block_group;
} else if (block_group) {
put_block_group(block_group);
btrfs_put_block_group(block_group);
}
empty_size += empty_cluster;
using_hint = 0;
}
search:
......@@ -2609,7 +2612,7 @@ have_block_group:
ret = cache_block_group(root, block_group);
mutex_unlock(&block_group->cache_mutex);
if (ret) {
put_block_group(block_group);
btrfs_put_block_group(block_group);
break;
}
}
......@@ -2617,11 +2620,88 @@ have_block_group:
if (unlikely(block_group->ro))
goto loop;
if (last_ptr) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
*/
spin_lock(&last_ptr->refill_lock);
offset = btrfs_alloc_from_cluster(block_group, last_ptr,
num_bytes, search_start);
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
spin_lock(&last_ptr->lock);
/*
* whoops, this cluster doesn't actually point to
* this block group. Get a ref on the block
* group is does point to and try again
*/
if (!last_ptr_loop && last_ptr->block_group &&
last_ptr->block_group != block_group) {
btrfs_put_block_group(block_group);
block_group = last_ptr->block_group;
atomic_inc(&block_group->count);
spin_unlock(&last_ptr->lock);
spin_unlock(&last_ptr->refill_lock);
last_ptr_loop = 1;
search_start = block_group->key.objectid;
goto have_block_group;
}
spin_unlock(&last_ptr->lock);
/*
* this cluster didn't work out, free it and
* start over
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
last_ptr_loop = 0;
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans,
block_group, last_ptr,
offset, num_bytes,
empty_cluster + empty_size);
if (ret == 0) {
/*
* now pull our allocation out of this
* cluster
*/
offset = btrfs_alloc_from_cluster(block_group,
last_ptr, num_bytes,
search_start);
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
}
/*
* at this point we either didn't find a cluster
* or we weren't able to allocate a block from our
* cluster. Free the cluster we've been trying
* to use, and go to the next block group
*/
if (loop < 2) {
btrfs_return_cluster_to_free_space(NULL,
last_ptr);
spin_unlock(&last_ptr->refill_lock);
goto loop;
}
spin_unlock(&last_ptr->refill_lock);
}
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
if (!offset)
goto loop;
checks:
search_start = stripe_align(root, offset);
/* move on to the next group */
......@@ -2637,11 +2717,6 @@ have_block_group:
goto loop;
}
if (using_hint && search_start > hint_byte) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
}
if (exclude_nr > 0 &&
(search_start + num_bytes > exclude_start &&
search_start < exclude_start + exclude_nr)) {
......@@ -2670,33 +2745,33 @@ have_block_group:
/* we are all good, lets return */
break;
loop:
put_block_group(block_group);
if (using_hint) {
empty_size += empty_cluster;
using_hint = 0;
up_read(&space_info->groups_sem);
goto search;
}
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
if (!ins->objectid && (empty_size || allowed_chunk_alloc)) {
int try_again = empty_size;
empty_size = 0;
/* loop == 0, try to find a clustered alloc in every block group
* loop == 1, try again after forcing a chunk allocation
* loop == 2, set empty_size and empty_cluster to 0 and try again
*/
if (!ins->objectid && loop < 3 &&
(empty_size || empty_cluster || allowed_chunk_alloc)) {
if (loop >= 2) {
empty_size = 0;
empty_cluster = 0;
}
if (allowed_chunk_alloc) {
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data, 1);
if (!ret)
try_again = 1;
allowed_chunk_alloc = 0;
} else {
space_info->force_alloc = 1;
}
if (try_again)
if (loop < 3) {
loop++;
goto search;
}
ret = -ENOSPC;
} else if (!ins->objectid) {
ret = -ENOSPC;
......@@ -2707,9 +2782,7 @@ loop:
if (!(data & BTRFS_BLOCK_GROUP_DATA))
trans->block_group = block_group->key.objectid;
if (last_ptr)
*last_ptr = ins->objectid + ins->offset;
put_block_group(block_group);
btrfs_put_block_group(block_group);
ret = 0;
}
......@@ -2817,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
put_block_group(cache);
btrfs_put_block_group(cache);
update_reserved_extents(root, start, len, 0);
return ret;
......@@ -2955,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
ret = btrfs_remove_free_space(block_group, ins->objectid,
ins->offset);
BUG_ON(ret);
put_block_group(block_group);
btrfs_put_block_group(block_group);
ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
ref_generation, owner, ins, 1);
return ret;
......@@ -5644,7 +5717,7 @@ next:
WARN_ON(block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
spin_unlock(&block_group->lock);
put_block_group(block_group);
btrfs_put_block_group(block_group);
ret = 0;
out:
btrfs_free_path(path);
......@@ -5774,6 +5847,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
spin_lock_init(&cache->tree_lock);
mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
......@@ -5830,6 +5904,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
spin_lock_init(&cache->tree_lock);
mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
btrfs_set_block_group_used(&cache->item, bytes_used);
btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
......@@ -5889,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->space_info->lock);
block_group->space_info->full = 0;
put_block_group(block_group);
put_block_group(block_group);
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
......
......@@ -18,6 +18,15 @@
#include <linux/sched.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
struct btrfs_free_space {
struct rb_node bytes_index;
struct rb_node offset_index;
u64 offset;
u64 bytes;
};
static int tree_insert_offset(struct rb_root *root, u64 offset,
struct rb_node *node)
......@@ -371,12 +380,58 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
return ret;
}
/*
* for a given cluster, put all of its extents back into the free
* space cache. If the block group passed doesn't match the block group
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
static int
__btrfs_return_cluster_to_free_space(
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space *entry;
struct rb_node *node;
spin_lock(&cluster->lock);
if (cluster->block_group != block_group)
goto out;
cluster->window_start = 0;
node = rb_first(&cluster->root);
while(node) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
link_free_space(block_group, entry);
}
list_del_init(&cluster->block_group_list);
btrfs_put_block_group(cluster->block_group);
cluster->block_group = NULL;
cluster->root.rb_node = NULL;
out:
spin_unlock(&cluster->lock);
return 0;
}
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
{
struct btrfs_free_space *info;
struct rb_node *node;
struct btrfs_free_cluster *cluster;
struct btrfs_free_cluster *safe;
spin_lock(&block_group->tree_lock);
list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
block_group_list) {
WARN_ON(cluster->block_group != block_group);
__btrfs_return_cluster_to_free_space(block_group, cluster);
}
while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
info = rb_entry(node, struct btrfs_free_space, bytes_index);
unlink_free_space(block_group, info);
......@@ -417,3 +472,245 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
return ret;
}
/*
* given a cluster, put all of its extents back into the free space
* cache. If a block group is passed, this function will only free
* a cluster that belongs to the passed block group.
*
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
int btrfs_return_cluster_to_free_space(
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster)
{
int ret;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
if (!block_group) {
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
return 0;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
return 0;
}
atomic_inc(&block_group->count);
spin_unlock(&cluster->lock);
/* now return any extents the cluster had on it */
spin_lock(&block_group->tree_lock);
ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&block_group->tree_lock);
/* finally drop our ref */
btrfs_put_block_group(block_group);
return ret;
}
/*
* given a cluster, try to allocate 'bytes' from it, returns 0
* if it couldn't find anything suitably large, or a logical disk offset
* if things worked out
*/
u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start)
{
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
u64 ret = 0;
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
if (cluster->block_group != block_group)
goto out;
node = rb_first(&cluster->root);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while(1) {
if (entry->bytes < bytes || entry->offset < min_start) {
struct rb_node *node;
node = rb_next(&entry->offset_index);
if (!node)
break;
entry = rb_entry(node, struct btrfs_free_space,
offset_index);
continue;
}
ret = entry->offset;
entry->offset += bytes;
entry->bytes -= bytes;
if (entry->bytes == 0) {