Commit 9ed74f2d authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason
Browse files

Btrfs: proper -ENOSPC handling



At the start of a transaction we do a btrfs_reserve_metadata_space() and
specify how many items we plan on modifying.  Then once we've done our
modifications and such, just call btrfs_unreserve_metadata_space() for
the same number of items we reserved.

For keeping track of metadata needed for data I've had to add an extent_io op
for when we merge extents.  This lets us track space properly when we are doing
sequential writes, so we don't end up reserving way more metadata space than
what we need.

The only place where the metadata space accounting is not done is in the
relocation code.  This is because Yan is going to be reworking that code in the
near future, so running btrfs-vol -b could still possibly result in a ENOSPC
related panic.  This patch also turns off the metadata_ratio stuff in order to
allow users to more efficiently use their disk space.

This patch makes it so we track how much metadata we need for an inode's
delayed allocation extents by tracking how many extents are currently
waiting for allocation.  It introduces two new callbacks for the
extent_io tree's, merge_extent_hook and split_extent_hook.  These help
us keep track of when we merge delalloc extents together and split them
up.  Reservations are handled prior to any actually dirty'ing occurs,
and then we unreserve after we dirty.

btrfs_unreserve_metadata_for_delalloc() will make the appropriate
unreservations as needed based on the number of reservations we
currently have and the number of extents we currently have.  Doing the
reservation outside of doing any of the actual dirty'ing lets us do
things like filemap_flush() the inode to try and force delalloc to
happen, or as a last resort actually start allocation on all delalloc
inodes in the fs.  This has survived dbench, fs_mark and an fsx torture
test.
Signed-off-by: default avatarJosef Bacik <jbacik@redhat.com>
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent c65ddb52
......@@ -127,6 +127,14 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
/*
* These two counters are for delalloc metadata reservations. We keep
* track of how many extents we've accounted for vs how many extents we
* have.
*/
int delalloc_reserved_extents;
int delalloc_extents;
/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
......
......@@ -675,18 +675,19 @@ struct btrfs_space_info {
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 bytes_super; /* total bytes reserved for the super blocks */
/* delalloc accounting */
u64 bytes_delalloc; /* number of bytes reserved for allocation,
this space is not necessarily reserved yet
by the allocator */
u64 bytes_root; /* the number of bytes needed to commit a
transaction */
u64 bytes_may_use; /* number of bytes that may be used for
delalloc */
delalloc/allocations */
u64 bytes_delalloc; /* number of bytes currently reserved for
delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
int force_delalloc; /* make people start doing filemap_flush until
we're under a threshold */
struct list_head list;
......@@ -695,6 +696,9 @@ struct btrfs_space_info {
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
int allocating_chunk;
wait_queue_head_t wait;
};
/*
......@@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_metadata_free_space(struct btrfs_root *root);
int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items);
int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
......
......@@ -1629,7 +1629,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 8;
fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
......
......@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
......@@ -2764,67 +2766,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
alloc_target);
}
static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
{
u64 num_bytes;
int level;
level = BTRFS_MAX_LEVEL - 2;
/*
* NOTE: these calculations are absolutely the worst possible case.
* This assumes that _every_ item we insert will require a new leaf, and
* that the tree has grown to its maximum level size.
*/
/*
* for every item we insert we could insert both an extent item and a
* extent ref item. Then for ever item we insert, we will need to cow
* both the original leaf, plus the leaf to the left and right of it.
*
* Unless we are talking about the extent root, then we just want the
* number of items * 2, since we just need the extent item plus its ref.
*/
if (root == root->fs_info->extent_root)
num_bytes = num_items * 2;
else
num_bytes = (num_items + (2 * num_items)) * 3;
/*
* num_bytes is total number of leaves we could need times the leaf
* size, and then for every leaf we could end up cow'ing 2 nodes per
* level, down to the leaf level.
*/
num_bytes = (num_bytes * root->leafsize) +
(num_bytes * (level * 2)) * root->nodesize;
return num_bytes;
}
/*
* for now this just makes sure we have at least 5% of our metadata space free
* for use.
* Unreserve metadata space for delalloc. If we have less reserved credits than
* we have extents, this function does nothing.
*/
int btrfs_check_metadata_free_space(struct btrfs_root *root)
int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 alloc_target, thresh;
int committed = 0, ret;
u64 num_bytes;
u64 alloc_target;
bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
if (!meta_sinfo)
goto alloc;
again:
num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
num_items);
spin_lock(&meta_sinfo->lock);
if (!meta_sinfo->full)
thresh = meta_sinfo->total_bytes * 80;
else
thresh = meta_sinfo->total_bytes * 95;
if (BTRFS_I(inode)->delalloc_reserved_extents <=
BTRFS_I(inode)->delalloc_extents) {
spin_unlock(&meta_sinfo->lock);
return 0;
}
BTRFS_I(inode)->delalloc_reserved_extents--;
BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
if (meta_sinfo->bytes_delalloc < num_bytes) {
bug = true;
meta_sinfo->bytes_delalloc = 0;
} else {
meta_sinfo->bytes_delalloc -= num_bytes;
}
spin_unlock(&meta_sinfo->lock);
BUG_ON(bug);
return 0;
}
static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
{
u64 thresh;
thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super + meta_sinfo->bytes_root +
meta_sinfo->bytes_may_use;
thresh = meta_sinfo->total_bytes - thresh;
thresh *= 80;
do_div(thresh, 100);
if (thresh <= meta_sinfo->bytes_delalloc)
meta_sinfo->force_delalloc = 1;
else
meta_sinfo->force_delalloc = 0;
}
if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super > thresh) {
struct btrfs_trans_handle *trans;
if (!meta_sinfo->full) {
meta_sinfo->force_alloc = 1;
static int maybe_allocate_chunk(struct btrfs_root *root,
struct btrfs_space_info *info)
{
struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
struct btrfs_trans_handle *trans;
bool wait = false;
int ret = 0;
u64 min_metadata;
u64 free_space;
free_space = btrfs_super_total_bytes(disk_super);
/*
* we allow the metadata to grow to a max of either 5gb or 5% of the
* space in the volume.
*/
min_metadata = min((u64)5 * 1024 * 1024 * 1024,
div64_u64(free_space * 5, 100));
if (info->total_bytes >= min_metadata) {
spin_unlock(&info->lock);
return 0;
}
if (info->full) {
spin_unlock(&info->lock);
return 0;
}
if (!info->allocating_chunk) {
info->force_alloc = 1;
info->allocating_chunk = 1;
init_waitqueue_head(&info->wait);
} else {
wait = true;
}
spin_unlock(&info->lock);
if (wait) {
wait_event(info->wait,
!info->allocating_chunk);
return 1;
}
trans = btrfs_start_transaction(root, 1);
if (!trans) {
ret = -ENOMEM;
goto out;
}
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4096 + 2 * 1024 * 1024,
info->flags, 0);
btrfs_end_transaction(trans, root);
if (ret)
goto out;
out:
spin_lock(&info->lock);
info->allocating_chunk = 0;
spin_unlock(&info->lock);
wake_up(&info->wait);
if (ret)
return 0;
return 1;
}
/*
* Reserve metadata space for delalloc.
*/
int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 num_bytes;
u64 used;
u64 alloc_target;
int flushed = 0;
int force_delalloc;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
num_items);
again:
spin_lock(&meta_sinfo->lock);
force_delalloc = meta_sinfo->force_delalloc;
if (unlikely(!meta_sinfo->bytes_root))
meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
if (!flushed)
meta_sinfo->bytes_delalloc += num_bytes;
used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super + meta_sinfo->bytes_root +
meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
if (used > meta_sinfo->total_bytes) {
flushed++;
if (flushed == 1) {
if (maybe_allocate_chunk(root, meta_sinfo))
goto again;
flushed++;
} else {
spin_unlock(&meta_sinfo->lock);
alloc:
trans = btrfs_start_transaction(root, 1);
if (!trans)
return -ENOMEM;
}
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2 * 1024 * 1024, alloc_target, 0);
btrfs_end_transaction(trans, root);
if (!meta_sinfo) {
meta_sinfo = __find_space_info(info,
alloc_target);
}
if (flushed == 2) {
filemap_flush(inode->i_mapping);
goto again;
} else if (flushed == 3) {
btrfs_start_delalloc_inodes(root);
btrfs_wait_ordered_extents(root, 0);
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
printk(KERN_ERR "enospc, has %d, reserved %d\n",
BTRFS_I(inode)->delalloc_extents,
BTRFS_I(inode)->delalloc_reserved_extents);
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
if (!committed) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
if (!trans)
return -ENOMEM;
ret = btrfs_commit_transaction(trans, root);
if (ret)
return ret;
BTRFS_I(inode)->delalloc_reserved_extents++;
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
if (!flushed && force_delalloc)
filemap_flush(inode->i_mapping);
return 0;
}
/*
* unreserve num_items number of items worth of metadata space. This needs to
* be paired with btrfs_reserve_metadata_space.
*
* NOTE: if you have the option, run this _AFTER_ you do a
* btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
* oprations which will result in more used metadata, so we want to make sure we
* can do that without issue.
*/
int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 num_bytes;
u64 alloc_target;
bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
num_bytes = calculate_bytes_needed(root, num_items);
spin_lock(&meta_sinfo->lock);
if (meta_sinfo->bytes_may_use < num_bytes) {
bug = true;
meta_sinfo->bytes_may_use = 0;
} else {
meta_sinfo->bytes_may_use -= num_bytes;
}
spin_unlock(&meta_sinfo->lock);
BUG_ON(bug);
return 0;
}
/*
* Reserve some metadata space for use. We'll calculate the worste case number
* of bytes that would be needed to modify num_items number of items. If we
* have space, fantastic, if not, you get -ENOSPC. Please call
* btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
* items you reserved, since whatever metadata you needed should have already
* been allocated.
*
* This will commit the transaction to make more space if we don't have enough
* metadata space. THe only time we don't do this is if we're reserving space
* inside of a transaction, then we will just return -ENOSPC and it is the
* callers responsibility to handle it properly.
*/
int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 num_bytes;
u64 used;
u64 alloc_target;
int retries = 0;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
num_bytes = calculate_bytes_needed(root, num_items);
again:
spin_lock(&meta_sinfo->lock);
if (unlikely(!meta_sinfo->bytes_root))
meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
if (!retries)
meta_sinfo->bytes_may_use += num_bytes;
used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super + meta_sinfo->bytes_root +
meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
if (used > meta_sinfo->total_bytes) {
retries++;
if (retries == 1) {
if (maybe_allocate_chunk(root, meta_sinfo))
goto again;
retries++;
} else {
spin_unlock(&meta_sinfo->lock);
}
if (retries == 2) {
btrfs_start_delalloc_inodes(root);
btrfs_wait_ordered_extents(root, 0);
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_may_use -= num_bytes;
spin_unlock(&meta_sinfo->lock);
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
return 0;
......@@ -2915,7 +3196,7 @@ alloc:
BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
return btrfs_check_metadata_free_space(root);
return 0;
}
/*
......@@ -3014,17 +3295,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
BUG_ON(!space_info);
spin_lock(&space_info->lock);
if (space_info->force_alloc) {
if (space_info->force_alloc)
force = 1;
space_info->force_alloc = 0;
}
if (space_info->full) {
spin_unlock(&space_info->lock);
goto out;
}
thresh = space_info->total_bytes - space_info->bytes_readonly;
thresh = div_factor(thresh, 6);
thresh = div_factor(thresh, 8);
if (!force &&
(space_info->bytes_used + space_info->bytes_pinned +
space_info->bytes_reserved + alloc_bytes) < thresh) {
......@@ -3038,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
if (flags & BTRFS_BLOCK_GROUP_DATA) {
if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
......@@ -3046,8 +3325,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
space_info->force_alloc = 0;
spin_unlock(&space_info->lock);
out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
......@@ -4062,21 +4344,32 @@ loop:
return ret;
}
static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved),
info->bytes_pinned - info->bytes_reserved -
info->bytes_super),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
" may_use=%llu, used=%llu\n",
" may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
"\n",
(unsigned long long)info->total_bytes,
(unsigned long long)info->bytes_pinned,
(unsigned long long)info->bytes_delalloc,
(unsigned long long)info->bytes_may_use,
(unsigned long long)info->bytes_used);
(unsigned long long)info->bytes_used,
(unsigned long long)info->bytes_root,
(unsigned long long)info->bytes_super,
(unsigned long long)info->bytes_reserved);
spin_unlock(&info->lock);
if (!dump_block_groups)
return;
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
......@@ -4144,7 +4437,7 @@ again:
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
dump_space_info(sinfo, num_bytes);
dump_space_info(sinfo, num_bytes, 1);
}
return ret;
......
......@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
return NULL;
}
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
if (tree->ops && tree->ops->merge_extent_hook)
tree->ops->merge_extent_hook(tree->mapping->host, new,
other);
}
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
......@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
......@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
state = NULL;
}
}
return 0;
}
static void set_state_cb(struct extent_io_tree *tree,
static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
tree->ops->set_bit_hook(tree->mapping->host, state->start,
state->end, state->state, bits);
return tree->ops->set_bit_hook(tree->mapping->host,
state->start, state->end,
state->state, bits);
}