Commit d899e052 authored by Yan Zheng's avatar Yan Zheng Committed by Chris Mason
Browse files

Btrfs: Add fallocate support v2


This patch updates btrfs-progs for fallocate support.

fallocate is a little different in Btrfs because we need to tell the
COW system that a given preallocated extent doesn't need to be
cow'd as long as there are no snapshots of it.  This leverages the
-o nodatacow checks.
Signed-off-by: default avatarYan Zheng <zheng.yan@oracle.com>
parent 80ff3856
......@@ -462,8 +462,9 @@ struct btrfs_root_item {
u8 level;
} __attribute__ ((__packed__));
#define BTRFS_FILE_EXTENT_REG 0
#define BTRFS_FILE_EXTENT_INLINE 1
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
struct btrfs_file_extent_item {
/*
......@@ -868,6 +869,7 @@ struct btrfs_root {
#define BTRFS_INODE_NODATACOW (1 << 1)
#define BTRFS_INODE_READONLY (1 << 2)
#define BTRFS_INODE_NOCOMPRESS (1 << 3)
#define BTRFS_INODE_PREALLOC (1 << 4)
#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
~BTRFS_INODE_##flag)
#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
......@@ -1924,6 +1926,9 @@ extern struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 inline_limit, u64 *hint_block);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
/* tree-defrag.c */
......
......@@ -2147,6 +2147,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
total_needed += empty_size;
block_group = btrfs_lookup_block_group(root->fs_info, search_start);
if (!block_group)
block_group = btrfs_lookup_first_block_group(root->fs_info,
search_start);
space_info = __find_space_info(root->fs_info, data);
down_read(&space_info->groups_sem);
......@@ -3426,9 +3429,7 @@ walk_down:
next:
level--;
btrfs_release_path(extent_root, path);
if (need_resched()) {
cond_resched();
}
cond_resched();
}
/* reached lowest level */
ret = 1;
......@@ -3539,9 +3540,7 @@ found:
}
btrfs_release_path(extent_root, path);
if (need_resched()) {
cond_resched();
}
cond_resched();
}
/* reached max tree level, but no tree root found. */
BUG();
......@@ -3654,8 +3653,9 @@ static int noinline get_new_locations(struct inode *reloc_inode,
exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
fi);
WARN_ON(exts[nr].offset > 0);
WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
BUG_ON(exts[nr].offset > 0);
BUG_ON(exts[nr].compression || exts[nr].encryption);
BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
cur_pos += exts[nr].num_bytes;
nr++;
......@@ -3709,6 +3709,7 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
u32 nritems;
int nr_scaned = 0;
int extent_locked = 0;
int extent_type;
int ret;
memcpy(&key, leaf_key, sizeof(key));
......@@ -3781,8 +3782,9 @@ next:
}
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if ((btrfs_file_extent_type(leaf, fi) !=
BTRFS_FILE_EXTENT_REG) ||
extent_type = btrfs_file_extent_type(leaf, fi);
if ((extent_type != BTRFS_FILE_EXTENT_REG &&
extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
(btrfs_file_extent_disk_bytenr(leaf, fi) !=
extent_key->objectid)) {
path->slots[0]++;
......@@ -3865,16 +3867,10 @@ next:
if (nr_extents == 1) {
/* update extent pointer in place */
btrfs_set_file_extent_generation(leaf, fi,
trans->transid);
btrfs_set_file_extent_disk_bytenr(leaf, fi,
new_extents[0].disk_bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi,
new_extents[0].disk_num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi,
new_extents[0].ram_bytes);
ext_offset += new_extents[0].offset;
btrfs_set_file_extent_offset(leaf, fi, ext_offset);
btrfs_mark_buffer_dirty(leaf);
btrfs_drop_extent_cache(inode, key.offset,
......@@ -3901,6 +3897,8 @@ next:
btrfs_release_path(root, path);
key.offset += num_bytes;
} else {
BUG_ON(1);
#if 0
u64 alloc_hint;
u64 extent_len;
int i;
......@@ -3977,6 +3975,7 @@ next:
break;
}
BUG_ON(i >= nr_extents);
#endif
}
if (extent_locked) {
......@@ -4156,15 +4155,10 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_ram_bytes(leaf, fi,
new_extent->ram_bytes);
btrfs_set_file_extent_disk_bytenr(leaf, fi,
new_extent->disk_bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi,
new_extent->disk_num_bytes);
new_extent->offset += btrfs_file_extent_offset(leaf, fi);
btrfs_set_file_extent_offset(leaf, fi, new_extent->offset);
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_inc_extent_ref(trans, root,
......@@ -4625,12 +4619,15 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
*/
if (!new_extents) {
u64 group_start = group->key.objectid;
new_extents = kmalloc(sizeof(*new_extents),
GFP_NOFS);
nr_extents = 1;
ret = get_new_locations(reloc_inode,
extent_key,
group_start, 0,
group_start, 1,
&new_extents,
&nr_extents);
if (ret < 0)
if (ret)
goto out;
}
btrfs_record_root_in_trans(found_root);
......@@ -4762,7 +4759,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, size);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM |
BTRFS_INODE_NOCOMPRESS);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
out:
......@@ -4835,6 +4833,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
struct inode *reloc_inode;
struct btrfs_block_group_cache *block_group;
struct btrfs_key key;
u64 skipped;
u64 cur_byte;
u64 total_found;
u32 nritems;
......@@ -4864,6 +4863,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
btrfs_start_delalloc_inodes(info->tree_root);
btrfs_wait_ordered_extents(info->tree_root, 0);
again:
skipped = 0;
total_found = 0;
progress = 0;
key.objectid = block_group->key.objectid;
......@@ -4926,6 +4926,8 @@ next:
ret = relocate_one_extent(root, path, &key, block_group,
reloc_inode, pass);
BUG_ON(ret < 0);
if (ret > 0)
skipped++;
key.objectid = cur_byte;
key.type = 0;
......@@ -4944,6 +4946,11 @@ next:
printk("btrfs found %llu extents in pass %d\n",
(unsigned long long)total_found, pass);
pass++;
if (total_found == skipped && pass > 2) {
iput(reloc_inode);
reloc_inode = create_reloc_inode(info, block_group);
pass = 0;
}
goto again;
}
......@@ -5011,17 +5018,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
cache_node);
spin_unlock(&info->block_group_cache_lock);
btrfs_remove_free_space_cache(block_group);
spin_lock(&info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
spin_unlock(&info->block_group_cache_lock);
btrfs_remove_free_space_cache(block_group);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
up_write(&block_group->space_info->groups_sem);
kfree(block_group);
spin_lock(&info->block_group_cache_lock);
}
spin_unlock(&info->block_group_cache_lock);
return 0;
......
......@@ -2015,6 +2015,8 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
}
bdev = em->bdev;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
free_extent_map(em);
em = NULL;
......@@ -2769,14 +2771,18 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
struct inode *inode = mapping->host;
u64 start = iblock << inode->i_blkbits;
sector_t sector = 0;
size_t blksize = (1 << inode->i_blkbits);
struct extent_map *em;
em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
GFP_NOFS);
em = get_extent(inode, NULL, 0, start, blksize, 0);
unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
GFP_NOFS);
if (!em || IS_ERR(em))
return 0;
if (em->block_start == EXTENT_MAP_INLINE ||
em->block_start == EXTENT_MAP_HOLE)
if (em->block_start > EXTENT_MAP_LAST_BYTE)
goto out;
sector = (em->block_start + start - em->start) >> inode->i_blkbits;
......
......@@ -12,6 +12,7 @@
#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
#define EXTENT_FLAG_COMPRESSED 1
#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
struct extent_map {
struct rb_node rb_node;
......
......@@ -381,7 +381,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
int keep;
int slot;
int bookend;
int found_type;
int found_type = 0;
int found_extent;
int found_inline;
int recow;
......@@ -442,7 +442,8 @@ next_slot:
extent);
other_encoding = btrfs_file_extent_other_encoding(leaf,
extent);
if (found_type == BTRFS_FILE_EXTENT_REG) {
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end =
btrfs_file_extent_disk_bytenr(leaf,
extent);
......@@ -609,8 +610,7 @@ next_slot:
*/
btrfs_set_file_extent_ram_bytes(leaf, extent,
ram_bytes);
btrfs_set_file_extent_type(leaf, extent,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_type(leaf, extent, found_type);
btrfs_mark_buffer_dirty(path->nodes[0]);
......@@ -661,6 +661,243 @@ out:
return ret;
}
static int extent_mergeable(struct extent_buffer *leaf, int slot,
u64 objectid, u64 bytenr, u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 extent_end;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
return 0;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
return 0;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
return 0;
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
if ((*start && *start != key.offset) || (*end && *end != extent_end))
return 0;
*start = key.offset;
*end = extent_end;
return 1;
}
/*
* Mark extent in the range start - end as written.
*
* This changes extent type from 'pre-allocated' to 'regular'. If only
* part of extent is marked as written, the extent will be split into
* two or three.
*/
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end)
{
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 bytenr;
u64 num_bytes;
u64 extent_end;
u64 extent_offset;
u64 other_start;
u64 other_end;
u64 split = start;
u64 locked_end = end;
int extent_type;
int split_end = 1;
int ret;
btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
BUG_ON(!path);
again:
key.objectid = inode->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY;
if (split == start)
key.offset = split;
else
key.offset = split - 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
BUG_ON(key.objectid != inode->i_ino ||
key.type != BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
BUG_ON(key.offset > start || extent_end < end);
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
extent_offset = btrfs_file_extent_offset(leaf, fi);
if (key.offset == start)
split = end;
if (key.offset == start && extent_end == end) {
int del_nr = 0;
int del_slot = 0;
u64 leaf_owner = btrfs_header_owner(leaf);
u64 leaf_gen = btrfs_header_generation(leaf);
other_start = end;
other_end = 0;
if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
bytenr, &other_start, &other_end)) {
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
leaf->start, leaf_owner,
leaf_gen, inode->i_ino, 0);
BUG_ON(ret);
}
other_start = 0;
other_end = start;
if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
bytenr, &other_start, &other_end)) {
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
leaf->start, leaf_owner,
leaf_gen, inode->i_ino, 0);
BUG_ON(ret);
}
split_end = 0;
if (del_nr == 0) {
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
goto done;
}
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
BUG_ON(ret);
goto done;
} else if (split == start) {
if (locked_end < extent_end) {
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
locked_end, extent_end - 1, GFP_NOFS);
if (!ret) {
btrfs_release_path(root, path);
lock_extent(&BTRFS_I(inode)->io_tree,
locked_end, extent_end - 1, GFP_NOFS);
locked_end = extent_end;
goto again;
}
locked_end = extent_end;
}
btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
extent_offset += split - key.offset;
} else {
BUG_ON(key.offset != start);
btrfs_set_file_extent_offset(leaf, fi, extent_offset +
split - key.offset);
btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
key.offset = split;
btrfs_set_item_key_safe(trans, root, path, &key);
extent_end = split;
}
if (extent_end == end) {
split_end = 0;
extent_type = BTRFS_FILE_EXTENT_REG;
}
if (extent_end == end && split == start) {
other_start = end;
other_end = 0;
if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
bytenr, &other_start, &other_end)) {
path->slots[0]++;
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
key.offset = split;
btrfs_set_item_key_safe(trans, root, path, &key);
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
other_end - split);
goto done;
}
}
if (extent_end == end && split == end) {
other_start = 0;
other_end = start;
if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
bytenr, &other_start, &other_end)) {
path->slots[0]--;
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
other_start);
goto done;
}
}
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
key.offset = start;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
BUG_ON(ret);
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_type(leaf, fi, extent_type);
btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_compression(leaf, fi, 0);
btrfs_set_file_extent_encryption(leaf, fi, 0);
btrfs_set_file_extent_other_encoding(leaf, fi, 0);
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
leaf->start, root->root_key.objectid,
trans->transid, inode->i_ino);
BUG_ON(ret);
done:
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
if (split_end && split == start) {
split = end;
goto again;
}
if (locked_end > end) {
unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
GFP_NOFS);
}
btrfs_free_path(path);
return 0;
}
/*
* this gets pages into the page cache and locks them down, it also properly
* waits for data=ordered extents to finish before allowing the pages to be
......
......@@ -37,6 +37,7 @@
#include <linux/version.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/falloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
......@@ -587,7 +588,7 @@ free_pages_out:
* blocks on disk
*/
static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started)
u64 start, u64 end, int *page_started, int force)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
......@@ -602,6 +603,7 @@ static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
u64 num_bytes;
int extent_type;
int ret;
int type;
int nocow;
int check_prev = 1;
......@@ -654,7 +656,8 @@ next_slot:
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
if (extent_type == BTRFS_FILE_EXTENT_REG) {
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
struct btrfs_block_group_cache *block_group;
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
extent_end = found_key.offset +
......@@ -669,6 +672,8 @@ next_slot:
goto out_check;
if (disk_bytenr == 0)
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
goto out_check;
block_group = btrfs_lookup_block_group(root->fs_info,
......@@ -709,10 +714,39 @@ out_check:
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
struct extent_map *em;
struct extent_map_tree *em_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
em = alloc_extent_map(GFP_NOFS);
em->start = cur_offset;
em->len = num_bytes;
em->block_len = num_bytes;
em->block_start = disk_bytenr;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
spin_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
}
btrfs_drop_extent_cache(inode, em->start,
em->start + em->len - 1, 0);
}
type = BTRFS_ORDERED_PREALLOC;
} else {
type = BTRFS_ORDERED_NOCOW;
}
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes,
BTRFS_ORDERED_NOCOW);
num_bytes, num_bytes, type);
BUG_ON(ret);
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
cur_offset, cur_offset + num_bytes - 1,
locked_page, 0, 0, 0);
cur_offset = extent_end;
if (cur_offset > end)