Commit 797b4cff authored by Vladimir Saveliev's avatar Vladimir Saveliev Committed by Linus Torvalds
Browse files

reiserfs: use generic write



Make reiserfs to write via generic routines.
Original reiserfs write optimized for big writes is deadlock rone
Signed-off-by: default avatarVladimir Saveliev <vs@namesys.com>
Signed-off-by: default avatarNick Piggin <npiggin@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f8706184
......@@ -153,608 +153,6 @@ static int reiserfs_sync_file(struct file *p_s_filp,
return (n_err < 0) ? -EIO : 0;
}
/* I really do not want to play with memory shortage right now, so
to simplify the code, we are not going to write more than this much pages at
a time. This still should considerably improve performance compared to 4k
at a time case. This is 32 pages of 4k size. */
#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
/* Allocates blocks for a file to fulfil write request.
Maps all unmapped but prepared pages from the list.
Updates metadata with newly allocated blocknumbers as needed */
static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode we work with */
loff_t pos, /* Writing position */
int num_pages, /* number of pages write going
to touch */
int write_bytes, /* amount of bytes to write */
struct page **prepared_pages, /* array of
prepared pages
*/
int blocks_to_allocate /* Amount of blocks we
need to allocate to
fit the data into file
*/
)
{
struct cpu_key key; // cpu key of item that we are going to deal with
struct item_head *ih; // pointer to item head that we are going to deal with
struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
__le32 *item; // pointer to item we are going to deal with
INITIALIZE_PATH(path); // path to item, that we are going to deal with.
b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
size_t res; // return value of various functions that we call.
int curr_block; // current block used to keep track of unmapped blocks.
int i; // loop counter
int itempos; // position in item
unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
// first page
unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
__u64 hole_size; // amount of blocks for a file hole, if it needed to be created.
int modifying_this_item = 0; // Flag for items traversal code to keep track
// of the fact that we already prepared
// current block for journal
int will_prealloc = 0;
RFALSE(!blocks_to_allocate,
"green-9004: tried to allocate zero blocks?");
/* only preallocate if this is a small write */
if (REISERFS_I(inode)->i_prealloc_count ||
(!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
blocks_to_allocate <
REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
will_prealloc =
REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
sizeof(b_blocknr_t), GFP_NOFS);
if (!allocated_blocks)
return -ENOMEM;
/* First we compose a key to point at the writing position, we want to do
that outside of any locking region. */
make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
/* If we came here, it means we absolutely need to open a transaction,
since we need to allocate some blocks */
reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb)); // Wish I know if this number enough
if (res)
goto error_exit;
reiserfs_update_inode_transaction(inode);
/* Look for the in-tree position of our write, need path for block allocator */
res = search_for_position_by_key(inode->i_sb, &key, &path);
if (res == IO_ERROR) {
res = -EIO;
goto error_exit;
}
/* Allocate blocks */
/* First fill in "hint" structure for block allocator */
hint.th = th; // transaction handle.
hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
hint.inode = inode; // Inode is needed by block allocator too.
hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
hint.key = key.on_disk_key; // on disk key of file.
hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); // Number of disk blocks this file occupies already.
hint.formatted_node = 0; // We are allocating blocks for unformatted node.
hint.preallocate = will_prealloc;
/* Call block allocator to allocate blocks */
res =
reiserfs_allocate_blocknrs(&hint, allocated_blocks,
blocks_to_allocate, blocks_to_allocate);
if (res != CARRY_ON) {
if (res == NO_DISK_SPACE) {
/* We flush the transaction in case of no space. This way some
blocks might become free */
SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
res = restart_transaction(th, inode, &path);
if (res)
goto error_exit;
/* We might have scheduled, so search again */
res =
search_for_position_by_key(inode->i_sb, &key,
&path);
if (res == IO_ERROR) {
res = -EIO;
goto error_exit;
}
/* update changed info for hint structure. */
res =
reiserfs_allocate_blocknrs(&hint, allocated_blocks,
blocks_to_allocate,
blocks_to_allocate);
if (res != CARRY_ON) {
res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
pathrelse(&path);
goto error_exit;
}
} else {
res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
pathrelse(&path);
goto error_exit;
}
}
#ifdef __BIG_ENDIAN
// Too bad, I have not found any way to convert a given region from
// cpu format to little endian format
{
int i;
for (i = 0; i < blocks_to_allocate; i++)
allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
}
#endif
/* Blocks allocating well might have scheduled and tree might have changed,
let's search the tree again */
/* find where in the tree our write should go */
res = search_for_position_by_key(inode->i_sb, &key, &path);
if (res == IO_ERROR) {
res = -EIO;
goto error_exit_free_blocks;
}
bh = get_last_bh(&path); // Get a bufferhead for last element in path.
ih = get_ih(&path); // Get a pointer to last item head in path.
item = get_item(&path); // Get a pointer to last item in path
/* Let's see what we have found */
if (res != POSITION_FOUND) { /* position not found, this means that we
might need to append file with holes
first */
// Since we are writing past the file's end, we need to find out if
// there is a hole that needs to be inserted before our writing
// position, and how many blocks it is going to cover (we need to
// populate pointers to file blocks representing the hole with zeros)
{
int item_offset = 1;
/*
* if ih is stat data, its offset is 0 and we don't want to
* add 1 to pos in the hole_size calculation
*/
if (is_statdata_le_ih(ih))
item_offset = 0;
hole_size = (pos + item_offset -
(le_key_k_offset
(get_inode_item_key_version(inode),
&(ih->ih_key)) + op_bytes_number(ih,
inode->
i_sb->
s_blocksize)))
>> inode->i_sb->s_blocksize_bits;
}
if (hole_size > 0) {
int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE); // How much data to insert first time.
/* area filled with zeroes, to supply as list of zero blocknumbers
We allocate it outside of loop just in case loop would spin for
several iterations. */
char *zeros = kzalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
if (!zeros) {
res = -ENOMEM;
goto error_exit_free_blocks;
}
do {
to_paste =
min_t(__u64, hole_size,
MAX_ITEM_LEN(inode->i_sb->
s_blocksize) /
UNFM_P_SIZE);
if (is_indirect_le_ih(ih)) {
/* Ok, there is existing indirect item already. Need to append it */
/* Calculate position past inserted item */
make_cpu_key(&key, inode,
le_key_k_offset
(get_inode_item_key_version
(inode),
&(ih->ih_key)) +
op_bytes_number(ih,
inode->
i_sb->
s_blocksize),
TYPE_INDIRECT, 3);
res =
reiserfs_paste_into_item(th, &path,
&key,
inode,
(char *)
zeros,
UNFM_P_SIZE
*
to_paste);
if (res) {
kfree(zeros);
goto error_exit_free_blocks;
}
} else if (is_statdata_le_ih(ih)) {
/* No existing item, create it */
/* item head for new item */
struct item_head ins_ih;
/* create a key for our new item */
make_cpu_key(&key, inode, 1,
TYPE_INDIRECT, 3);
/* Create new item head for our new item */
make_le_item_head(&ins_ih, &key,
key.version, 1,
TYPE_INDIRECT,
to_paste *
UNFM_P_SIZE,
0 /* free space */ );
/* Find where such item should live in the tree */
res =
search_item(inode->i_sb, &key,
&path);
if (res != ITEM_NOT_FOUND) {
/* item should not exist, otherwise we have error */
if (res != -ENOSPC) {
reiserfs_warning(inode->
i_sb,
"green-9008: search_by_key (%K) returned %d",
&key,
res);
}
res = -EIO;
kfree(zeros);
goto error_exit_free_blocks;
}
res =
reiserfs_insert_item(th, &path,
&key, &ins_ih,
inode,
(char *)zeros);
} else {
reiserfs_panic(inode->i_sb,
"green-9011: Unexpected key type %K\n",
&key);
}
if (res) {
kfree(zeros);
goto error_exit_free_blocks;
}
/* Now we want to check if transaction is too full, and if it is
we restart it. This will also free the path. */
if (journal_transaction_should_end
(th, th->t_blocks_allocated)) {
inode->i_size = cpu_key_k_offset(&key) +
(to_paste << inode->i_blkbits);
res =
restart_transaction(th, inode,
&path);
if (res) {
pathrelse(&path);
kfree(zeros);
goto error_exit;
}
}
/* Well, need to recalculate path and stuff */
set_cpu_key_k_offset(&key,
cpu_key_k_offset(&key) +
(to_paste << inode->
i_blkbits));
res =
search_for_position_by_key(inode->i_sb,
&key, &path);
if (res == IO_ERROR) {
res = -EIO;
kfree(zeros);
goto error_exit_free_blocks;
}
bh = get_last_bh(&path);
ih = get_ih(&path);
item = get_item(&path);
hole_size -= to_paste;
} while (hole_size);
kfree(zeros);
}
}
// Go through existing indirect items first
// replace all zeroes with blocknumbers from list
// Note that if no corresponding item was found, by previous search,
// it means there are no existing in-tree representation for file area
// we are going to overwrite, so there is nothing to scan through for holes.
for (curr_block = 0, itempos = path.pos_in_item;
curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
retry:
if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
/* We run out of data in this indirect item, let's look for another
one. */
/* First if we are already modifying current item, log it */
if (modifying_this_item) {
journal_mark_dirty(th, inode->i_sb, bh);
modifying_this_item = 0;
}
/* Then set the key to look for a new indirect item (offset of old
item is added to old item length */
set_cpu_key_k_offset(&key,
le_key_k_offset
(get_inode_item_key_version(inode),
&(ih->ih_key)) +
op_bytes_number(ih,
inode->i_sb->
s_blocksize));
/* Search ofor position of new key in the tree. */
res =
search_for_position_by_key(inode->i_sb, &key,
&path);
if (res == IO_ERROR) {
res = -EIO;
goto error_exit_free_blocks;
}
bh = get_last_bh(&path);
ih = get_ih(&path);
item = get_item(&path);
itempos = path.pos_in_item;
continue; // loop to check all kinds of conditions and so on.
}
/* Ok, we have correct position in item now, so let's see if it is
representing file hole (blocknumber is zero) and fill it if needed */
if (!item[itempos]) {
/* Ok, a hole. Now we need to check if we already prepared this
block to be journaled */
while (!modifying_this_item) { // loop until succeed
/* Well, this item is not journaled yet, so we must prepare
it for journal first, before we can change it */
struct item_head tmp_ih; // We copy item head of found item,
// here to detect if fs changed under
// us while we were preparing for
// journal.
int fs_gen; // We store fs generation here to find if someone
// changes fs under our feet
copy_item_head(&tmp_ih, ih); // Remember itemhead
fs_gen = get_generation(inode->i_sb); // remember fs generation
reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
if (fs_changed(fs_gen, inode->i_sb)
&& item_moved(&tmp_ih, &path)) {
// Sigh, fs was changed under us, we need to look for new
// location of item we are working with
/* unmark prepaerd area as journaled and search for it's
new position */
reiserfs_restore_prepared_buffer(inode->
i_sb,
bh);
res =
search_for_position_by_key(inode->
i_sb,
&key,
&path);
if (res == IO_ERROR) {
res = -EIO;
goto error_exit_free_blocks;
}
bh = get_last_bh(&path);
ih = get_ih(&path);
item = get_item(&path);
itempos = path.pos_in_item;
goto retry;
}
modifying_this_item = 1;
}
item[itempos] = allocated_blocks[curr_block]; // Assign new block
curr_block++;
}
itempos++;
}
if (modifying_this_item) { // We need to log last-accessed block, if it
// was modified, but not logged yet.
journal_mark_dirty(th, inode->i_sb, bh);
}
if (curr_block < blocks_to_allocate) {
// Oh, well need to append to indirect item, or to create indirect item
// if there weren't any
if (is_indirect_le_ih(ih)) {
// Existing indirect item - append. First calculate key for append
// position. We do not need to recalculate path as it should
// already point to correct place.
make_cpu_key(&key, inode,
le_key_k_offset(get_inode_item_key_version
(inode),
&(ih->ih_key)) +
op_bytes_number(ih,
inode->i_sb->s_blocksize),
TYPE_INDIRECT, 3);
res =
reiserfs_paste_into_item(th, &path, &key, inode,
(char *)(allocated_blocks +
curr_block),
UNFM_P_SIZE *
(blocks_to_allocate -
curr_block));
if (res) {
goto error_exit_free_blocks;
}
} else if (is_statdata_le_ih(ih)) {
// Last found item was statdata. That means we need to create indirect item.
struct item_head ins_ih; /* itemhead for new item */
/* create a key for our new item */
make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3); // Position one,
// because that's
// where first
// indirect item
// begins
/* Create new item head for our new item */
make_le_item_head(&ins_ih, &key, key.version, 1,
TYPE_INDIRECT,
(blocks_to_allocate -
curr_block) * UNFM_P_SIZE,
0 /* free space */ );
/* Find where such item should live in the tree */
res = search_item(inode->i_sb, &key, &path);
if (res != ITEM_NOT_FOUND) {
/* Well, if we have found such item already, or some error
occured, we need to warn user and return error */
if (res != -ENOSPC) {
reiserfs_warning(inode->i_sb,
"green-9009: search_by_key (%K) "
"returned %d", &key,
res);
}
res = -EIO;
goto error_exit_free_blocks;
}
/* Insert item into the tree with the data as its body */
res =
reiserfs_insert_item(th, &path, &key, &ins_ih,
inode,
(char *)(allocated_blocks +
curr_block));
} else {
reiserfs_panic(inode->i_sb,
"green-9010: unexpected item type for key %K\n",
&key);
}
}
// the caller is responsible for closing the transaction
// unless we return an error, they are also responsible for logging
// the inode.
//
pathrelse(&path);
/*
* cleanup prellocation from previous writes
* if this is a partial block write
*/
if (write_bytes & (inode->i_sb->s_blocksize - 1))
reiserfs_discard_prealloc(th, inode);
reiserfs_write_unlock(inode->i_sb);
// go through all the pages/buffers and map the buffers to newly allocated
// blocks (so that system knows where to write these pages later).
curr_block = 0;
for (i = 0; i < num_pages; i++) {
struct page *page = prepared_pages[i]; //current page
struct buffer_head *head = page_buffers(page); // first buffer for a page
int block_start, block_end; // in-page offsets for buffers.
if (!page_buffers(page))
reiserfs_panic(inode->i_sb,
"green-9005: No buffers for prepared page???");
/* For each buffer in page */
for (bh = head, block_start = 0; bh != head || !block_start;
block_start = block_end, bh = bh->b_this_page) {
if (!bh)
reiserfs_panic(inode->i_sb,
"green-9006: Allocated but absent buffer for a page?");
block_end = block_start + inode->i_sb->s_blocksize;
if (i == 0 && block_end <= from)
/* if this buffer is before requested data to map, skip it */
continue;
if (i == num_pages - 1 && block_start >= to)
/* If this buffer is after requested data to map, abort
processing of current page */
break;
if (!buffer_mapped(bh)) { // Ok, unmapped buffer, need to map it
map_bh(bh, inode->i_sb,
le32_to_cpu(allocated_blocks
[curr_block]));
curr_block++;
set_buffer_new(bh);
}
}
}
RFALSE(curr_block > blocks_to_allocate,
"green-9007: Used too many blocks? weird");
kfree(allocated_blocks);
return 0;
// Need to deal with transaction here.
error_exit_free_blocks:
pathrelse(&path);
// free blocks
for (i = 0; i < blocks_to_allocate; i++)
reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
1);
error_exit:
if (th->t_trans_id) {
int err;
// update any changes we made to blk count
mark_inode_dirty(inode);
err =
journal_end(th, inode->i_sb,
JOURNAL_PER_BALANCE_CNT * 3 + 1 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
if (err)
res = err;
}
reiserfs_write_unlock(inode->i_sb);
kfree(allocated_blocks);
return res;
}
/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
size_t num_pages /* amount of pages */ )
{
int i; // loop counter
for (i = 0; i < num_pages; i++) {
struct page *page = prepared_pages[i];
try_to_free_buffers(page);
unlock_page(page);
page_cache_release(page);
}
}
/* This function will copy data from userspace to specified pages within
supplied byte range */
static int reiserfs_copy_from_user_to_file_region(loff_t pos, /* In-file position */
int num_pages, /* Number of pages affected */
int write_bytes, /* Amount of bytes to write */
struct page **prepared_pages, /* pointer to
array to
prepared pages
*/
const char __user * buf /* Pointer to user-supplied
data */
)
{
long page_fault = 0; // status of copy_from_user.
int i; // loop counter.
int offset; // offset in page
for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
i++, offset = 0) {
size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); // How much of bytes to write to this page
struct page *page = prepared_pages[i]; // Current page we process.
fault_in_pages_readable(buf, count);
/* Copy data from userspace to the current page */
kmap(page);
page_fault = __copy_from_user(page_address(page) + offset, buf, count); // Copy the data.
/* Flush processor's dcache for this page */
flush_dcache_page(page);
kunmap(page);
buf += count;
write_bytes -= count;
if (page_fault)
break; // Was there a fault? abort.
}
return page_fault ? -EFAULT : 0;
}
/* taken fs/buffer.c:__block_commit_write */
int reiserfs_commit_page(struct inode *inode, struct page *page,
unsigned from, unsigned to)
......@@ -824,432 +222,6 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
return ret;
}
/* Submit pages for write. This was separated from actual file copying
because we might want to allocate block numbers in-between.
This function assumes that caller will adjust file size to correct value. */
static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos, /* Writing position offset */
size_t num_pages, /* Number of pages to write */
size_t write_bytes, /* number of bytes to write */
struct page **prepared_pages /* list of pages */
)
<