Commit 4142e0d1 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'osync_cleanup' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6

* 'osync_cleanup' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs-2.6:
  fsync: wait for data writeout completion before calling ->fsync
  vfs: Remove generic_osync_inode() and sync_page_range{_nolock}()
  fat: Opencode sync_page_range_nolock()
  pohmelfs: Use new syncing helper
  xfs: Convert sync_page_range() to simple filemap_write_and_wait_range()
  ocfs2: Update syncing after splicing to match generic version
  ntfs: Use new syncing helpers and update comments
  ext4: Remove syncing logic from ext4_file_write
  ext3: Remove syncing logic from ext3_file_write
  ext2: Update comment about generic_osync_inode
  vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode
  vfs: Rename generic_file_aio_write_nolock
  ocfs2: Use __generic_file_aio_write instead of generic_file_aio_write_nolock
  pohmelfs: Use __generic_file_aio_write instead of generic_file_aio_write_nolock
  vfs: Remove syncing from generic_file_direct_write() and generic_file_buffered_write()
  vfs: Export __generic_file_aio_write() and add some comments
  vfs: Introduce filemap_fdatawait_range
parents 33f1de69 2daea67e
......@@ -246,7 +246,7 @@ static const struct file_operations raw_fops = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.write = do_sync_write,
.aio_write = generic_file_aio_write_nolock,
.aio_write = blkdev_aio_write,
.open = raw_open,
.release= raw_release,
.ioctl = raw_ioctl,
......
......@@ -921,16 +921,16 @@ ssize_t pohmelfs_write(struct file *file, const char __user *buf,
if (ret)
goto err_out_unlock;
ret = generic_file_aio_write_nolock(&kiocb, &iov, 1, pos);
ret = __generic_file_aio_write(&kiocb, &iov, 1, &kiocb.ki_pos);
*ppos = kiocb.ki_pos;
mutex_unlock(&inode->i_mutex);
WARN_ON(ret < 0);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (ret > 0) {
ssize_t err;
err = sync_page_range(inode, mapping, pos, ret);
err = generic_write_sync(file, pos, ret);
if (err < 0)
ret = err;
WARN_ON(ret < 0);
......
......@@ -1404,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(bdev, mode, cmd, arg);
}
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
*
* Does not take i_mutex for the write and thus is not for general purpose
* use.
*/
ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_write);
/*
* Try to release a page associated with block device when the system
* is under memory pressure.
......@@ -1436,7 +1463,7 @@ const struct file_operations def_blk_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write_nolock,
.aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = block_fsync,
.unlocked_ioctl = block_ioctl,
......
......@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
unlock_buffer(bh);
mark_buffer_dirty_inode(bh, inode);
/* We used to sync bh here if IS_SYNC(inode).
* But we now rely upon generic_osync_inode()
* But we now rely upon generic_write_sync()
* and b_inode_buffers. But not for directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
......
......@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
return 0;
}
static ssize_t
ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
ssize_t ret;
int err;
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
/*
* Skip flushing if there was an error, or if nothing was written.
*/
if (ret <= 0)
return ret;
/*
* If the inode is IS_SYNC, or is O_SYNC and we are doing data
* journalling then we need to make sure that we force the transaction
* to disk to keep all metadata uptodate synchronously.
*/
if (file->f_flags & O_SYNC) {
/*
* If we are non-data-journaled, then the dirty data has
* already been flushed to backing store by generic_osync_inode,
* and the inode has been flushed too if there have been any
* modifications other than mere timestamp updates.
*
* Open question --- do we care about flushing timestamps too
* if the inode is IS_SYNC?
*/
if (!ext3_should_journal_data(inode))
return ret;
goto force_commit;
}
/*
* So we know that there has been no forced data flush. If the inode
* is marked IS_SYNC, we need to force one ourselves.
*/
if (!IS_SYNC(inode))
return ret;
/*
* Open question #2 --- should we force data to disk here too? If we
* don't, the only impact is that data=writeback filesystems won't
* flush data to disk automatically on IS_SYNC, only metadata (but
* historically, that is what ext2 has done.)
*/
force_commit:
err = ext3_force_commit(inode->i_sb);
if (err)
return err;
return ret;
}
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = ext3_file_write,
.aio_write = generic_file_aio_write,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
......
......@@ -58,10 +58,7 @@ static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
ssize_t ret;
int err;
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
/*
* If we have encountered a bitmap-format file, the size limit
......@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
/*
* Skip flushing if there was an error, or if nothing was written.
*/
if (ret <= 0)
return ret;
/*
* If the inode is IS_SYNC, or is O_SYNC and we are doing data
* journalling then we need to make sure that we force the transaction
* to disk to keep all metadata uptodate synchronously.
*/
if (file->f_flags & O_SYNC) {
/*
* If we are non-data-journaled, then the dirty data has
* already been flushed to backing store by generic_osync_inode,
* and the inode has been flushed too if there have been any
* modifications other than mere timestamp updates.
*
* Open question --- do we care about flushing timestamps too
* if the inode is IS_SYNC?
*/
if (!ext4_should_journal_data(inode))
return ret;
goto force_commit;
}
/*
* So we know that there has been no forced data flush. If the inode
* is marked IS_SYNC, we need to force one ourselves.
*/
if (!IS_SYNC(inode))
return ret;
/*
* Open question #2 --- should we force data to disk here too? If we
* don't, the only impact is that data=writeback filesystems won't
* flush data to disk automatically on IS_SYNC, only metadata (but
* historically, that is what ext2 has done.)
*/
force_commit:
err = ext4_force_commit(inode->i_sb);
if (err)
return err;
return ret;
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
static struct vm_operations_struct ext4_file_vm_ops = {
......
......@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
if (IS_SYNC(inode))
err = sync_page_range_nolock(inode, mapping, start, count);
if (IS_SYNC(inode)) {
int err2;
/*
* Opencode syncing since we don't have a file open to use
* standard fsync path.
*/
err = filemap_fdatawrite_range(mapping, start,
start + count - 1);
err2 = sync_mapping_buffers(mapping);
if (!err)
err = err2;
err2 = write_inode_now(inode, 1);
if (!err)
err = err2;
if (!err) {
err = filemap_fdatawait_range(mapping, start,
start + count - 1);
}
}
out:
return err;
}
......
......@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
MSDOS_I(inode)->i_start = new_dclus;
MSDOS_I(inode)->i_logstart = new_dclus;
/*
* Since generic_osync_inode() synchronize later if
* this is not directory, we don't here.
* Since generic_write_sync() synchronizes regular files later,
* we sync here only directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
ret = fat_sync_inode(inode);
......
......@@ -1242,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
EXPORT_SYMBOL(sync_inode);
/**
* generic_osync_inode - flush all dirty data for a given inode to disk
* @inode: inode to write
* @mapping: the address_space that should be flushed
* @what: what to write and wait upon
*
* This can be called by file_write functions for files which have the
* O_SYNC flag set, to flush dirty writes to disk.
*
* @what is a bitmask, specifying which part of the inode's data should be
* written and waited upon.
*
* OSYNC_DATA: i_mapping's dirty data
* OSYNC_METADATA: the buffers at i_mapping->private_list
* OSYNC_INODE: the inode itself
*/
int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
{
int err = 0;
int need_write_inode_now = 0;
int err2;
if (what & OSYNC_DATA)
err = filemap_fdatawrite(mapping);
if (what & (OSYNC_METADATA|OSYNC_DATA)) {
err2 = sync_mapping_buffers(mapping);
if (!err)
err = err2;
}
if (what & OSYNC_DATA) {
err2 = filemap_fdatawait(mapping);
if (!err)
err = err2;
}
spin_lock(&inode_lock);
if ((inode->i_state & I_DIRTY) &&
((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
need_write_inode_now = 1;
spin_unlock(&inode_lock);
if (need_write_inode_now) {
err2 = write_inode_now(inode, 1);
if (!err)
err = err2;
}
else
inode_sync_wait(inode);
return err;
}
EXPORT_SYMBOL(generic_osync_inode);
......@@ -2076,14 +2076,6 @@ err_out:
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
/* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
if (likely(!status)) {
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(vi, mapping,
OSYNC_METADATA|OSYNC_DATA);
}
}
pagevec_lru_add_file(&lru_pvec);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
......@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = sync_page_range(inode, mapping, pos, ret);
if (ret > 0) {
int err = generic_write_sync(file, pos, ret);
if (err < 0)
ret = err;
}
......@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err = sync_page_range(inode, mapping, *ppos - ret, ret);
if (ret > 0) {
int err = generic_write_sync(file, *ppos - ret, ret);
if (err < 0)
ret = err;
}
......
......@@ -384,13 +384,12 @@ unm_err_out:
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
* full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
* other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
* ensure ->write_inode is called from generic_osync_inode() and this needs to
* happen or the file data would not necessarily hit the device synchronously,
* even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
* simply "feels" better than just I_DIRTY_SYNC, since the file data has not
* actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
* would suggest.
* other hand, is not sufficient, because ->write_inode needs to be called even
* in case of fdatasync. This needs to happen or the file data would not
* necessarily hit the device synchronously, even though the vfs inode has the
* O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
* I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
* which is not what I_DIRTY_SYNC on its own would suggest.
*/
void __mark_mft_record_dirty(ntfs_inode *ni)
{
......
......@@ -1871,8 +1871,7 @@ relock:
goto out_dio;
}
} else {
written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
*ppos);
written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
}
out_dio:
......@@ -1880,18 +1879,21 @@ out_dio:
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
/*
* The generic write paths have handled getting data
* to disk, but since we don't make use of the dirty
* inode list, a manual journal commit is necessary
* here.
*/
if (old_size != i_size_read(inode) ||
old_clusters != OCFS2_I(inode)->ip_clusters) {
ret = filemap_fdatawrite_range(file->f_mapping, pos,
pos + count - 1);
if (ret < 0)
written = ret;
if (!ret && (old_size != i_size_read(inode) ||
old_clusters != OCFS2_I(inode)->ip_clusters)) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
if (!ret)
ret = filemap_fdatawait_range(file->f_mapping, pos,
pos + count - 1);
}
/*
......@@ -1991,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
if (ret > 0) {
unsigned long nr_pages;
int err;
*ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
/*
* If file or inode is SYNC and we actually wrote some data,
* sync it.
*/
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err;
mutex_lock(&inode->i_mutex);
err = ocfs2_rw_lock(inode, 1);
if (err < 0) {
mlog_errno(err);
} else {
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
ocfs2_rw_unlock(inode, 1);
}
mutex_unlock(&inode->i_mutex);
err = generic_write_sync(out, *ppos, ret);
if (err)
ret = err;
else
*ppos += ret;
if (err)
ret = err;
}
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
......
......@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (ret > 0) {
unsigned long nr_pages;
int err;
*ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
/*
* If file or inode is SYNC and we actually wrote some data,
* sync it.
*/
if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err;
mutex_lock(&inode->i_mutex);
err = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
mutex_unlock(&inode->i_mutex);
if (err)
ret = err;
}
err = generic_write_sync(out, *ppos, ret);
if (err)
ret = err;
else
*ppos += ret;
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
......
......@@ -178,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
}
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* vfs_fsync_range - helper to sync a range of data & metadata to disk
* @file: file to sync
* @dentry: dentry of @file
* @data: only perform a fdatasync operation
* @start: offset in bytes of the beginning of data range to sync
* @end: offset in bytes of the end of data range (inclusive)
* @datasync: perform only datasync
*
* Write back data and metadata for @file to disk. If @datasync is
* set only metadata needed to access modified file data is written.
* Write back data in range @start..@end and metadata for @file to disk. If
* @datasync is set only metadata needed to access modified file data is
* written.
*
* In case this function is called from nfsd @file may be %NULL and
* only @dentry is set. This can only happen when the filesystem
* implements the export_operations API.
*/
int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
loff_t end, int datasync)
{
const struct file_operations *fop;
struct address_space *mapping;
......@@ -214,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
ret = filemap_fdatawrite(mapping);
ret = filemap_write_and_wait_range(mapping, start, end);
/*
* We need to protect against concurrent writers, which could cause
......@@ -225,12 +229,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
err = filemap_fdatawait(mapping);
if (!ret)
ret = err;
out:
return ret;
}
EXPORT_SYMBOL(vfs_fsync_range);
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* @file: file to sync
* @dentry: dentry of @file
* @datasync: only perform a fdatasync operation
*
* Write back data and metadata for @file to disk. If @datasync is
* set only metadata needed to access modified file data is written.
*
* In case this function is called from nfsd @file may be %NULL and
* only @dentry is set. This can only happen when the filesystem
* implements the export_operations API.
*/
int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
......@@ -256,6 +277,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
return do_fsync(fd, 1);
}
/**
* generic_write_sync - perform syncing after a write if file / inode is sync
* @file: file to which the write happened
* @pos: offset where the write started
* @count: length of the write
*
* This is just a simple wrapper about our general syncing function.
*/
int generic_write_sync(struct file *file, loff_t pos, loff_t count)
{
if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
return 0;
return vfs_fsync_range(file, file->f_path.dentry, pos,
pos + count - 1, 1);
}
EXPORT_SYMBOL(generic_write_sync);
/*
* sys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
......
......@@ -817,7 +817,8 @@ write_retry:
xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
error2 = sync_page_range(inode, mapping, pos, ret);
error2 = filemap_write_and_wait_range(mapping, pos,
pos + ret - 1);
if (!error)
error = error2;
if (need_i_mutex)
......
......@@ -1455,11 +1455,6 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
#define DT_SOCK 12
#define DT_WHT 14
#define OSYNC_METADATA (1<<0)
#define OSYNC_DATA (1<<1)
#define OSYNC_INODE (1<<2)
int generic_osync_inode(struct inode *, struct address_space *, int);
/*
* This is the "filldir" function type, used by readdir() to let
* the kernel specify what kind of dirent layout it wants to have.
......@@ -2086,6 +2081,8 @@ extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
......@@ -2096,7 +2093,10 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
extern int filemap_fdatawrite_range(struct address_space *mapping,