Commit b31dc66a authored by Jens Axboe's avatar Jens Axboe Committed by Jens Axboe

[PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.
Signed-off-by: default avatarJens Axboe <axboe@suse.de>
parent 271f18f1
...@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq) ...@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq)
arq->state = AS_RQ_NEW; arq->state = AS_RQ_NEW;
if (rq_data_dir(arq->request) == READ if (rq_data_dir(arq->request) == READ
|| current->flags&PF_SYNCWRITE) || (arq->request->flags & REQ_RW_SYNC))
arq->is_sync = 1; arq->is_sync = 1;
else else
arq->is_sync = 0; arq->is_sync = 0;
......
...@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi ...@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi
static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *); static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE)
/* /*
* lots of deadline iosched dupes, can be abstracted later... * lots of deadline iosched dupes, can be abstracted later...
*/ */
...@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q) ...@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q)
static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
{ {
if (rw == READ || process_sync(task)) if (rw == READ || rw == WRITE_SYNC)
return task->pid; return task->pid;
return CFQ_KEY_ASYNC; return CFQ_KEY_ASYNC;
......
...@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio) ...@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
if (unlikely(bio_barrier(bio))) if (unlikely(bio_barrier(bio)))
req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
if (bio_sync(bio))
req->flags |= REQ_RW_SYNC;
req->errors = 0; req->errors = 0;
req->hard_sector = req->sector = bio->bi_sector; req->hard_sector = req->sector = bio->bi_sector;
req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio); req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
......
...@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun) ...@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun)
inode = filp->f_dentry->d_inode; inode = filp->f_dentry->d_inode;
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
current->flags |= PF_SYNCWRITE;
rc = filemap_fdatawrite(inode->i_mapping); rc = filemap_fdatawrite(inode->i_mapping);
err = filp->f_op->fsync(filp, filp->f_dentry, 1); err = filp->f_op->fsync(filp, filp->f_dentry, 1);
if (!rc) if (!rc)
...@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun) ...@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun)
err = filemap_fdatawait(inode->i_mapping); err = filemap_fdatawait(inode->i_mapping);
if (!rc) if (!rc)
rc = err; rc = err;
current->flags &= ~PF_SYNCWRITE;
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
VLDBG(curlun, "fdatasync -> %d\n", rc); VLDBG(curlun, "fdatasync -> %d\n", rc);
return rc; return rc;
......
...@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync) ...@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
goto out; goto out;
} }
current->flags |= PF_SYNCWRITE;
ret = filemap_fdatawrite(mapping); ret = filemap_fdatawrite(mapping);
/* /*
...@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync) ...@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
err = filemap_fdatawait(mapping); err = filemap_fdatawait(mapping);
if (!ret) if (!ret)
ret = err; ret = err;
current->flags &= ~PF_SYNCWRITE;
out: out:
return ret; return ret;
} }
......
...@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio) ...@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
NULL); /* vmas */ NULL); /* vmas */
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(dio->curr_user_address); struct page *page = ZERO_PAGE(dio->curr_user_address);
/* /*
* A memory fault, but the filesystem has some outstanding * A memory fault, but the filesystem has some outstanding
...@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio) ...@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
map_bh->b_state = 0; map_bh->b_state = 0;
map_bh->b_size = fs_count << dio->inode->i_blkbits; map_bh->b_size = fs_count << dio->inode->i_blkbits;
create = dio->rw == WRITE; create = dio->rw & WRITE;
if (dio->lock_type == DIO_LOCKING) { if (dio->lock_type == DIO_LOCKING) {
if (dio->block_in_file < (i_size_read(dio->inode) >> if (dio->block_in_file < (i_size_read(dio->inode) >>
dio->blkbits)) dio->blkbits))
...@@ -867,7 +867,7 @@ do_holes: ...@@ -867,7 +867,7 @@ do_holes:
loff_t i_size_aligned; loff_t i_size_aligned;
/* AKPM: eargh, -ENOTBLK is a hack */ /* AKPM: eargh, -ENOTBLK is a hack */
if (dio->rw == WRITE) { if (dio->rw & WRITE) {
page_cache_release(page); page_cache_release(page);
return -ENOTBLK; return -ENOTBLK;
} }
...@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
} }
} /* end iovec loop */ } /* end iovec loop */
if (ret == -ENOTBLK && rw == WRITE) { if (ret == -ENOTBLK && (rw & WRITE)) {
/* /*
* The remaining part of the request will be * The remaining part of the request will be
* be handled by buffered I/O when we return * be handled by buffered I/O when we return
...@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
if (dio->is_async) { if (dio->is_async) {
int should_wait = 0; int should_wait = 0;
if (dio->result < dio->size && rw == WRITE) { if (dio->result < dio->size && (rw & WRITE)) {
dio->waiter = current; dio->waiter = current;
should_wait = 1; should_wait = 1;
} }
...@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
ret = transferred; ret = transferred;
/* We could have also come here on an AIO file extend */ /* We could have also come here on an AIO file extend */
if (!is_sync_kiocb(iocb) && rw == WRITE && if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
ret >= 0 && dio->result == dio->size) ret >= 0 && dio->result == dio->size)
/* /*
* For AIO writes where we have completed the * For AIO writes where we have completed the
...@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
int acquire_i_mutex = 0; int acquire_i_mutex = 0;
if (rw & WRITE) if (rw & WRITE)
current->flags |= PF_SYNCWRITE; rw = WRITE_SYNC;
if (bdev) if (bdev)
bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
...@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ...@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* even for AIO, we need to wait for i/o to complete before * even for AIO, we need to wait for i/o to complete before
* returning in this case. * returning in this case.
*/ */
dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) && dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode))); (end > i_size_read(inode)));
retval = direct_io_worker(rw, iocb, inode, iov, offset, retval = direct_io_worker(rw, iocb, inode, iov, offset,
...@@ -1284,8 +1284,6 @@ out: ...@@ -1284,8 +1284,6 @@ out:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
else if (acquire_i_mutex) else if (acquire_i_mutex)
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
if (rw & WRITE)
current->flags &= ~PF_SYNCWRITE;
return retval; return retval;
} }
EXPORT_SYMBOL(__blockdev_direct_IO); EXPORT_SYMBOL(__blockdev_direct_IO);
...@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int ...@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
int need_write_inode_now = 0; int need_write_inode_now = 0;
int err2; int err2;
current->flags |= PF_SYNCWRITE;
if (what & OSYNC_DATA) if (what & OSYNC_DATA)
err = filemap_fdatawrite(mapping); err = filemap_fdatawrite(mapping);
if (what & (OSYNC_METADATA|OSYNC_DATA)) { if (what & (OSYNC_METADATA|OSYNC_DATA)) {
...@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int ...@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
if (!err) if (!err)
err = err2; err = err2;
} }
current->flags &= ~PF_SYNCWRITE;
spin_lock(&inode_lock); spin_lock(&inode_lock);
if ((inode->i_state & I_DIRTY) && if ((inode->i_state & I_DIRTY) &&
......
...@@ -241,6 +241,7 @@ enum rq_flag_bits { ...@@ -241,6 +241,7 @@ enum rq_flag_bits {
__REQ_PM_RESUME, /* resume request */ __REQ_PM_RESUME, /* resume request */
__REQ_PM_SHUTDOWN, /* shutdown request */ __REQ_PM_SHUTDOWN, /* shutdown request */
__REQ_ORDERED_COLOR, /* is before or after barrier */ __REQ_ORDERED_COLOR, /* is before or after barrier */
__REQ_RW_SYNC, /* request is sync (O_DIRECT) */
__REQ_NR_BITS, /* stops here */ __REQ_NR_BITS, /* stops here */
}; };
...@@ -270,6 +271,7 @@ enum rq_flag_bits { ...@@ -270,6 +271,7 @@ enum rq_flag_bits {
#define REQ_PM_RESUME (1 << __REQ_PM_RESUME) #define REQ_PM_RESUME (1 << __REQ_PM_RESUME)
#define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN) #define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN)
#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) #define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
#define REQ_RW_SYNC (1 << __REQ_RW_SYNC)
/* /*
* State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
......
...@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t) ...@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t)
#define PF_KSWAPD 0x00040000 /* I am kswapd */ #define PF_KSWAPD 0x00040000 /* I am kswapd */
#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ #define PF_SWAPOFF 0x00080000 /* I am in swapoff */
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */
#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
#define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
#define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
#define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
/* /*
......
...@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) ...@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
* just ignore them, but return -ENOMEM at the end. * just ignore them, but return -ENOMEM at the end.
*/ */
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
if (flags & MS_SYNC)
current->flags |= PF_SYNCWRITE;
vma = find_vma(current->mm, start); vma = find_vma(current->mm, start);
if (!vma) { if (!vma) {
error = -ENOMEM; error = -ENOMEM;
...@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) ...@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
} }
} while (vma && !done); } while (vma && !done);
out_unlock: out_unlock:
current->flags &= ~PF_SYNCWRITE;
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
out: out:
return error; return error;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment