Newer
Older
/*
* An async IO implementation for Linux
* Written by Benjamin LaHaise <bcrl@kvack.org>
*
* Implements an efficient asynchronous io interface.
*
* Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
*
* See ../COPYING for licensing terms.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#define DEBUG 0
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mmu_context.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/eventfd.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
#if DEBUG > 1
#define dprintk printk
#else
#define dprintk(x...) do { ; } while (0)
#endif
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
unsigned long aio_nr; /* current system wide number of aio requests */
unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
static struct workqueue_struct *aio_wq;
/* Used for rare fput completion. */
static void aio_fput_routine(struct work_struct *);
static DECLARE_WORK(fput_work, aio_fput_routine);
static void aio_kick_handler(struct work_struct *);
static void aio_queue_work(struct kioctx *);
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
*/
static int __init aio_setup(void)
{
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
return 0;
}
__initcall(aio_setup);
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
static void aio_free_ring(struct kioctx *ctx)
{
struct aio_ring_info *info = &ctx->ring_info;
long i;
for (i=0; i<info->nr_pages; i++)
put_page(info->ring_pages[i]);
if (info->mmap_size) {
down_write(&ctx->mm->mmap_sem);
do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
up_write(&ctx->mm->mmap_sem);
}
if (info->ring_pages && info->ring_pages != info->internal_pages)
kfree(info->ring_pages);
info->ring_pages = NULL;
info->nr = 0;
}
static int aio_setup_ring(struct kioctx *ctx)
{
struct aio_ring *ring;
struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
unsigned long size;
int nr_pages;
/* Compensate for the ring buffer's head/tail overlap entry */
nr_events += 2; /* 1 is required, 2 for good luck */
size = sizeof(struct aio_ring);
size += sizeof(struct io_event) * nr_events;
nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
if (nr_pages < 0)
return -EINVAL;
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
info->nr = 0;
info->ring_pages = info->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!info->ring_pages)
return -ENOMEM;
}
info->mmap_size = nr_pages * PAGE_SIZE;
dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
down_write(&ctx->mm->mmap_sem);
info->mmap_base = do_mmap(NULL, 0, info->mmap_size,
Robert P. J. Day
committed
PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
0);
if (IS_ERR((void *)info->mmap_base)) {
up_write(&ctx->mm->mmap_sem);
info->mmap_size = 0;
aio_free_ring(ctx);
return -EAGAIN;
}
dprintk("mmap address: 0x%08lx\n", info->mmap_base);
info->nr_pages = get_user_pages(current, ctx->mm,
info->mmap_base, nr_pages,
1, 0, info->ring_pages, NULL);
up_write(&ctx->mm->mmap_sem);
if (unlikely(info->nr_pages != nr_pages)) {
aio_free_ring(ctx);
return -EAGAIN;
}
ctx->user_id = info->mmap_base;
info->nr = nr_events; /* trusted copy */
ring = kmap_atomic(info->ring_pages[0], KM_USER0);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
ring->magic = AIO_RING_MAGIC;
ring->compat_features = AIO_RING_COMPAT_FEATURES;
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
kunmap_atomic(ring, KM_USER0);
return 0;
}
/* aio_ring_event: returns a pointer to the event at the given index from
* kmap_atomic(, km). Release the pointer with put_aio_ring_event();
*/
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
#define aio_ring_event(info, nr, km) ({ \
unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
struct io_event *__event; \
__event = kmap_atomic( \
(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
__event += pos % AIO_EVENTS_PER_PAGE; \
__event; \
})
#define put_aio_ring_event(event, km) do { \
struct io_event *__event = (event); \
(void)__event; \
kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
} while(0)
static void ctx_rcu_free(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
unsigned nr_events = ctx->max_reqs;
kmem_cache_free(kioctx_cachep, ctx);
if (nr_events) {
spin_lock(&aio_nr_lock);
BUG_ON(aio_nr - nr_events > aio_nr);
aio_nr -= nr_events;
spin_unlock(&aio_nr_lock);
}
}
/* __put_ioctx
* Called when the last user of an aio context has gone away,
* and the struct needs to be freed.
*/
static void __put_ioctx(struct kioctx *ctx)
{
BUG_ON(ctx->reqs_active);
cancel_delayed_work(&ctx->wq);
cancel_work_sync(&ctx->wq.work);
aio_free_ring(ctx);
mmdrop(ctx->mm);
ctx->mm = NULL;
pr_debug("__put_ioctx: freeing %p\n", ctx);
call_rcu(&ctx->rcu_head, ctx_rcu_free);
static inline void get_ioctx(struct kioctx *kioctx)
{
BUG_ON(atomic_read(&kioctx->users) <= 0);
atomic_inc(&kioctx->users);
}
static inline int try_get_ioctx(struct kioctx *kioctx)
{
return atomic_inc_not_zero(&kioctx->users);
}
static inline void put_ioctx(struct kioctx *kioctx)
{
BUG_ON(atomic_read(&kioctx->users) <= 0);
if (unlikely(atomic_dec_and_test(&kioctx->users)))
__put_ioctx(kioctx);
}
/* ioctx_alloc
* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
*/
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
struct mm_struct *mm;
struct kioctx *ctx;
/* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
(nr_events > (0x10000000U / sizeof(struct kiocb)))) {
pr_debug("ENOMEM: nr_events too high\n");
return ERR_PTR(-EINVAL);
}
if ((unsigned long)nr_events > aio_max_nr)
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
if (!ctx)
return ERR_PTR(-ENOMEM);
ctx->max_reqs = nr_events;
mm = ctx->mm = current->mm;
atomic_inc(&mm->mm_count);
atomic_set(&ctx->users, 1);
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->ring_info.ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
INIT_LIST_HEAD(&ctx->run_list);
INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
if (aio_setup_ring(ctx) < 0)
goto out_freectx;
/* limit the number of system wide aios */
do {
spin_lock_bh(&aio_nr_lock);
if (aio_nr + nr_events > aio_max_nr ||
aio_nr + nr_events < aio_nr)
ctx->max_reqs = 0;
else
aio_nr += ctx->max_reqs;
spin_unlock_bh(&aio_nr_lock);
if (ctx->max_reqs || did_sync)
break;
/* wait for rcu callbacks to have completed before giving up */
synchronize_rcu();
did_sync = 1;
ctx->max_reqs = nr_events;
} while (1);
spin_lock(&mm->ioctx_lock);
hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
spin_unlock(&mm->ioctx_lock);
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
return ctx;
out_cleanup:
__put_ioctx(ctx);
return ERR_PTR(-EAGAIN);
out_freectx:
mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
ctx = ERR_PTR(-ENOMEM);
dprintk("aio: error allocating ioctx %p\n", ctx);
return ctx;
}
/* aio_cancel_all
* Cancels all outstanding aio requests on an aio context. Used
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
static void aio_cancel_all(struct kioctx *ctx)
{
int (*cancel)(struct kiocb *, struct io_event *);
struct io_event res;
spin_lock_irq(&ctx->ctx_lock);
ctx->dead = 1;
while (!list_empty(&ctx->active_reqs)) {
struct list_head *pos = ctx->active_reqs.next;
struct kiocb *iocb = list_kiocb(pos);
list_del_init(&iocb->ki_list);
cancel = iocb->ki_cancel;
kiocbSetCancelled(iocb);
if (cancel) {
iocb->ki_users++;
spin_unlock_irq(&ctx->ctx_lock);
cancel(iocb, &res);
spin_lock_irq(&ctx->ctx_lock);
}
}
spin_unlock_irq(&ctx->ctx_lock);
}
static void wait_for_all_aios(struct kioctx *ctx)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
spin_lock_irq(&ctx->ctx_lock);
add_wait_queue(&ctx->wait, &wait);
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
while (ctx->reqs_active) {
spin_unlock_irq(&ctx->ctx_lock);
spin_lock_irq(&ctx->ctx_lock);
}
__set_task_state(tsk, TASK_RUNNING);
remove_wait_queue(&ctx->wait, &wait);
out:
spin_unlock_irq(&ctx->ctx_lock);
}
/* wait_on_sync_kiocb:
* Waits on the given sync kiocb to complete.
*/
ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
{
while (iocb->ki_users) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!iocb->ki_users)
break;
}
__set_current_state(TASK_RUNNING);
return iocb->ki_user_data;
}
EXPORT_SYMBOL(wait_on_sync_kiocb);
/* exit_aio: called when the last user of mm goes away. At this point,
* there is no way for any new requests to be submited or any of the
* io_* syscalls to be called on the context. However, there may be
* outstanding requests which hold references to the context; as they
* go away, they will call put_ioctx and release any pinned memory
* associated with the request (held via struct page * references).
*/
void exit_aio(struct mm_struct *mm)
struct kioctx *ctx;
while (!hlist_empty(&mm->ioctx_list)) {
ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
hlist_del_rcu(&ctx->list);
aio_cancel_all(ctx);
wait_for_all_aios(ctx);
/*
cancel_work_sync(&ctx->wq.work);
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
"exit_aio:ioctx still alive: %d %d %d\n",
atomic_read(&ctx->users), ctx->dead,
ctx->reqs_active);
put_ioctx(ctx);
}
}
/* aio_get_req
* Allocate a slot for an aio request. Increments the users count
* of the kioctx so that the kioctx stays around until all requests are
* complete. Returns NULL if no requests are free.
*
* Returns with kiocb->users set to 2. The io submit code path holds
* an extra reference while submitting the i/o.
* This prevents races between the aio code path referencing the
* req (after submitting it) and aio_complete() freeing the req.
*/
static struct kiocb *__aio_get_req(struct kioctx *ctx)
{
struct kiocb *req = NULL;
struct aio_ring *ring;
int okay = 0;
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
if (unlikely(!req))
return NULL;
req->ki_users = 2;
req->ki_key = 0;
req->ki_ctx = ctx;
req->ki_cancel = NULL;
req->ki_retry = NULL;
req->ki_dtor = NULL;
req->private = NULL;
req->ki_eventfd = NULL;
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
/* Check if the completion queue has enough free space to
* accept an event from this io.
*/
spin_lock_irq(&ctx->ctx_lock);
ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
list_add(&req->ki_list, &ctx->active_reqs);
ctx->reqs_active++;
okay = 1;
}
kunmap_atomic(ring, KM_USER0);
spin_unlock_irq(&ctx->ctx_lock);
if (!okay) {
kmem_cache_free(kiocb_cachep, req);
req = NULL;
}
return req;
}
static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
/* Handle a potential starvation case -- should be exceedingly rare as
* requests will be stuck on fput_head only if the aio_fput_routine is
* delayed and the requests were the last user of the struct file.
*/
req = __aio_get_req(ctx);
if (unlikely(NULL == req)) {
aio_fput_routine(NULL);
req = __aio_get_req(ctx);
}
return req;
}
static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
{
assert_spin_locked(&ctx->ctx_lock);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;
if (unlikely(!ctx->reqs_active && ctx->dead))
static void aio_fput_routine(struct work_struct *data)
{
spin_lock_irq(&fput_lock);
while (likely(!list_empty(&fput_head))) {
struct kiocb *req = list_kiocb(fput_head.next);
struct kioctx *ctx = req->ki_ctx;
list_del(&req->ki_list);
spin_unlock_irq(&fput_lock);
/* Complete the fput(s) */
if (req->ki_filp != NULL)
/* Link the iocb into the context's free list */
spin_lock_irq(&ctx->ctx_lock);
really_put_req(ctx, req);
spin_unlock_irq(&ctx->ctx_lock);
put_ioctx(ctx);
spin_lock_irq(&fput_lock);
}
spin_unlock_irq(&fput_lock);
}
/* __aio_put_req
* Returns true if this put was the last user of the request.
*/
static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
{
dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
req, atomic_long_read(&req->ki_filp->f_count));
assert_spin_locked(&ctx->ctx_lock);
if (likely(req->ki_users))
return 0;
list_del(&req->ki_list); /* remove from active_reqs */
req->ki_cancel = NULL;
req->ki_retry = NULL;
/*
* Try to optimize the aio and eventfd file* puts, by avoiding to
* schedule work in case it is not final fput() time. In normal cases,
* we would not be holding the last reference to the file*, so
* this function will be executed w/out any aio kthread wakeup.
if (unlikely(!fput_atomic(req->ki_filp))) {
get_ioctx(ctx);
spin_lock(&fput_lock);
list_add(&req->ki_list, &fput_head);
spin_unlock(&fput_lock);
} else {
req->ki_filp = NULL;
return 1;
}
/* aio_put_req
* Returns true if this put was the last user of the kiocb,
* false if the request is still in use.
*/
int aio_put_req(struct kiocb *req)
{
struct kioctx *ctx = req->ki_ctx;
int ret;
spin_lock_irq(&ctx->ctx_lock);
ret = __aio_put_req(ctx, req);
spin_unlock_irq(&ctx->ctx_lock);
return ret;
}
EXPORT_SYMBOL(aio_put_req);
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
struct kioctx *ctx, *ret = NULL;
rcu_read_lock();
hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
/*
* RCU protects us against accessing freed memory but
* we have to be careful not to get a reference when the
* reference count already dropped to 0 (ctx->dead test
* is unreliable because of races).
*/
if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
ret = ctx;
return ret;
}
/*
* Queue up a kiocb to be retried. Assumes that the kiocb
* has already been marked as kicked, and places it on
* the retry run list for the corresponding ioctx, if it
* isn't already queued. Returns 1 if it actually queued
* the kiocb (to tell the caller to activate the work
* queue to process it), or 0, if it found that it was
* already queued.
*/
static inline int __queue_kicked_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
assert_spin_locked(&ctx->ctx_lock);
if (list_empty(&iocb->ki_run_list)) {
list_add_tail(&iocb->ki_run_list,
&ctx->run_list);
return 1;
}
return 0;
}
/* aio_run_iocb
* This is the core aio execution routine. It is
* invoked both for initial i/o submission and
* subsequent retries via the aio_kick_handler.
* Expects to be invoked with iocb->ki_ctx->lock
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
* as needed during processing.
*
* Calls the iocb retry method (already setup for the
* iocb on initial submission) for operation specific
* handling, but takes care of most of common retry
* execution details for a given iocb. The retry method
* needs to be non-blocking as far as possible, to avoid
* holding up other iocbs waiting to be serviced by the
* retry kernel thread.
*
* The trickier parts in this code have to do with
* ensuring that only one retry instance is in progress
* for a given iocb at any time. Providing that guarantee
* simplifies the coding of individual aio operations as
* it avoids various potential races.
*/
static ssize_t aio_run_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
ssize_t (*retry)(struct kiocb *);
ssize_t ret;
if (!(retry = iocb->ki_retry)) {
printk("aio_run_iocb: iocb->ki_retry = NULL\n");
return 0;
}
/*
* We don't want the next retry iteration for this
* operation to start until this one has returned and
* updated the iocb state. However, wait_queue functions
* can trigger a kick_iocb from interrupt context in the
* meantime, indicating that data is available for the next
* iteration. We want to remember that and enable the
* next retry iteration _after_ we are through with
* this one.
*
* So, in order to be able to register a "kick", but
* prevent it from being queued now, we clear the kick
* flag, but make the kick code *think* that the iocb is
* still on the run list until we are actually done.
* When we are done with this iteration, we check if
* the iocb was kicked in the meantime and if so, queue
* it up afresh.
*/
kiocbClearKicked(iocb);
/*
* This is so that aio_complete knows it doesn't need to
* pull the iocb off the run list (We can't just call
* INIT_LIST_HEAD because we don't want a kick_iocb to
* queue this on the run list yet)
*/
iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
spin_unlock_irq(&ctx->ctx_lock);
/* Quit retrying if the i/o has been cancelled */
if (kiocbIsCancelled(iocb)) {
ret = -EINTR;
aio_complete(iocb, ret, 0);
/* must not access the iocb after this */
goto out;
}
/*
* Now we are all set to call the retry method in async
if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
/*
* There's no easy way to restart the syscall since other AIO's
* may be already running. Just fail this IO with EINTR.
*/
if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
ret = -EINTR;
aio_complete(iocb, ret, 0);
out:
spin_lock_irq(&ctx->ctx_lock);
if (-EIOCBRETRY == ret) {
/*
* OK, now that we are done with this iteration
* and know that there is more left to go,
* this is where we let go so that a subsequent
* "kick" can start the next iteration
*/
/* will make __queue_kicked_iocb succeed from here on */
INIT_LIST_HEAD(&iocb->ki_run_list);
/* we must queue the next iteration ourselves, if it
* has already been kicked */
if (kiocbIsKicked(iocb)) {
__queue_kicked_iocb(iocb);
/*
* __queue_kicked_iocb will always return 1 here, because
* iocb->ki_run_list is empty at this point so it should
* be safe to unconditionally queue the context into the
* work queue.
*/
aio_queue_work(ctx);
}
}
return ret;
}
/*
* __aio_run_iocbs:
* Process all pending retries queued on the ioctx
* run list.
* Assumes it is operating within the aio issuer's mm
*/
static int __aio_run_iocbs(struct kioctx *ctx)
{
struct kiocb *iocb;
struct list_head run_list;
assert_spin_locked(&ctx->ctx_lock);
list_replace_init(&ctx->run_list, &run_list);
while (!list_empty(&run_list)) {
iocb = list_entry(run_list.next, struct kiocb,
ki_run_list);
list_del(&iocb->ki_run_list);
/*
* Hold an extra reference while retrying i/o.
*/
iocb->ki_users++; /* grab extra reference */
aio_run_iocb(iocb);
__aio_put_req(ctx, iocb);
}
if (!list_empty(&ctx->run_list))
return 1;
return 0;
}
static void aio_queue_work(struct kioctx * ctx)
{
unsigned long timeout;
/*
* if someone is waiting, get the work started right
* away, otherwise, use a longer delay
*/
smp_mb();
if (waitqueue_active(&ctx->wait))
timeout = 1;
else
timeout = HZ/10;
queue_delayed_work(aio_wq, &ctx->wq, timeout);
}
/*
* aio_run_all_iocbs:
* Process all pending retries queued on the ioctx
* run list, and keep running them until the list
* stays empty.
* Assumes it is operating within the aio issuer's mm context.
*/
static inline void aio_run_all_iocbs(struct kioctx *ctx)
{
spin_lock_irq(&ctx->ctx_lock);
while (__aio_run_iocbs(ctx))
;
spin_unlock_irq(&ctx->ctx_lock);
}
/*
* aio_kick_handler:
* Work queue handler triggered to process pending
* retries on an ioctx. Takes on the aio issuer's
* mm context before running the iocbs, so that
* copy_xxx_user operates on the issuer's address
* space.
* Run on aiod's context.
*/
static void aio_kick_handler(struct work_struct *work)
struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
int requeue;
set_fs(USER_DS);
use_mm(ctx->mm);
spin_lock_irq(&ctx->ctx_lock);
requeue =__aio_run_iocbs(ctx);
set_fs(oldfs);
/*
* we're in a worker thread already, don't use queue_delayed_work,
*/
if (requeue)
queue_delayed_work(aio_wq, &ctx->wq, 0);
}
/*
* Called by kick_iocb to queue the kiocb for retry
* and if required activate the aio work queue to process
* it
*/
static void try_queue_kicked_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
unsigned long flags;
int run = 0;
spin_lock_irqsave(&ctx->ctx_lock, flags);
/* set this inside the lock so that we can't race with aio_run_iocb()
* testing it and putting the iocb on the run list under the lock */
if (!kiocbTryKick(iocb))
run = __queue_kicked_iocb(iocb);
aio_queue_work(ctx);
}
/*
* kick_iocb:
* Called typically from a wait queue callback context
* The retry is usually executed by aio workqueue
* threads (See aio_kick_handler).
*/
void kick_iocb(struct kiocb *iocb)
{
/* sync iocbs are easy: they can only ever be executing from a
* single context. */
if (is_sync_kiocb(iocb)) {
kiocbSetKicked(iocb);
wake_up_process(iocb->ki_obj.tsk);
return;
}
}
EXPORT_SYMBOL(kick_iocb);
/* aio_complete
* Called when the io request on the given iocb is complete.
* Returns true if this is the last user of the request. The
* only other user of the request can be the cancellation code.
*/
int aio_complete(struct kiocb *iocb, long res, long res2)
{
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring_info *info;
struct aio_ring *ring;
struct io_event *event;
unsigned long flags;
unsigned long tail;
int ret;
/*
* Special case handling for sync iocbs:
* - events go directly into the iocb for fast handling
* - the sync task with the iocb in its stack holds the single iocb
* ref, no other paths have a way to get another ref
* - the sync task helpfully left a reference to itself in the iocb
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
}
info = &ctx->ring_info;
/* add a completion event to the ring buffer.
* must be done holding ctx->ctx_lock to prevent
* other code from messing with the tail
* pointer since we might be called from irq
* context.
*/
spin_lock_irqsave(&ctx->ctx_lock, flags);
if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
list_del_init(&iocb->ki_run_list);
/*
* cancelled requests don't get events, userland was given one
* when the event got cancelled.
*/
if (kiocbIsCancelled(iocb))
goto put_rq;
ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
tail = info->tail;
event = aio_ring_event(info, tail, KM_IRQ0);
if (++tail >= info->nr)
tail = 0;
event->obj = (u64)(unsigned long)iocb->ki_obj.user;
event->data = iocb->ki_user_data;
event->res = res;
event->res2 = res2;
dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
res, res2);
/* after flagging the request as done, we
* must never even look at it again
*/
smp_wmb(); /* make event visible before updating tail */
info->tail = tail;
ring->tail = tail;
put_aio_ring_event(event, KM_IRQ0);
kunmap_atomic(ring, KM_IRQ1);
pr_debug("added to ring %p at [%lu]\n", iocb, tail);
/*
* Check if the user asked us to deliver the result through an
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
if (iocb->ki_eventfd != NULL)
put_rq:
/* everything turned out well, dispose of the aiocb. */
ret = __aio_put_req(ctx, iocb);
/*
* We have to order our ring_info tail store above and test
* of the wait list below outside the wait lock. This is
* like in wake_up_bit() where clearing a bit has to be
* ordered with the unlocked test.
*/
smp_mb();
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
EXPORT_SYMBOL(aio_complete);
/* aio_read_evt
* Pull an event off of the ioctx's event ring. Returns the number of