direct.c 24.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
Linus Torvalds's avatar
Linus Torvalds committed
11
 * implement their own cache coherency protocol that subsumes file
12
13
14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
Linus Torvalds's avatar
Linus Torvalds committed
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
Linus Torvalds's avatar
Linus Torvalds committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 *
 */

#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

58
59
#include "iostat.h"

Linus Torvalds's avatar
Linus Torvalds committed
60
61
62
63
64
65
66
67
68
#define NFSDBG_FACILITY		NFSDBG_VFS

static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
69
70

	/* I/O parameters */
71
72
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
73
	struct nfs_open_context	*ctx;		/* file open context info */
74
	struct kiocb *		iocb;		/* controlling i/o request */
75
	struct inode *		inode;		/* target file of i/o */
76
77
78
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
Linus Torvalds's avatar
Linus Torvalds committed
79
80
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
81
82
83
84
85

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
	int			outstanding;	/* i/os we're waiting for */
	ssize_t			count,		/* bytes actually processed */
Linus Torvalds's avatar
Linus Torvalds committed
86
				error;		/* any reported error */
87
	struct completion	completion;	/* wait for i/o completion */
88
89
90
91
92
93
94

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
Linus Torvalds's avatar
Linus Torvalds committed
95
96
};

97
98
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
Linus Torvalds's avatar
Linus Torvalds committed
99
100

/**
101
102
103
104
105
106
107
108
109
110
111
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
Linus Torvalds's avatar
Linus Torvalds committed
112
 */
113
114
115
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
116
117
			iocb->ki_filp->f_dentry->d_name.name,
			(long long) pos, nr_segs);
118
119
120
121

	return -EINVAL;
}

122
123
124
125
126
127
128
129
130
131
132
133
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
{
	int i;
	for (i = 0; i < npages; i++) {
		struct page *page = pages[i];
		if (do_dirty && !PageCompound(page))
			set_page_dirty_lock(page);
		page_cache_release(page);
	}
	kfree(pages);
}

134
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
Linus Torvalds's avatar
Linus Torvalds committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
{
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
151
152
153
154
155
156
157
158
159
160
161
		if (result != page_count) {
			/*
			 * If we got fewer pages than expected from
			 * get_user_pages(), the user buffer runs off the
			 * end of a mapping; return EFAULT.
			 */
			if (result >= 0) {
				nfs_free_user_pages(*pages, result, 0);
				result = -EFAULT;
			} else
				kfree(*pages);
162
163
			*pages = NULL;
		}
Linus Torvalds's avatar
Linus Torvalds committed
164
165
166
167
	}
	return result;
}

168
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
169
{
170
171
172
173
174
175
176
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
177
	init_completion(&dreq->completion);
178
	INIT_LIST_HEAD(&dreq->list);
179
	INIT_LIST_HEAD(&dreq->rewrite_list);
180
	dreq->iocb = NULL;
181
	dreq->ctx = NULL;
182
183
184
185
	spin_lock_init(&dreq->lock);
	dreq->outstanding = 0;
	dreq->count = 0;
	dreq->error = 0;
186
	dreq->flags = 0;
187
188

	return dreq;
Linus Torvalds's avatar
Linus Torvalds committed
189
190
191
192
193
}

static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
194
195
196

	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
Linus Torvalds's avatar
Linus Torvalds committed
197
198
199
	kmem_cache_free(nfs_direct_cachep, dreq);
}

200
201
202
203
204
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
205
	ssize_t result = -EIOCBQUEUED;
206
207
208
209
210

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

211
	result = wait_for_completion_interruptible(&dreq->completion);
212
213

	if (!result)
214
		result = dreq->error;
215
	if (!result)
216
		result = dreq->count;
217
218
219
220
221
222

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

223
224
225
226
/*
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
Linus Torvalds's avatar
Linus Torvalds committed
227
 *
228
229
230
231
232
233
234
235
236
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

	if (dreq->iocb) {
237
		long res = (long) dreq->error;
238
		if (!res)
239
			res = (long) dreq->count;
240
		aio_complete(dreq->iocb, res, 0);
241
242
	}
	complete_all(&dreq->completion);
243
244
245
246

	kref_put(&dreq->kref, nfs_direct_req_release);
}

247
/*
Linus Torvalds's avatar
Linus Torvalds committed
248
249
250
251
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
 */
252
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
Linus Torvalds's avatar
Linus Torvalds committed
253
254
255
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
256
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
257

258
	dreq = nfs_direct_req_alloc();
Linus Torvalds's avatar
Linus Torvalds committed
259
260
261
262
263
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
264
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
Linus Torvalds's avatar
Linus Torvalds committed
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
281
		dreq->outstanding++;
Linus Torvalds's avatar
Linus Torvalds committed
282
283
284
285
286
287
288
289
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

290
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
Linus Torvalds's avatar
Linus Torvalds committed
291
{
292
	struct nfs_read_data *data = calldata;
Linus Torvalds's avatar
Linus Torvalds committed
293
294
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

295
296
	if (nfs_readpage_result(task, data) != 0)
		return;
297
298
299

	spin_lock(&dreq->lock);

300
	if (likely(task->tk_status >= 0))
301
		dreq->count += data->res.count;
Linus Torvalds's avatar
Linus Torvalds committed
302
	else
303
		dreq->error = task->tk_status;
Linus Torvalds's avatar
Linus Torvalds committed
304

305
306
307
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
Linus Torvalds's avatar
Linus Torvalds committed
308
309
	}

310
311
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
312
313
}

314
315
316
317
318
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

319
/*
Linus Torvalds's avatar
Linus Torvalds committed
320
321
322
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
 */
323
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
Linus Torvalds's avatar
Linus Torvalds committed
324
{
325
326
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
327
328
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
329
330
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
331
	size_t rsize = NFS_SERVER(inode)->rsize;
Linus Torvalds's avatar
Linus Torvalds committed
332
333
334
	unsigned int curpage, pgbase;

	curpage = 0;
335
	pgbase = dreq->user_addr & ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
336
337
	do {
		struct nfs_read_data *data;
338
		size_t bytes;
Linus Torvalds's avatar
Linus Torvalds committed
339
340
341
342
343

		bytes = rsize;
		if (count < rsize)
			bytes = count;

344
		BUG_ON(list_empty(list));
Linus Torvalds's avatar
Linus Torvalds committed
345
346
347
348
349
350
351
		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
352
		data->args.offset = pos;
Linus Torvalds's avatar
Linus Torvalds committed
353
354
355
356
357
358
359
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

360
361
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
Linus Torvalds's avatar
Linus Torvalds committed
362
363
364
365
366
367
368
369
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

370
		dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
Linus Torvalds's avatar
Linus Torvalds committed
371
372
373
374
375
376
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

377
		pos += bytes;
Linus Torvalds's avatar
Linus Torvalds committed
378
379
380
381
382
383
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
384
	BUG_ON(!list_empty(list));
Linus Torvalds's avatar
Linus Torvalds committed
385
386
}

387
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
Linus Torvalds's avatar
Linus Torvalds committed
388
389
390
{
	ssize_t result;
	sigset_t oldset;
391
	struct inode *inode = iocb->ki_filp->f_mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
392
393
394
395
396
397
398
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
	if (!dreq)
		return -ENOMEM;

399
400
401
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
Linus Torvalds's avatar
Linus Torvalds committed
402
403
	dreq->pages = pages;
	dreq->npages = nr_pages;
404
	dreq->inode = inode;
405
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
406
407
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
Linus Torvalds's avatar
Linus Torvalds committed
408

409
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
Linus Torvalds's avatar
Linus Torvalds committed
410
	rpc_clnt_sigmask(clnt, &oldset);
411
	nfs_direct_read_schedule(dreq);
412
	result = nfs_direct_wait(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
413
414
415
416
417
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

418
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
Linus Torvalds's avatar
Linus Torvalds committed
419
{
420
421
422
423
424
425
426
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}
Linus Torvalds's avatar
Linus Torvalds committed
427

428
429
430
431
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
	struct list_head *pos;
Linus Torvalds's avatar
Linus Torvalds committed
432

433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
	dreq->count = 0;

	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
Linus Torvalds's avatar
Linus Torvalds committed
456
457
	}

458
459
	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
Linus Torvalds's avatar
Linus Torvalds committed
460
461
}

462
463
464
465
466
467
static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
Linus Torvalds's avatar
Linus Torvalds committed
468
{
469
	struct nfs_write_data *data = dreq->commit_data;
Linus Torvalds's avatar
Linus Torvalds committed
470

471
	data->inode = dreq->inode;
472
	data->cred = dreq->ctx->cred;
Linus Torvalds's avatar
Linus Torvalds committed
473

474
475
476
477
478
479
	data->args.fh = NFS_FH(data->inode);
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;
Linus Torvalds's avatar
Linus Torvalds committed
480

481
482
483
	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);
Linus Torvalds's avatar
Linus Torvalds committed
484

485
486
487
488
	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
489

490
	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
Linus Torvalds's avatar
Linus Torvalds committed
491

492
493
494
495
	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}
Linus Torvalds's avatar
Linus Torvalds committed
496

497
498
499
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;
Linus Torvalds's avatar
Linus Torvalds committed
500

501
502
503
504
	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
505
			break;
506
507
508
509
510
511
512
513
514
515
516
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}
Linus Torvalds's avatar
Linus Torvalds committed
517

518
519
520
521
522
523
524
525
526
527
528
static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}
Linus Torvalds's avatar
Linus Torvalds committed
529

530
531
532
533
534
535
536
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif
Linus Torvalds's avatar
Linus Torvalds committed
537

538
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
Linus Torvalds's avatar
Linus Torvalds committed
539
{
540
541
542
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
543

544
545
546
547
548
549
550
	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);
Linus Torvalds's avatar
Linus Torvalds committed
551

552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
567
		dreq->outstanding++;
568
569
570
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
Linus Torvalds's avatar
Linus Torvalds committed
571
572
	}

573
	nfs_alloc_commit_data(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
574

575
576
	kref_get(&dreq->kref);
	return dreq;
Linus Torvalds's avatar
Linus Torvalds committed
577
578
}

579
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
Linus Torvalds's avatar
Linus Torvalds committed
580
{
581
582
583
584
585
586
587
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;

588
	spin_lock(&dreq->lock);
Linus Torvalds's avatar
Linus Torvalds committed
589

590
	if (likely(status >= 0))
591
		dreq->count += data->res.count;
592
	else
593
		dreq->error = task->tk_status;
Linus Torvalds's avatar
Linus Torvalds committed
594

595
596
597
598
599
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
Linus Torvalds's avatar
Linus Torvalds committed
600
				break;
601
602
603
604
605
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
Linus Torvalds's avatar
Linus Torvalds committed
606
607
		}
	}
608
609
610
611
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;

	spin_unlock(&dreq->lock);
Linus Torvalds's avatar
Linus Torvalds committed
612
613
}

614
615
616
/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
Linus Torvalds's avatar
Linus Torvalds committed
617
 */
618
static void nfs_direct_write_release(void *calldata)
Linus Torvalds's avatar
Linus Torvalds committed
619
{
620
621
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
Linus Torvalds's avatar
Linus Torvalds committed
622

623
	spin_lock(&dreq->lock);
624
625
626
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
Linus Torvalds's avatar
Linus Torvalds committed
627
	}
628
629
	spin_unlock(&dreq->lock);

630
	nfs_direct_write_complete(dreq, data->inode);
631
632
633
634
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
635
	.rpc_release = nfs_direct_write_release,
636
637
638
639
640
641
};

/*
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
 */
642
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
643
{
644
645
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
646
647
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
648
649
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
650
651
	size_t wsize = NFS_SERVER(inode)->wsize;
	unsigned int curpage, pgbase;
Linus Torvalds's avatar
Linus Torvalds committed
652
653

	curpage = 0;
654
	pgbase = dreq->user_addr & ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
655
	do {
656
657
658
659
660
661
662
		struct nfs_write_data *data;
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

663
		BUG_ON(list_empty(list));
664
		data = list_entry(list->next, struct nfs_write_data, pages);
665
		list_move_tail(&data->pages, &dreq->rewrite_list);
666
667
668
669
670

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
671
		data->args.offset = pos;
672
673
674
675
676
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
677
		data->res.verf = &data->verf;
678
679
680

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
681
		NFS_PROTO(inode)->write_setup(data, sync);
Linus Torvalds's avatar
Linus Torvalds committed
682

683
684
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
Linus Torvalds's avatar
Linus Torvalds committed
685
686

		lock_kernel();
687
		rpc_execute(&data->task);
Linus Torvalds's avatar
Linus Torvalds committed
688
689
		unlock_kernel();

690
		dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
691
692
693
694
695
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
Linus Torvalds's avatar
Linus Torvalds committed
696

697
		pos += bytes;
698
699
700
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
701

702
703
		count -= bytes;
	} while (count != 0);
704
	BUG_ON(!list_empty(list));
705
}
Linus Torvalds's avatar
Linus Torvalds committed
706

707
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
708
709
710
{
	ssize_t result;
	sigset_t oldset;
711
	struct inode *inode = iocb->ki_filp->f_mapping->host;
712
713
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
714
715
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;
Linus Torvalds's avatar
Linus Torvalds committed
716

717
	dreq = nfs_direct_write_alloc(count, wsize);
718
719
	if (!dreq)
		return -ENOMEM;
720
721
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;
Linus Torvalds's avatar
Linus Torvalds committed
722

723
724
725
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
726
727
	dreq->pages = pages;
	dreq->npages = nr_pages;
728
	dreq->inode = inode;
729
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
730
731
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
Linus Torvalds's avatar
Linus Torvalds committed
732

733
734
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

735
	nfs_begin_data_update(inode);
Linus Torvalds's avatar
Linus Torvalds committed
736

737
	rpc_clnt_sigmask(clnt, &oldset);
738
	nfs_direct_write_schedule(dreq, sync);
739
	result = nfs_direct_wait(dreq);
740
	rpc_clnt_sigunmask(clnt, &oldset);
Linus Torvalds's avatar
Linus Torvalds committed
741
742
743
744
745
746
747
748

	return result;
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
749
750
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
Linus Torvalds's avatar
Linus Torvalds committed
751
752
753
754
755
756
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
757
 * READ where the file size could change.  Our preference is simply
Linus Torvalds's avatar
Linus Torvalds committed
758
759
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
760
 *
Linus Torvalds's avatar
Linus Torvalds committed
761
762
763
764
765
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
766
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
767
768
{
	ssize_t retval = -EINVAL;
769
770
	int page_count;
	struct page **pages;
Linus Torvalds's avatar
Linus Torvalds committed
771
772
773
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

774
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
775
776
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
777
		(unsigned long) count, (long long) pos);
Linus Torvalds's avatar
Linus Torvalds committed
778
779
780
781

	if (count < 0)
		goto out;
	retval = -EFAULT;
782
	if (!access_ok(VERIFY_WRITE, buf, count))
Linus Torvalds's avatar
Linus Torvalds committed
783
784
785
786
787
		goto out;
	retval = 0;
	if (!count)
		goto out;

788
789
790
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
791

792
	retval = nfs_get_user_pages(READ, (unsigned long) buf,
793
						count, &pages);
794
	if (retval < 0)
795
		goto out;
796
	page_count = retval;
797

798
	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
799
						pages, page_count);
Linus Torvalds's avatar
Linus Torvalds committed
800
	if (retval > 0)
801
		iocb->ki_pos = pos + retval;
Linus Torvalds's avatar
Linus Torvalds committed
802
803
804
805
806
807
808
809
810

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
811
812
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
Linus Torvalds's avatar
Linus Torvalds committed
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
832
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
833
{
834
	ssize_t retval;
835
836
	int page_count;
	struct page **pages;
Linus Torvalds's avatar
Linus Torvalds committed
837
838
839
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

840
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
841
		file->f_dentry->d_parent->d_name.name,
842
843
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
Linus Torvalds's avatar
Linus Torvalds committed
844

845
846
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
Linus Torvalds's avatar
Linus Torvalds committed
847
		goto out;
848
849
850

	retval = -EINVAL;
	if ((ssize_t) count < 0)
Linus Torvalds's avatar
Linus Torvalds committed
851
852
853
854
		goto out;
	retval = 0;
	if (!count)
		goto out;
855
856

	retval = -EFAULT;
857
	if (!access_ok(VERIFY_READ, buf, count))
858
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
859

860
861
862
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
863

864
	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
865
						count, &pages);
866
	if (retval < 0)
867
		goto out;
868
	page_count = retval;
869

870
	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
871
					pos, pages, page_count);
872
873
874
875
876
877
878
879
880

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
881
882
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
883

Linus Torvalds's avatar
Linus Torvalds committed
884
	if (retval > 0)
885
		iocb->ki_pos = pos + retval;
Linus Torvalds's avatar
Linus Torvalds committed
886
887
888
889
890

out:
	return retval;
}

891
892
893
894
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
895
896
897
898
int nfs_init_directcache(void)
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
899
900
						0, (SLAB_RECLAIM_ACCOUNT|
							SLAB_MEM_SPREAD),
Linus Torvalds's avatar
Linus Torvalds committed
901
902
903
904
905
906
907
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

908
909
910
911
/**
 * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
912
913
914
915
916
void nfs_destroy_directcache(void)
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}