direct.c 20.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
Linus Torvalds's avatar
Linus Torvalds committed
11
 * implement their own cache coherency protocol that subsumes file
12
13
14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
Linus Torvalds's avatar
Linus Torvalds committed
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
Linus Torvalds's avatar
Linus Torvalds committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 *
 */

#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

58
59
#include "iostat.h"

Linus Torvalds's avatar
Linus Torvalds committed
60
61
#define NFSDBG_FACILITY		NFSDBG_VFS

62
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
Linus Torvalds's avatar
Linus Torvalds committed
63
64
65
66
67
68
69
static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
70
71

	/* I/O parameters */
72
	struct list_head	list;		/* nfs_read/write_data structs */
73
74
	struct file *		filp;		/* file descriptor */
	struct kiocb *		iocb;		/* controlling i/o request */
Linus Torvalds's avatar
Linus Torvalds committed
75
	wait_queue_head_t	wait;		/* wait for i/o completion */
76
	struct inode *		inode;		/* target file of i/o */
Linus Torvalds's avatar
Linus Torvalds committed
77
78
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
79
80
81
82
83

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
	int			outstanding;	/* i/os we're waiting for */
	ssize_t			count,		/* bytes actually processed */
Linus Torvalds's avatar
Linus Torvalds committed
84
85
86
				error;		/* any reported error */
};

87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/**
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
 */
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	struct dentry *dentry = iocb->ki_filp->f_dentry;

	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
			dentry->d_name.name, (long long) pos, nr_segs);

	return -EINVAL;
}

110
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
Linus Torvalds's avatar
Linus Torvalds committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
{
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
127
128
129
130
131
132
133
134
135
		/*
		 * If we got fewer pages than expected from get_user_pages(),
		 * the user buffer runs off the end of a mapping; return EFAULT.
		 */
		if (result >= 0 && result < page_count) {
			nfs_free_user_pages(*pages, result, 0);
			*pages = NULL;
			result = -EFAULT;
		}
Linus Torvalds's avatar
Linus Torvalds committed
136
137
138
139
	}
	return result;
}

140
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
Linus Torvalds's avatar
Linus Torvalds committed
141
142
143
{
	int i;
	for (i = 0; i < npages; i++) {
144
145
146
147
		struct page *page = pages[i];
		if (do_dirty && !PageCompound(page))
			set_page_dirty_lock(page);
		page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
148
149
150
151
	}
	kfree(pages);
}

152
153
154
155
156
157
158
159
160
161
162
163
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
	init_waitqueue_head(&dreq->wait);
	INIT_LIST_HEAD(&dreq->list);
	dreq->iocb = NULL;
164
165
166
167
	spin_lock_init(&dreq->lock);
	dreq->outstanding = 0;
	dreq->count = 0;
	dreq->error = 0;
168
169
170
171

	return dreq;
}

Linus Torvalds's avatar
Linus Torvalds committed
172
173
174
175
176
177
static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
	kmem_cache_free(nfs_direct_cachep, dreq);
}

178
179
180
181
182
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
183
	ssize_t result = -EIOCBQUEUED;
184
185
186
187
188

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

189
	result = wait_event_interruptible(dreq->wait, (dreq->outstanding == 0));
190
191

	if (!result)
192
		result = dreq->error;
193
	if (!result)
194
		result = dreq->count;
195
196
197
198
199
200

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

201
202
203
204
205
206
207
208
209
210
211
212
213
214
/*
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
 *
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

	if (dreq->iocb) {
215
		long res = (long) dreq->error;
216
		if (!res)
217
			res = (long) dreq->count;
218
219
220
221
		aio_complete(dreq->iocb, res, 0);
	} else
		wake_up(&dreq->wait);

222
	iput(dreq->inode);
223
224
225
	kref_put(&dreq->kref, nfs_direct_req_release);
}

226
/*
Linus Torvalds's avatar
Linus Torvalds committed
227
228
229
230
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
 */
231
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
Linus Torvalds's avatar
Linus Torvalds committed
232
233
234
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
235
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
236

237
	dreq = nfs_direct_req_alloc();
Linus Torvalds's avatar
Linus Torvalds committed
238
239
240
241
242
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
243
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
Linus Torvalds's avatar
Linus Torvalds committed
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
260
		dreq->outstanding++;
Linus Torvalds's avatar
Linus Torvalds committed
261
262
263
264
265
266
267
268
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

269
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
Linus Torvalds's avatar
Linus Torvalds committed
270
{
271
	struct nfs_read_data *data = calldata;
Linus Torvalds's avatar
Linus Torvalds committed
272
273
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

274
275
	if (nfs_readpage_result(task, data) != 0)
		return;
276
277
278

	spin_lock(&dreq->lock);

279
	if (likely(task->tk_status >= 0))
280
		dreq->count += data->res.count;
Linus Torvalds's avatar
Linus Torvalds committed
281
	else
282
283
284
285
286
287
		dreq->error = task->tk_status;

	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
Linus Torvalds's avatar
Linus Torvalds committed
288

289
290
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
291
292
}

293
294
295
296
297
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

298
/*
Linus Torvalds's avatar
Linus Torvalds committed
299
300
301
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
 */
302
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
303
{
304
305
306
307
	struct file *file = dreq->filp;
	struct inode *inode = file->f_mapping->host;
	struct nfs_open_context *ctx = (struct nfs_open_context *)
							file->private_data;
Linus Torvalds's avatar
Linus Torvalds committed
308
309
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
310
	size_t rsize = NFS_SERVER(inode)->rsize;
Linus Torvalds's avatar
Linus Torvalds committed
311
312
313
314
315
316
	unsigned int curpage, pgbase;

	curpage = 0;
	pgbase = user_addr & ~PAGE_MASK;
	do {
		struct nfs_read_data *data;
317
		size_t bytes;
Linus Torvalds's avatar
Linus Torvalds committed
318
319
320
321
322
323
324
325
326
327
328
329

		bytes = rsize;
		if (count < rsize)
			bytes = count;

		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
330
		data->args.offset = pos;
Linus Torvalds's avatar
Linus Torvalds committed
331
332
333
334
335
336
337
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

338
339
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
Linus Torvalds's avatar
Linus Torvalds committed
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

		dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

355
		pos += bytes;
Linus Torvalds's avatar
Linus Torvalds committed
356
357
358
359
360
361
362
363
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
}

364
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
Linus Torvalds's avatar
Linus Torvalds committed
365
366
367
{
	ssize_t result;
	sigset_t oldset;
368
	struct inode *inode = iocb->ki_filp->f_mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
369
370
371
372
373
374
375
376
377
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
	if (!dreq)
		return -ENOMEM;

	dreq->pages = pages;
	dreq->npages = nr_pages;
378
	igrab(inode);
379
	dreq->inode = inode;
380
	dreq->filp = iocb->ki_filp;
381
382
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
Linus Torvalds's avatar
Linus Torvalds committed
383

384
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
Linus Torvalds's avatar
Linus Torvalds committed
385
	rpc_clnt_sigmask(clnt, &oldset);
386
	nfs_direct_read_schedule(dreq, user_addr, count, pos);
387
	result = nfs_direct_wait(dreq);
Linus Torvalds's avatar
Linus Torvalds committed
388
389
390
391
392
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

393
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
Linus Torvalds's avatar
Linus Torvalds committed
394
{
395
396
397
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
398

399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
422
		dreq->outstanding++;
423
424
425
426
427
428
429
430
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

431
432
433
434
/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
435
436
437
438
439
440
441
442
443
444
445
446
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;
	/* If the server fell back to an UNSTABLE write, it's an error. */
	if (unlikely(data->res.verf->committed != NFS_FILE_SYNC))
		status = -EIO;

447
448
	spin_lock(&dreq->lock);

449
	if (likely(status >= 0))
450
		dreq->count += data->res.count;
451
	else
452
		dreq->error = status;
453

454
455
456
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
457
	}
458
459
460
461
462

	spin_unlock(&dreq->lock);

	nfs_end_data_update(data->inode);
	nfs_direct_complete(dreq);
463
464
465
466
467
468
469
470
471
472
473
474
475
476
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
	.rpc_release = nfs_writedata_release,
};

/*
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
 *
 * XXX: For now, support only FILE_SYNC writes.  Later we may add
 *      support for UNSTABLE + COMMIT.
 */
477
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
478
{
479
480
481
482
	struct file *file = dreq->filp;
	struct inode *inode = file->f_mapping->host;
	struct nfs_open_context *ctx = (struct nfs_open_context *)
							file->private_data;
483
484
485
486
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	size_t wsize = NFS_SERVER(inode)->wsize;
	unsigned int curpage, pgbase;
Linus Torvalds's avatar
Linus Torvalds committed
487
488

	curpage = 0;
489
	pgbase = user_addr & ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
490
	do {
491
492
493
494
495
496
497
498
499
500
501
502
503
504
		struct nfs_write_data *data;
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

		data = list_entry(list->next, struct nfs_write_data, pages);
		list_del_init(&data->pages);

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
505
		data->args.offset = pos;
506
507
508
509
510
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
511
		data->res.verf = &data->verf;
512
513
514
515

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
Linus Torvalds's avatar
Linus Torvalds committed
516

517
518
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
Linus Torvalds's avatar
Linus Torvalds committed
519
520

		lock_kernel();
521
		rpc_execute(&data->task);
Linus Torvalds's avatar
Linus Torvalds committed
522
523
		unlock_kernel();

524
525
526
527
528
529
		dfprintk(VFS, "NFS: %4d initiated direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
Linus Torvalds's avatar
Linus Torvalds committed
530

531
		pos += bytes;
532
533
534
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
535

536
537
538
		count -= bytes;
	} while (count != 0);
}
Linus Torvalds's avatar
Linus Torvalds committed
539

540
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
541
542
543
{
	ssize_t result;
	sigset_t oldset;
544
	struct inode *inode = iocb->ki_filp->f_mapping->host;
545
546
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
Linus Torvalds's avatar
Linus Torvalds committed
547

548
549
550
	dreq = nfs_direct_write_alloc(count, NFS_SERVER(inode)->wsize);
	if (!dreq)
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
551

552
553
	dreq->pages = pages;
	dreq->npages = nr_pages;
554
	igrab(inode);
555
556
557
558
	dreq->inode = inode;
	dreq->filp = iocb->ki_filp;
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
Linus Torvalds's avatar
Linus Torvalds committed
559

560
561
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

562
	nfs_begin_data_update(inode);
Linus Torvalds's avatar
Linus Torvalds committed
563

564
	rpc_clnt_sigmask(clnt, &oldset);
565
	nfs_direct_write_schedule(dreq, user_addr, count, pos);
566
	result = nfs_direct_wait(dreq);
567
	rpc_clnt_sigunmask(clnt, &oldset);
Linus Torvalds's avatar
Linus Torvalds committed
568

569
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
570
571
572
573
574
575
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
576
577
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
Linus Torvalds's avatar
Linus Torvalds committed
578
579
580
581
582
583
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
584
 * READ where the file size could change.  Our preference is simply
Linus Torvalds's avatar
Linus Torvalds committed
585
586
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
587
 *
Linus Torvalds's avatar
Linus Torvalds committed
588
589
590
591
592
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
593
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
594
595
{
	ssize_t retval = -EINVAL;
596
597
	int page_count;
	struct page **pages;
Linus Torvalds's avatar
Linus Torvalds committed
598
599
600
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

601
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
602
603
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
604
		(unsigned long) count, (long long) pos);
Linus Torvalds's avatar
Linus Torvalds committed
605
606
607
608

	if (count < 0)
		goto out;
	retval = -EFAULT;
609
	if (!access_ok(VERIFY_WRITE, buf, count))
Linus Torvalds's avatar
Linus Torvalds committed
610
611
612
613
614
		goto out;
	retval = 0;
	if (!count)
		goto out;

615
616
617
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
618

619
620
621
622
623
624
625
626
	page_count = nfs_get_user_pages(READ, (unsigned long) buf,
						count, &pages);
	if (page_count < 0) {
		nfs_free_user_pages(pages, 0, 0);
		retval = page_count;
		goto out;
	}

627
	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
628
						pages, page_count);
Linus Torvalds's avatar
Linus Torvalds committed
629
	if (retval > 0)
630
		iocb->ki_pos = pos + retval;
Linus Torvalds's avatar
Linus Torvalds committed
631
632
633
634
635
636
637
638
639

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
640
641
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
Linus Torvalds's avatar
Linus Torvalds committed
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
661
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
Linus Torvalds's avatar
Linus Torvalds committed
662
{
663
	ssize_t retval;
664
665
	int page_count;
	struct page **pages;
Linus Torvalds's avatar
Linus Torvalds committed
666
667
668
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

669
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
670
		file->f_dentry->d_parent->d_name.name,
671
672
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
Linus Torvalds's avatar
Linus Torvalds committed
673

674
675
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
Linus Torvalds's avatar
Linus Torvalds committed
676
		goto out;
677
678
679

	retval = -EINVAL;
	if ((ssize_t) count < 0)
Linus Torvalds's avatar
Linus Torvalds committed
680
681
682
683
		goto out;
	retval = 0;
	if (!count)
		goto out;
684
685

	retval = -EFAULT;
686
	if (!access_ok(VERIFY_READ, buf, count))
687
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
688

689
690
691
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
692

693
694
695
696
697
698
699
700
	page_count = nfs_get_user_pages(WRITE, (unsigned long) buf,
						count, &pages);
	if (page_count < 0) {
		nfs_free_user_pages(pages, 0, 0);
		retval = page_count;
		goto out;
	}

701
	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
702
					pos, pages, page_count);
703
704
705
706
707
708
709
710
711

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
712
713
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
714

Linus Torvalds's avatar
Linus Torvalds committed
715
	if (retval > 0)
716
		iocb->ki_pos = pos + retval;
Linus Torvalds's avatar
Linus Torvalds committed
717
718
719
720
721

out:
	return retval;
}

722
723
724
725
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
726
727
728
729
730
731
732
733
734
735
736
737
int nfs_init_directcache(void)
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
						0, SLAB_RECLAIM_ACCOUNT,
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

738
739
740
741
/**
 * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
Linus Torvalds's avatar
Linus Torvalds committed
742
743
744
745
746
void nfs_destroy_directcache(void)
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}