filemap.c 64.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
 *	linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/fs.h>
16
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
#include <linux/aio.h>
18
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/syscalls.h>
32
#include <linux/cpuset.h>
33
#include "filemap.h"
34
35
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
36
37
38
39
40
41
42
/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
#include <linux/buffer_head.h> /* for generic_osync_inode */

#include <asm/mman.h>

43
44
45
46
static ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
	loff_t offset, unsigned long nr_segs);

Linus Torvalds's avatar
Linus Torvalds committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
 *  ->i_mmap_lock		(vmtruncate)
 *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
64
65
 *      ->swap_lock		(exclusive_swap_page, others)
 *        ->mapping->tree_lock
Linus Torvalds's avatar
Linus Torvalds committed
66
 *
67
 *  ->i_mutex
Linus Torvalds's avatar
Linus Torvalds committed
68
69
70
71
 *    ->i_mmap_lock		(truncate->unmap_mapping_range)
 *
 *  ->mmap_sem
 *    ->i_mmap_lock
72
 *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
Linus Torvalds's avatar
Linus Torvalds committed
73
74
75
76
77
 *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
 *
 *  ->mmap_sem
 *    ->lock_page		(access_process_vm)
 *
78
79
 *  ->i_mutex			(generic_file_buffered_write)
 *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
Linus Torvalds's avatar
Linus Torvalds committed
80
 *
81
 *  ->i_mutex
Linus Torvalds's avatar
Linus Torvalds committed
82
83
84
85
86
87
88
89
90
91
 *    ->i_alloc_sem             (various)
 *
 *  ->inode_lock
 *    ->sb_lock			(fs/fs-writeback.c)
 *    ->mapping->tree_lock	(__sync_single_inode)
 *
 *  ->i_mmap_lock
 *    ->anon_vma.lock		(vma_adjust)
 *
 *  ->anon_vma.lock
92
 *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
Linus Torvalds's avatar
Linus Torvalds committed
93
 *
94
 *  ->page_table_lock or pte_lock
95
 *    ->swap_lock		(try_to_unmap_one)
Linus Torvalds's avatar
Linus Torvalds committed
96
97
98
 *    ->private_lock		(try_to_unmap_one)
 *    ->tree_lock		(try_to_unmap_one)
 *    ->zone.lru_lock		(follow_page->mark_page_accessed)
99
 *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
Linus Torvalds's avatar
Linus Torvalds committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
 *    ->private_lock		(page_remove_rmap->set_page_dirty)
 *    ->tree_lock		(page_remove_rmap->set_page_dirty)
 *    ->inode_lock		(page_remove_rmap->set_page_dirty)
 *    ->inode_lock		(zap_pte_range->set_page_dirty)
 *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
 *
 *  ->task->proc_lock
 *    ->dcache_lock		(proc_pid_lookup)
 */

/*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
 */
void __remove_from_page_cache(struct page *page)
{
	struct address_space *mapping = page->mapping;

	radix_tree_delete(&mapping->page_tree, page->index);
	page->mapping = NULL;
	mapping->nrpages--;
122
	__dec_zone_page_state(page, NR_FILE_PAGES);
Linus Torvalds's avatar
Linus Torvalds committed
123
124
125
126
127
128
}

void remove_from_page_cache(struct page *page)
{
	struct address_space *mapping = page->mapping;

Matt Mackall's avatar
Matt Mackall committed
129
	BUG_ON(!PageLocked(page));
Linus Torvalds's avatar
Linus Torvalds committed
130
131
132
133
134
135
136
137
138
139
140

	write_lock_irq(&mapping->tree_lock);
	__remove_from_page_cache(page);
	write_unlock_irq(&mapping->tree_lock);
}

static int sync_page(void *word)
{
	struct address_space *mapping;
	struct page *page;

141
	page = container_of((unsigned long *)word, struct page, flags);
Linus Torvalds's avatar
Linus Torvalds committed
142
143

	/*
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
	 * page_mapping() is being called without PG_locked held.
	 * Some knowledge of the state and use of the page is used to
	 * reduce the requirements down to a memory barrier.
	 * The danger here is of a stale page_mapping() return value
	 * indicating a struct address_space different from the one it's
	 * associated with when it is associated with one.
	 * After smp_mb(), it's either the correct page_mapping() for
	 * the page, or an old page_mapping() and the page's own
	 * page_mapping() has gone NULL.
	 * The ->sync_page() address_space operation must tolerate
	 * page_mapping() going NULL. By an amazing coincidence,
	 * this comes about because none of the users of the page
	 * in the ->sync_page() methods make essential use of the
	 * page_mapping(), merely passing the page down to the backing
	 * device's unplug functions when it's non-NULL, which in turn
159
	 * ignore it for all cases but swap, where only page_private(page) is
160
161
162
	 * of interest. When page_mapping() does go NULL, the entire
	 * call stack gracefully ignores the page and returns.
	 * -- wli
Linus Torvalds's avatar
Linus Torvalds committed
163
164
165
166
167
168
169
170
171
172
	 */
	smp_mb();
	mapping = page_mapping(page);
	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
		mapping->a_ops->sync_page(page);
	io_schedule();
	return 0;
}

/**
173
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
174
175
 * @mapping:	address space structure to write
 * @start:	offset in bytes where the range starts
176
 * @end:	offset in bytes where the range ends (inclusive)
177
 * @sync_mode:	enable synchronous operation
Linus Torvalds's avatar
Linus Torvalds committed
178
 *
179
180
181
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
Linus Torvalds's avatar
Linus Torvalds committed
182
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
183
 * opposed to a regular memory cleansing writeback.  The difference between
Linus Torvalds's avatar
Linus Torvalds committed
184
185
186
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 */
187
188
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
				loff_t end, int sync_mode)
Linus Torvalds's avatar
Linus Torvalds committed
189
190
191
192
193
{
	int ret;
	struct writeback_control wbc = {
		.sync_mode = sync_mode,
		.nr_to_write = mapping->nrpages * 2,
194
195
		.range_start = start,
		.range_end = end,
Linus Torvalds's avatar
Linus Torvalds committed
196
197
198
199
200
201
202
203
204
205
206
207
	};

	if (!mapping_cap_writeback_dirty(mapping))
		return 0;

	ret = do_writepages(mapping, &wbc);
	return ret;
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
	int sync_mode)
{
208
	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
Linus Torvalds's avatar
Linus Torvalds committed
209
210
211
212
213
214
215
216
}

int filemap_fdatawrite(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

217
218
static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
				loff_t end)
Linus Torvalds's avatar
Linus Torvalds committed
219
220
221
222
{
	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}

223
224
225
226
/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:	target address_space
 *
Linus Torvalds's avatar
Linus Torvalds committed
227
228
229
230
231
232
233
234
235
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 */
int filemap_flush(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

236
237
238
239
240
241
/**
 * wait_on_page_writeback_range - wait for writeback to complete
 * @mapping:	target address_space
 * @start:	beginning page index
 * @end:	ending page index
 *
Linus Torvalds's avatar
Linus Torvalds committed
242
243
244
 * Wait for writeback to complete against pages indexed by start->end
 * inclusive
 */
245
int wait_on_page_writeback_range(struct address_space *mapping,
Linus Torvalds's avatar
Linus Torvalds committed
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
				pgoff_t start, pgoff_t end)
{
	struct pagevec pvec;
	int nr_pages;
	int ret = 0;
	pgoff_t index;

	if (end < start)
		return 0;

	pagevec_init(&pvec, 0);
	index = start;
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;

			wait_on_page_writeback(page);
			if (PageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}

	/* Check for outstanding write errors */
	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
		ret = -ENOSPC;
	if (test_and_clear_bit(AS_EIO, &mapping->flags))
		ret = -EIO;

	return ret;
}

288
289
290
291
292
293
294
/**
 * sync_page_range - write and wait on all pages in the passed range
 * @inode:	target inode
 * @mapping:	target address_space
 * @pos:	beginning offset in pages to write
 * @count:	number of bytes to write
 *
Linus Torvalds's avatar
Linus Torvalds committed
295
296
297
298
 * Write and wait upon all the pages in the passed range.  This is a "data
 * integrity" operation.  It waits upon in-flight writeout before starting and
 * waiting upon new writeout.  If there was an IO error, return it.
 *
299
 * We need to re-take i_mutex during the generic_osync_inode list walk because
Linus Torvalds's avatar
Linus Torvalds committed
300
301
302
 * it is otherwise livelockable.
 */
int sync_page_range(struct inode *inode, struct address_space *mapping,
303
			loff_t pos, loff_t count)
Linus Torvalds's avatar
Linus Torvalds committed
304
305
306
307
308
309
310
311
312
{
	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
	int ret;

	if (!mapping_cap_writeback_dirty(mapping) || !count)
		return 0;
	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
	if (ret == 0) {
313
		mutex_lock(&inode->i_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
314
		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
315
		mutex_unlock(&inode->i_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
316
317
318
319
320
321
322
	}
	if (ret == 0)
		ret = wait_on_page_writeback_range(mapping, start, end);
	return ret;
}
EXPORT_SYMBOL(sync_page_range);

323
324
325
326
327
328
329
/**
 * sync_page_range_nolock
 * @inode:	target inode
 * @mapping:	target address_space
 * @pos:	beginning offset in pages to write
 * @count:	number of bytes to write
 *
330
 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
Linus Torvalds's avatar
Linus Torvalds committed
331
332
333
 * as it forces O_SYNC writers to different parts of the same file
 * to be serialised right until io completion.
 */
334
335
int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
			   loff_t pos, loff_t count)
Linus Torvalds's avatar
Linus Torvalds committed
336
337
338
339
340
341
342
343
344
345
346
347
348
349
{
	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
	int ret;

	if (!mapping_cap_writeback_dirty(mapping) || !count)
		return 0;
	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
	if (ret == 0)
		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
	if (ret == 0)
		ret = wait_on_page_writeback_range(mapping, start, end);
	return ret;
}
350
EXPORT_SYMBOL(sync_page_range_nolock);
Linus Torvalds's avatar
Linus Torvalds committed
351
352

/**
353
 * filemap_fdatawait - wait for all under-writeback pages to complete
Linus Torvalds's avatar
Linus Torvalds committed
354
 * @mapping: address space structure to wait for
355
356
357
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.
Linus Torvalds's avatar
Linus Torvalds committed
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
 */
int filemap_fdatawait(struct address_space *mapping)
{
	loff_t i_size = i_size_read(mapping->host);

	if (i_size == 0)
		return 0;

	return wait_on_page_writeback_range(mapping, 0,
				(i_size - 1) >> PAGE_CACHE_SHIFT);
}
EXPORT_SYMBOL(filemap_fdatawait);

int filemap_write_and_wait(struct address_space *mapping)
{
373
	int err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
374
375

	if (mapping->nrpages) {
376
377
378
379
380
381
382
383
384
385
386
387
		err = filemap_fdatawrite(mapping);
		/*
		 * Even if the above returned error, the pages may be
		 * written partially (e.g. -ENOSPC), so we wait for it.
		 * But the -EIO is special case, it may indicate the worst
		 * thing (e.g. bug) happened, so we avoid waiting for it.
		 */
		if (err != -EIO) {
			int err2 = filemap_fdatawait(mapping);
			if (!err)
				err = err2;
		}
Linus Torvalds's avatar
Linus Torvalds committed
388
	}
389
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
390
}
391
EXPORT_SYMBOL(filemap_write_and_wait);
Linus Torvalds's avatar
Linus Torvalds committed
392

393
394
395
396
397
398
/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:	the address_space for the pages
 * @lstart:	offset in bytes where the range starts
 * @lend:	offset in bytes where the range ends (inclusive)
 *
399
400
401
402
403
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that `lend' is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 */
Linus Torvalds's avatar
Linus Torvalds committed
404
405
406
int filemap_write_and_wait_range(struct address_space *mapping,
				 loff_t lstart, loff_t lend)
{
407
	int err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
408
409

	if (mapping->nrpages) {
410
411
412
413
414
415
416
417
418
419
		err = __filemap_fdatawrite_range(mapping, lstart, lend,
						 WB_SYNC_ALL);
		/* See comment of filemap_write_and_wait() */
		if (err != -EIO) {
			int err2 = wait_on_page_writeback_range(mapping,
						lstart >> PAGE_CACHE_SHIFT,
						lend >> PAGE_CACHE_SHIFT);
			if (!err)
				err = err2;
		}
Linus Torvalds's avatar
Linus Torvalds committed
420
	}
421
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
422
423
}

424
425
426
427
428
429
430
431
/**
 * add_to_page_cache - add newly allocated pagecache pages
 * @page:	page to add
 * @mapping:	the page's address_space
 * @offset:	page index
 * @gfp_mask:	page allocation mode
 *
 * This function is used to add newly allocated pagecache pages;
Linus Torvalds's avatar
Linus Torvalds committed
432
433
434
435
436
437
 * the page is new, so we can just run SetPageLocked() against it.
 * The other page state flags were set by rmqueue().
 *
 * This function does not add the page to the LRU.  The caller must do that.
 */
int add_to_page_cache(struct page *page, struct address_space *mapping,
Al Viro's avatar
Al Viro committed
438
		pgoff_t offset, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
439
440
441
442
443
444
445
446
447
448
449
450
{
	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);

	if (error == 0) {
		write_lock_irq(&mapping->tree_lock);
		error = radix_tree_insert(&mapping->page_tree, offset, page);
		if (!error) {
			page_cache_get(page);
			SetPageLocked(page);
			page->mapping = mapping;
			page->index = offset;
			mapping->nrpages++;
451
			__inc_zone_page_state(page, NR_FILE_PAGES);
Linus Torvalds's avatar
Linus Torvalds committed
452
453
454
455
456
457
458
459
460
		}
		write_unlock_irq(&mapping->tree_lock);
		radix_tree_preload_end();
	}
	return error;
}
EXPORT_SYMBOL(add_to_page_cache);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
Al Viro's avatar
Al Viro committed
461
				pgoff_t offset, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
462
463
464
465
466
467
468
{
	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
	if (ret == 0)
		lru_cache_add(page);
	return ret;
}

469
#ifdef CONFIG_NUMA
470
struct page *__page_cache_alloc(gfp_t gfp)
471
472
473
{
	if (cpuset_do_page_mem_spread()) {
		int n = cpuset_mem_spread_node();
474
		return alloc_pages_node(n, gfp, 0);
475
	}
476
	return alloc_pages(gfp, 0);
477
}
478
EXPORT_SYMBOL(__page_cache_alloc);
479
480
#endif

481
482
483
484
485
486
static int __sleep_on_page_lock(void *word)
{
	io_schedule();
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
static wait_queue_head_t *page_waitqueue(struct page *page)
{
	const struct zone *zone = page_zone(page);

	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
}

static inline void wake_up_page(struct page *page, int bit)
{
	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
}

void fastcall wait_on_page_bit(struct page *page, int bit_nr)
{
	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);

	if (test_bit(bit_nr, &page->flags))
		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
							TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);

/**
520
 * unlock_page - unlock a locked page
Linus Torvalds's avatar
Linus Torvalds committed
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
 * @page: the page
 *
 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 * mechananism between PageLocked pages and PageWriteback pages is shared.
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
 * The first mb is necessary to safely close the critical section opened by the
 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
 * parallel wait_on_page_locked()).
 */
void fastcall unlock_page(struct page *page)
{
	smp_mb__before_clear_bit();
	if (!TestClearPageLocked(page))
		BUG();
	smp_mb__after_clear_bit(); 
	wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);

543
544
545
/**
 * end_page_writeback - end writeback against a page
 * @page: the page
Linus Torvalds's avatar
Linus Torvalds committed
546
547
548
549
550
551
552
553
554
555
556
557
 */
void end_page_writeback(struct page *page)
{
	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
		if (!test_clear_page_writeback(page))
			BUG();
	}
	smp_mb__after_clear_bit();
	wake_up_page(page, PG_writeback);
}
EXPORT_SYMBOL(end_page_writeback);

558
559
560
/**
 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 * @page: the page to lock
Linus Torvalds's avatar
Linus Torvalds committed
561
 *
562
 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
Linus Torvalds's avatar
Linus Torvalds committed
563
564
565
566
567
568
569
570
571
572
573
574
575
 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 * chances are that on the second loop, the block layer's plug list is empty,
 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 */
void fastcall __lock_page(struct page *page)
{
	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
							TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);

576
577
578
579
580
581
582
583
584
585
586
/*
 * Variant of lock_page that does not require the caller to hold a reference
 * on the page's mapping.
 */
void fastcall __lock_page_nosync(struct page *page)
{
	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
							TASK_UNINTERRUPTIBLE);
}

587
588
589
590
591
/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
Nick Piggin's avatar
Nick Piggin committed
592
593
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
Linus Torvalds's avatar
Linus Torvalds committed
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
 */
struct page * find_get_page(struct address_space *mapping, unsigned long offset)
{
	struct page *page;

	read_lock_irq(&mapping->tree_lock);
	page = radix_tree_lookup(&mapping->page_tree, offset);
	if (page)
		page_cache_get(page);
	read_unlock_irq(&mapping->tree_lock);
	return page;
}
EXPORT_SYMBOL(find_get_page);

/**
 * find_lock_page - locate, pin and lock a pagecache page
610
611
 * @mapping: the address_space to search
 * @offset: the page index
Linus Torvalds's avatar
Linus Torvalds committed
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
 *
 * Locates the desired pagecache page, locks it, increments its reference
 * count and returns its address.
 *
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
struct page *find_lock_page(struct address_space *mapping,
				unsigned long offset)
{
	struct page *page;

	read_lock_irq(&mapping->tree_lock);
repeat:
	page = radix_tree_lookup(&mapping->page_tree, offset);
	if (page) {
		page_cache_get(page);
		if (TestSetPageLocked(page)) {
			read_unlock_irq(&mapping->tree_lock);
630
			__lock_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
631
632
633
			read_lock_irq(&mapping->tree_lock);

			/* Has the page been truncated while we slept? */
634
635
			if (unlikely(page->mapping != mapping ||
				     page->index != offset)) {
Linus Torvalds's avatar
Linus Torvalds committed
636
637
638
639
640
641
642
643
644
645
646
647
648
				unlock_page(page);
				page_cache_release(page);
				goto repeat;
			}
		}
	}
	read_unlock_irq(&mapping->tree_lock);
	return page;
}
EXPORT_SYMBOL(find_lock_page);

/**
 * find_or_create_page - locate or add a pagecache page
649
650
651
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
Linus Torvalds's avatar
Linus Torvalds committed
652
653
654
655
656
657
658
659
660
661
662
663
664
 *
 * Locates a page in the pagecache.  If the page is not present, a new page
 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 * LRU list.  The returned page is locked and has its reference count
 * incremented.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 * allocation!
 *
 * find_or_create_page() returns the desired page's address, or zero on
 * memory exhaustion.
 */
struct page *find_or_create_page(struct address_space *mapping,
Al Viro's avatar
Al Viro committed
665
		unsigned long index, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
{
	struct page *page, *cached_page = NULL;
	int err;
repeat:
	page = find_lock_page(mapping, index);
	if (!page) {
		if (!cached_page) {
			cached_page = alloc_page(gfp_mask);
			if (!cached_page)
				return NULL;
		}
		err = add_to_page_cache_lru(cached_page, mapping,
					index, gfp_mask);
		if (!err) {
			page = cached_page;
			cached_page = NULL;
		} else if (err == -EEXIST)
			goto repeat;
	}
	if (cached_page)
		page_cache_release(cached_page);
	return page;
}
EXPORT_SYMBOL(find_or_create_page);

/**
 * find_get_pages - gang pagecache lookup
 * @mapping:	The address_space to search
 * @start:	The starting page index
 * @nr_pages:	The maximum number of pages
 * @pages:	Where the resulting pages are placed
 *
 * find_get_pages() will search for and return a group of up to
 * @nr_pages pages in the mapping.  The pages are placed at @pages.
 * find_get_pages() takes a reference against the returned pages.
 *
 * The search returns a group of mapping-contiguous pages with ascending
 * indexes.  There may be holes in the indices due to not-present pages.
 *
 * find_get_pages() returns the number of pages which were found.
 */
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
			    unsigned int nr_pages, struct page **pages)
{
	unsigned int i;
	unsigned int ret;

	read_lock_irq(&mapping->tree_lock);
	ret = radix_tree_gang_lookup(&mapping->page_tree,
				(void **)pages, start, nr_pages);
	for (i = 0; i < ret; i++)
		page_cache_get(pages[i]);
	read_unlock_irq(&mapping->tree_lock);
	return ret;
}

722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
/**
 * find_get_pages_contig - gang contiguous pagecache lookup
 * @mapping:	The address_space to search
 * @index:	The starting page index
 * @nr_pages:	The maximum number of pages
 * @pages:	Where the resulting pages are placed
 *
 * find_get_pages_contig() works exactly like find_get_pages(), except
 * that the returned number of pages are guaranteed to be contiguous.
 *
 * find_get_pages_contig() returns the number of pages which were found.
 */
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
			       unsigned int nr_pages, struct page **pages)
{
	unsigned int i;
	unsigned int ret;

	read_lock_irq(&mapping->tree_lock);
	ret = radix_tree_gang_lookup(&mapping->page_tree,
				(void **)pages, index, nr_pages);
	for (i = 0; i < ret; i++) {
		if (pages[i]->mapping == NULL || pages[i]->index != index)
			break;

		page_cache_get(pages[i]);
		index++;
	}
	read_unlock_irq(&mapping->tree_lock);
	return i;
}
753
EXPORT_SYMBOL(find_get_pages_contig);
754

755
756
757
758
759
760
761
762
/**
 * find_get_pages_tag - find and return pages that match @tag
 * @mapping:	the address_space to search
 * @index:	the starting page index
 * @tag:	the tag index
 * @nr_pages:	the maximum number of pages
 * @pages:	where the resulting pages are placed
 *
Linus Torvalds's avatar
Linus Torvalds committed
763
 * Like find_get_pages, except we only return pages which are tagged with
764
 * @tag.   We update @index to index the next page for the traversal.
Linus Torvalds's avatar
Linus Torvalds committed
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
 */
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
			int tag, unsigned int nr_pages, struct page **pages)
{
	unsigned int i;
	unsigned int ret;

	read_lock_irq(&mapping->tree_lock);
	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
				(void **)pages, *index, nr_pages, tag);
	for (i = 0; i < ret; i++)
		page_cache_get(pages[i]);
	if (ret)
		*index = pages[ret - 1]->index + 1;
	read_unlock_irq(&mapping->tree_lock);
	return ret;
}
782
EXPORT_SYMBOL(find_get_pages_tag);
Linus Torvalds's avatar
Linus Torvalds committed
783

784
785
786
787
788
/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
789
 * Same as grab_cache_page(), but do not wait if the page is unavailable.
Linus Torvalds's avatar
Linus Torvalds committed
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
struct page *
grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
	struct page *page = find_get_page(mapping, index);

	if (page) {
		if (!TestSetPageLocked(page))
			return page;
		page_cache_release(page);
		return NULL;
	}
808
809
	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
	if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
Linus Torvalds's avatar
Linus Torvalds committed
810
811
812
813
814
815
816
		page_cache_release(page);
		page = NULL;
	}
	return page;
}
EXPORT_SYMBOL(grab_cache_page_nowait);

817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
static void shrink_readahead_size_eio(struct file *filp,
					struct file_ra_state *ra)
{
	if (!ra->ra_pages)
		return;

	ra->ra_pages /= 4;
}

841
842
843
844
845
846
847
848
849
/**
 * do_generic_mapping_read - generic file read routine
 * @mapping:	address_space to be read
 * @_ra:	file's readahead state
 * @filp:	the file to read
 * @ppos:	current file position
 * @desc:	read_descriptor
 * @actor:	read method
 *
Linus Torvalds's avatar
Linus Torvalds committed
850
 * This is a generic file read routine, and uses the
851
 * mapping->a_ops->readpage() function for the actual low-level stuff.
Linus Torvalds's avatar
Linus Torvalds committed
852
853
854
855
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 *
856
857
 * Note the struct file* is only passed for the use of readpage.
 * It may be NULL.
Linus Torvalds's avatar
Linus Torvalds committed
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
 */
void do_generic_mapping_read(struct address_space *mapping,
			     struct file_ra_state *_ra,
			     struct file *filp,
			     loff_t *ppos,
			     read_descriptor_t *desc,
			     read_actor_t actor)
{
	struct inode *inode = mapping->host;
	unsigned long index;
	unsigned long end_index;
	unsigned long offset;
	unsigned long last_index;
	unsigned long next_index;
	unsigned long prev_index;
873
	unsigned int prev_offset;
Linus Torvalds's avatar
Linus Torvalds committed
874
875
876
877
878
879
880
881
	loff_t isize;
	struct page *cached_page;
	int error;
	struct file_ra_state ra = *_ra;

	cached_page = NULL;
	index = *ppos >> PAGE_CACHE_SHIFT;
	next_index = index;
Jan Kara's avatar
Jan Kara committed
882
883
	prev_index = ra.prev_index;
	prev_offset = ra.prev_offset;
Linus Torvalds's avatar
Linus Torvalds committed
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
	offset = *ppos & ~PAGE_CACHE_MASK;

	isize = i_size_read(inode);
	if (!isize)
		goto out;

	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
	for (;;) {
		struct page *page;
		unsigned long nr, ret;

		/* nr is the maximum number of bytes to copy from this page */
		nr = PAGE_CACHE_SIZE;
		if (index >= end_index) {
			if (index > end_index)
				goto out;
			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
			if (nr <= offset) {
				goto out;
			}
		}
		nr = nr - offset;

		cond_resched();
		if (index == next_index)
			next_index = page_cache_readahead(mapping, &ra, filp,
					index, last_index - index);

find_page:
		page = find_get_page(mapping, index);
		if (unlikely(page == NULL)) {
			handle_ra_miss(mapping, &ra, index);
			goto no_cached_page;
		}
		if (!PageUptodate(page))
			goto page_not_up_to_date;
page_ok:

		/* If users can be writing to this page using arbitrary
		 * virtual addresses, take care about potential aliasing
		 * before reading the page on the kernel side.
		 */
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		/*
931
932
		 * When a sequential read accesses a page several times,
		 * only mark it as accessed the first time.
Linus Torvalds's avatar
Linus Torvalds committed
933
		 */
934
		if (prev_index != index || offset != prev_offset)
Linus Torvalds's avatar
Linus Torvalds committed
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
			mark_page_accessed(page);
		prev_index = index;

		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 *
		 * The actor routine returns how many bytes were actually used..
		 * NOTE! This may not be the same as how much of a user buffer
		 * we filled up (we may be padding etc), so we can only update
		 * "pos" here (the actor routine has to update the user buffer
		 * pointers and the remaining count).
		 */
		ret = actor(desc, page, offset, nr);
		offset += ret;
		index += offset >> PAGE_CACHE_SHIFT;
		offset &= ~PAGE_CACHE_MASK;
Jan Kara's avatar
Jan Kara committed
952
953
		prev_offset = offset;
		ra.prev_offset = offset;
Linus Torvalds's avatar
Linus Torvalds committed
954
955
956
957
958
959
960
961
962
963

		page_cache_release(page);
		if (ret == nr && desc->count)
			continue;
		goto out;

page_not_up_to_date:
		/* Get exclusive access to the page ... */
		lock_page(page);

Nick Piggin's avatar
Nick Piggin committed
964
		/* Did it get truncated before we got the lock? */
Linus Torvalds's avatar
Linus Torvalds committed
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
		if (!page->mapping) {
			unlock_page(page);
			page_cache_release(page);
			continue;
		}

		/* Did somebody else fill it already? */
		if (PageUptodate(page)) {
			unlock_page(page);
			goto page_ok;
		}

readpage:
		/* Start the actual read. The read will unlock the page. */
		error = mapping->a_ops->readpage(filp, page);

981
982
983
984
985
		if (unlikely(error)) {
			if (error == AOP_TRUNCATED_PAGE) {
				page_cache_release(page);
				goto find_page;
			}
Linus Torvalds's avatar
Linus Torvalds committed
986
			goto readpage_error;
987
		}
Linus Torvalds's avatar
Linus Torvalds committed
988
989
990
991
992
993
994
995
996
997
998
999
1000

		if (!PageUptodate(page)) {
			lock_page(page);
			if (!PageUptodate(page)) {
				if (page->mapping == NULL) {
					/*
					 * invalidate_inode_pages got it
					 */
					unlock_page(page);
					page_cache_release(page);
					goto find_page;
				}
				unlock_page(page);