truncate.c 15.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5
/*
 * mm/truncate.c - code for taking down pages from address_spaces
 *
 * Copyright (C) 2002, Linus Torvalds
 *
6
 * 10Sep2002	Andrew Morton
Linus Torvalds's avatar
Linus Torvalds committed
7 8 9 10
 *		Initial version.
 */

#include <linux/kernel.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
11
#include <linux/backing-dev.h>
12
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
13
#include <linux/mm.h>
14
#include <linux/swap.h>
Linus Torvalds's avatar
Linus Torvalds committed
15 16
#include <linux/module.h>
#include <linux/pagemap.h>
17
#include <linux/highmem.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
#include <linux/pagevec.h>
19
#include <linux/task_io_accounting_ops.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
#include <linux/buffer_head.h>	/* grr. try_to_release_page,
21
				   do_invalidatepage */
22
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
23 24


25
/**
26
 * do_invalidatepage - invalidate part or all of a page
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
 * @page: the page which is affected
 * @offset: the index of the truncation point
 *
 * do_invalidatepage() is called when all or part of the page has become
 * invalidated by a truncate operation.
 *
 * do_invalidatepage() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void do_invalidatepage(struct page *page, unsigned long offset)
{
	void (*invalidatepage)(struct page *, unsigned long);
	invalidatepage = page->mapping->a_ops->invalidatepage;
43
#ifdef CONFIG_BLOCK
44 45
	if (!invalidatepage)
		invalidatepage = block_invalidatepage;
46
#endif
47 48 49 50
	if (invalidatepage)
		(*invalidatepage)(page, offset);
}

Linus Torvalds's avatar
Linus Torvalds committed
51 52
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
53
	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
54
	if (page_has_private(page))
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57
		do_invalidatepage(page, partial);
}

58 59 60 61 62 63 64 65 66 67 68 69 70 71
/*
 * This cancels just the dirty bit on the kernel page itself, it
 * does NOT actually remove dirty bits on any mmap's that may be
 * around. It also leaves the page tagged dirty, so any sync
 * activity will still find it on the dirty lists, and in particular,
 * clear_page_dirty_for_io() will still look at the dirty bits in
 * the VM.
 *
 * Doing this should *normally* only ever be done when a page
 * is truncated, and is not actually mapped anywhere at all. However,
 * fs/buffer.c does this when it notices that somebody has cleaned
 * out all the buffers on a page without actually doing it through
 * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
 */
72 73
void cancel_dirty_page(struct page *page, unsigned int account_size)
{
74 75 76 77
	if (TestClearPageDirty(page)) {
		struct address_space *mapping = page->mapping;
		if (mapping && mapping_cap_account_dirty(mapping)) {
			dec_zone_page_state(page, NR_FILE_DIRTY);
78 79
			dec_bdi_stat(mapping->backing_dev_info,
					BDI_RECLAIMABLE);
80 81 82
			if (account_size)
				task_io_account_cancelled_write(account_size);
		}
83
	}
84
}
85
EXPORT_SYMBOL(cancel_dirty_page);
86

Linus Torvalds's avatar
Linus Torvalds committed
87 88
/*
 * If truncate cannot remove the fs-private metadata from the page, the page
89
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
90
 * user pagetables if we're racing with filemap_fault().
Linus Torvalds's avatar
Linus Torvalds committed
91 92 93
 *
 * We need to bale out if page->mapping is no longer equal to the original
 * mapping.  This happens a) when the VM reclaimed the page while we waited on
94
 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
Linus Torvalds's avatar
Linus Torvalds committed
95 96
 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
 */
97
static int
Linus Torvalds's avatar
Linus Torvalds committed
98 99 100
truncate_complete_page(struct address_space *mapping, struct page *page)
{
	if (page->mapping != mapping)
101
		return -EIO;
Linus Torvalds's avatar
Linus Torvalds committed
102

103
	if (page_has_private(page))
Linus Torvalds's avatar
Linus Torvalds committed
104 105
		do_invalidatepage(page, 0);

106 107
	cancel_dirty_page(page, PAGE_CACHE_SIZE);

108
	clear_page_mlock(page);
109
	remove_from_page_cache(page);
Linus Torvalds's avatar
Linus Torvalds committed
110 111
	ClearPageMappedToDisk(page);
	page_cache_release(page);	/* pagecache ref */
112
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
113 114 115
}

/*
116
 * This is for invalidate_mapping_pages().  That function can be called at
Linus Torvalds's avatar
Linus Torvalds committed
117
 * any time, and is not supposed to throw away dirty pages.  But pages can
118 119
 * be marked dirty at any time too, so use remove_mapping which safely
 * discards clean, unused pages.
Linus Torvalds's avatar
Linus Torvalds committed
120 121 122 123 124 125
 *
 * Returns non-zero if the page was successfully invalidated.
 */
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
126 127
	int ret;

Linus Torvalds's avatar
Linus Torvalds committed
128 129 130
	if (page->mapping != mapping)
		return 0;

131
	if (page_has_private(page) && !try_to_release_page(page, 0))
Linus Torvalds's avatar
Linus Torvalds committed
132 133
		return 0;

134
	clear_page_mlock(page);
135 136 137
	ret = remove_mapping(mapping, page);

	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
138 139
}

140 141 142 143 144 145 146 147 148 149
int truncate_inode_page(struct address_space *mapping, struct page *page)
{
	if (page_mapped(page)) {
		unmap_mapping_range(mapping,
				   (loff_t)page->index << PAGE_CACHE_SHIFT,
				   PAGE_CACHE_SIZE, 0);
	}
	return truncate_complete_page(mapping, page);
}

150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
/*
 * Used to get rid of pages on hardware memory corruption.
 */
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
	if (!mapping)
		return -EINVAL;
	/*
	 * Only punch for normal data pages for now.
	 * Handling other types like directories would need more auditing.
	 */
	if (!S_ISREG(mapping->host->i_mode))
		return -EIO;
	return truncate_inode_page(mapping, page);
}
EXPORT_SYMBOL(generic_error_remove_page);

167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
/*
 * Safely invalidate one page from its pagecache mapping.
 * It only drops clean, unused pages. The page must be locked.
 *
 * Returns 1 if the page is successfully invalidated, otherwise 0.
 */
int invalidate_inode_page(struct page *page)
{
	struct address_space *mapping = page_mapping(page);
	if (!mapping)
		return 0;
	if (PageDirty(page) || PageWriteback(page))
		return 0;
	if (page_mapped(page))
		return 0;
	return invalidate_complete_page(mapping, page);
}

Linus Torvalds's avatar
Linus Torvalds committed
185
/**
186
 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
Linus Torvalds's avatar
Linus Torvalds committed
187 188
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
189
 * @lend: offset to which to truncate
Linus Torvalds's avatar
Linus Torvalds committed
190
 *
191 192 193
 * Truncate the page cache, removing the pages that are between
 * specified offsets (and zeroing out partial page
 * (if lstart is not page aligned)).
Linus Torvalds's avatar
Linus Torvalds committed
194 195 196 197 198 199 200 201 202 203 204 205 206 207
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
 * will wait.  This is to prevent as much IO as possible in the affected region.
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
 * When looking at page->index outside the page lock we need to be careful to
 * copy it into a local to avoid races (it could change at any time).
 *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
 */
208 209
void truncate_inode_pages_range(struct address_space *mapping,
				loff_t lstart, loff_t lend)
Linus Torvalds's avatar
Linus Torvalds committed
210 211
{
	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
212
	pgoff_t end;
Linus Torvalds's avatar
Linus Torvalds committed
213 214 215 216 217 218 219 220
	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
	struct pagevec pvec;
	pgoff_t next;
	int i;

	if (mapping->nrpages == 0)
		return;

221 222 223
	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
	end = (lend >> PAGE_CACHE_SHIFT);

Linus Torvalds's avatar
Linus Torvalds committed
224 225
	pagevec_init(&pvec, 0);
	next = start;
226 227
	while (next <= end &&
	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
Linus Torvalds's avatar
Linus Torvalds committed
228 229 230 231
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];
			pgoff_t page_index = page->index;

232 233 234 235 236
			if (page_index > end) {
				next = page_index;
				break;
			}

Linus Torvalds's avatar
Linus Torvalds committed
237 238 239
			if (page_index > next)
				next = page_index;
			next++;
Nick Piggin's avatar
Nick Piggin committed
240
			if (!trylock_page(page))
Linus Torvalds's avatar
Linus Torvalds committed
241 242 243 244 245
				continue;
			if (PageWriteback(page)) {
				unlock_page(page);
				continue;
			}
246
			truncate_inode_page(mapping, page);
Linus Torvalds's avatar
Linus Torvalds committed
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
			unlock_page(page);
		}
		pagevec_release(&pvec);
		cond_resched();
	}

	if (partial) {
		struct page *page = find_lock_page(mapping, start - 1);
		if (page) {
			wait_on_page_writeback(page);
			truncate_partial_page(page, partial);
			unlock_page(page);
			page_cache_release(page);
		}
	}

	next = start;
	for ( ; ; ) {
		cond_resched();
		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
			if (next == start)
				break;
			next = start;
			continue;
		}
272 273 274 275
		if (pvec.pages[0]->index > end) {
			pagevec_release(&pvec);
			break;
		}
276
		mem_cgroup_uncharge_start();
Linus Torvalds's avatar
Linus Torvalds committed
277 278 279
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

280 281
			if (page->index > end)
				break;
Linus Torvalds's avatar
Linus Torvalds committed
282 283
			lock_page(page);
			wait_on_page_writeback(page);
284
			truncate_inode_page(mapping, page);
Linus Torvalds's avatar
Linus Torvalds committed
285 286 287 288 289 290
			if (page->index > next)
				next = page->index;
			next++;
			unlock_page(page);
		}
		pagevec_release(&pvec);
291
		mem_cgroup_uncharge_end();
Linus Torvalds's avatar
Linus Torvalds committed
292 293
	}
}
294
EXPORT_SYMBOL(truncate_inode_pages_range);
Linus Torvalds's avatar
Linus Torvalds committed
295

296 297 298 299 300
/**
 * truncate_inode_pages - truncate *all* the pages from an offset
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 *
301
 * Called under (and serialised by) inode->i_mutex.
302 303 304 305 306
 */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
Linus Torvalds's avatar
Linus Torvalds committed
307 308
EXPORT_SYMBOL(truncate_inode_pages);

309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
/**
 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
 * @mapping: the address_space which holds the pages to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 *
 * This function only removes the unlocked pages, if you want to
 * remove all the pages of one inode, you must call truncate_inode_pages.
 *
 * invalidate_mapping_pages() will not block on IO activity. It will not
 * invalidate pages which are dirty, locked, under writeback or mapped into
 * pagetables.
 */
unsigned long invalidate_mapping_pages(struct address_space *mapping,
				       pgoff_t start, pgoff_t end)
Linus Torvalds's avatar
Linus Torvalds committed
324 325 326 327 328 329 330 331 332
{
	struct pagevec pvec;
	pgoff_t next = start;
	unsigned long ret = 0;
	int i;

	pagevec_init(&pvec, 0);
	while (next <= end &&
			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
333
		mem_cgroup_uncharge_start();
Linus Torvalds's avatar
Linus Torvalds committed
334 335
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];
336 337
			pgoff_t index;
			int lock_failed;
Linus Torvalds's avatar
Linus Torvalds committed
338

Nick Piggin's avatar
Nick Piggin committed
339
			lock_failed = !trylock_page(page);
340 341 342 343 344 345 346 347 348 349

			/*
			 * We really shouldn't be looking at the ->index of an
			 * unlocked page.  But we're not allowed to lock these
			 * pages.  So we rely upon nobody altering the ->index
			 * of this (pinned-by-us) page.
			 */
			index = page->index;
			if (index > next)
				next = index;
Linus Torvalds's avatar
Linus Torvalds committed
350
			next++;
351 352 353
			if (lock_failed)
				continue;

354 355
			ret += invalidate_inode_page(page);

Linus Torvalds's avatar
Linus Torvalds committed
356 357 358 359 360
			unlock_page(page);
			if (next > end)
				break;
		}
		pagevec_release(&pvec);
361
		mem_cgroup_uncharge_end();
362
		cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
363 364 365
	}
	return ret;
}
366
EXPORT_SYMBOL(invalidate_mapping_pages);
Linus Torvalds's avatar
Linus Torvalds committed
367

368 369 370 371
/*
 * This is like invalidate_complete_page(), except it ignores the page's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave pages behind because
372 373
 * shrink_page_list() has a temp ref on them, or because they're transiently
 * sitting in the lru_cache_add() pagevecs.
374 375 376 377 378 379 380
 */
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
	if (page->mapping != mapping)
		return 0;

381
	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
382 383
		return 0;

Nick Piggin's avatar
Nick Piggin committed
384
	spin_lock_irq(&mapping->tree_lock);
385 386 387
	if (PageDirty(page))
		goto failed;

388
	clear_page_mlock(page);
389
	BUG_ON(page_has_private(page));
390
	__remove_from_page_cache(page);
Nick Piggin's avatar
Nick Piggin committed
391
	spin_unlock_irq(&mapping->tree_lock);
392
	mem_cgroup_uncharge_cache_page(page);
393 394 395
	page_cache_release(page);	/* pagecache ref */
	return 1;
failed:
Nick Piggin's avatar
Nick Piggin committed
396
	spin_unlock_irq(&mapping->tree_lock);
397 398 399
	return 0;
}

400 401 402 403 404 405 406 407 408
static int do_launder_page(struct address_space *mapping, struct page *page)
{
	if (!PageDirty(page))
		return 0;
	if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
		return 0;
	return mapping->a_ops->launder_page(page);
}

Linus Torvalds's avatar
Linus Torvalds committed
409 410
/**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
411
 * @mapping: the address_space
Linus Torvalds's avatar
Linus Torvalds committed
412 413 414 415 416 417
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
418
 * Returns -EBUSY if any pages could not be invalidated.
Linus Torvalds's avatar
Linus Torvalds committed
419 420 421 422 423 424 425 426
 */
int invalidate_inode_pages2_range(struct address_space *mapping,
				  pgoff_t start, pgoff_t end)
{
	struct pagevec pvec;
	pgoff_t next;
	int i;
	int ret = 0;
427
	int ret2 = 0;
Linus Torvalds's avatar
Linus Torvalds committed
428 429 430 431 432
	int did_range_unmap = 0;
	int wrapped = 0;

	pagevec_init(&pvec, 0);
	next = start;
433
	while (next <= end && !wrapped &&
Linus Torvalds's avatar
Linus Torvalds committed
434 435
		pagevec_lookup(&pvec, mapping, next,
			min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
436
		mem_cgroup_uncharge_start();
437
		for (i = 0; i < pagevec_count(&pvec); i++) {
Linus Torvalds's avatar
Linus Torvalds committed
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
			struct page *page = pvec.pages[i];
			pgoff_t page_index;

			lock_page(page);
			if (page->mapping != mapping) {
				unlock_page(page);
				continue;
			}
			page_index = page->index;
			next = page_index + 1;
			if (next == 0)
				wrapped = 1;
			if (page_index > end) {
				unlock_page(page);
				break;
			}
			wait_on_page_writeback(page);
455
			if (page_mapped(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
456 457 458 459 460
				if (!did_range_unmap) {
					/*
					 * Zap the rest of the file in one hit.
					 */
					unmap_mapping_range(mapping,
461 462
					   (loff_t)page_index<<PAGE_CACHE_SHIFT,
					   (loff_t)(end - page_index + 1)
Linus Torvalds's avatar
Linus Torvalds committed
463 464 465 466 467 468 469 470
							<< PAGE_CACHE_SHIFT,
					    0);
					did_range_unmap = 1;
				} else {
					/*
					 * Just zap this page
					 */
					unmap_mapping_range(mapping,
471
					  (loff_t)page_index<<PAGE_CACHE_SHIFT,
Linus Torvalds's avatar
Linus Torvalds committed
472 473 474
					  PAGE_CACHE_SIZE, 0);
				}
			}
475
			BUG_ON(page_mapped(page));
476 477 478
			ret2 = do_launder_page(mapping, page);
			if (ret2 == 0) {
				if (!invalidate_complete_page2(mapping, page))
479
					ret2 = -EBUSY;
480 481 482
			}
			if (ret2 < 0)
				ret = ret2;
Linus Torvalds's avatar
Linus Torvalds committed
483 484 485
			unlock_page(page);
		}
		pagevec_release(&pvec);
486
		mem_cgroup_uncharge_end();
Linus Torvalds's avatar
Linus Torvalds committed
487 488 489 490 491 492 493 494
		cond_resched();
	}
	return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

/**
 * invalidate_inode_pages2 - remove all pages from an address_space
495
 * @mapping: the address_space
Linus Torvalds's avatar
Linus Torvalds committed
496 497 498 499
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
500
 * Returns -EBUSY if any pages could not be invalidated.
Linus Torvalds's avatar
Linus Torvalds committed
501 502 503 504 505 506
 */
int invalidate_inode_pages2(struct address_space *mapping)
{
	return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
npiggin@suse.de's avatar
npiggin@suse.de committed
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525

/**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
 * @old: old file offset
 * @new: new file offset
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
{
526 527 528 529 530 531 532 533 534 535 536 537 538 539
	struct address_space *mapping = inode->i_mapping;

	/*
	 * unmap_mapping_range is called twice, first simply for
	 * efficiency so that truncate_inode_pages does fewer
	 * single-page unmaps.  However after this first call, and
	 * before truncate_inode_pages finishes, it is possible for
	 * private pages to be COWed, which remain after
	 * truncate_inode_pages finishes, hence the second
	 * unmap_mapping_range call must be made for correctness.
	 */
	unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
	truncate_inode_pages(mapping, new);
	unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
npiggin@suse.de's avatar
npiggin@suse.de committed
540 541 542 543 544 545 546 547 548 549 550
}
EXPORT_SYMBOL(truncate_pagecache);

/**
 * vmtruncate - unmap mappings "freed" by truncate() syscall
 * @inode: inode of the file used
 * @offset: file offset to start truncating
 *
 * NOTE! We have to be ready to update the memory sharing
 * between the file and the memory map for a potential last
 * incomplete page.  Ugly, but necessary.
551 552 553
 *
 * This function is deprecated and simple_setsize or truncate_pagecache
 * should be used instead.
npiggin@suse.de's avatar
npiggin@suse.de committed
554 555 556 557 558
 */
int vmtruncate(struct inode *inode, loff_t offset)
{
	int error;

559
	error = simple_setsize(inode, offset);
npiggin@suse.de's avatar
npiggin@suse.de committed
560 561
	if (error)
		return error;
562

npiggin@suse.de's avatar
npiggin@suse.de committed
563 564 565 566 567 568
	if (inode->i_op->truncate)
		inode->i_op->truncate(inode);

	return error;
}
EXPORT_SYMBOL(vmtruncate);