shmem.c 76.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *		 2000 Transmeta Corp.
 *		 2000-2001 Christoph Rohland
 *		 2000-2001 SAP AG
 *		 2002 Red Hat Inc.
9 10
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
11
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15 16 17
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
18 19 20
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23
 * This file is released under the GPL.
 */

24 25 26 27
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
Hugh Dickins's avatar
Hugh Dickins committed
28
#include <linux/pagemap.h>
29 30
#include <linux/file.h>
#include <linux/mm.h>
31
#include <linux/export.h>
32 33 34 35 36
#include <linux/swap.h>

static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
Linus Torvalds's avatar
Linus Torvalds committed
37 38 39 40 41 42
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

43
#include <linux/xattr.h>
44
#include <linux/exportfs.h>
45
#include <linux/posix_acl.h>
46
#include <linux/generic_acl.h>
Linus Torvalds's avatar
Linus Torvalds committed
47 48 49 50 51 52 53
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
54
#include <linux/pagevec.h>
55
#include <linux/percpu_counter.h>
56
#include <linux/falloc.h>
57
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
58 59 60 61
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
62
#include <linux/ctype.h>
63
#include <linux/migrate.h>
64
#include <linux/highmem.h>
65
#include <linux/seq_file.h>
Mimi Zohar's avatar
Mimi Zohar committed
66
#include <linux/magic.h>
67

Linus Torvalds's avatar
Linus Torvalds committed
68 69 70
#include <asm/uaccess.h>
#include <asm/pgtable.h>

Hugh Dickins's avatar
Hugh Dickins committed
71
#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
Linus Torvalds's avatar
Linus Torvalds committed
72 73 74 75 76
#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

77 78 79
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

80 81 82 83 84 85 86
struct shmem_xattr {
	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */
	char *name;		/* xattr name */
	size_t size;
	char value[0];
};

87 88 89 90 91 92 93 94 95 96 97 98
/*
 * shmem_fallocate and shmem_writepage communicate via inode->i_private
 * (with i_mutex making sure that it has only one user at a time):
 * we would prefer not to enlarge the shmem inode just for that.
 */
struct shmem_falloc {
	pgoff_t start;		/* start of range currently being fallocated */
	pgoff_t next;		/* the next page offset to be fallocated */
	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
};

99
/* Flag allocation requirements to shmem_getpage */
Linus Torvalds's avatar
Linus Torvalds committed
100 101 102
enum sgp_type {
	SGP_READ,	/* don't exceed i_size, don't allocate page */
	SGP_CACHE,	/* don't exceed i_size, may allocate page */
103
	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
104 105
	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
Linus Torvalds's avatar
Linus Torvalds committed
106 107
};

Andrew Morton's avatar
Andrew Morton committed
108
#ifdef CONFIG_TMPFS
109 110 111 112 113 114 115 116 117
static unsigned long shmem_default_max_blocks(void)
{
	return totalram_pages / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
}
Andrew Morton's avatar
Andrew Morton committed
118
#endif
119

120 121 122
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				struct shmem_inode_info *info, pgoff_t index);
123 124 125 126 127 128 129 130 131
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);

static inline int shmem_getpage(struct inode *inode, pgoff_t index,
	struct page **pagep, enum sgp_type sgp, int *fault_type)
{
	return shmem_getpage_gfp(inode, index, pagep, sgp,
			mapping_gfp_mask(inode->i_mapping), fault_type);
}
Linus Torvalds's avatar
Linus Torvalds committed
132 133 134 135 136 137 138 139 140 141 142 143 144 145

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
146
	return (flags & VM_NORESERVE) ?
147
		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
Linus Torvalds's avatar
Linus Torvalds committed
148 149 150 151
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
152
	if (!(flags & VM_NORESERVE))
Linus Torvalds's avatar
Linus Torvalds committed
153 154 155 156 157 158 159 160 161 162 163
		vm_unacct_memory(VM_ACCT(size));
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow huge sparse files.
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_block(unsigned long flags)
{
164
	return (flags & VM_NORESERVE) ?
165
		security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
Linus Torvalds's avatar
Linus Torvalds committed
166 167 168 169
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
170
	if (flags & VM_NORESERVE)
Linus Torvalds's avatar
Linus Torvalds committed
171 172 173
		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
}

174
static const struct super_operations shmem_ops;
175
static const struct address_space_operations shmem_aops;
176
static const struct file_operations shmem_file_operations;
177 178 179
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
180
static const struct vm_operations_struct shmem_vm_ops;
Linus Torvalds's avatar
Linus Torvalds committed
181

182
static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
Linus Torvalds's avatar
Linus Torvalds committed
183
	.ra_pages	= 0,	/* No readahead */
184
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
Linus Torvalds's avatar
Linus Torvalds committed
185 186 187
};

static LIST_HEAD(shmem_swaplist);
188
static DEFINE_MUTEX(shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
189

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
static int shmem_reserve_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		if (!sbinfo->free_inodes) {
			spin_unlock(&sbinfo->stat_lock);
			return -ENOSPC;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}
	return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}

215
/**
216
 * shmem_recalc_inode - recalculate the block usage of an inode
Linus Torvalds's avatar
Linus Torvalds committed
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	long freed;

	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
	if (freed > 0) {
234 235 236
		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
		if (sbinfo->max_blocks)
			percpu_counter_add(&sbinfo->used_blocks, -freed);
Linus Torvalds's avatar
Linus Torvalds committed
237
		info->alloced -= freed;
238
		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
Linus Torvalds's avatar
Linus Torvalds committed
239 240 241 242
		shmem_unacct_blocks(info->flags, freed);
	}
}

243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
/*
 * Replace item expected in radix tree by a new item, while holding tree lock.
 */
static int shmem_radix_tree_replace(struct address_space *mapping,
			pgoff_t index, void *expected, void *replacement)
{
	void **pslot;
	void *item = NULL;

	VM_BUG_ON(!expected);
	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
	if (pslot)
		item = radix_tree_deref_slot_protected(pslot,
							&mapping->tree_lock);
	if (item != expected)
		return -ENOENT;
	if (replacement)
		radix_tree_replace_slot(pslot, replacement);
	else
		radix_tree_delete(&mapping->page_tree, index);
	return 0;
}

266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking page is not enough: by the time a SwapCache page is locked, it
 * might be reused, and again be SwapCache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
			       pgoff_t index, swp_entry_t swap)
{
	void *item;

	rcu_read_lock();
	item = radix_tree_lookup(&mapping->page_tree, index);
	rcu_read_unlock();
	return item == swp_to_radix_entry(swap);
}

284 285 286 287 288 289 290
/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct page *page,
				   struct address_space *mapping,
				   pgoff_t index, gfp_t gfp, void *expected)
{
291
	int error = 0;
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327

	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageSwapBacked(page));

	if (!expected)
		error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
	if (!error) {
		page_cache_get(page);
		page->mapping = mapping;
		page->index = index;

		spin_lock_irq(&mapping->tree_lock);
		if (!expected)
			error = radix_tree_insert(&mapping->page_tree,
							index, page);
		else
			error = shmem_radix_tree_replace(mapping, index,
							expected, page);
		if (!error) {
			mapping->nrpages++;
			__inc_zone_page_state(page, NR_FILE_PAGES);
			__inc_zone_page_state(page, NR_SHMEM);
			spin_unlock_irq(&mapping->tree_lock);
		} else {
			page->mapping = NULL;
			spin_unlock_irq(&mapping->tree_lock);
			page_cache_release(page);
		}
		if (!expected)
			radix_tree_preload_end();
	}
	if (error)
		mem_cgroup_uncharge_cache_page(page);
	return error;
}

328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
/*
 * Like delete_from_page_cache, but substitutes swap for page.
 */
static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
	struct address_space *mapping = page->mapping;
	int error;

	spin_lock_irq(&mapping->tree_lock);
	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
	page->mapping = NULL;
	mapping->nrpages--;
	__dec_zone_page_state(page, NR_FILE_PAGES);
	__dec_zone_page_state(page, NR_SHMEM);
	spin_unlock_irq(&mapping->tree_lock);
	page_cache_release(page);
	BUG_ON(error);
}

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
/*
 * Like find_get_pages, but collecting swap entries as well as pages.
 */
static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
					pgoff_t start, unsigned int nr_pages,
					struct page **pages, pgoff_t *indices)
{
	unsigned int i;
	unsigned int ret;
	unsigned int nr_found;

	rcu_read_lock();
restart:
	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
				(void ***)pages, indices, start, nr_pages);
	ret = 0;
	for (i = 0; i < nr_found; i++) {
		struct page *page;
repeat:
		page = radix_tree_deref_slot((void **)pages[i]);
		if (unlikely(!page))
			continue;
		if (radix_tree_exception(page)) {
370 371 372 373 374 375 376 377
			if (radix_tree_deref_retry(page))
				goto restart;
			/*
			 * Otherwise, we must be storing a swap entry
			 * here as an exceptional entry: so return it
			 * without attempting to raise page count.
			 */
			goto export;
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
		}
		if (!page_cache_get_speculative(page))
			goto repeat;

		/* Has the page moved? */
		if (unlikely(page != *((void **)pages[i]))) {
			page_cache_release(page);
			goto repeat;
		}
export:
		indices[ret] = indices[i];
		pages[ret] = page;
		ret++;
	}
	if (unlikely(!ret && nr_found))
		goto restart;
	rcu_read_unlock();
	return ret;
}

/*
 * Remove swap entry from radix tree, free the swap and its page cache.
 */
static int shmem_free_swap(struct address_space *mapping,
			   pgoff_t index, void *radswap)
{
	int error;

	spin_lock_irq(&mapping->tree_lock);
	error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
	spin_unlock_irq(&mapping->tree_lock);
	if (!error)
		free_swap_and_cache(radix_to_swp_entry(radswap));
	return error;
}

/*
 * Pagevec may contain swap entries, so shuffle up pages before releasing.
 */
417
static void shmem_deswap_pagevec(struct pagevec *pvec)
418 419 420 421 422 423 424 425 426
{
	int i, j;

	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
		struct page *page = pvec->pages[i];
		if (!radix_tree_exceptional_entry(page))
			pvec->pages[j++] = page;
	}
	pvec->nr = j;
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
}

/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
	struct pagevec pvec;
	pgoff_t indices[PAGEVEC_SIZE];
	pgoff_t index = 0;

	pagevec_init(&pvec, 0);
	/*
	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
	 */
	while (!mapping_unevictable(mapping)) {
		/*
		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
		 */
		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
					PAGEVEC_SIZE, pvec.pages, indices);
		if (!pvec.nr)
			break;
		index = indices[pvec.nr - 1] + 1;
		shmem_deswap_pagevec(&pvec);
		check_move_unevictable_pages(pvec.pages, pvec.nr);
		pagevec_release(&pvec);
		cond_resched();
	}
457 458 459 460
}

/*
 * Remove range of pages and swap entries from radix tree, and free them.
461
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
462
 */
463 464
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
								 bool unfalloc)
Linus Torvalds's avatar
Linus Torvalds committed
465
{
466
	struct address_space *mapping = inode->i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
467
	struct shmem_inode_info *info = SHMEM_I(inode);
468
	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
469 470 471
	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
472
	struct pagevec pvec;
473 474
	pgoff_t indices[PAGEVEC_SIZE];
	long nr_swaps_freed = 0;
475
	pgoff_t index;
476 477
	int i;

478 479
	if (lend == -1)
		end = -1;	/* unsigned, so actually very big */
480 481 482

	pagevec_init(&pvec, 0);
	index = start;
483
	while (index < end) {
484
		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
485
				min(end - index, (pgoff_t)PAGEVEC_SIZE),
486 487 488
							pvec.pages, indices);
		if (!pvec.nr)
			break;
489 490 491 492
		mem_cgroup_uncharge_start();
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

493
			index = indices[i];
494
			if (index >= end)
495 496
				break;

497
			if (radix_tree_exceptional_entry(page)) {
498 499
				if (unfalloc)
					continue;
500 501
				nr_swaps_freed += !shmem_free_swap(mapping,
								index, page);
502
				continue;
503 504 505
			}

			if (!trylock_page(page))
506
				continue;
507 508 509 510 511
			if (!unfalloc || !PageUptodate(page)) {
				if (page->mapping == mapping) {
					VM_BUG_ON(PageWriteback(page));
					truncate_inode_page(mapping, page);
				}
512 513 514
			}
			unlock_page(page);
		}
515 516
		shmem_deswap_pagevec(&pvec);
		pagevec_release(&pvec);
517 518 519 520
		mem_cgroup_uncharge_end();
		cond_resched();
		index++;
	}
Linus Torvalds's avatar
Linus Torvalds committed
521

522
	if (partial_start) {
523 524 525
		struct page *page = NULL;
		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
		if (page) {
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
			unsigned int top = PAGE_CACHE_SIZE;
			if (start > end) {
				top = partial_end;
				partial_end = 0;
			}
			zero_user_segment(page, partial_start, top);
			set_page_dirty(page);
			unlock_page(page);
			page_cache_release(page);
		}
	}
	if (partial_end) {
		struct page *page = NULL;
		shmem_getpage(inode, end, &page, SGP_READ, NULL);
		if (page) {
			zero_user_segment(page, 0, partial_end);
542 543 544 545 546
			set_page_dirty(page);
			unlock_page(page);
			page_cache_release(page);
		}
	}
547 548
	if (start >= end)
		return;
549 550 551 552

	index = start;
	for ( ; ; ) {
		cond_resched();
553
		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
554
				min(end - index, (pgoff_t)PAGEVEC_SIZE),
555 556
							pvec.pages, indices);
		if (!pvec.nr) {
557
			if (index == start || unfalloc)
558 559 560 561
				break;
			index = start;
			continue;
		}
562
		if ((index == start || unfalloc) && indices[0] >= end) {
563 564
			shmem_deswap_pagevec(&pvec);
			pagevec_release(&pvec);
565 566 567 568 569 570
			break;
		}
		mem_cgroup_uncharge_start();
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

571
			index = indices[i];
572
			if (index >= end)
573 574
				break;

575
			if (radix_tree_exceptional_entry(page)) {
576 577
				if (unfalloc)
					continue;
578 579 580 581 582
				nr_swaps_freed += !shmem_free_swap(mapping,
								index, page);
				continue;
			}

583
			lock_page(page);
584 585 586 587 588
			if (!unfalloc || !PageUptodate(page)) {
				if (page->mapping == mapping) {
					VM_BUG_ON(PageWriteback(page));
					truncate_inode_page(mapping, page);
				}
589
			}
590 591
			unlock_page(page);
		}
592 593
		shmem_deswap_pagevec(&pvec);
		pagevec_release(&pvec);
594 595 596
		mem_cgroup_uncharge_end();
		index++;
	}
597

Linus Torvalds's avatar
Linus Torvalds committed
598
	spin_lock(&info->lock);
599
	info->swapped -= nr_swaps_freed;
Linus Torvalds's avatar
Linus Torvalds committed
600 601
	shmem_recalc_inode(inode);
	spin_unlock(&info->lock);
602
}
Linus Torvalds's avatar
Linus Torvalds committed
603

604 605 606
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
	shmem_undo_range(inode, lstart, lend, false);
607
	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
Linus Torvalds's avatar
Linus Torvalds committed
608
}
609
EXPORT_SYMBOL_GPL(shmem_truncate_range);
Linus Torvalds's avatar
Linus Torvalds committed
610

611
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
Linus Torvalds's avatar
Linus Torvalds committed
612 613 614 615
{
	struct inode *inode = dentry->d_inode;
	int error;

616 617 618 619
	error = inode_change_ok(inode, attr);
	if (error)
		return error;

620 621 622
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
		loff_t oldsize = inode->i_size;
		loff_t newsize = attr->ia_size;
623

624 625 626 627 628 629 630 631 632 633 634
		if (newsize != oldsize) {
			i_size_write(inode, newsize);
			inode->i_ctime = inode->i_mtime = CURRENT_TIME;
		}
		if (newsize < oldsize) {
			loff_t holebegin = round_up(newsize, PAGE_SIZE);
			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
			shmem_truncate_range(inode, newsize, (loff_t)-1);
			/* unmap again to remove racily COWed private pages */
			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
		}
Linus Torvalds's avatar
Linus Torvalds committed
635 636
	}

637
	setattr_copy(inode, attr);
638
#ifdef CONFIG_TMPFS_POSIX_ACL
639
	if (attr->ia_valid & ATTR_MODE)
640
		error = generic_acl_chmod(inode);
641
#endif
Linus Torvalds's avatar
Linus Torvalds committed
642 643 644
	return error;
}

Al Viro's avatar
Al Viro committed
645
static void shmem_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
646 647
{
	struct shmem_inode_info *info = SHMEM_I(inode);
648
	struct shmem_xattr *xattr, *nxattr;
Linus Torvalds's avatar
Linus Torvalds committed
649

650
	if (inode->i_mapping->a_ops == &shmem_aops) {
Linus Torvalds's avatar
Linus Torvalds committed
651 652
		shmem_unacct_size(info->flags, inode->i_size);
		inode->i_size = 0;
653
		shmem_truncate_range(inode, 0, (loff_t)-1);
Linus Torvalds's avatar
Linus Torvalds committed
654
		if (!list_empty(&info->swaplist)) {
655
			mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
656
			list_del_init(&info->swaplist);
657
			mutex_unlock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
658
		}
659 660
	} else
		kfree(info->symlink);
661 662 663 664 665

	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
		kfree(xattr->name);
		kfree(xattr);
	}
666
	BUG_ON(inode->i_blocks);
667
	shmem_free_inode(inode->i_sb);
668
	clear_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
669 670
}

671 672 673
/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
674
static int shmem_unuse_inode(struct shmem_inode_info *info,
675
			     swp_entry_t swap, struct page **pagep)
Linus Torvalds's avatar
Linus Torvalds committed
676
{
677
	struct address_space *mapping = info->vfs_inode.i_mapping;
678
	void *radswap;
679
	pgoff_t index;
680 681
	gfp_t gfp;
	int error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
682

683
	radswap = swp_to_radix_entry(swap);
684
	index = radix_tree_locate_item(&mapping->page_tree, radswap);
685
	if (index == -1)
686
		return 0;
687

Hugh Dickins's avatar
Hugh Dickins committed
688 689
	/*
	 * Move _head_ to start search for next from here.
Al Viro's avatar
Al Viro committed
690
	 * But be careful: shmem_evict_inode checks list_empty without taking
Hugh Dickins's avatar
Hugh Dickins committed
691
	 * mutex, and there's an instant in list_move_tail when info->swaplist
692
	 * would appear empty, if it were the only one on shmem_swaplist.
Hugh Dickins's avatar
Hugh Dickins committed
693 694 695
	 */
	if (shmem_swaplist.next != &info->swaplist)
		list_move_tail(&shmem_swaplist, &info->swaplist);
696

697 698 699 700 701 702 703
	gfp = mapping_gfp_mask(mapping);
	if (shmem_should_replace_page(*pagep, gfp)) {
		mutex_unlock(&shmem_swaplist_mutex);
		error = shmem_replace_page(pagep, gfp, info, index);
		mutex_lock(&shmem_swaplist_mutex);
		/*
		 * We needed to drop mutex to make that restrictive page
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
		 * allocation, but the inode might have been freed while we
		 * dropped it: although a racing shmem_evict_inode() cannot
		 * complete without emptying the radix_tree, our page lock
		 * on this swapcache page is not enough to prevent that -
		 * free_swap_and_cache() of our swap entry will only
		 * trylock_page(), removing swap from radix_tree whatever.
		 *
		 * We must not proceed to shmem_add_to_page_cache() if the
		 * inode has been freed, but of course we cannot rely on
		 * inode or mapping or info to check that.  However, we can
		 * safely check if our swap entry is still in use (and here
		 * it can't have got reused for another page): if it's still
		 * in use, then the inode cannot have been freed yet, and we
		 * can safely proceed (if it's no longer in use, that tells
		 * nothing about the inode, but we don't need to unuse swap).
719 720 721 722 723
		 */
		if (!page_swapcount(*pagep))
			error = -ENOENT;
	}

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
724
	/*
725 726 727
	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
	 * beneath us (pagelock doesn't help until the page is in pagecache).
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
728
	 */
729 730
	if (!error)
		error = shmem_add_to_page_cache(*pagep, mapping, index,
731
						GFP_NOWAIT, radswap);
732
	if (error != -ENOMEM) {
733 734 735 736
		/*
		 * Truncation and eviction use free_swap_and_cache(), which
		 * only does trylock page: if we raced, best clean up here.
		 */
737 738
		delete_from_swap_cache(*pagep);
		set_page_dirty(*pagep);
739 740 741 742 743 744
		if (!error) {
			spin_lock(&info->lock);
			info->swapped--;
			spin_unlock(&info->lock);
			swap_free(swap);
		}
745
		error = 1;	/* not an error, but entry was found */
Linus Torvalds's avatar
Linus Torvalds committed
746
	}
747
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
748 749 750
}

/*
751
 * Search through swapped inodes to find and replace swap by page.
Linus Torvalds's avatar
Linus Torvalds committed
752
 */
753
int shmem_unuse(swp_entry_t swap, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
754
{
755
	struct list_head *this, *next;
Linus Torvalds's avatar
Linus Torvalds committed
756 757
	struct shmem_inode_info *info;
	int found = 0;
758 759 760 761
	int error = 0;

	/*
	 * There's a faint possibility that swap page was replaced before
762
	 * caller locked it: caller will come back later with the right page.
763
	 */
764
	if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
765
		goto out;
766 767 768 769 770 771 772 773 774

	/*
	 * Charge page using GFP_KERNEL while we can wait, before taking
	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
	 * Charged back to the user (not to caller) when swap account is used.
	 */
	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
	if (error)
		goto out;
775
	/* No radix_tree_preload: swap entry keeps a place for page in tree */
Linus Torvalds's avatar
Linus Torvalds committed
776

777
	mutex_lock(&shmem_swaplist_mutex);
778 779
	list_for_each_safe(this, next, &shmem_swaplist) {
		info = list_entry(this, struct shmem_inode_info, swaplist);
780
		if (info->swapped)
781
			found = shmem_unuse_inode(info, swap, &page);
782 783
		else
			list_del_init(&info->swaplist);
784
		cond_resched();
785
		if (found)
786
			break;
Linus Torvalds's avatar
Linus Torvalds committed
787
	}
788
	mutex_unlock(&shmem_swaplist_mutex);
789 790 791 792

	if (found < 0)
		error = found;
out:
Hugh Dickins's avatar
Hugh Dickins committed
793 794
	unlock_page(page);
	page_cache_release(page);
795
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
796 797 798 799 800 801 802 803 804 805
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
	struct shmem_inode_info *info;
	struct address_space *mapping;
	struct inode *inode;
806 807
	swp_entry_t swap;
	pgoff_t index;
Linus Torvalds's avatar
Linus Torvalds committed
808 809 810 811 812 813 814 815

	BUG_ON(!PageLocked(page));
	mapping = page->mapping;
	index = page->index;
	inode = mapping->host;
	info = SHMEM_I(inode);
	if (info->flags & VM_LOCKED)
		goto redirty;
816
	if (!total_swap_pages)
Linus Torvalds's avatar
Linus Torvalds committed
817 818
		goto redirty;

819 820 821
	/*
	 * shmem_backing_dev_info's capabilities prevent regular writeback or
	 * sync from ever calling shmem_writepage; but a stacking filesystem
822
	 * might use ->writepage of its underlying filesystem, in which case
823
	 * tmpfs should write out to swap only in response to memory pressure,
824
	 * and not for the writeback threads or sync.
825
	 */
826 827 828 829
	if (!wbc->for_reclaim) {
		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
		goto redirty;
	}
830 831 832 833 834

	/*
	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
	 * value into swapfile.c, the only way we can correctly account for a
	 * fallocated page arriving here is now to initialize it and write it.
835 836 837 838 839 840
	 *
	 * That's okay for a page already fallocated earlier, but if we have
	 * not yet completed the fallocation, then (a) we want to keep track
	 * of this page in case we have to undo it, and (b) it may not be a
	 * good idea to continue anyway, once we're pushing into swap.  So
	 * reactivate the page, and let shmem_fallocate() quit when too many.
841 842
	 */
	if (!PageUptodate(page)) {
843 844 845 846 847 848 849 850 851 852 853 854 855 856
		if (inode->i_private) {
			struct shmem_falloc *shmem_falloc;
			spin_lock(&inode->i_lock);
			shmem_falloc = inode->i_private;
			if (shmem_falloc &&
			    index >= shmem_falloc->start &&
			    index < shmem_falloc->next)
				shmem_falloc->nr_unswapped++;
			else
				shmem_falloc = NULL;
			spin_unlock(&inode->i_lock);
			if (shmem_falloc)
				goto redirty;
		}
857 858 859 860 861
		clear_highpage(page);
		flush_dcache_page(page);
		SetPageUptodate(page);
	}

862 863 864
	swap = get_swap_page();
	if (!swap.val)
		goto redirty;
865

866 867
	/*
	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
868 869
	 * if it's not already there.  Do it now before the page is
	 * moved to swap cache, when its pagelock no longer protects
870
	 * the inode from eviction.  But don't unlock the mutex until
871 872
	 * we've incremented swapped, because shmem_unuse_inode() will
	 * prune a !swapped inode from the swaplist under this mutex.
873
	 */
874 875 876
	mutex_lock(&shmem_swaplist_mutex);
	if (list_empty(&info->swaplist))
		list_add_tail(&info->swaplist, &shmem_swaplist);
877

878
	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
Hugh Dickins's avatar
Hugh Dickins committed
879
		swap_shmem_alloc(swap);
880 881 882 883 884
		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));

		spin_lock(&info->lock);
		info->swapped++;
		shmem_recalc_inode(inode);
885
		spin_unlock(&info->lock);
886 887

		mutex_unlock(&shmem_swaplist_mutex);
888
		BUG_ON(page_mapped(page));
889
		swap_writepage(page, wbc);
Linus Torvalds's avatar
Linus Torvalds committed
890 891 892
		return 0;
	}

893
	mutex_unlock(&shmem_swaplist_mutex);
894
	swapcache_free(swap, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
895 896
redirty:
	set_page_dirty(page);
897 898 899 900
	if (wbc->for_reclaim)
		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
	unlock_page(page);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
901 902 903
}

#ifdef CONFIG_NUMA
904
#ifdef CONFIG_TMPFS
905
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
906
{
907
	char buffer[64];
908

909
	if (!mpol || mpol->mode == MPOL_DEFAULT)
910
		return;		/* show nothing */
911

912
	mpol_to_str(buffer, sizeof(buffer), mpol, 1);
913 914

	seq_printf(seq, ",mpol=%s", buffer);
915
}
916 917 918 919 920 921 922 923 924 925 926 927

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
	struct mempolicy *mpol = NULL;
	if (sbinfo->mpol) {
		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
		mpol = sbinfo->mpol;
		mpol_get(mpol);
		spin_unlock(&sbinfo->stat_lock);
	}
	return mpol;
}
928 929
#endif /* CONFIG_TMPFS */

930 931
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
			struct shmem_inode_info *info, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
932
{
933
	struct mempolicy mpol, *spol;
Linus Torvalds's avatar
Linus Torvalds committed
934 935
	struct vm_area_struct pvma;

936
	spol = mpol_cond_copy(&mpol,
937
			mpol_shared_policy_lookup(&info->policy, index));
938

Linus Torvalds's avatar
Linus Torvalds committed
939
	/* Create a pseudo vma that just contains the policy */
940
	pvma.vm_start = 0;
941
	pvma.vm_pgoff = index;
942
	pvma.vm_ops = NULL;
943
	pvma.vm_policy = spol;
944
	return swapin_readahead(swap, gfp, &pvma, 0);
Linus Torvalds's avatar
Linus Torvalds committed
945 946
}

947
static struct page *shmem_alloc_page(gfp_t gfp,
948
			struct shmem_inode_info *info, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
949 950 951
{
	struct vm_area_struct pvma;

952 953
	/* Create a pseudo vma that just contains the policy */
	pvma.vm_start = 0;
954
	pvma.vm_pgoff = index;
955
	pvma.vm_ops = NULL;
956
	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
957 958 959 960 961

	/*
	 * alloc_page_vma() will drop the shared policy reference
	 */
	return alloc_page_vma(gfp, &pvma, 0);
Linus Torvalds's avatar
Linus Torvalds committed
962
}
963 964
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
965
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
966 967 968 969
{
}
#endif /* CONFIG_TMPFS */

970 971
static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
			struct shmem_inode_info *info, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
972
{
973
	return swapin_readahead(swap, gfp, NULL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
974 975
}

976
static inline struct page *shmem_alloc_page(gfp_t gfp,
977
			struct shmem_inode_info *info, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
978
{
979
	return alloc_page(gfp);
Linus Torvalds's avatar
Linus Torvalds committed
980
}
981
#endif /* CONFIG_NUMA */
Linus Torvalds's avatar
Linus Torvalds committed
982

983 984 985 986 987 988 989
#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
	return NULL;
}
#endif

990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
/*
 * When a page is moved from swapcache to shmem filecache (either by the
 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
 * shmem_unuse_inode()), it may have been read in earlier from swap, in
 * ignorance of the mapping it belongs to.  If that mapping has special
 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
 * we may need to copy to a suitable page before moving to filecache.
 *
 * In a future release, this may well be extended to respect cpuset and
 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
 * but for now it is a simple matter of zone.
 */
static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
{
	return page_zonenum(page) > gfp_zone(gfp);
}

static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				struct shmem_inode_info *info, pgoff_t index)
{
	struct page *oldpage, *newpage;
	struct address_space *swap_mapping;
	pgoff_t swap_index;
	int error;

	oldpage = *pagep;
	swap_index = page_private(oldpage);
	swap_mapping = page_mapping(oldpage);

	/*
	 * We have arrived here because our zones are constrained, so don't
	 * limit chance of success by further cpuset and node constraints.
	 */
	gfp &= ~GFP_CONSTRAINT_MASK;
	newpage = shmem_alloc_page(gfp, info, index);
	if (!newpage)
		return -ENOMEM;

	page_cache_get(newpage);
	copy_highpage(newpage, oldpage);
1030
	flush_dcache_page(newpage);
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044

	__set_page_locked(newpage);
	SetPageUptodate(newpage);
	SetPageSwapBacked(newpage);
	set_page_private(newpage, swap_index);
	SetPageSwapCache(newpage);

	/*
	 * Our caller will very soon move newpage out of swapcache, but it's
	 * a nice clean interface for us to replace oldpage by newpage there.
	 */
	spin_lock_irq(&swap_mapping->tree_lock);
	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
								   newpage);
1045 1046 1047 1048
	if (!error) {
		__inc_zone_page_state(newpage, NR_FILE_PAGES);
		__dec_zone_page_state(oldpage, NR_FILE_PAGES);
	}
1049 1050
	spin_unlock_irq(&swap_mapping->tree_lock);

1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
	if (unlikely(error)) {
		/*
		 * Is this possible?  I think not, now that our callers check
		 * both PageSwapCache and page_private after getting page lock;
		 * but be defensive.  Reverse old to newpage for clear and free.
		 */
		oldpage = newpage;
	} else {
		mem_cgroup_replace_page_cache(oldpage, newpage);
		lru_cache_add_anon(newpage);
		*pagep = newpage;
	}
1063 1064 1065 1066 1067 1068 1069

	ClearPageSwapCache(oldpage);
	set_page_private(oldpage, 0);

	unlock_page(oldpage);
	page_cache_release(oldpage);
	page_cache_release(oldpage);
1070
	return error;
1071 1072
}

Linus Torvalds's avatar
Linus Torvalds committed
1073
/*
1074
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
Linus Torvalds's avatar
Linus Torvalds committed
1075 1076 1077 1078 1079
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache
 */
1080
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1081
	struct page **pagep, enum sgp_type