shmem.c 77.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *		 2000 Transmeta Corp.
 *		 2000-2001 Christoph Rohland
 *		 2000-2001 SAP AG
 *		 2002 Red Hat Inc.
9 10
 * Copyright (C) 2002-2005 Hugh Dickins.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
Linus Torvalds's avatar
Linus Torvalds committed
11 12 13 14 15 16
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
17 18 19
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
Linus Torvalds's avatar
Linus Torvalds committed
20 21 22
 * This file is released under the GPL.
 */

23 24 25 26
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
Hugh Dickins's avatar
Hugh Dickins committed
27
#include <linux/pagemap.h>
28 29 30
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
31
#include <linux/percpu_counter.h>
32 33 34 35 36
#include <linux/swap.h>

static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
Linus Torvalds's avatar
Linus Torvalds committed
37 38 39 40 41 42
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

43
#include <linux/xattr.h>
44
#include <linux/exportfs.h>
45
#include <linux/posix_acl.h>
46
#include <linux/generic_acl.h>
Linus Torvalds's avatar
Linus Torvalds committed
47 48 49 50 51 52 53 54 55 56 57
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
58
#include <linux/ctype.h>
59
#include <linux/migrate.h>
60
#include <linux/highmem.h>
61
#include <linux/seq_file.h>
Mimi Zohar's avatar
Mimi Zohar committed
62
#include <linux/magic.h>
63

Linus Torvalds's avatar
Linus Torvalds committed
64 65 66 67
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>

Hugh Dickins's avatar
Hugh Dickins committed
68 69 70 71 72 73 74 75 76 77 78 79
/*
 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
 *
 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
 * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum
 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
 *
 * We use / and * instead of shifts in the definitions below, so that the swap
 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
 */
Linus Torvalds's avatar
Linus Torvalds committed
80
#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
Yuri Tikhonov's avatar
Yuri Tikhonov committed
81
#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
Linus Torvalds's avatar
Linus Torvalds committed
82

Hugh Dickins's avatar
Hugh Dickins committed
83 84
#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
Linus Torvalds's avatar
Linus Torvalds committed
85

Hugh Dickins's avatar
Hugh Dickins committed
86 87 88 89
#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))

#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
Linus Torvalds's avatar
Linus Torvalds committed
90 91 92 93 94 95 96 97 98 99 100 101
#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)

/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
#define SHMEM_PAGEIN	 VM_READ
#define SHMEM_TRUNCATE	 VM_WRITE

/* Definition to limit shmem_truncate's steps between cond_rescheds */
#define LATENCY_LIMIT	 64

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

102 103 104 105 106 107 108
struct shmem_xattr {
	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */
	char *name;		/* xattr name */
	size_t size;
	char value[0];
};

Linus Torvalds's avatar
Linus Torvalds committed
109 110 111 112
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type {
	SGP_READ,	/* don't exceed i_size, don't allocate page */
	SGP_CACHE,	/* don't exceed i_size, may allocate page */
113
	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
Linus Torvalds's avatar
Linus Torvalds committed
114 115 116
	SGP_WRITE,	/* may exceed i_size, may allocate page */
};

Andrew Morton's avatar
Andrew Morton committed
117
#ifdef CONFIG_TMPFS
118 119 120 121 122 123 124 125 126
static unsigned long shmem_default_max_blocks(void)
{
	return totalram_pages / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
}
Andrew Morton's avatar
Andrew Morton committed
127
#endif
128

Linus Torvalds's avatar
Linus Torvalds committed
129 130 131
static int shmem_getpage(struct inode *inode, unsigned long idx,
			 struct page **pagep, enum sgp_type sgp, int *type);

Al Viro's avatar
Al Viro committed
132
static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
133 134 135 136 137
{
	/*
	 * The above definition of ENTRIES_PER_PAGE, and the use of
	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
	 * might be reconsidered if it ever diverges from PAGE_SIZE.
138
	 *
139
	 * Mobility flags are masked out as swap vectors cannot move
Linus Torvalds's avatar
Linus Torvalds committed
140
	 */
141
	return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
142
				PAGE_CACHE_SHIFT-PAGE_SHIFT);
Linus Torvalds's avatar
Linus Torvalds committed
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
}

static inline void shmem_dir_free(struct page *page)
{
	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
}

static struct page **shmem_dir_map(struct page *page)
{
	return (struct page **)kmap_atomic(page, KM_USER0);
}

static inline void shmem_dir_unmap(struct page **dir)
{
	kunmap_atomic(dir, KM_USER0);
}

static swp_entry_t *shmem_swp_map(struct page *page)
{
	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
}

static inline void shmem_swp_balance_unmap(void)
{
	/*
	 * When passing a pointer to an i_direct entry, to code which
	 * also handles indirect entries and so will shmem_swp_unmap,
	 * we must arrange for the preempt count to remain in balance.
	 * What kmap_atomic of a lowmem page does depends on config
	 * and architecture, so pretend to kmap_atomic some lowmem page.
	 */
	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
}

static inline void shmem_swp_unmap(swp_entry_t *entry)
{
	kunmap_atomic(entry, KM_USER1);
}

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
195 196
	return (flags & VM_NORESERVE) ?
		0 : security_vm_enough_memory_kern(VM_ACCT(size));
Linus Torvalds's avatar
Linus Torvalds committed
197 198 199 200
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
201
	if (!(flags & VM_NORESERVE))
Linus Torvalds's avatar
Linus Torvalds committed
202 203 204 205 206 207 208 209 210 211 212
		vm_unacct_memory(VM_ACCT(size));
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow huge sparse files.
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_block(unsigned long flags)
{
213 214
	return (flags & VM_NORESERVE) ?
		security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
Linus Torvalds's avatar
Linus Torvalds committed
215 216 217 218
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
219
	if (flags & VM_NORESERVE)
Linus Torvalds's avatar
Linus Torvalds committed
220 221 222
		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
}

223
static const struct super_operations shmem_ops;
224
static const struct address_space_operations shmem_aops;
225
static const struct file_operations shmem_file_operations;
226 227 228
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
229
static const struct vm_operations_struct shmem_vm_ops;
Linus Torvalds's avatar
Linus Torvalds committed
230

231
static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
Linus Torvalds's avatar
Linus Torvalds committed
232
	.ra_pages	= 0,	/* No readahead */
233
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
Linus Torvalds's avatar
Linus Torvalds committed
234 235 236
};

static LIST_HEAD(shmem_swaplist);
237
static DEFINE_MUTEX(shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
238 239 240 241

static void shmem_free_blocks(struct inode *inode, long pages)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242
	if (sbinfo->max_blocks) {
243 244
		percpu_counter_add(&sbinfo->used_blocks, -pages);
		spin_lock(&inode->i_lock);
Linus Torvalds's avatar
Linus Torvalds committed
245
		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246
		spin_unlock(&inode->i_lock);
Linus Torvalds's avatar
Linus Torvalds committed
247 248 249
	}
}

250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
static int shmem_reserve_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		if (!sbinfo->free_inodes) {
			spin_unlock(&sbinfo->stat_lock);
			return -ENOSPC;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}
	return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}

275
/**
Linus Torvalds's avatar
Linus Torvalds committed
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
 * shmem_recalc_inode - recalculate the size of an inode
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	long freed;

	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
	if (freed > 0) {
		info->alloced -= freed;
		shmem_unacct_blocks(info->flags, freed);
		shmem_free_blocks(inode, freed);
	}
}

300
/**
Linus Torvalds's avatar
Linus Torvalds committed
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
 * shmem_swp_entry - find the swap vector position in the info structure
 * @info:  info structure for the inode
 * @index: index of the page to find
 * @page:  optional page to add to the structure. Has to be preset to
 *         all zeros
 *
 * If there is no space allocated yet it will return NULL when
 * page is NULL, else it will use the page for the needed block,
 * setting it to NULL on return to indicate that it has been used.
 *
 * The swap vector is organized the following way:
 *
 * There are SHMEM_NR_DIRECT entries directly stored in the
 * shmem_inode_info structure. So small files do not need an addional
 * allocation.
 *
 * For pages with index > SHMEM_NR_DIRECT there is the pointer
 * i_indirect which points to a page which holds in the first half
 * doubly indirect blocks, in the second half triple indirect blocks:
 *
 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
 * following layout (for SHMEM_NR_DIRECT == 16):
 *
 * i_indirect -> dir --> 16-19
 * 	      |	     +-> 20-23
 * 	      |
 * 	      +-->dir2 --> 24-27
 * 	      |	       +-> 28-31
 * 	      |	       +-> 32-35
 * 	      |	       +-> 36-39
 * 	      |
 * 	      +-->dir3 --> 40-43
 * 	       	       +-> 44-47
 * 	      	       +-> 48-51
 * 	      	       +-> 52-55
 */
static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
{
	unsigned long offset;
	struct page **dir;
	struct page *subdir;

	if (index < SHMEM_NR_DIRECT) {
		shmem_swp_balance_unmap();
		return info->i_direct+index;
	}
	if (!info->i_indirect) {
		if (page) {
			info->i_indirect = *page;
			*page = NULL;
		}
		return NULL;			/* need another page */
	}

	index -= SHMEM_NR_DIRECT;
	offset = index % ENTRIES_PER_PAGE;
	index /= ENTRIES_PER_PAGE;
	dir = shmem_dir_map(info->i_indirect);

	if (index >= ENTRIES_PER_PAGE/2) {
		index -= ENTRIES_PER_PAGE/2;
		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
		index %= ENTRIES_PER_PAGE;
		subdir = *dir;
		if (!subdir) {
			if (page) {
				*dir = *page;
				*page = NULL;
			}
			shmem_dir_unmap(dir);
			return NULL;		/* need another page */
		}
		shmem_dir_unmap(dir);
		dir = shmem_dir_map(subdir);
	}

	dir += index;
	subdir = *dir;
	if (!subdir) {
		if (!page || !(subdir = *page)) {
			shmem_dir_unmap(dir);
			return NULL;		/* need a page */
		}
		*dir = subdir;
		*page = NULL;
	}
	shmem_dir_unmap(dir);
	return shmem_swp_map(subdir) + offset;
}

static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
{
	long incdec = value? 1: -1;

	entry->val = value;
	info->swapped += incdec;
397 398 399 400
	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
		struct page *page = kmap_atomic_to_page(entry);
		set_page_private(page, page_private(page) + incdec);
	}
Linus Torvalds's avatar
Linus Torvalds committed
401 402
}

403
/**
Linus Torvalds's avatar
Linus Torvalds committed
404 405 406 407
 * shmem_swp_alloc - get the position of the swap entry for the page.
 * @info:	info structure for the inode
 * @index:	index of the page to find
 * @sgp:	check and recheck i_size? skip allocation?
408 409
 *
 * If the entry does not exist, allocate it.
Linus Torvalds's avatar
Linus Torvalds committed
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
 */
static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
{
	struct inode *inode = &info->vfs_inode;
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
	struct page *page = NULL;
	swp_entry_t *entry;

	if (sgp != SGP_WRITE &&
	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
		return ERR_PTR(-EINVAL);

	while (!(entry = shmem_swp_entry(info, index, &page))) {
		if (sgp == SGP_READ)
			return shmem_swp_map(ZERO_PAGE(0));
		/*
426
		 * Test used_blocks against 1 less max_blocks, since we have 1 data
Linus Torvalds's avatar
Linus Torvalds committed
427 428 429
		 * page (and perhaps indirect index pages) yet to allocate:
		 * a waste to allocate index if we cannot allocate data.
		 */
430
		if (sbinfo->max_blocks) {
431 432
			if (percpu_counter_compare(&sbinfo->used_blocks,
						sbinfo->max_blocks - 1) >= 0)
Linus Torvalds's avatar
Linus Torvalds committed
433
				return ERR_PTR(-ENOSPC);
434 435
			percpu_counter_inc(&sbinfo->used_blocks);
			spin_lock(&inode->i_lock);
Linus Torvalds's avatar
Linus Torvalds committed
436
			inode->i_blocks += BLOCKS_PER_PAGE;
437
			spin_unlock(&inode->i_lock);
Linus Torvalds's avatar
Linus Torvalds committed
438 439 440
		}

		spin_unlock(&info->lock);
441
		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
Linus Torvalds's avatar
Linus Torvalds committed
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
		spin_lock(&info->lock);

		if (!page) {
			shmem_free_blocks(inode, 1);
			return ERR_PTR(-ENOMEM);
		}
		if (sgp != SGP_WRITE &&
		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
			entry = ERR_PTR(-EINVAL);
			break;
		}
		if (info->next_index <= index)
			info->next_index = index + 1;
	}
	if (page) {
		/* another task gave its page, or truncated the file */
		shmem_free_blocks(inode, 1);
		shmem_dir_free(page);
	}
	if (info->next_index <= index && !IS_ERR(entry))
		info->next_index = index + 1;
	return entry;
}

466
/**
Linus Torvalds's avatar
Linus Torvalds committed
467
 * shmem_free_swp - free some swap entries in a directory
468 469 470
 * @dir:        pointer to the directory
 * @edir:       pointer after last entry of the directory
 * @punch_lock: pointer to spinlock when needed for the holepunch case
Linus Torvalds's avatar
Linus Torvalds committed
471
 */
472 473
static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
						spinlock_t *punch_lock)
Linus Torvalds's avatar
Linus Torvalds committed
474
{
475
	spinlock_t *punch_unlock = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
476 477 478 479 480
	swp_entry_t *ptr;
	int freed = 0;

	for (ptr = dir; ptr < edir; ptr++) {
		if (ptr->val) {
481 482 483 484 485 486 487
			if (unlikely(punch_lock)) {
				punch_unlock = punch_lock;
				punch_lock = NULL;
				spin_lock(punch_unlock);
				if (!ptr->val)
					continue;
			}
Linus Torvalds's avatar
Linus Torvalds committed
488 489 490 491 492
			free_swap_and_cache(*ptr);
			*ptr = (swp_entry_t){0};
			freed++;
		}
	}
493 494
	if (punch_unlock)
		spin_unlock(punch_unlock);
Linus Torvalds's avatar
Linus Torvalds committed
495 496 497
	return freed;
}

498 499
static int shmem_map_and_free_swp(struct page *subdir, int offset,
		int limit, struct page ***dir, spinlock_t *punch_lock)
Linus Torvalds's avatar
Linus Torvalds committed
500 501 502 503 504 505 506 507 508
{
	swp_entry_t *ptr;
	int freed = 0;

	ptr = shmem_swp_map(subdir);
	for (; offset < limit; offset += LATENCY_LIMIT) {
		int size = limit - offset;
		if (size > LATENCY_LIMIT)
			size = LATENCY_LIMIT;
509 510
		freed += shmem_free_swp(ptr+offset, ptr+offset+size,
							punch_lock);
Linus Torvalds's avatar
Linus Torvalds committed
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
		if (need_resched()) {
			shmem_swp_unmap(ptr);
			if (*dir) {
				shmem_dir_unmap(*dir);
				*dir = NULL;
			}
			cond_resched();
			ptr = shmem_swp_map(subdir);
		}
	}
	shmem_swp_unmap(ptr);
	return freed;
}

static void shmem_free_pages(struct list_head *next)
{
	struct page *page;
	int freed = 0;

	do {
		page = container_of(next, struct page, lru);
		next = next->next;
		shmem_dir_free(page);
		freed++;
		if (freed >= LATENCY_LIMIT) {
			cond_resched();
			freed = 0;
		}
	} while (next);
}

542
static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
Linus Torvalds's avatar
Linus Torvalds committed
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	unsigned long idx;
	unsigned long size;
	unsigned long limit;
	unsigned long stage;
	unsigned long diroff;
	struct page **dir;
	struct page *topdir;
	struct page *middir;
	struct page *subdir;
	swp_entry_t *ptr;
	LIST_HEAD(pages_to_free);
	long nr_pages_to_free = 0;
	long nr_swaps_freed = 0;
	int offset;
	int freed;
560
	int punch_hole;
561 562
	spinlock_t *needs_lock;
	spinlock_t *punch_lock;
563
	unsigned long upper_limit;
Linus Torvalds's avatar
Linus Torvalds committed
564 565

	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
566
	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
567 568 569 570 571
	if (idx >= info->next_index)
		return;

	spin_lock(&info->lock);
	info->flags |= SHMEM_TRUNCATE;
572 573
	if (likely(end == (loff_t) -1)) {
		limit = info->next_index;
574
		upper_limit = SHMEM_MAX_INDEX;
575
		info->next_index = idx;
576
		needs_lock = NULL;
577
		punch_hole = 0;
578
	} else {
579 580 581 582 583 584 585 586
		if (end + 1 >= inode->i_size) {	/* we may free a little more */
			limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
							PAGE_CACHE_SHIFT;
			upper_limit = SHMEM_MAX_INDEX;
		} else {
			limit = (end + 1) >> PAGE_CACHE_SHIFT;
			upper_limit = limit;
		}
587
		needs_lock = &info->lock;
588 589 590
		punch_hole = 1;
	}

Linus Torvalds's avatar
Linus Torvalds committed
591
	topdir = info->i_indirect;
592
	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
Linus Torvalds's avatar
Linus Torvalds committed
593 594 595 596 597 598 599 600 601 602 603
		info->i_indirect = NULL;
		nr_pages_to_free++;
		list_add(&topdir->lru, &pages_to_free);
	}
	spin_unlock(&info->lock);

	if (info->swapped && idx < SHMEM_NR_DIRECT) {
		ptr = info->i_direct;
		size = limit;
		if (size > SHMEM_NR_DIRECT)
			size = SHMEM_NR_DIRECT;
604
		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
Linus Torvalds's avatar
Linus Torvalds committed
605
	}
606 607 608 609 610

	/*
	 * If there are no indirect blocks or we are punching a hole
	 * below indirect blocks, nothing to be done.
	 */
611
	if (!topdir || limit <= SHMEM_NR_DIRECT)
Linus Torvalds's avatar
Linus Torvalds committed
612 613
		goto done2;

614 615 616 617 618 619 620 621 622 623 624 625 626
	/*
	 * The truncation case has already dropped info->lock, and we're safe
	 * because i_size and next_index have already been lowered, preventing
	 * access beyond.  But in the punch_hole case, we still need to take
	 * the lock when updating the swap directory, because there might be
	 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
	 * shmem_writepage.  However, whenever we find we can remove a whole
	 * directory page (not at the misaligned start or end of the range),
	 * we first NULLify its pointer in the level above, and then have no
	 * need to take the lock when updating its contents: needs_lock and
	 * punch_lock (either pointing to info->lock or NULL) manage this.
	 */

627
	upper_limit -= SHMEM_NR_DIRECT;
Linus Torvalds's avatar
Linus Torvalds committed
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
	limit -= SHMEM_NR_DIRECT;
	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
	offset = idx % ENTRIES_PER_PAGE;
	idx -= offset;

	dir = shmem_dir_map(topdir);
	stage = ENTRIES_PER_PAGEPAGE/2;
	if (idx < ENTRIES_PER_PAGEPAGE/2) {
		middir = topdir;
		diroff = idx/ENTRIES_PER_PAGE;
	} else {
		dir += ENTRIES_PER_PAGE/2;
		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
		while (stage <= idx)
			stage += ENTRIES_PER_PAGEPAGE;
		middir = *dir;
		if (*dir) {
			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
647
			if (!diroff && !offset && upper_limit >= stage) {
648 649 650 651 652 653 654
				if (needs_lock) {
					spin_lock(needs_lock);
					*dir = NULL;
					spin_unlock(needs_lock);
					needs_lock = NULL;
				} else
					*dir = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
				nr_pages_to_free++;
				list_add(&middir->lru, &pages_to_free);
			}
			shmem_dir_unmap(dir);
			dir = shmem_dir_map(middir);
		} else {
			diroff = 0;
			offset = 0;
			idx = stage;
		}
	}

	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
		if (unlikely(idx == stage)) {
			shmem_dir_unmap(dir);
			dir = shmem_dir_map(topdir) +
			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
			while (!*dir) {
				dir++;
				idx += ENTRIES_PER_PAGEPAGE;
				if (idx >= limit)
					goto done1;
			}
			stage = idx + ENTRIES_PER_PAGEPAGE;
			middir = *dir;
680 681
			if (punch_hole)
				needs_lock = &info->lock;
682
			if (upper_limit >= stage) {
683 684 685 686 687 688 689
				if (needs_lock) {
					spin_lock(needs_lock);
					*dir = NULL;
					spin_unlock(needs_lock);
					needs_lock = NULL;
				} else
					*dir = NULL;
690 691 692
				nr_pages_to_free++;
				list_add(&middir->lru, &pages_to_free);
			}
Linus Torvalds's avatar
Linus Torvalds committed
693 694 695 696 697
			shmem_dir_unmap(dir);
			cond_resched();
			dir = shmem_dir_map(middir);
			diroff = 0;
		}
698
		punch_lock = needs_lock;
Linus Torvalds's avatar
Linus Torvalds committed
699
		subdir = dir[diroff];
700 701 702 703 704 705 706 707 708 709 710 711
		if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
			if (needs_lock) {
				spin_lock(needs_lock);
				dir[diroff] = NULL;
				spin_unlock(needs_lock);
				punch_lock = NULL;
			} else
				dir[diroff] = NULL;
			nr_pages_to_free++;
			list_add(&subdir->lru, &pages_to_free);
		}
		if (subdir && page_private(subdir) /* has swap entries */) {
Linus Torvalds's avatar
Linus Torvalds committed
712 713 714 715
			size = limit - idx;
			if (size > ENTRIES_PER_PAGE)
				size = ENTRIES_PER_PAGE;
			freed = shmem_map_and_free_swp(subdir,
716
					offset, size, &dir, punch_lock);
Linus Torvalds's avatar
Linus Torvalds committed
717 718 719
			if (!dir)
				dir = shmem_dir_map(middir);
			nr_swaps_freed += freed;
720
			if (offset || punch_lock) {
Linus Torvalds's avatar
Linus Torvalds committed
721
				spin_lock(&info->lock);
722 723
				set_page_private(subdir,
					page_private(subdir) - freed);
Linus Torvalds's avatar
Linus Torvalds committed
724
				spin_unlock(&info->lock);
725 726
			} else
				BUG_ON(page_private(subdir) != freed);
Linus Torvalds's avatar
Linus Torvalds committed
727
		}
728
		offset = 0;
Linus Torvalds's avatar
Linus Torvalds committed
729 730 731 732 733 734 735
	}
done1:
	shmem_dir_unmap(dir);
done2:
	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
		/*
		 * Call truncate_inode_pages again: racing shmem_unuse_inode
736 737 738 739 740
		 * may have swizzled a page in from swap since
		 * truncate_pagecache or generic_delete_inode did it, before we
		 * lowered next_index.  Also, though shmem_getpage checks
		 * i_size before adding to cache, no recheck after: so fix the
		 * narrow window there too.
741 742 743 744 745
		 *
		 * Recalling truncate_inode_pages_range and unmap_mapping_range
		 * every time for punch_hole (which never got a chance to clear
		 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
		 * yet hardly ever necessary: try to optimize them out later.
Linus Torvalds's avatar
Linus Torvalds committed
746
		 */
747
		truncate_inode_pages_range(inode->i_mapping, start, end);
748 749 750
		if (punch_hole)
			unmap_mapping_range(inode->i_mapping, start,
							end - start, 1);
Linus Torvalds's avatar
Linus Torvalds committed
751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
	}

	spin_lock(&info->lock);
	info->flags &= ~SHMEM_TRUNCATE;
	info->swapped -= nr_swaps_freed;
	if (nr_pages_to_free)
		shmem_free_blocks(inode, nr_pages_to_free);
	shmem_recalc_inode(inode);
	spin_unlock(&info->lock);

	/*
	 * Empty swap vector directory pages to be freed?
	 */
	if (!list_empty(&pages_to_free)) {
		pages_to_free.prev->next = NULL;
		shmem_free_pages(pages_to_free.next);
	}
}

static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
{
	struct inode *inode = dentry->d_inode;
773
	loff_t newsize = attr->ia_size;
Linus Torvalds's avatar
Linus Torvalds committed
774 775
	int error;

776 777 778 779
	error = inode_change_ok(inode, attr);
	if (error)
		return error;

780 781
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
					&& newsize != inode->i_size) {
782 783 784
		struct page *page = NULL;

		if (newsize < inode->i_size) {
Linus Torvalds's avatar
Linus Torvalds committed
785 786 787 788
			/*
			 * If truncating down to a partial page, then
			 * if that page is already allocated, hold it
			 * in memory until the truncation is over, so
789
			 * truncate_partial_page cannot miss it were
Linus Torvalds's avatar
Linus Torvalds committed
790 791
			 * it assigned to swap.
			 */
792
			if (newsize & (PAGE_CACHE_SIZE-1)) {
Linus Torvalds's avatar
Linus Torvalds committed
793
				(void) shmem_getpage(inode,
794
					newsize >> PAGE_CACHE_SHIFT,
Linus Torvalds's avatar
Linus Torvalds committed
795
						&page, SGP_READ, NULL);
796 797
				if (page)
					unlock_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
798 799 800 801 802 803 804 805
			}
			/*
			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
			 * detect if any pages might have been added to cache
			 * after truncate_inode_pages.  But we needn't bother
			 * if it's being fully truncated to zero-length: the
			 * nrpages check is efficient enough in that case.
			 */
806
			if (newsize) {
Linus Torvalds's avatar
Linus Torvalds committed
807 808 809 810 811 812
				struct shmem_inode_info *info = SHMEM_I(inode);
				spin_lock(&info->lock);
				info->flags &= ~SHMEM_PAGEIN;
				spin_unlock(&info->lock);
			}
		}
813

814 815
		/* XXX(truncate): truncate_setsize should be called last */
		truncate_setsize(inode, newsize);
816 817 818
		if (page)
			page_cache_release(page);
		shmem_truncate_range(inode, newsize, (loff_t)-1);
Linus Torvalds's avatar
Linus Torvalds committed
819 820
	}

821
	setattr_copy(inode, attr);
822
#ifdef CONFIG_TMPFS_POSIX_ACL
823
	if (attr->ia_valid & ATTR_MODE)
824
		error = generic_acl_chmod(inode);
825
#endif
Linus Torvalds's avatar
Linus Torvalds committed
826 827 828
	return error;
}

Al Viro's avatar
Al Viro committed
829
static void shmem_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
830 831
{
	struct shmem_inode_info *info = SHMEM_I(inode);
832
	struct shmem_xattr *xattr, *nxattr;
Linus Torvalds's avatar
Linus Torvalds committed
833

834
	if (inode->i_mapping->a_ops == &shmem_aops) {
835
		truncate_inode_pages(inode->i_mapping, 0);
Linus Torvalds's avatar
Linus Torvalds committed
836 837
		shmem_unacct_size(info->flags, inode->i_size);
		inode->i_size = 0;
838
		shmem_truncate_range(inode, 0, (loff_t)-1);
Linus Torvalds's avatar
Linus Torvalds committed
839
		if (!list_empty(&info->swaplist)) {
840
			mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
841
			list_del_init(&info->swaplist);
842
			mutex_unlock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
843 844
		}
	}
845 846 847 848 849

	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
		kfree(xattr->name);
		kfree(xattr);
	}
850
	BUG_ON(inode->i_blocks);
851
	shmem_free_inode(inode->i_sb);
Al Viro's avatar
Al Viro committed
852
	end_writeback(inode);
Linus Torvalds's avatar
Linus Torvalds committed
853 854 855 856 857 858 859 860 861 862 863 864 865 866 867
}

static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
{
	swp_entry_t *ptr;

	for (ptr = dir; ptr < edir; ptr++) {
		if (ptr->val == entry.val)
			return ptr - dir;
	}
	return -1;
}

static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
{
868
	struct address_space *mapping;
Linus Torvalds's avatar
Linus Torvalds committed
869 870 871 872 873 874 875 876
	unsigned long idx;
	unsigned long size;
	unsigned long limit;
	unsigned long stage;
	struct page **dir;
	struct page *subdir;
	swp_entry_t *ptr;
	int offset;
877
	int error;
Linus Torvalds's avatar
Linus Torvalds committed
878 879 880 881

	idx = 0;
	ptr = info->i_direct;
	spin_lock(&info->lock);
Hugh Dickins's avatar
Hugh Dickins committed
882 883 884 885
	if (!info->swapped) {
		list_del_init(&info->swaplist);
		goto lost2;
	}
Linus Torvalds's avatar
Linus Torvalds committed
886 887 888 889 890
	limit = info->next_index;
	size = limit;
	if (size > SHMEM_NR_DIRECT)
		size = SHMEM_NR_DIRECT;
	offset = shmem_find_swp(entry, ptr, ptr+size);
891 892
	if (offset >= 0) {
		shmem_swp_balance_unmap();
Linus Torvalds's avatar
Linus Torvalds committed
893
		goto found;
894
	}
Linus Torvalds's avatar
Linus Torvalds committed
895 896 897 898 899 900 901 902 903
	if (!info->i_indirect)
		goto lost2;

	dir = shmem_dir_map(info->i_indirect);
	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;

	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
		if (unlikely(idx == stage)) {
			shmem_dir_unmap(dir-1);
904 905 906 907 908 909 910 911
			if (cond_resched_lock(&info->lock)) {
				/* check it has not been truncated */
				if (limit > info->next_index) {
					limit = info->next_index;
					if (idx >= limit)
						goto lost2;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
912 913 914 915 916 917 918 919 920 921 922 923 924 925
			dir = shmem_dir_map(info->i_indirect) +
			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
			while (!*dir) {
				dir++;
				idx += ENTRIES_PER_PAGEPAGE;
				if (idx >= limit)
					goto lost1;
			}
			stage = idx + ENTRIES_PER_PAGEPAGE;
			subdir = *dir;
			shmem_dir_unmap(dir);
			dir = shmem_dir_map(subdir);
		}
		subdir = *dir;
926
		if (subdir && page_private(subdir)) {
Linus Torvalds's avatar
Linus Torvalds committed
927 928 929 930 931
			ptr = shmem_swp_map(subdir);
			size = limit - idx;
			if (size > ENTRIES_PER_PAGE)
				size = ENTRIES_PER_PAGE;
			offset = shmem_find_swp(entry, ptr, ptr+size);
932
			shmem_swp_unmap(ptr);
Linus Torvalds's avatar
Linus Torvalds committed
933 934
			if (offset >= 0) {
				shmem_dir_unmap(dir);
935
				ptr = shmem_swp_map(subdir);
Linus Torvalds's avatar
Linus Torvalds committed
936 937 938 939 940 941 942 943 944 945 946
				goto found;
			}
		}
	}
lost1:
	shmem_dir_unmap(dir-1);
lost2:
	spin_unlock(&info->lock);
	return 0;
found:
	idx += offset;
947
	ptr += offset;
948

Hugh Dickins's avatar
Hugh Dickins committed
949 950
	/*
	 * Move _head_ to start search for next from here.
Al Viro's avatar
Al Viro committed
951
	 * But be careful: shmem_evict_inode checks list_empty without taking
Hugh Dickins's avatar
Hugh Dickins committed
952 953 954 955 956 957
	 * mutex, and there's an instant in list_move_tail when info->swaplist
	 * would appear empty, if it were the only one on shmem_swaplist.  We
	 * could avoid doing it if inode NULL; or use this minor optimization.
	 */
	if (shmem_swaplist.next != &info->swaplist)
		list_move_tail(&shmem_swaplist, &info->swaplist);
958

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
959
	/*
960 961 962
	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
	 * beneath us (pagelock doesn't help until the page is in pagecache).
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
963
	 */
964 965 966
	mapping = info->vfs_inode.i_mapping;
	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
	/* which does mem_cgroup_uncharge_cache_page on error */
967

968
	if (error == -EEXIST) {
969
		struct page *filepage = find_get_page(mapping, idx);
970
		error = 1;
971 972 973 974 975 976 977 978 979 980 981
		if (filepage) {
			/*
			 * There might be a more uptodate page coming down
			 * from a stacked writepage: forget our swappage if so.
			 */
			if (PageUptodate(filepage))
				error = 0;
			page_cache_release(filepage);
		}
	}
	if (!error) {
982 983
		delete_from_swap_cache(page);
		set_page_dirty(page);
Linus Torvalds's avatar
Linus Torvalds committed
984
		info->flags |= SHMEM_PAGEIN;
985 986 987
		shmem_swp_set(info, ptr, 0);
		swap_free(entry);
		error = 1;	/* not an error, but entry was found */
Linus Torvalds's avatar
Linus Torvalds committed
988
	}
989
	shmem_swp_unmap(ptr);
Linus Torvalds's avatar
Linus Torvalds committed
990
	spin_unlock(&info->lock);
991
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
992 993 994 995 996 997 998 999 1000 1001
}

/*
 * shmem_unuse() search for an eventually swapped out shmem page.
 */
int shmem_unuse(swp_entry_t entry, struct page *page)
{
	struct list_head *p, *next;
	struct shmem_inode_info *info;
	int found = 0;
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
	int error;

	/*
	 * Charge page using GFP_KERNEL while we can wait, before taking
	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
	 * Charged back to the user (not to caller) when swap account is used.
	 * add_to_page_cache() will be called with GFP_NOWAIT.
	 */
	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
	if (error)
		goto out;
	/*
	 * Try to preload while we can wait, to not make a habit of
	 * draining atomic reserves; but don't latch on to this cpu,
	 * it's okay if sometimes we get rescheduled after this.
	 */
	error = radix_tree_preload(GFP_KERNEL);
	if (error)
		goto uncharge;
	radix_tree_preload_end();
Linus Torvalds's avatar
Linus Torvalds committed
1022

1023
	mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1024 1025
	list_for_each_safe(p, next, &shmem_swaplist) {
		info = list_entry(p, struct shmem_inode_info, swaplist);
Hugh Dickins's avatar
Hugh Dickins committed
1026
		found = shmem_unuse_inode(info, entry, page);
1027
		cond_resched();
1028
		if (found)
1029
			break;
Linus Torvalds's avatar
Linus Torvalds committed
1030
	}
1031
	mutex_unlock(&shmem_swaplist_mutex);
1032 1033 1034 1035 1036 1037 1038

uncharge:
	if (!found)
		mem_cgroup_uncharge_cache_page(page);
	if (found < 0)
		error = found;
out:
Hugh Dickins's avatar
Hugh Dickins committed
1039 1040
	unlock_page(page);
	page_cache_release(page);
1041
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
	struct shmem_inode_info *info;
	swp_entry_t *entry, swap;
	struct address_space *mapping;
	unsigned long index;
	struct inode *inode;

	BUG_ON(!PageLocked(page));
	mapping = page->mapping;
	index = page->index;
	inode = mapping->host;
	info = SHMEM_I(inode);
	if (info->flags & VM_LOCKED)
		goto redirty;
1062
	if (!total_swap_pages)
Linus Torvalds's avatar
Linus Torvalds committed
1063 1064
		goto redirty;

1065 1066 1067 1068 1069
	/*
	 * shmem_backing_dev_info's capabilities prevent regular writeback or
	 * sync from ever calling shmem_writepage; but a stacking filesystem
	 * may use the ->writepage of its underlying filesystem, in which case
	 * tmpfs should write out to swap only in response to memory pressure,
1070 1071 1072
	 * and not for the writeback threads or sync.  However, in those cases,
	 * we do still want to check if there's a redundant swappage to be
	 * discarded.
1073 1074 1075 1076 1077 1078
	 */
	if (wbc->for_reclaim)
		swap = get_swap_page();
	else
		swap.val = 0;

1079 1080 1081 1082 1083 1084 1085 1086 1087
	/*
	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
	 * if it's not already there.  Do it now because we cannot take
	 * mutex while holding spinlock, and must do so before the page
	 * is moved to swap cache, when its pagelock no longer protects
	 * the inode from eviction.  But don't unlock the mutex until
	 * we've taken the spinlock, because shmem_unuse_inode() will
	 * prune a !swapped inode from the swaplist under both locks.
	 */
1088
	if (swap.val) {
1089
		mutex_lock(&shmem_swaplist_mutex);
1090 1091
		if (list_empty(&info->swaplist))
			list_add_tail(&info->swaplist, &shmem_swaplist);
1092 1093
	}

Linus Torvalds's avatar
Linus Torvalds committed
1094
	spin_lock(&info->lock);
1095
	if (swap.val)
1096 1097
		mutex_unlock(&shmem_swaplist_mutex);

Linus Torvalds's avatar
Linus Torvalds committed
1098 1099 1100 1101 1102
	if (index >= info->next_index) {
		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
		goto unlock;
	}
	entry = shmem_swp_entry(info, index, NULL);
1103 1104 1105 1106 1107 1108 1109 1110 1111
	if (entry->val) {
		/*
		 * The more uptodate page coming down from a stacked
		 * writepage should replace our old swappage.
		 */
		free_swap_and_cache(*entry);
		shmem_swp_set(info, entry, 0);
	}
	shmem_recalc_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
1112

1113
	if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1114
		delete_from_page_cache(page);
Linus Torvalds's avatar
Linus Torvalds committed
1115 1116
		shmem_swp_set(info, entry, swap.val);
		shmem_swp_unmap(entry);
Hugh Dickins's avatar
Hugh Dickins committed
1117
		swap_shmem_alloc(swap);
1118
		spin_unlock(&info->lock);
1119
		BUG_ON(page_mapped(page));
1120
		swap_writepage(page, wbc);
Linus Torvalds's avatar
Linus Torvalds committed
1121 1122 1123 1124 1125 1126
		return 0;
	}

	shmem_swp_unmap(entry);
unlock:
	spin_unlock(&info->lock);
1127 1128 1129 1130
	/*
	 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
	 * clear SWAP_HAS_CACHE flag.
	 */
1131
	swapcache_free(swap, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1132 1133
redirty:
	set_page_dirty(page);
1134 1135 1136 1137
	if (wbc->for_reclaim)
		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
	unlock_page(page);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1138 1139 1140
}

#ifdef CONFIG_NUMA
1141
#ifdef CONFIG_TMPFS
1142
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1143
{
1144
	char buffer[64];
1145

1146
	if (!mpol || mpol->mode == MPOL_DEFAULT)
1147
		return;		/* show nothing */
1148

1149
	mpol_to_str(buffer, sizeof(buffer), mpol, 1);
1150 1151

	seq_printf(seq, ",mpol=%s", buffer);
1152
}
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
	struct mempolicy *mpol = NULL;
	if (sbinfo->mpol) {
		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
		mpol = sbinfo->mpol;
		mpol_get(mpol);
		spin_unlock(&sbinfo->stat_lock);
	}
	return mpol;
}
1165 1166
#endif /* CONFIG_TMPFS */

1167 1168
static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
			struct shmem_inode_info *info, unsigned long idx)
Linus Torvalds's avatar
Linus Torvalds committed
1169
{
1170
	struct mempolicy mpol, *spol;
Linus Torvalds's avatar
Linus Torvalds committed
1171
	struct vm_area_struct pvma;
1172
	struct page *page;
Linus Torvalds's avatar
Linus Torvalds committed
1173

1174 1175 1176
	spol = mpol_cond_copy(&mpol,
				mpol_shared_policy_lookup(&info->policy, idx));

Linus Torvalds's avatar
Linus Torvalds committed
1177
	/* Create a pseudo vma that just contains the policy */
1178
	pvma.vm_start = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1179
	pvma.vm_pgoff = idx;
1180
	pvma.vm_ops = NULL;
1181
	pvma.vm_policy = spol;
1182
	page = swapin_readahead(entry, gfp, &pvma, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1183 1184 1185
	return page;
}

1186 1187
static struct page *shmem_alloc_page(gfp_t gfp,
			struct shmem_inode_info *info, unsigned long idx)
Linus Torvalds's avatar
Linus Torvalds committed
1188 1189 1190
{
	struct vm_area_struct pvma;

1191 1192
	/* Create a pseudo vma that just contains the policy */
	pvma.vm_start = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1193
	pvma.vm_pgoff = idx;
1194 1195
	pvma.vm_ops = NULL;
	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1196 1197 1198 1199 1200

	/*
	 * alloc_page_vma() will drop the shared policy reference
	 */
	return alloc_page_vma(gfp, &pvma, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1201
}
1202 1203
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
1204
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)