shmem.c 61.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *		 2000 Transmeta Corp.
 *		 2000-2001 Christoph Rohland
 *		 2000-2001 SAP AG
 *		 2002 Red Hat Inc.
9 10
 * Copyright (C) 2002-2005 Hugh Dickins.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
Linus Torvalds's avatar
Linus Torvalds committed
11 12 13 14 15 16
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
17 18 19
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
Linus Torvalds's avatar
Linus Torvalds committed
20 21 22
 * This file is released under the GPL.
 */

23 24 25 26
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
Hugh Dickins's avatar
Hugh Dickins committed
27
#include <linux/pagemap.h>
28 29 30
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
31
#include <linux/percpu_counter.h>
32 33 34 35 36
#include <linux/swap.h>

static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
Linus Torvalds's avatar
Linus Torvalds committed
37 38 39 40 41 42
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

43
#include <linux/xattr.h>
44
#include <linux/exportfs.h>
45
#include <linux/posix_acl.h>
46
#include <linux/generic_acl.h>
Linus Torvalds's avatar
Linus Torvalds committed
47 48 49 50 51 52 53
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
54
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57 58
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
59
#include <linux/ctype.h>
60
#include <linux/migrate.h>
61
#include <linux/highmem.h>
62
#include <linux/seq_file.h>
Mimi Zohar's avatar
Mimi Zohar committed
63
#include <linux/magic.h>
64

Linus Torvalds's avatar
Linus Torvalds committed
65 66 67 68
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>

Hugh Dickins's avatar
Hugh Dickins committed
69
#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
Linus Torvalds's avatar
Linus Torvalds committed
70 71 72 73 74
#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

75 76 77 78 79 80 81
struct shmem_xattr {
	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */
	char *name;		/* xattr name */
	size_t size;
	char value[0];
};

82
/* Flag allocation requirements to shmem_getpage */
Linus Torvalds's avatar
Linus Torvalds committed
83 84 85
enum sgp_type {
	SGP_READ,	/* don't exceed i_size, don't allocate page */
	SGP_CACHE,	/* don't exceed i_size, may allocate page */
86
	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
Linus Torvalds's avatar
Linus Torvalds committed
87 88 89
	SGP_WRITE,	/* may exceed i_size, may allocate page */
};

Andrew Morton's avatar
Andrew Morton committed
90
#ifdef CONFIG_TMPFS
91 92 93 94 95 96 97 98 99
static unsigned long shmem_default_max_blocks(void)
{
	return totalram_pages / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
}
Andrew Morton's avatar
Andrew Morton committed
100
#endif
101

102 103 104 105 106 107 108 109 110
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);

static inline int shmem_getpage(struct inode *inode, pgoff_t index,
	struct page **pagep, enum sgp_type sgp, int *fault_type)
{
	return shmem_getpage_gfp(inode, index, pagep, sgp,
			mapping_gfp_mask(inode->i_mapping), fault_type);
}
Linus Torvalds's avatar
Linus Torvalds committed
111 112 113 114 115 116 117 118 119 120 121 122 123 124

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
125 126
	return (flags & VM_NORESERVE) ?
		0 : security_vm_enough_memory_kern(VM_ACCT(size));
Linus Torvalds's avatar
Linus Torvalds committed
127 128 129 130
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
131
	if (!(flags & VM_NORESERVE))
Linus Torvalds's avatar
Linus Torvalds committed
132 133 134 135 136 137 138 139 140 141 142
		vm_unacct_memory(VM_ACCT(size));
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow huge sparse files.
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_block(unsigned long flags)
{
143 144
	return (flags & VM_NORESERVE) ?
		security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
Linus Torvalds's avatar
Linus Torvalds committed
145 146 147 148
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
149
	if (flags & VM_NORESERVE)
Linus Torvalds's avatar
Linus Torvalds committed
150 151 152
		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
}

153
static const struct super_operations shmem_ops;
154
static const struct address_space_operations shmem_aops;
155
static const struct file_operations shmem_file_operations;
156 157 158
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
159
static const struct vm_operations_struct shmem_vm_ops;
Linus Torvalds's avatar
Linus Torvalds committed
160

161
static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
Linus Torvalds's avatar
Linus Torvalds committed
162
	.ra_pages	= 0,	/* No readahead */
163
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
Linus Torvalds's avatar
Linus Torvalds committed
164 165 166
};

static LIST_HEAD(shmem_swaplist);
167
static DEFINE_MUTEX(shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
168 169 170 171

static void shmem_free_blocks(struct inode *inode, long pages)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
172
	if (sbinfo->max_blocks) {
173
		percpu_counter_add(&sbinfo->used_blocks, -pages);
Linus Torvalds's avatar
Linus Torvalds committed
174 175 176 177
		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
	}
}

178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
static int shmem_reserve_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		if (!sbinfo->free_inodes) {
			spin_unlock(&sbinfo->stat_lock);
			return -ENOSPC;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}
	return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}

203
/**
Linus Torvalds's avatar
Linus Torvalds committed
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
 * shmem_recalc_inode - recalculate the size of an inode
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	long freed;

	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
	if (freed > 0) {
		info->alloced -= freed;
		shmem_unacct_blocks(info->flags, freed);
		shmem_free_blocks(inode, freed);
	}
}

228 229
static void shmem_put_swap(struct shmem_inode_info *info, pgoff_t index,
			   swp_entry_t swap)
Linus Torvalds's avatar
Linus Torvalds committed
230
{
231 232
	if (index < SHMEM_NR_DIRECT)
		info->i_direct[index] = swap;
Linus Torvalds's avatar
Linus Torvalds committed
233 234
}

235
static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
236
{
237 238
	return (index < SHMEM_NR_DIRECT) ?
		info->i_direct[index] : (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
239 240
}

241
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
Linus Torvalds's avatar
Linus Torvalds committed
242
{
243
	struct address_space *mapping = inode->i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
244
	struct shmem_inode_info *info = SHMEM_I(inode);
245 246 247 248
	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
	pgoff_t index;
	swp_entry_t swap;
Linus Torvalds's avatar
Linus Torvalds committed
249

250
	truncate_inode_pages_range(mapping, lstart, lend);
251

252 253
	if (end > SHMEM_NR_DIRECT)
		end = SHMEM_NR_DIRECT;
Linus Torvalds's avatar
Linus Torvalds committed
254 255

	spin_lock(&info->lock);
256 257 258 259 260 261
	for (index = start; index < end; index++) {
		swap = shmem_get_swap(info, index);
		if (swap.val) {
			free_swap_and_cache(swap);
			shmem_put_swap(info, index, (swp_entry_t){0});
			info->swapped--;
Linus Torvalds's avatar
Linus Torvalds committed
262 263 264
		}
	}

265 266
	if (mapping->nrpages) {
		spin_unlock(&info->lock);
Linus Torvalds's avatar
Linus Torvalds committed
267
		/*
268
		 * A page may have meanwhile sneaked in from swap.
Linus Torvalds's avatar
Linus Torvalds committed
269
		 */
270 271
		truncate_inode_pages_range(mapping, lstart, lend);
		spin_lock(&info->lock);
Linus Torvalds's avatar
Linus Torvalds committed
272 273 274 275 276
	}

	shmem_recalc_inode(inode);
	spin_unlock(&info->lock);

277
	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
Linus Torvalds's avatar
Linus Torvalds committed
278
}
279
EXPORT_SYMBOL_GPL(shmem_truncate_range);
Linus Torvalds's avatar
Linus Torvalds committed
280

281
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
Linus Torvalds's avatar
Linus Torvalds committed
282 283 284 285
{
	struct inode *inode = dentry->d_inode;
	int error;

286 287 288 289
	error = inode_change_ok(inode, attr);
	if (error)
		return error;

290 291 292
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
		loff_t oldsize = inode->i_size;
		loff_t newsize = attr->ia_size;
293 294
		struct page *page = NULL;

295
		if (newsize < oldsize) {
Linus Torvalds's avatar
Linus Torvalds committed
296 297 298 299
			/*
			 * If truncating down to a partial page, then
			 * if that page is already allocated, hold it
			 * in memory until the truncation is over, so
300
			 * truncate_partial_page cannot miss it were
Linus Torvalds's avatar
Linus Torvalds committed
301 302
			 * it assigned to swap.
			 */
303
			if (newsize & (PAGE_CACHE_SIZE-1)) {
Linus Torvalds's avatar
Linus Torvalds committed
304
				(void) shmem_getpage(inode,
305
					newsize >> PAGE_CACHE_SHIFT,
Linus Torvalds's avatar
Linus Torvalds committed
306
						&page, SGP_READ, NULL);
307 308
				if (page)
					unlock_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
309 310
			}
		}
311 312 313 314 315 316 317 318 319 320 321
		if (newsize != oldsize) {
			i_size_write(inode, newsize);
			inode->i_ctime = inode->i_mtime = CURRENT_TIME;
		}
		if (newsize < oldsize) {
			loff_t holebegin = round_up(newsize, PAGE_SIZE);
			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
			shmem_truncate_range(inode, newsize, (loff_t)-1);
			/* unmap again to remove racily COWed private pages */
			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
		}
322 323
		if (page)
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
324 325
	}

326
	setattr_copy(inode, attr);
327
#ifdef CONFIG_TMPFS_POSIX_ACL
328
	if (attr->ia_valid & ATTR_MODE)
329
		error = generic_acl_chmod(inode);
330
#endif
Linus Torvalds's avatar
Linus Torvalds committed
331 332 333
	return error;
}

Al Viro's avatar
Al Viro committed
334
static void shmem_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
335 336
{
	struct shmem_inode_info *info = SHMEM_I(inode);
337
	struct shmem_xattr *xattr, *nxattr;
Linus Torvalds's avatar
Linus Torvalds committed
338

339
	if (inode->i_mapping->a_ops == &shmem_aops) {
Linus Torvalds's avatar
Linus Torvalds committed
340 341
		shmem_unacct_size(info->flags, inode->i_size);
		inode->i_size = 0;
342
		shmem_truncate_range(inode, 0, (loff_t)-1);
Linus Torvalds's avatar
Linus Torvalds committed
343
		if (!list_empty(&info->swaplist)) {
344
			mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
345
			list_del_init(&info->swaplist);
346
			mutex_unlock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
347 348
		}
	}
349 350 351 352 353

	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
		kfree(xattr->name);
		kfree(xattr);
	}
354
	BUG_ON(inode->i_blocks);
355
	shmem_free_inode(inode->i_sb);
Al Viro's avatar
Al Viro committed
356
	end_writeback(inode);
Linus Torvalds's avatar
Linus Torvalds committed
357 358 359 360
}

static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
{
361
	struct address_space *mapping = info->vfs_inode.i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
362
	unsigned long idx;
363
	int error;
Linus Torvalds's avatar
Linus Torvalds committed
364

365 366 367
	for (idx = 0; idx < SHMEM_NR_DIRECT; idx++)
		if (shmem_get_swap(info, idx).val == entry.val)
			goto found;
Linus Torvalds's avatar
Linus Torvalds committed
368 369
	return 0;
found:
370 371 372 373 374
	spin_lock(&info->lock);
	if (shmem_get_swap(info, idx).val != entry.val) {
		spin_unlock(&info->lock);
		return 0;
	}
375

Hugh Dickins's avatar
Hugh Dickins committed
376 377
	/*
	 * Move _head_ to start search for next from here.
Al Viro's avatar
Al Viro committed
378
	 * But be careful: shmem_evict_inode checks list_empty without taking
Hugh Dickins's avatar
Hugh Dickins committed
379
	 * mutex, and there's an instant in list_move_tail when info->swaplist
380
	 * would appear empty, if it were the only one on shmem_swaplist.
Hugh Dickins's avatar
Hugh Dickins committed
381 382 383
	 */
	if (shmem_swaplist.next != &info->swaplist)
		list_move_tail(&shmem_swaplist, &info->swaplist);
384

KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
385
	/*
386 387 388
	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
	 * beneath us (pagelock doesn't help until the page is in pagecache).
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
389
	 */
390 391
	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
	/* which does mem_cgroup_uncharge_cache_page on error */
392

393
	if (error != -ENOMEM) {
394 395
		delete_from_swap_cache(page);
		set_page_dirty(page);
396 397
		shmem_put_swap(info, idx, (swp_entry_t){0});
		info->swapped--;
398 399
		swap_free(entry);
		error = 1;	/* not an error, but entry was found */
Linus Torvalds's avatar
Linus Torvalds committed
400 401
	}
	spin_unlock(&info->lock);
402
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
403 404 405 406 407 408 409 410 411 412
}

/*
 * shmem_unuse() search for an eventually swapped out shmem page.
 */
int shmem_unuse(swp_entry_t entry, struct page *page)
{
	struct list_head *p, *next;
	struct shmem_inode_info *info;
	int found = 0;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
	int error;

	/*
	 * Charge page using GFP_KERNEL while we can wait, before taking
	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
	 * Charged back to the user (not to caller) when swap account is used.
	 * add_to_page_cache() will be called with GFP_NOWAIT.
	 */
	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
	if (error)
		goto out;
	/*
	 * Try to preload while we can wait, to not make a habit of
	 * draining atomic reserves; but don't latch on to this cpu,
	 * it's okay if sometimes we get rescheduled after this.
	 */
	error = radix_tree_preload(GFP_KERNEL);
	if (error)
		goto uncharge;
	radix_tree_preload_end();
Linus Torvalds's avatar
Linus Torvalds committed
433

434
	mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
435 436
	list_for_each_safe(p, next, &shmem_swaplist) {
		info = list_entry(p, struct shmem_inode_info, swaplist);
437 438 439 440 441 442 443 444
		if (!info->swapped) {
			spin_lock(&info->lock);
			if (!info->swapped)
				list_del_init(&info->swaplist);
			spin_unlock(&info->lock);
		}
		if (info->swapped)
			found = shmem_unuse_inode(info, entry, page);
445
		cond_resched();
446
		if (found)
447
			break;
Linus Torvalds's avatar
Linus Torvalds committed
448
	}
449
	mutex_unlock(&shmem_swaplist_mutex);
450 451 452 453 454 455 456

uncharge:
	if (!found)
		mem_cgroup_uncharge_cache_page(page);
	if (found < 0)
		error = found;
out:
Hugh Dickins's avatar
Hugh Dickins committed
457 458
	unlock_page(page);
	page_cache_release(page);
459
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
460 461 462 463 464 465 466 467
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
	struct shmem_inode_info *info;
468
	swp_entry_t swap, oswap;
Linus Torvalds's avatar
Linus Torvalds committed
469 470 471 472 473 474 475 476 477 478 479
	struct address_space *mapping;
	unsigned long index;
	struct inode *inode;

	BUG_ON(!PageLocked(page));
	mapping = page->mapping;
	index = page->index;
	inode = mapping->host;
	info = SHMEM_I(inode);
	if (info->flags & VM_LOCKED)
		goto redirty;
480
	if (!total_swap_pages)
Linus Torvalds's avatar
Linus Torvalds committed
481 482
		goto redirty;

483 484 485
	/*
	 * shmem_backing_dev_info's capabilities prevent regular writeback or
	 * sync from ever calling shmem_writepage; but a stacking filesystem
486
	 * might use ->writepage of its underlying filesystem, in which case
487
	 * tmpfs should write out to swap only in response to memory pressure,
488
	 * and not for the writeback threads or sync.
489
	 */
490 491 492 493
	if (!wbc->for_reclaim) {
		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
		goto redirty;
	}
494 495 496 497 498 499 500 501 502

	/*
	 * Just for this patch, we have a toy implementation,
	 * which can swap out only the first SHMEM_NR_DIRECT pages:
	 * for simple demonstration of where we need to think about swap.
	 */
	if (index >= SHMEM_NR_DIRECT)
		goto redirty;

503 504 505
	swap = get_swap_page();
	if (!swap.val)
		goto redirty;
506

507 508 509 510 511 512 513 514 515
	/*
	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
	 * if it's not already there.  Do it now because we cannot take
	 * mutex while holding spinlock, and must do so before the page
	 * is moved to swap cache, when its pagelock no longer protects
	 * the inode from eviction.  But don't unlock the mutex until
	 * we've taken the spinlock, because shmem_unuse_inode() will
	 * prune a !swapped inode from the swaplist under both locks.
	 */
516 517 518
	mutex_lock(&shmem_swaplist_mutex);
	if (list_empty(&info->swaplist))
		list_add_tail(&info->swaplist, &shmem_swaplist);
519

Linus Torvalds's avatar
Linus Torvalds committed
520
	spin_lock(&info->lock);
521
	mutex_unlock(&shmem_swaplist_mutex);
522

523 524
	oswap = shmem_get_swap(info, index);
	if (oswap.val) {
525
		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
526 527 528
		free_swap_and_cache(oswap);
		shmem_put_swap(info, index, (swp_entry_t){0});
		info->swapped--;
529 530
	}
	shmem_recalc_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
531

532
	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
533
		delete_from_page_cache(page);
534 535
		shmem_put_swap(info, index, swap);
		info->swapped++;
Hugh Dickins's avatar
Hugh Dickins committed
536
		swap_shmem_alloc(swap);
537
		spin_unlock(&info->lock);
538
		BUG_ON(page_mapped(page));
539
		swap_writepage(page, wbc);
Linus Torvalds's avatar
Linus Torvalds committed
540 541 542 543
		return 0;
	}

	spin_unlock(&info->lock);
544
	swapcache_free(swap, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
545 546
redirty:
	set_page_dirty(page);
547 548 549 550
	if (wbc->for_reclaim)
		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
	unlock_page(page);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
551 552 553
}

#ifdef CONFIG_NUMA
554
#ifdef CONFIG_TMPFS
555
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
556
{
557
	char buffer[64];
558

559
	if (!mpol || mpol->mode == MPOL_DEFAULT)
560
		return;		/* show nothing */
561

562
	mpol_to_str(buffer, sizeof(buffer), mpol, 1);
563 564

	seq_printf(seq, ",mpol=%s", buffer);
565
}
566 567 568 569 570 571 572 573 574 575 576 577

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
	struct mempolicy *mpol = NULL;
	if (sbinfo->mpol) {
		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
		mpol = sbinfo->mpol;
		mpol_get(mpol);
		spin_unlock(&sbinfo->stat_lock);
	}
	return mpol;
}
578 579
#endif /* CONFIG_TMPFS */

580 581
static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
			struct shmem_inode_info *info, unsigned long idx)
Linus Torvalds's avatar
Linus Torvalds committed
582
{
583
	struct mempolicy mpol, *spol;
Linus Torvalds's avatar
Linus Torvalds committed
584
	struct vm_area_struct pvma;
585
	struct page *page;
Linus Torvalds's avatar
Linus Torvalds committed
586

587 588 589
	spol = mpol_cond_copy(&mpol,
				mpol_shared_policy_lookup(&info->policy, idx));

Linus Torvalds's avatar
Linus Torvalds committed
590
	/* Create a pseudo vma that just contains the policy */
591
	pvma.vm_start = 0;
Linus Torvalds's avatar
Linus Torvalds committed
592
	pvma.vm_pgoff = idx;
593
	pvma.vm_ops = NULL;
594
	pvma.vm_policy = spol;
595
	page = swapin_readahead(entry, gfp, &pvma, 0);
Linus Torvalds's avatar
Linus Torvalds committed
596 597 598
	return page;
}

599 600
static struct page *shmem_alloc_page(gfp_t gfp,
			struct shmem_inode_info *info, unsigned long idx)
Linus Torvalds's avatar
Linus Torvalds committed
601 602 603
{
	struct vm_area_struct pvma;

604 605
	/* Create a pseudo vma that just contains the policy */
	pvma.vm_start = 0;
Linus Torvalds's avatar
Linus Torvalds committed
606
	pvma.vm_pgoff = idx;
607 608
	pvma.vm_ops = NULL;
	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
609 610 611 612 613

	/*
	 * alloc_page_vma() will drop the shared policy reference
	 */
	return alloc_page_vma(gfp, &pvma, 0);
Linus Torvalds's avatar
Linus Torvalds committed
614
}
615 616
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
617
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
618 619 620 621
{
}
#endif /* CONFIG_TMPFS */

622 623
static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
			struct shmem_inode_info *info, unsigned long idx)
Linus Torvalds's avatar
Linus Torvalds committed
624
{
625
	return swapin_readahead(entry, gfp, NULL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
626 627
}

628 629
static inline struct page *shmem_alloc_page(gfp_t gfp,
			struct shmem_inode_info *info, unsigned long idx)
Linus Torvalds's avatar
Linus Torvalds committed
630
{
631
	return alloc_page(gfp);
Linus Torvalds's avatar
Linus Torvalds committed
632
}
633
#endif /* CONFIG_NUMA */
Linus Torvalds's avatar
Linus Torvalds committed
634

635 636 637 638 639 640 641
#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
	return NULL;
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
642
/*
643
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
Linus Torvalds's avatar
Linus Torvalds committed
644 645 646 647 648
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache
 */
649 650
static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
Linus Torvalds's avatar
Linus Torvalds committed
651 652 653 654
{
	struct address_space *mapping = inode->i_mapping;
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo;
655
	struct page *page;
656
	struct page *prealloc_page = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
657 658 659
	swp_entry_t swap;
	int error;

660
	if (idx > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
Linus Torvalds's avatar
Linus Torvalds committed
661 662
		return -EFBIG;
repeat:
663 664
	page = find_lock_page(mapping, idx);
	if (page) {
Hugh Dickins's avatar
Hugh Dickins committed
665
		/*
666 667 668
		 * Once we can get the page lock, it must be uptodate:
		 * if there were an error in reading back from swap,
		 * the page would not be inserted into the filecache.
Hugh Dickins's avatar
Hugh Dickins committed
669
		 */
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
		BUG_ON(!PageUptodate(page));
		goto done;
	}

	/*
	 * Try to preload while we can wait, to not make a habit of
	 * draining atomic reserves; but don't latch on to this cpu.
	 */
	error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
	if (error)
		goto out;
	radix_tree_preload_end();

	if (sgp != SGP_READ && !prealloc_page) {
		prealloc_page = shmem_alloc_page(gfp, info, idx);
		if (prealloc_page) {
			SetPageSwapBacked(prealloc_page);
			if (mem_cgroup_cache_charge(prealloc_page,
					current->mm, GFP_KERNEL)) {
				page_cache_release(prealloc_page);
				prealloc_page = NULL;
691 692
			}
		}
Hugh Dickins's avatar
Hugh Dickins committed
693
	}
Linus Torvalds's avatar
Linus Torvalds committed
694 695 696

	spin_lock(&info->lock);
	shmem_recalc_inode(inode);
697
	swap = shmem_get_swap(info, idx);
Linus Torvalds's avatar
Linus Torvalds committed
698 699
	if (swap.val) {
		/* Look it up and read it in.. */
700 701
		page = lookup_swap_cache(swap);
		if (!page) {
702
			spin_unlock(&info->lock);
Linus Torvalds's avatar
Linus Torvalds committed
703
			/* here we actually do the io */
704 705
			if (fault_type)
				*fault_type |= VM_FAULT_MAJOR;
706 707
			page = shmem_swapin(swap, gfp, info, idx);
			if (!page) {
708 709 710
				swp_entry_t nswap = shmem_get_swap(info, idx);
				if (nswap.val == swap.val) {
					error = -ENOMEM;
711
					goto out;
712
				}
Linus Torvalds's avatar
Linus Torvalds committed
713 714
				goto repeat;
			}
715 716
			wait_on_page_locked(page);
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
717 718 719 720
			goto repeat;
		}

		/* We have to do this with page locked to prevent races */
721
		if (!trylock_page(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
722
			spin_unlock(&info->lock);
723 724
			wait_on_page_locked(page);
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
725 726
			goto repeat;
		}
727
		if (PageWriteback(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
728
			spin_unlock(&info->lock);
729 730 731
			wait_on_page_writeback(page);
			unlock_page(page);
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
732 733
			goto repeat;
		}
734
		if (!PageUptodate(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
735
			spin_unlock(&info->lock);
736 737
			unlock_page(page);
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
738
			error = -EIO;
739
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
740 741
		}

742 743 744
		error = add_to_page_cache_locked(page, mapping,
						 idx, GFP_NOWAIT);
		if (error) {
Linus Torvalds's avatar
Linus Torvalds committed
745
			spin_unlock(&info->lock);
746
			if (error == -ENOMEM) {
747 748 749 750 751
				/*
				 * reclaim from proper memory cgroup and
				 * call memcg's OOM if needed.
				 */
				error = mem_cgroup_shmem_charge_fallback(
752
						page, current->mm, gfp);
753
				if (error) {
754 755 756
					unlock_page(page);
					page_cache_release(page);
					goto out;
757
				}
758
			}
759 760
			unlock_page(page);
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
761 762
			goto repeat;
		}
763 764

		delete_from_swap_cache(page);
765 766
		shmem_put_swap(info, idx, (swp_entry_t){0});
		info->swapped--;
767 768 769 770 771 772 773
		spin_unlock(&info->lock);
		set_page_dirty(page);
		swap_free(swap);

	} else if (sgp == SGP_READ) {
		page = find_get_page(mapping, idx);
		if (page && !trylock_page(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
774
			spin_unlock(&info->lock);
775 776
			wait_on_page_locked(page);
			page_cache_release(page);
Linus Torvalds's avatar
Linus Torvalds committed
777 778 779
			goto repeat;
		}
		spin_unlock(&info->lock);
Hugh Dickins's avatar
Hugh Dickins committed
780 781

	} else if (prealloc_page) {
Linus Torvalds's avatar
Linus Torvalds committed
782
		sbinfo = SHMEM_SB(inode->i_sb);
783
		if (sbinfo->max_blocks) {
784 785
			if (percpu_counter_compare(&sbinfo->used_blocks,
						sbinfo->max_blocks) >= 0 ||
786 787
			    shmem_acct_block(info->flags))
				goto nospace;
788
			percpu_counter_inc(&sbinfo->used_blocks);
Linus Torvalds's avatar
Linus Torvalds committed
789
			inode->i_blocks += BLOCKS_PER_PAGE;
790 791
		} else if (shmem_acct_block(info->flags))
			goto nospace;
Linus Torvalds's avatar
Linus Torvalds committed
792

793 794
		page = prealloc_page;
		prealloc_page = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
795

796 797
		swap = shmem_get_swap(info, idx);
		if (swap.val)
798 799
			mem_cgroup_uncharge_cache_page(page);
		else
800
			error = add_to_page_cache_lru(page, mapping,
801
						idx, GFP_NOWAIT);
802 803 804 805
		/*
		 * At add_to_page_cache_lru() failure,
		 * uncharge will be done automatically.
		 */
806
		if (swap.val || error) {
807 808 809 810 811
			shmem_unacct_blocks(info->flags, 1);
			shmem_free_blocks(inode, 1);
			spin_unlock(&info->lock);
			page_cache_release(page);
			goto repeat;
Linus Torvalds's avatar
Linus Torvalds committed
812 813 814 815
		}

		info->alloced++;
		spin_unlock(&info->lock);
816 817 818
		clear_highpage(page);
		flush_dcache_page(page);
		SetPageUptodate(page);
819
		if (sgp == SGP_DIRTY)
820 821
			set_page_dirty(page);

Hugh Dickins's avatar
Hugh Dickins committed
822 823 824 825
	} else {
		spin_unlock(&info->lock);
		error = -ENOMEM;
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
826 827
	}
done:
828
	*pagep = page;
829
	error = 0;
Hugh Dickins's avatar
Hugh Dickins committed
830 831 832 833 834 835
out:
	if (prealloc_page) {
		mem_cgroup_uncharge_cache_page(prealloc_page);
		page_cache_release(prealloc_page);
	}
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
836

837 838 839 840 841
nospace:
	/*
	 * Perhaps the page was brought in from swap between find_lock_page
	 * and taking info->lock?  We allow for that at add_to_page_cache_lru,
	 * but must also avoid reporting a spurious ENOSPC while working on a
Hugh Dickins's avatar
Hugh Dickins committed
842
	 * full tmpfs.
843
	 */
844
	page = find_get_page(mapping, idx);
845
	spin_unlock(&info->lock);
846 847 848
	if (page) {
		page_cache_release(page);
		goto repeat;
849
	}
850
	error = -ENOSPC;
Hugh Dickins's avatar
Hugh Dickins committed
851
	goto out;
Linus Torvalds's avatar
Linus Torvalds committed
852 853
}

Nick Piggin's avatar
Nick Piggin committed
854
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
855
{
856
	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
857
	int error;
858
	int ret = VM_FAULT_LOCKED;
Linus Torvalds's avatar
Linus Torvalds committed
859

Nick Piggin's avatar
Nick Piggin committed
860 861
	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
		return VM_FAULT_SIGBUS;
862

863
	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
Nick Piggin's avatar
Nick Piggin committed
864 865
	if (error)
		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
866

867 868 869 870
	if (ret & VM_FAULT_MAJOR) {
		count_vm_event(PGMAJFAULT);
		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
	}
871
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
872 873 874
}

#ifdef CONFIG_NUMA
875
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
Linus Torvalds's avatar
Linus Torvalds committed
876
{
877
	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
878 879 880
	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
}

881 882
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
					  unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
883
{
884
	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
885 886 887 888 889 890 891 892 893
	unsigned long idx;

	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
}
#endif

int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
894
	struct inode *inode = file->f_path.dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
895 896 897 898 899 900 901 902
	struct shmem_inode_info *info = SHMEM_I(inode);
	int retval = -ENOMEM;

	spin_lock(&info->lock);
	if (lock && !(info->flags & VM_LOCKED)) {
		if (!user_shm_lock(inode->i_size, user))
			goto out_nomem;
		info->flags |= VM_LOCKED;
903
		mapping_set_unevictable(file->f_mapping);
Linus Torvalds's avatar
Linus Torvalds committed
904 905 906 907
	}
	if (!lock && (info->flags & VM_LOCKED) && user) {
		user_shm_unlock(inode->i_size, user);
		info->flags &= ~VM_LOCKED;
908 909
		mapping_clear_unevictable(file->f_mapping);
		scan_mapping_unevictable_pages(file->f_mapping);
Linus Torvalds's avatar
Linus Torvalds committed
910 911
	}
	retval = 0;
912

Linus Torvalds's avatar
Linus Torvalds committed
913 914 915 916 917
out_nomem:
	spin_unlock(&info->lock);
	return retval;
}

918
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
919 920 921
{
	file_accessed(file);
	vma->vm_ops = &shmem_vm_ops;
Nick Piggin's avatar
Nick Piggin committed
922
	vma->vm_flags |= VM_CAN_NONLINEAR;
Linus Torvalds's avatar
Linus Torvalds committed
923 924 925
	return 0;
}

926 927
static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
				     int mode, dev_t dev, unsigned long flags)
Linus Torvalds's avatar
Linus Torvalds committed
928 929 930 931 932
{
	struct inode *inode;
	struct shmem_inode_info *info;
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

933 934
	if (shmem_reserve_inode(sb))
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
935 936 937

	inode = new_inode(sb);
	if (inode) {
938
		inode->i_ino = get_next_ino();
939
		inode_init_owner(inode, dir, mode);
Linus Torvalds's avatar
Linus Torvalds committed
940 941 942
		inode->i_blocks = 0;
		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
943
		inode->i_generation = get_seconds();
Linus Torvalds's avatar
Linus Torvalds committed
944 945 946
		info = SHMEM_I(inode);
		memset(info, 0, (char *)inode - (char *)info);
		spin_lock_init(&info->lock);
947
		info->flags = flags & VM_NORESERVE;
Linus Torvalds's avatar
Linus Torvalds committed
948
		INIT_LIST_HEAD(&info->swaplist);
949
		INIT_LIST_HEAD(&info->xattr_list);
950
		cache_no_acl(inode);
Linus Torvalds's avatar
Linus Torvalds committed
951 952 953

		switch (mode & S_IFMT) {
		default:
954
			inode->i_op = &shmem_special_inode_operations;
Linus Torvalds's avatar
Linus Torvalds committed
955 956 957
			init_special_inode(inode, mode, dev);
			break;
		case S_IFREG:
958
			inode->i_mapping->a_ops = &shmem_aops;
Linus Torvalds's avatar
Linus Torvalds committed