task_mmu.c 38.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
#include <linux/mm.h>
#include <linux/hugetlb.h>
3
#include <linux/huge_mm.h>
Linus Torvalds's avatar
Linus Torvalds committed
4
5
#include <linux/mount.h>
#include <linux/seq_file.h>
Mauricio Lin's avatar
Mauricio Lin committed
6
#include <linux/highmem.h>
Kees Cook's avatar
Kees Cook committed
7
#include <linux/ptrace.h>
8
#include <linux/slab.h>
9
10
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
11
#include <linux/rmap.h>
12
13
#include <linux/swap.h>
#include <linux/swapops.h>
14
#include <linux/mmu_notifier.h>
Mauricio Lin's avatar
Mauricio Lin committed
15

Linus Torvalds's avatar
Linus Torvalds committed
16
17
#include <asm/elf.h>
#include <asm/uaccess.h>
Mauricio Lin's avatar
Mauricio Lin committed
18
#include <asm/tlbflush.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
20
#include "internal.h"

21
void task_mem(struct seq_file *m, struct mm_struct *mm)
Linus Torvalds's avatar
Linus Torvalds committed
22
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
23
	unsigned long data, text, lib, swap;
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;

	/*
	 * Note: to minimize their overhead, mm maintains hiwater_vm and
	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
	 * collector of these hiwater stats must therefore get total_vm
	 * and rss too, which will usually be the higher.  Barriers? not
	 * worth the effort, such snapshots can always be inconsistent.
	 */
	hiwater_vm = total_vm = mm->total_vm;
	if (hiwater_vm < mm->hiwater_vm)
		hiwater_vm = mm->hiwater_vm;
	hiwater_rss = total_rss = get_mm_rss(mm);
	if (hiwater_rss < mm->hiwater_rss)
		hiwater_rss = mm->hiwater_rss;
Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
42

	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
43
	swap = get_mm_counter(mm, MM_SWAPENTS);
44
	seq_printf(m,
45
		"VmPeak:\t%8lu kB\n"
Linus Torvalds's avatar
Linus Torvalds committed
46
47
		"VmSize:\t%8lu kB\n"
		"VmLck:\t%8lu kB\n"
48
		"VmPin:\t%8lu kB\n"
49
		"VmHWM:\t%8lu kB\n"
Linus Torvalds's avatar
Linus Torvalds committed
50
51
52
53
54
		"VmRSS:\t%8lu kB\n"
		"VmData:\t%8lu kB\n"
		"VmStk:\t%8lu kB\n"
		"VmExe:\t%8lu kB\n"
		"VmLib:\t%8lu kB\n"
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
55
56
		"VmPTE:\t%8lu kB\n"
		"VmSwap:\t%8lu kB\n",
57
		hiwater_vm << (PAGE_SHIFT-10),
58
		total_vm << (PAGE_SHIFT-10),
Linus Torvalds's avatar
Linus Torvalds committed
59
		mm->locked_vm << (PAGE_SHIFT-10),
60
		mm->pinned_vm << (PAGE_SHIFT-10),
61
62
		hiwater_rss << (PAGE_SHIFT-10),
		total_rss << (PAGE_SHIFT-10),
Linus Torvalds's avatar
Linus Torvalds committed
63
64
		data << (PAGE_SHIFT-10),
		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
65
66
		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
		swap << (PAGE_SHIFT-10));
Linus Torvalds's avatar
Linus Torvalds committed
67
68
69
70
71
72
73
}

unsigned long task_vsize(struct mm_struct *mm)
{
	return PAGE_SIZE * mm->total_vm;
}

74
75
76
unsigned long task_statm(struct mm_struct *mm,
			 unsigned long *shared, unsigned long *text,
			 unsigned long *data, unsigned long *resident)
Linus Torvalds's avatar
Linus Torvalds committed
77
{
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
78
	*shared = get_mm_counter(mm, MM_FILEPAGES);
Linus Torvalds's avatar
Linus Torvalds committed
79
80
81
	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
								>> PAGE_SHIFT;
	*data = mm->total_vm - mm->shared_vm;
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
82
	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
Linus Torvalds's avatar
Linus Torvalds committed
83
84
85
86
87
88
89
90
91
92
93
	return mm->total_vm;
}

static void pad_len_spaces(struct seq_file *m, int len)
{
	len = 25 + sizeof(void*) * 6 - len;
	if (len < 1)
		len = 1;
	seq_printf(m, "%*c", len, ' ');
}

94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#ifdef CONFIG_NUMA
/*
 * These functions are for numa_maps but called in generic **maps seq_file
 * ->start(), ->stop() ops.
 *
 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
 * Each mempolicy object is controlled by reference counting. The problem here
 * is how to avoid accessing dead mempolicy object.
 *
 * Because we're holding mmap_sem while reading seq_file, it's safe to access
 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
 *
 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
 * gurantee the task never exits under us. But taking task_lock() around
 * get_vma_plicy() causes lock order problem.
 *
 * To access task->mempolicy without lock, we hold a reference count of an
 * object pointed by task->mempolicy and remember it. This will guarantee
 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
 */
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
	struct task_struct *task = priv->task;

	task_lock(task);
	priv->task_mempolicy = task->mempolicy;
	mpol_get(priv->task_mempolicy);
	task_unlock(task);
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
	mpol_put(priv->task_mempolicy);
}
#else
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
}
#endif

138
139
140
141
static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
	if (vma && vma != priv->tail_vma) {
		struct mm_struct *mm = vma->vm_mm;
142
		release_task_mempolicy(priv);
143
144
145
146
		up_read(&mm->mmap_sem);
		mmput(mm);
	}
}
147

148
static void *m_start(struct seq_file *m, loff_t *pos)
Mauricio Lin's avatar
Mauricio Lin committed
149
{
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
	struct proc_maps_private *priv = m->private;
	unsigned long last_addr = m->version;
	struct mm_struct *mm;
	struct vm_area_struct *vma, *tail_vma = NULL;
	loff_t l = *pos;

	/* Clear the per syscall fields in priv */
	priv->task = NULL;
	priv->tail_vma = NULL;

	/*
	 * We remember last_addr rather than next_addr to hit with
	 * mmap_cache most of the time. We have zero last_addr at
	 * the beginning and also after lseek. We will have -1 last_addr
	 * after the end of the vmas.
	 */

	if (last_addr == -1UL)
		return NULL;

	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
	if (!priv->task)
172
		return ERR_PTR(-ESRCH);
173

Cong Wang's avatar
Cong Wang committed
174
	mm = mm_access(priv->task, PTRACE_MODE_READ);
175
176
	if (!mm || IS_ERR(mm))
		return mm;
177
	down_read(&mm->mmap_sem);
178

179
	tail_vma = get_gate_vma(priv->task->mm);
180
	priv->tail_vma = tail_vma;
181
	hold_task_mempolicy(priv);
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
	/* Start with last addr hint */
	vma = find_vma(mm, last_addr);
	if (last_addr && vma) {
		vma = vma->vm_next;
		goto out;
	}

	/*
	 * Check the vma index is within the range and do
	 * sequential scan until m_index.
	 */
	vma = NULL;
	if ((unsigned long)l < mm->map_count) {
		vma = mm->mmap;
		while (l-- && vma)
			vma = vma->vm_next;
		goto out;
	}

	if (l != mm->map_count)
		tail_vma = NULL; /* After gate vma */

out:
	if (vma)
		return vma;

208
	release_task_mempolicy(priv);
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
	/* End of vmas has been reached */
	m->version = (tail_vma != NULL)? 0: -1UL;
	up_read(&mm->mmap_sem);
	mmput(mm);
	return tail_vma;
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
	struct proc_maps_private *priv = m->private;
	struct vm_area_struct *vma = v;
	struct vm_area_struct *tail_vma = priv->tail_vma;

	(*pos)++;
	if (vma && (vma != tail_vma) && vma->vm_next)
		return vma->vm_next;
	vma_stop(priv, vma);
	return (vma != tail_vma)? tail_vma: NULL;
}

static void m_stop(struct seq_file *m, void *v)
{
	struct proc_maps_private *priv = m->private;
	struct vm_area_struct *vma = v;

234
235
	if (!IS_ERR(vma))
		vma_stop(priv, vma);
236
237
238
239
240
	if (priv->task)
		put_task_struct(priv->task);
}

static int do_maps_open(struct inode *inode, struct file *file,
241
			const struct seq_operations *ops)
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
{
	struct proc_maps_private *priv;
	int ret = -ENOMEM;
	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
	if (priv) {
		priv->pid = proc_pid(inode);
		ret = seq_open(file, ops);
		if (!ret) {
			struct seq_file *m = file->private_data;
			m->private = priv;
		} else {
			kfree(priv);
		}
	}
	return ret;
}
Mauricio Lin's avatar
Mauricio Lin committed
258

259
260
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
Linus Torvalds's avatar
Linus Torvalds committed
261
{
Mauricio Lin's avatar
Mauricio Lin committed
262
263
	struct mm_struct *mm = vma->vm_mm;
	struct file *file = vma->vm_file;
264
265
	struct proc_maps_private *priv = m->private;
	struct task_struct *task = priv->task;
266
	vm_flags_t flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
267
	unsigned long ino = 0;
268
	unsigned long long pgoff = 0;
269
	unsigned long start, end;
Linus Torvalds's avatar
Linus Torvalds committed
270
271
	dev_t dev = 0;
	int len;
272
	const char *name = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
273
274

	if (file) {
Al Viro's avatar
Al Viro committed
275
		struct inode *inode = file_inode(vma->vm_file);
Linus Torvalds's avatar
Linus Torvalds committed
276
277
		dev = inode->i_sb->s_dev;
		ino = inode->i_ino;
278
		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
279
280
	}

281
282
	/* We don't show the stack guard page in /proc/maps */
	start = vma->vm_start;
283
284
285
286
287
	if (stack_guard_page_start(vma, start))
		start += PAGE_SIZE;
	end = vma->vm_end;
	if (stack_guard_page_end(vma, end))
		end -= PAGE_SIZE;
288

289
	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
290
			start,
291
			end,
Linus Torvalds's avatar
Linus Torvalds committed
292
293
294
295
			flags & VM_READ ? 'r' : '-',
			flags & VM_WRITE ? 'w' : '-',
			flags & VM_EXEC ? 'x' : '-',
			flags & VM_MAYSHARE ? 's' : 'p',
296
			pgoff,
Linus Torvalds's avatar
Linus Torvalds committed
297
298
299
300
301
302
			MAJOR(dev), MINOR(dev), ino, &len);

	/*
	 * Print the dentry name for named mappings, and a
	 * special [heap] marker for the heap:
	 */
Mauricio Lin's avatar
Mauricio Lin committed
303
	if (file) {
Linus Torvalds's avatar
Linus Torvalds committed
304
		pad_len_spaces(m, len);
305
		seq_path(m, &file->f_path, "\n");
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
		goto done;
	}

	name = arch_vma_name(vma);
	if (!name) {
		pid_t tid;

		if (!mm) {
			name = "[vdso]";
			goto done;
		}

		if (vma->vm_start <= mm->brk &&
		    vma->vm_end >= mm->start_brk) {
			name = "[heap]";
			goto done;
		}

		tid = vm_is_stack(task, vma, is_pid);

		if (tid != 0) {
			/*
			 * Thread stack in /proc/PID/task/TID/maps or
			 * the main process stack.
			 */
			if (!is_pid || (vma->vm_start <= mm->start_stack &&
			    vma->vm_end >= mm->start_stack)) {
				name = "[stack]";
334
			} else {
335
336
337
				/* Thread stack in /proc/PID/maps */
				pad_len_spaces(m, len);
				seq_printf(m, "[stack:%d]", tid);
Linus Torvalds's avatar
Linus Torvalds committed
338
			}
339
		}
340
341
342
343
344
345
	}

done:
	if (name) {
		pad_len_spaces(m, len);
		seq_puts(m, name);
Linus Torvalds's avatar
Linus Torvalds committed
346
347
	}
	seq_putc(m, '\n');
348
349
}

350
static int show_map(struct seq_file *m, void *v, int is_pid)
351
352
353
354
355
{
	struct vm_area_struct *vma = v;
	struct proc_maps_private *priv = m->private;
	struct task_struct *task = priv->task;

356
	show_map_vma(m, vma, is_pid);
Mauricio Lin's avatar
Mauricio Lin committed
357
358

	if (m->count < m->size)  /* vma is copied successfully */
359
360
		m->version = (vma != get_gate_vma(task->mm))
			? vma->vm_start : 0;
Linus Torvalds's avatar
Linus Torvalds committed
361
362
363
	return 0;
}

364
365
366
367
368
369
370
371
372
373
static int show_pid_map(struct seq_file *m, void *v)
{
	return show_map(m, v, 1);
}

static int show_tid_map(struct seq_file *m, void *v)
{
	return show_map(m, v, 0);
}

374
static const struct seq_operations proc_pid_maps_op = {
375
376
377
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
378
379
380
381
382
383
384
385
	.show	= show_pid_map
};

static const struct seq_operations proc_tid_maps_op = {
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
	.show	= show_tid_map
386
387
};

388
static int pid_maps_open(struct inode *inode, struct file *file)
389
390
391
392
{
	return do_maps_open(inode, file, &proc_pid_maps_op);
}

393
394
395
396
397
398
399
400
401
402
403
404
405
406
static int tid_maps_open(struct inode *inode, struct file *file)
{
	return do_maps_open(inode, file, &proc_tid_maps_op);
}

const struct file_operations proc_pid_maps_operations = {
	.open		= pid_maps_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release_private,
};

const struct file_operations proc_tid_maps_operations = {
	.open		= tid_maps_open,
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release_private,
};

/*
 * Proportional Set Size(PSS): my share of RSS.
 *
 * PSS of a process is the count of pages it has in memory, where each
 * page is divided by the number of processes sharing it.  So if a
 * process has 1000 pages all to itself, and 1000 shared with one other
 * process, its PSS will be 1500.
 *
 * To keep (accumulated) division errors low, we adopt a 64bit
 * fixed-point pss counter to minimize division errors. So (pss >>
 * PSS_SHIFT) would be the real byte count.
 *
 * A shift of 12 before division means (assuming 4K page size):
 * 	- 1M 3-user-pages add up to 8KB errors;
 * 	- supports mapcount up to 2^24, or 16M;
 * 	- supports PSS up to 2^52 bytes, or 4PB.
 */
#define PSS_SHIFT 12

431
#ifdef CONFIG_PROC_PAGE_MONITOR
Peter Zijlstra's avatar
Peter Zijlstra committed
432
struct mem_size_stats {
433
434
435
436
437
438
439
	struct vm_area_struct *vma;
	unsigned long resident;
	unsigned long shared_clean;
	unsigned long shared_dirty;
	unsigned long private_clean;
	unsigned long private_dirty;
	unsigned long referenced;
440
	unsigned long anonymous;
441
	unsigned long anonymous_thp;
Peter Zijlstra's avatar
Peter Zijlstra committed
442
	unsigned long swap;
443
	unsigned long nonlinear;
444
445
446
	u64 pss;
};

447
448

static void smaps_pte_entry(pte_t ptent, unsigned long addr,
449
		unsigned long ptent_size, struct mm_walk *walk)
450
451
452
{
	struct mem_size_stats *mss = walk->private;
	struct vm_area_struct *vma = mss->vma;
453
	pgoff_t pgoff = linear_page_index(vma, addr);
454
	struct page *page = NULL;
455
456
	int mapcount;

457
458
459
460
	if (pte_present(ptent)) {
		page = vm_normal_page(vma, addr, ptent);
	} else if (is_swap_pte(ptent)) {
		swp_entry_t swpent = pte_to_swp_entry(ptent);
461

462
463
464
465
		if (!non_swap_entry(swpent))
			mss->swap += ptent_size;
		else if (is_migration_entry(swpent))
			page = migration_entry_to_page(swpent);
466
467
468
	} else if (pte_file(ptent)) {
		if (pte_to_pgoff(ptent) != pgoff)
			mss->nonlinear += ptent_size;
469
	}
470
471
472
473
474

	if (!page)
		return;

	if (PageAnon(page))
475
		mss->anonymous += ptent_size;
476

477
478
479
	if (page->index != pgoff)
		mss->nonlinear += ptent_size;

480
	mss->resident += ptent_size;
481
482
	/* Accumulate the size in pages that have been accessed. */
	if (pte_young(ptent) || PageReferenced(page))
483
		mss->referenced += ptent_size;
484
485
486
	mapcount = page_mapcount(page);
	if (mapcount >= 2) {
		if (pte_dirty(ptent) || PageDirty(page))
487
			mss->shared_dirty += ptent_size;
488
		else
489
490
			mss->shared_clean += ptent_size;
		mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
491
492
	} else {
		if (pte_dirty(ptent) || PageDirty(page))
493
			mss->private_dirty += ptent_size;
494
		else
495
496
			mss->private_clean += ptent_size;
		mss->pss += (ptent_size << PSS_SHIFT);
497
498
499
	}
}

500
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
501
			   struct mm_walk *walk)
Mauricio Lin's avatar
Mauricio Lin committed
502
{
503
	struct mem_size_stats *mss = walk->private;
504
	struct vm_area_struct *vma = mss->vma;
505
	pte_t *pte;
506
	spinlock_t *ptl;
Mauricio Lin's avatar
Mauricio Lin committed
507

508
509
	if (pmd_trans_huge_lock(pmd, vma) == 1) {
		smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
510
		spin_unlock(&walk->mm->page_table_lock);
511
512
		mss->anonymous_thp += HPAGE_PMD_SIZE;
		return 0;
513
	}
514
515
516

	if (pmd_trans_unstable(pmd))
		return 0;
517
518
519
520
521
	/*
	 * The mmap_sem held all the way back in m_start() is what
	 * keeps khugepaged out of here and from collapsing things
	 * in here.
	 */
522
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
523
	for (; addr != end; pte++, addr += PAGE_SIZE)
524
		smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
525
526
	pte_unmap_unlock(pte - 1, ptl);
	cond_resched();
527
	return 0;
Mauricio Lin's avatar
Mauricio Lin committed
528
529
}

530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
	/*
	 * Don't forget to update Documentation/ on changes.
	 */
	static const char mnemonics[BITS_PER_LONG][2] = {
		/*
		 * In case if we meet a flag we don't know about.
		 */
		[0 ... (BITS_PER_LONG-1)] = "??",

		[ilog2(VM_READ)]	= "rd",
		[ilog2(VM_WRITE)]	= "wr",
		[ilog2(VM_EXEC)]	= "ex",
		[ilog2(VM_SHARED)]	= "sh",
		[ilog2(VM_MAYREAD)]	= "mr",
		[ilog2(VM_MAYWRITE)]	= "mw",
		[ilog2(VM_MAYEXEC)]	= "me",
		[ilog2(VM_MAYSHARE)]	= "ms",
		[ilog2(VM_GROWSDOWN)]	= "gd",
		[ilog2(VM_PFNMAP)]	= "pf",
		[ilog2(VM_DENYWRITE)]	= "dw",
		[ilog2(VM_LOCKED)]	= "lo",
		[ilog2(VM_IO)]		= "io",
		[ilog2(VM_SEQ_READ)]	= "sr",
		[ilog2(VM_RAND_READ)]	= "rr",
		[ilog2(VM_DONTCOPY)]	= "dc",
		[ilog2(VM_DONTEXPAND)]	= "de",
		[ilog2(VM_ACCOUNT)]	= "ac",
		[ilog2(VM_NORESERVE)]	= "nr",
		[ilog2(VM_HUGETLB)]	= "ht",
		[ilog2(VM_NONLINEAR)]	= "nl",
		[ilog2(VM_ARCH_1)]	= "ar",
		[ilog2(VM_DONTDUMP)]	= "dd",
		[ilog2(VM_MIXEDMAP)]	= "mm",
		[ilog2(VM_HUGEPAGE)]	= "hg",
		[ilog2(VM_NOHUGEPAGE)]	= "nh",
		[ilog2(VM_MERGEABLE)]	= "mg",
	};
	size_t i;

	seq_puts(m, "VmFlags: ");
	for (i = 0; i < BITS_PER_LONG; i++) {
		if (vma->vm_flags & (1UL << i)) {
			seq_printf(m, "%c%c ",
				   mnemonics[i][0], mnemonics[i][1]);
		}
	}
	seq_putc(m, '\n');
}

581
static int show_smap(struct seq_file *m, void *v, int is_pid)
Mauricio Lin's avatar
Mauricio Lin committed
582
{
583
584
	struct proc_maps_private *priv = m->private;
	struct task_struct *task = priv->task;
Mauricio Lin's avatar
Mauricio Lin committed
585
586
	struct vm_area_struct *vma = v;
	struct mem_size_stats mss;
587
588
589
590
591
	struct mm_walk smaps_walk = {
		.pmd_entry = smaps_pte_range,
		.mm = vma->vm_mm,
		.private = &mss,
	};
Mauricio Lin's avatar
Mauricio Lin committed
592
593

	memset(&mss, 0, sizeof mss);
594
	mss.vma = vma;
595
	/* mmap_sem is held in m_start */
Nick Piggin's avatar
Nick Piggin committed
596
	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
597
		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
598

599
	show_map_vma(m, vma, is_pid);
600
601
602
603
604
605
606
607
608

	seq_printf(m,
		   "Size:           %8lu kB\n"
		   "Rss:            %8lu kB\n"
		   "Pss:            %8lu kB\n"
		   "Shared_Clean:   %8lu kB\n"
		   "Shared_Dirty:   %8lu kB\n"
		   "Private_Clean:  %8lu kB\n"
		   "Private_Dirty:  %8lu kB\n"
Peter Zijlstra's avatar
Peter Zijlstra committed
609
		   "Referenced:     %8lu kB\n"
610
		   "Anonymous:      %8lu kB\n"
611
		   "AnonHugePages:  %8lu kB\n"
612
		   "Swap:           %8lu kB\n"
613
		   "KernelPageSize: %8lu kB\n"
614
615
		   "MMUPageSize:    %8lu kB\n"
		   "Locked:         %8lu kB\n",
616
617
618
619
620
621
622
		   (vma->vm_end - vma->vm_start) >> 10,
		   mss.resident >> 10,
		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
		   mss.shared_clean  >> 10,
		   mss.shared_dirty  >> 10,
		   mss.private_clean >> 10,
		   mss.private_dirty >> 10,
Peter Zijlstra's avatar
Peter Zijlstra committed
623
		   mss.referenced >> 10,
624
		   mss.anonymous >> 10,
625
		   mss.anonymous_thp >> 10,
626
		   mss.swap >> 10,
627
		   vma_kernel_pagesize(vma) >> 10,
628
629
630
		   vma_mmu_pagesize(vma) >> 10,
		   (vma->vm_flags & VM_LOCKED) ?
			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
631

632
633
634
635
	if (vma->vm_flags & VM_NONLINEAR)
		seq_printf(m, "Nonlinear:      %8lu kB\n",
				mss.nonlinear >> 10);

636
637
	show_smap_vma_flags(m, vma);

638
	if (m->count < m->size)  /* vma is copied successfully */
639
640
		m->version = (vma != get_gate_vma(task->mm))
			? vma->vm_start : 0;
641
	return 0;
Mauricio Lin's avatar
Mauricio Lin committed
642
643
}

644
645
646
647
648
649
650
651
652
653
static int show_pid_smap(struct seq_file *m, void *v)
{
	return show_smap(m, v, 1);
}

static int show_tid_smap(struct seq_file *m, void *v)
{
	return show_smap(m, v, 0);
}

654
static const struct seq_operations proc_pid_smaps_op = {
655
656
657
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
658
659
660
661
662
663
664
665
	.show	= show_pid_smap
};

static const struct seq_operations proc_tid_smaps_op = {
	.start	= m_start,
	.next	= m_next,
	.stop	= m_stop,
	.show	= show_tid_smap
666
667
};

668
static int pid_smaps_open(struct inode *inode, struct file *file)
669
670
671
672
{
	return do_maps_open(inode, file, &proc_pid_smaps_op);
}

673
674
675
676
677
678
679
680
681
682
683
684
685
686
static int tid_smaps_open(struct inode *inode, struct file *file)
{
	return do_maps_open(inode, file, &proc_tid_smaps_op);
}

const struct file_operations proc_pid_smaps_operations = {
	.open		= pid_smaps_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release_private,
};

const struct file_operations proc_tid_smaps_operations = {
	.open		= tid_smaps_open,
687
688
689
690
691
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= seq_release_private,
};

692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
/*
 * We do not want to have constant page-shift bits sitting in
 * pagemap entries and are about to reuse them some time soon.
 *
 * Here's the "migration strategy":
 * 1. when the system boots these bits remain what they are,
 *    but a warning about future change is printed in log;
 * 2. once anyone clears soft-dirty bits via clear_refs file,
 *    these flag is set to denote, that user is aware of the
 *    new API and those page-shift bits change their meaning.
 *    The respective warning is printed in dmesg;
 * 3. In a couple of releases we will remove all the mentions
 *    of page-shift in pagemap entries.
 */

static bool soft_dirty_cleared __read_mostly;

709
710
711
712
enum clear_refs_types {
	CLEAR_REFS_ALL = 1,
	CLEAR_REFS_ANON,
	CLEAR_REFS_MAPPED,
713
	CLEAR_REFS_SOFT_DIRTY,
714
715
716
	CLEAR_REFS_LAST,
};

717
718
struct clear_refs_private {
	struct vm_area_struct *vma;
719
	enum clear_refs_types type;
720
721
};

722
723
724
725
726
727
728
729
730
731
732
static inline void clear_soft_dirty(struct vm_area_struct *vma,
		unsigned long addr, pte_t *pte)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
	/*
	 * The soft-dirty tracker uses #PF-s to catch writes
	 * to pages, so write-protect the pte as well. See the
	 * Documentation/vm/soft-dirty.txt for full description
	 * of how soft-dirty works.
	 */
	pte_t ptent = *pte;
733
734
735
736
737
738

	if (pte_present(ptent)) {
		ptent = pte_wrprotect(ptent);
		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
	} else if (is_swap_pte(ptent)) {
		ptent = pte_swp_clear_soft_dirty(ptent);
739
740
	} else if (pte_file(ptent)) {
		ptent = pte_file_clear_soft_dirty(ptent);
741
742
	}

743
744
745
	if (vma->vm_flags & VM_SOFTDIRTY)
		vma->vm_flags &= ~VM_SOFTDIRTY;

746
747
748
749
	set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif
}

750
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
751
				unsigned long end, struct mm_walk *walk)
752
{
753
754
	struct clear_refs_private *cp = walk->private;
	struct vm_area_struct *vma = cp->vma;
755
756
757
758
	pte_t *pte, ptent;
	spinlock_t *ptl;
	struct page *page;

759
	split_huge_page_pmd(vma, addr, pmd);
760
761
	if (pmd_trans_unstable(pmd))
		return 0;
762

763
764
765
766
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	for (; addr != end; pte++, addr += PAGE_SIZE) {
		ptent = *pte;

767
768
769
770
771
		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
			clear_soft_dirty(vma, addr, pte);
			continue;
		}

772
773
774
		if (!pte_present(ptent))
			continue;

775
776
777
778
779
780
781
782
783
784
785
786
787
		page = vm_normal_page(vma, addr, ptent);
		if (!page)
			continue;

		/* Clear accessed and referenced bits. */
		ptep_test_and_clear_young(vma, addr, pte);
		ClearPageReferenced(page);
	}
	pte_unmap_unlock(pte - 1, ptl);
	cond_resched();
	return 0;
}

788
789
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppos)
790
{
791
	struct task_struct *task;
792
	char buffer[PROC_NUMBUF];
793
	struct mm_struct *mm;
794
	struct vm_area_struct *vma;
795
796
	enum clear_refs_types type;
	int itype;
797
	int rv;
798

799
800
801
802
803
	memset(buffer, 0, sizeof(buffer));
	if (count > sizeof(buffer) - 1)
		count = sizeof(buffer) - 1;
	if (copy_from_user(buffer, buf, count))
		return -EFAULT;
804
	rv = kstrtoint(strstrip(buffer), 10, &itype);
805
806
	if (rv < 0)
		return rv;
807
808
	type = (enum clear_refs_types)itype;
	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
809
		return -EINVAL;
810
811
812
813
814
815
816

	if (type == CLEAR_REFS_SOFT_DIRTY) {
		soft_dirty_cleared = true;
		pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
				"See the linux/Documentation/vm/pagemap.txt for details.\n");
	}

Al Viro's avatar
Al Viro committed
817
	task = get_proc_task(file_inode(file));
818
819
820
821
	if (!task)
		return -ESRCH;
	mm = get_task_mm(task);
	if (mm) {
822
		struct clear_refs_private cp = {
823
			.type = type,
824
		};
825
826
827
		struct mm_walk clear_refs_walk = {
			.pmd_entry = clear_refs_pte_range,
			.mm = mm,
828
			.private = &cp,
829
		};
830
		down_read(&mm->mmap_sem);
831
832
		if (type == CLEAR_REFS_SOFT_DIRTY)
			mmu_notifier_invalidate_range_start(mm, 0, -1);
833
		for (vma = mm->mmap; vma; vma = vma->vm_next) {
834
			cp.vma = vma;
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
			if (is_vm_hugetlb_page(vma))
				continue;
			/*
			 * Writing 1 to /proc/pid/clear_refs affects all pages.
			 *
			 * Writing 2 to /proc/pid/clear_refs only affects
			 * Anonymous pages.
			 *
			 * Writing 3 to /proc/pid/clear_refs only affects file
			 * mapped pages.
			 */
			if (type == CLEAR_REFS_ANON && vma->vm_file)
				continue;
			if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
				continue;
			walk_page_range(vma->vm_start, vma->vm_end,
					&clear_refs_walk);
852
		}
853
854
		if (type == CLEAR_REFS_SOFT_DIRTY)
			mmu_notifier_invalidate_range_end(mm, 0, -1);
855
856
857
858
859
		flush_tlb_mm(mm);
		up_read(&mm->mmap_sem);
		mmput(mm);
	}
	put_task_struct(task);
860
861

	return count;
862
863
}

864
865
const struct file_operations proc_clear_refs_operations = {
	.write		= clear_refs_write,
866
	.llseek		= noop_llseek,
867
868
};

869
870
871
872
typedef struct {
	u64 pme;
} pagemap_entry_t;

873
struct pagemapread {
874
	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
875
	pagemap_entry_t *buffer;
876
	bool v2;
877
878
};

879
880
881
#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
#define PAGEMAP_WALK_MASK	(PMD_MASK)

882
#define PM_ENTRY_BYTES      sizeof(pagemap_entry_t)
883
884
885
886
887
888
889
#define PM_STATUS_BITS      3
#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
#define PM_PSHIFT_BITS      6
#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
890
#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
891
892
#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
893
894
/* in "new" pagemap pshift bits are occupied with more status bits */
#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
895

896
#define __PM_SOFT_DIRTY      (1LL)
897
898
#define PM_PRESENT          PM_STATUS(4LL)
#define PM_SWAP             PM_STATUS(2LL)
899
#define PM_FILE             PM_STATUS(1LL)
900
#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)
901
902
#define PM_END_OF_BUFFER    1

903
904
905
906
907
908
static inline pagemap_entry_t make_pme(u64 val)
{
	return (pagemap_entry_t) { .pme = val };
}

static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
909
910
			  struct pagemapread *pm)
{
911
	pm->buffer[pm->pos++] = *pme;
912
	if (pm->pos >= pm->len)
913
		return PM_END_OF_BUFFER;
914
915
916
917
	return 0;
}

static int pagemap_pte_hole(unsigned long start, unsigned long end,
918
				struct mm_walk *walk)
919
{
920
	struct pagemapread *pm = walk->private;
921
922
	unsigned long addr;
	int err = 0;
923
	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
924

925
	for (addr = start; addr < end; addr += PAGE_SIZE) {
926
		err = add_to_pagemap(addr, &pme, pm);
927
928
929
930
931
932
		if (err)
			break;
	}
	return err;
}

933
static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
934
		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
935
{
936
937
	u64 frame, flags;
	struct page *page = NULL;
938
	int flags2 = 0;
939

940
941
942
943
944
	if (pte_present(pte)) {
		frame = pte_pfn(pte);
		flags = PM_PRESENT;
		page = vm_normal_page(vma, addr, pte);
	} else if (is_swap_pte(pte)) {
945
946
947
948
		swp_entry_t entry;
		if (pte_swp_soft_dirty(pte))
			flags2 |= __PM_SOFT_DIRTY;
		entry = pte_to_swp_entry(pte);
949
950
951
952
953
954
		frame = swp_type(entry) |
			(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
		flags = PM_SWAP;
		if (is_migration_entry(entry))
			page = migration_entry_to_page(entry);
	} else {
955
956
957
		if (vma->vm_flags & VM_SOFTDIRTY)
			flags2 |= __PM_SOFT_DIRTY;
		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
958
959
960
961
962
		return;
	}

	if (page && !PageAnon(page))
		flags |= PM_FILE;
963
	if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
964
		flags2 |= __PM_SOFT_DIRTY;
965

966
	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
967
968
}

969
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
970
static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
971
		pmd_t pmd, int offset, int pmd_flags2)
972
973
974
975
976
977
978
{
	/*
	 * Currently pmd for thp is always present because thp can not be
	 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
	 * This if-check is just to prepare for future implementation.
	 */
	if (pmd_present(pmd))
979
		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
980
				| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
981
	else
982
		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
983
984
}
#else
985
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
986
		pmd_t pmd, int offset, int pmd_flags2)
987
988
989
990
{
}
#endif

991
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
992
			     struct mm_walk *walk)
993
{
994
	struct vm_area_struct *vma;
995
	struct pagemapread *pm = walk->private;
996
997
	pte_t *pte;
	int err = 0;
998
	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
999

1000
1001
	/* find the first VMA at or above 'addr' */
	vma = find_vma(walk->mm, addr);
1002
	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
1003
1004
		int pmd_flags2;

1005
1006
1007
1008
1009
		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
			pmd_flags2 = __PM_SOFT_DIRTY;
		else
			pmd_flags2 = 0;

1010
1011
1012
1013
1014
		for (; addr != end; addr += PAGE_SIZE) {
			unsigned long offset;

			offset = (addr & ~PAGEMAP_WALK_MASK) >>
					PAGE_SHIFT;
1015
			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
1016
			err = add_to_pagemap(addr, &pme, pm);
1017
1018
			if (err)
				break;
1019
1020
		}
		spin_unlock(&walk->mm->page_table_lock);
1021
		return err;
1022
1023
	}

1024
1025
	if (pmd_trans_unstable(pmd))
		return 0;
1026
	for (; addr != end; addr += PAGE_SIZE) {
1027
		int flags2;
1028
1029
1030

		/* check to see if we've left 'vma' behind
		 * and need a new, higher one */
1031
		if (vma && (addr >= vma->vm_end)) {
1032
			vma = find_vma(walk->mm, addr);
1033
1034
1035
1036
1037
			if (vma && (vma->vm_flags & VM_SOFTDIRTY))
				flags2 = __PM_SOFT_DIRTY;
			else
				flags2 = 0;
			pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1038
		}
1039
1040
1041
1042
1043
1044

		/* check that 'vma' actually covers this address,
		 * and that it isn't a huge page vma */
		if (vma && (vma->vm_start <= addr) &&
		    !is_vm_hugetlb_page(vma)) {
			pte = pte_offset_map(pmd, addr);
1045
			pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1046
1047
1048
			/* unmap before userspace copy */
			pte_unmap(pte);
		}
1049
		err = add_to_pagemap(addr, &pme, pm);
1050
1051
1052
1053
1054
1055
1056
1057
1058
		if (err)
			return err;
	}

	cond_resched();

	return err;
}

1059
#ifdef CONFIG_HUGETLB_PAGE
1060
static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1061
					pte_t pte, int offset, int flags2)
1062
1063
{
	if (pte_present(pte))
1064
1065
1066
		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)	|
				PM_STATUS2(pm->v2, flags2)		|
				PM_PRESENT);
1067
	else
1068
1069
		*pme = make_pme(PM_NOT_PRESENT(pm->v2)			|
				PM_STATUS2(pm->v2, flags2));
1070
1071
}

1072
1073
1074
1075
/* This function walks within one hugetlb entry in the single call */
static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
				 unsigned long addr, unsigned long end,
				 struct mm_walk *walk)
1076
1077
{
	struct pagemapread *pm = walk->private;
1078
	struct vm_area_struct *vma;
1079
	int err = 0;
1080
	int flags2;
1081
	pagemap_entry_t pme;
1082

1083
1084
1085
1086
1087
1088
1089
1090
	vma = find_vma(walk->mm, addr);
	WARN_ON_ONCE(!vma);

	if (vma && (vma->vm_flags & VM_SOFTDIRTY))
		flags2 = __PM_SOFT_DIRTY;
	else
		flags2 = 0;

1091
	for (; addr != end; addr += PAGE_SIZE) {
1092
		int offset = (addr & ~hmask) >> PAGE_SHIFT;
1093
		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1094
		err = add_to_pagemap(addr, &pme, pm);
1095
1096
1097
1098
1099
1100
1101
1102
		if (err)
			return err;
	}

	cond_resched();

	return err;
}
1103
#endif /* HUGETLB_PAGE */
1104

1105
1106
1107
/*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
 *
1108
1109
1110
 * For each page in the address space, this file contains one 64-bit entry
 * consisting of the following:
 *
1111
 * Bits 0-54  page frame number (PFN) if present
1112
 * Bits 0-4   swap type if swapped
1113
 * Bits 5-54  swap offset if swapped
1114
 * Bits 55-60 page shift (page size = 1<<page shift)
1115
 * Bit  61    page is file-page or shared-anon
1116
1117
1118
1119
1120
1121
 * Bit  62    page swapped
 * Bit  63    page present
 *
 * If the page is not present but in swap, then the PFN contains an
 * encoding of the swap file number and the page's offset into the
 * swap. Unmapped pages return a null PFN. This allows determining
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
 * precisely which pages are mapped (or in swap) and comparing mapped
 * pages between processes.
 *
 * Efficient users of this interface will use /proc/pid/maps to
 * determine which areas of memory are actually mapped and llseek to
 * skip over unmapped regions.
 */
static ssize_t pagemap_read(struct file *file, char __user *buf,
			    size_t count, loff_t *ppos)
{
Al Viro's avatar
Al Viro committed
1132
	struct task_struct *task = get_proc_task(file_inode(file));
1133
1134
1135
	struct mm_struct *mm;
	struct pagemapread pm;
	int ret = -ESRCH;
1136
	struct mm_walk pagemap_walk = {};
1137
1138
1139
1140
	unsigned long src;
	unsigned long svpfn;
	unsigned long start_vaddr;
	unsigned long end_vaddr;
1141
	int copied = 0;
1142
1143
1144
1145
1146
1147

	if (!task)
		goto out;

	ret = -EINVAL;
	/* file position must be aligned */
1148
	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1149
		goto out_task;
1150
1151

	ret = 0;
1152
1153
1154
	if (!count)
		goto out_task;

1155
	pm.v2 = soft_dirty_cleared;
yonghua zheng's avatar
<