Commit 714f83d5 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'tracing-for-linus' of git://

* 'tracing-for-linus' of git:// (413 commits)
  tracing, net: fix net tree and tracing tree merge interaction
  tracing, powerpc: fix powerpc tree and tracing tree interaction
  ring-buffer: do not remove reader page from list on ring buffer free
  function-graph: allow unregistering twice
  trace: make argument 'mem' of trace_seq_putmem() const
  tracing: add missing 'extern' keywords to trace_output.h
  tracing: provide trace_seq_reserve()
  blktrace: print out BLK_TN_MESSAGE properly
  blktrace: extract duplidate code
  blktrace: fix memory leak when freeing struct blk_io_trace
  blktrace: fix blk_probes_ref chaos
  blktrace: make classic output more classic
  blktrace: fix off-by-one bug
  blktrace: fix the original blktrace
  blktrace: fix a race when creating blk_tree_root in debugfs
  blktrace: fix timestamp in binary output
  tracing, Text Edit Lock: cleanup
  tracing: filter fix for TRACE_EVENT_FORMAT events
  ftrace: Using FTRACE_WARN_ON() to check "freed record" in ftrace_release()
  x86: kretprobe-booster interrupt emulation code fix

Fix up trivial conflicts in
parents 8901e7ff 645dae96
What: /sys/kernel/debug/kmemtrace/
Date: July 2008
Contact: Eduard - Gabriel Munteanu <>
In kmemtrace-enabled kernels, the following files are created:
cpu<n> (0400) Per-CPU tracing data, see below. (binary)
total_overruns (0400) Total number of bytes which were dropped from
cpu<n> files because of full buffer condition,
non-binary. (text)
abi_version (0400) Kernel's kmemtrace ABI version. (text)
Each per-CPU file should be read according to the relay interface. That is,
the reader should set affinity to that specific CPU and, as currently done by
the userspace application (though there are other methods), use poll() with
an infinite timeout before every read(). Otherwise, erroneous data may be
read. The binary data has the following _core_ format:
Event ID (1 byte) Unsigned integer, one of:
0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
1 - represents a freeing of previously allocated memory
Type ID (1 byte) Unsigned integer, one of:
0 - this is a kmalloc() / kfree()
1 - this is a kmem_cache_alloc() / kmem_cache_free()
2 - this is a __get_free_pages() et al.
Event size (2 bytes) Unsigned integer representing the
size of this event. Used to extend
kmemtrace. Discard the bytes you
don't know about.
Sequence number (4 bytes) Signed integer used to reorder data
logged on SMP machines. Wraparound
must be taken into account, although
it is unlikely.
Caller address (8 bytes) Return address to the caller.
Pointer to mem (8 bytes) Pointer to target memory area. Can be
NULL, but not all such calls might be
In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
Requested bytes (8 bytes) Total number of requested bytes,
unsigned, must not be zero.
Allocated bytes (8 bytes) Total number of actually allocated
bytes, unsigned, must not be lower
than requested bytes.
Requested flags (4 bytes) GFP flags supplied by the caller.
Target CPU (4 bytes) Signed integer, valid for event id 1.
If equal to -1, target CPU is the same
as origin CPU, but the reverse might
not be true.
The data is made available in the same endianness the machine has.
Other event ids and type ids may be defined and added. Other fields may be
added by increasing event size, but see below for details.
Every modification to the ABI, including new id definitions, are followed
by bumping the ABI version by one.
Adding new data to the packet (features) is done at the end of the mandatory
Feature size (2 byte)
Feature ID (1 byte)
Feature data (Feature size - 3 bytes)
kmemtrace-user - git://
This diff is collapsed.
......@@ -50,6 +50,7 @@ parameter is applicable:
ISAPNP ISA PnP code is enabled.
ISDN Appropriate ISDN support is enabled.
JOY Appropriate joystick support is enabled.
KMEMTRACE kmemtrace is enabled.
LIBATA Libata driver is enabled
LP Printer support is enabled.
LOOP Loopback device support is enabled.
......@@ -1081,6 +1082,15 @@ and is between 256 and 4096 characters. It is defined in the file
use the HighMem zone if it exists, and the Normal
zone if it does not.
kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no }
Controls whether kmemtrace is enabled
at boot-time.
kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of
subbufs kmemtrace's relay channel has. Set this
higher than default (KMEMTRACE_N_SUBBUFS in code) if
you experience buffer overruns.
movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
is similar to kernelcore except it specifies the
amount of memory used for migratable allocations.
......@@ -2367,6 +2377,8 @@ and is between 256 and 4096 characters. It is defined in the file
tp720= [HW,PS2]
trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size.
trix= [HW,OSS] MediaTrix AudioTrix Pro
......@@ -115,6 +115,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
'x' - Used by xmon interface on ppc/powerpc platforms.
'z' - Dump the ftrace buffer
'0'-'9' - Sets the console log level, controlling which kernel messages
will be printed to your console. ('0', for example would make
it so that only emergency messages like PANICs or OOPSes would
......@@ -45,8 +45,8 @@ In include/trace/subsys.h :
#include <linux/tracepoint.h>
TPPROTO(int firstarg, struct task_struct *p),
TPARGS(firstarg, p));
TP_PROTO(int firstarg, struct task_struct *p),
TP_ARGS(firstarg, p));
In subsys/file.c (where the tracing statement must be added) :
......@@ -66,10 +66,10 @@ Where :
- subsys is the name of your subsystem.
- eventname is the name of the event to trace.
- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
function called by this tracepoint.
- TPARGS(firstarg, p) are the parameters names, same as found in the
- TP_ARGS(firstarg, p) are the parameters names, same as found in the
Connecting a function (probe) to a tracepoint is done by providing a
......@@ -103,13 +103,14 @@ used to export the defined tracepoints.
* Probe / tracepoint example
See the example provided in samples/tracepoints/src
See the example provided in samples/tracepoints
Compile them with your kernel.
Compile them with your kernel. They are built during 'make' (not
'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m.
Run, as root :
modprobe tracepoint-example (insmod order is not important)
modprobe tracepoint-probe-example
cat /proc/tracepoint-example (returns an expected error)
rmmod tracepoint-example tracepoint-probe-example
modprobe tracepoint-sample (insmod order is not important)
modprobe tracepoint-probe-sample
cat /proc/tracepoint-sample (returns an expected error)
rmmod tracepoint-sample tracepoint-probe-sample
kmemtrace - Kernel Memory Tracer
by Eduard - Gabriel Munteanu
I. Introduction
kmemtrace helps kernel developers figure out two things:
1) how different allocators (SLAB, SLUB etc.) perform
2) how kernel code allocates memory and how much
To do this, we trace every allocation and export information to the userspace
through the relay interface. We export things such as the number of requested
bytes, the number of bytes actually allocated (i.e. including internal
fragmentation), whether this is a slab allocation or a plain kmalloc() and so
The actual analysis is performed by a userspace tool (see section III for
details on where to get it from). It logs the data exported by the kernel,
processes it and (as of writing this) can provide the following information:
- the total amount of memory allocated and fragmentation per call-site
- the amount of memory allocated and fragmentation per allocation
- total memory allocated and fragmentation in the collected dataset
- number of cross-CPU allocation and frees (makes sense in NUMA environments)
Moreover, it can potentially find inconsistent and erroneous behavior in
kernel code, such as using slab free functions on kmalloc'ed memory or
allocating less memory than requested (but not truly failed allocations).
kmemtrace also makes provisions for tracing on some arch and analysing the
data on another.
II. Design and goals
kmemtrace was designed to handle rather large amounts of data. Thus, it uses
the relay interface to export whatever is logged to userspace, which then
stores it. Analysis and reporting is done asynchronously, that is, after the
data is collected and stored. By design, it allows one to log and analyse
on different machines and different arches.
As of writing this, the ABI is not considered stable, though it might not
change much. However, no guarantees are made about compatibility yet. When
deemed stable, the ABI should still allow easy extension while maintaining
backward compatibility. This is described further in Documentation/ABI.
Summary of design goals:
- allow logging and analysis to be done across different machines
- be fast and anticipate usage in high-load environments (*)
- be reasonably extensible
- make it possible for GNU/Linux distributions to have kmemtrace
included in their repositories
(*) - one of the reasons Pekka Enberg's original userspace data analysis
tool's code was rewritten from Perl to C (although this is more than a
simple conversion)
III. Quick usage guide
1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
2) Get the userspace tool and build it:
$ git-clone git:// # current repository
$ cd kmemtrace-user/
$ ./
$ ./configure
$ make
3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
'single' runlevel (so that relay buffers don't fill up easily), and run
# '$' does not mean user, but root here.
$ mount -t debugfs none /sys/kernel/debug
$ mount -t proc none /proc
$ cd path/to/kmemtrace-user/
$ ./kmemtraced
Wait a bit, then stop it with CTRL+C.
$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't
# overrun, should
# be zero.
$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
check its correctness]
$ ./kmemtrace-report
Now you should have a nice and short summary of how the allocator performs.
IV. FAQ and known issues
Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
this? Should I worry?
A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
large the number is. You can fix it by supplying a higher
'kmemtrace.subbufs=N' kernel parameter.
Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
A: This is a bug and should be reported. It can occur for a variety of
- possible bugs in relay code
- possible misuse of relay by kmemtrace
- timestamps being collected unorderly
Or you may fix it yourself and send us a patch.
Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
A: This is a known issue and I'm working on it. These might be true errors
in kernel code, which may have inconsistent behavior (e.g. allocating memory
with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
out this behavior may work with SLAB, but may fail with other allocators.
It may also be due to lack of tracing in some unusual allocator functions.
We don't want bug reports regarding this issue yet.
V. See also
......@@ -2654,6 +2654,12 @@ M:
S: Maintained
P: Eduard - Gabriel Munteanu
S: Maintained
P: Ananth N Mavinakayanahalli
......@@ -6,6 +6,7 @@ config OPROFILE
tristate "OProfile system profiling (EXPERIMENTAL)"
depends on PROFILING
depends on HAVE_OPROFILE
select TRACING
......@@ -14,17 +14,4 @@ typedef struct {
void ack_bad_irq(unsigned int irq);
#define HARDIRQ_BITS 12
* The hardirq mask has to be large enough to have
* space for potentially nestable IRQ sources in the system
* to nest on a single CPU. On Alpha, interrupts are masked at the CPU
* by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode)
* so we really only have 8 nestable IRQs, but allow some overhead
#if (1 << HARDIRQ_BITS) < 16
#error HARDIRQ_BITS is too low!
#endif /* _ALPHA_HARDIRQ_H */
......@@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq);
#endif /* __ASSEMBLY__ */
#define HARDIRQ_BITS 12
* The hardirq mask has to be large enough to have
* space for potentially all IRQ sources in the system
* nesting on a single CPU:
# error HARDIRQ_BITS is too low!
#endif /* __ASM_AVR32_HARDIRQ_H */
......@@ -22,6 +22,9 @@ config IA64
select HAVE_KVM
#ifndef _ASM_IA64_FTRACE_H
#define _ASM_IA64_FTRACE_H
#define MCOUNT_INSN_SIZE 32 /* sizeof mcount call */
#ifndef __ASSEMBLY__
extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0);
#define mcount _mcount
#include <asm/kprobes.h>
/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */
#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip)
#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip)
static inline unsigned long ftrace_call_adjust(unsigned long addr)
/* second bundle, insn 2 */
return addr - 0x12;
struct dyn_arch_ftrace {
#endif /* _ASM_IA64_FTRACE_H */
......@@ -20,16 +20,6 @@
#define local_softirq_pending() (local_cpu_data->softirq_pending)
#define HARDIRQ_BITS 14
* The hardirq mask has to be large enough to have space for potentially all IRQ sources
* in the system nesting on a single CPU:
# error HARDIRQ_BITS is too low!
extern void __iomem *ipi_base_addr;
void ack_bad_irq(unsigned int irq);
......@@ -2,6 +2,10 @@
# Makefile for the linux kernel.
CFLAGS_REMOVE_ftrace.o = -pg
extra-y := head.o init_task.o
obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
......@@ -28,6 +32,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o
......@@ -47,6 +47,7 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/unistd.h>
#include <asm/ftrace.h>
#include "minstate.h"
......@@ -1404,6 +1405,105 @@ GLOBAL_ENTRY(unw_init_running)
br.ret.sptk.many rp
br ftrace_stub
br.ret.sptk.many b0
alloc out0 = ar.pfs, 8, 0, 4, 0
mov out3 = r0
mov out2 = b0
add r3 = 0x20, r3
mov out1 = r1; b0 = ftrace_patch_gp
//this might be called from module, so we must patch gp
movl gp=__gp
mov b0 = r3
.global ftrace_call;
nop.m 0x0
movl r3 = .here;;
alloc loc0 = ar.pfs, 4, 4, 2, 0
mov loc1 = b0
mov out0 = b0
mov loc2 = r8
mov loc3 = r15
adds out0 = -MCOUNT_INSN_SIZE, out0
mov out1 = in2
mov b6 = r3 b0 = b6
mov ar.pfs = loc0
mov b0 = loc1
mov r8 = loc2
mov r15 = loc3
br ftrace_stub
movl r2 = ftrace_stub
movl r3 = ftrace_trace_function;;
ld8 r3 = [r3];;
ld8 r3 = [r3];;
cmp.eq p7,p0 = r2, r3
(p7) br.sptk.many ftrace_stub
alloc loc0 = ar.pfs, 4, 4, 2, 0
mov loc1 = b0
mov out0 = b0
mov loc2 = r8
mov loc3 = r15
adds out0 = -MCOUNT_INSN_SIZE, out0
mov out1 = in2
mov b6 = r3 b0 = b6
mov ar.pfs = loc0
mov b0 = loc1
mov r8 = loc2
mov r15 = loc3
br ftrace_stub
mov r3 = b0
movl r2 = _mcount_ret_helper
mov b6 = r2
mov b7 = r3
br.ret.sptk.many b6
mov b0 = r42
mov r1 = r41
mov ar.pfs = r40
br b7
.align 8
.globl sys_call_table
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment