Commit d00aa669 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'perfcounters-fixes-for-linus' of...

Merge branch 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (27 commits)
  perf_counter: Zero dead bytes from ftrace raw samples size alignment
  perf_counter: Subtract the buffer size field from the event record size
  perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data
  perf_counter: Correct PERF_SAMPLE_RAW output
  perf tools: callchain: Fix bad rounding of minimum rate
  perf_counter tools: Fix libbfd detection for systems with libz dependency
  perf: "Longum est iter per praecepta, breve et efficax per exempla"
  perf_counter: Fix a race on perf_counter_ctx
  perf_counter: Fix tracepoint sampling to be part of generic sampling
  perf_counter: Work around gcc warning by initializing tracepoint record unconditionally
  perf tools: callchain: Fix sum of percentages to be 100% by displaying amount of ignored chains in fractal mode
  perf tools: callchain: Fix 'perf report' display to be callchain by default
  perf tools: callchain: Fix spurious 'perf report' warnings: ignore empty callchains
  perf record: Fix the -A UI for empty or non-existent perf.data
  perf util: Fix do_read() to fail on EOF instead of busy-looping
  perf list: Fix the output to not include tracepoints without an id
  perf_counter/powerpc: Fix oops on cpus without perf_counter hardware support
  perf stat: Fix tool option consistency: rename -S/--scale to -c/--scale
  perf report: Add debug help for the finding of symbol bugs - show the symtab origin (DSO, build-id, kernel, etc)
  perf report: Fix per task mult-counter stat reporting
  ...
parents cec36911 1853db0e
......@@ -518,6 +518,8 @@ void hw_perf_disable(void)
struct cpu_hw_counters *cpuhw;
unsigned long flags;
if (!ppmu)
return;
local_irq_save(flags);
cpuhw = &__get_cpu_var(cpu_hw_counters);
......@@ -572,6 +574,8 @@ void hw_perf_enable(void)
int n_lim;
int idx;
if (!ppmu)
return;
local_irq_save(flags);
cpuhw = &__get_cpu_var(cpu_hw_counters);
if (!cpuhw->disabled) {
......@@ -737,6 +741,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
long i, n, n0;
struct perf_counter *sub;
if (!ppmu)
return 0;
cpuhw = &__get_cpu_var(cpu_hw_counters);
n0 = cpuhw->n_counters;
n = collect_events(group_leader, ppmu->n_counter - n0,
......@@ -1281,6 +1287,8 @@ void hw_perf_counter_setup(int cpu)
{
struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
if (!ppmu)
return;
memset(cpuhw, 0, sizeof(*cpuhw));
cpuhw->mmcr[0] = MMCR0_FC;
}
......
......@@ -121,7 +121,7 @@ enum perf_counter_sample_format {
PERF_SAMPLE_CPU = 1U << 7,
PERF_SAMPLE_PERIOD = 1U << 8,
PERF_SAMPLE_STREAM_ID = 1U << 9,
PERF_SAMPLE_TP_RECORD = 1U << 10,
PERF_SAMPLE_RAW = 1U << 10,
PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */
};
......@@ -369,6 +369,8 @@ enum perf_event_type {
*
* { u64 nr,
* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
* { u32 size;
* char data[size];}&& PERF_SAMPLE_RAW
* };
*/
PERF_EVENT_SAMPLE = 9,
......@@ -414,9 +416,9 @@ struct perf_callchain_entry {
__u64 ip[PERF_MAX_STACK_DEPTH];
};
struct perf_tracepoint_record {
int size;
char *record;
struct perf_raw_record {
u32 size;
void *data;
};
struct task_struct;
......@@ -687,7 +689,7 @@ struct perf_sample_data {
struct pt_regs *regs;
u64 addr;
u64 period;
void *private;
struct perf_raw_record *raw;
};
extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
......
......@@ -637,12 +637,20 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
* pc = preempt_count();
*
* __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
* __entry_size = __data_size + sizeof(*entry);
*
* // Below we want to get the aligned size by taking into account
* // the u32 field that will later store the buffer size
* __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),
* sizeof(u64));
* __entry_size -= sizeof(u32);
*
* do {
* char raw_data[__entry_size]; <- allocate our sample in the stack
* struct trace_entry *ent;
*
* zero dead bytes from alignment to avoid stack leak to userspace:
*
* *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
* entry = (struct ftrace_raw_<call> *)raw_data;
* ent = &entry->ent;
* tracing_generic_entry_update(ent, irq_flags, pc);
......@@ -685,12 +693,15 @@ static void ftrace_profile_##call(proto) \
pc = preempt_count(); \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
__entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
do { \
char raw_data[__entry_size]; \
struct trace_entry *ent; \
\
*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
entry = (struct ftrace_raw_##call *)raw_data; \
ent = &entry->ent; \
tracing_generic_entry_update(ent, irq_flags, pc); \
......
......@@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
u64 counter;
} group_entry;
struct perf_callchain_entry *callchain = NULL;
struct perf_tracepoint_record *tp;
int callchain_size = 0;
u64 time;
struct {
......@@ -2715,9 +2714,16 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
header.size += sizeof(u64);
}
if (sample_type & PERF_SAMPLE_TP_RECORD) {
tp = data->private;
header.size += tp->size;
if (sample_type & PERF_SAMPLE_RAW) {
int size = sizeof(u32);
if (data->raw)
size += data->raw->size;
else
size += sizeof(u32);
WARN_ON_ONCE(size & (sizeof(u64)-1));
header.size += size;
}
ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
......@@ -2783,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
}
}
if (sample_type & PERF_SAMPLE_TP_RECORD)
perf_output_copy(&handle, tp->record, tp->size);
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
perf_output_put(&handle, data->raw->size);
perf_output_copy(&handle, data->raw->data, data->raw->size);
} else {
struct {
u32 size;
u32 data;
} raw = {
.size = sizeof(u32),
.data = 0,
};
perf_output_put(&handle, raw);
}
}
perf_output_end(&handle);
}
......@@ -2849,7 +2868,8 @@ perf_counter_read_event(struct perf_counter *counter,
*/
struct perf_task_event {
struct task_struct *task;
struct task_struct *task;
struct perf_counter_context *task_ctx;
struct {
struct perf_event_header header;
......@@ -2909,24 +2929,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
static void perf_counter_task_event(struct perf_task_event *task_event)
{
struct perf_cpu_context *cpuctx;
struct perf_counter_context *ctx;
struct perf_counter_context *ctx = task_event->task_ctx;
cpuctx = &get_cpu_var(perf_cpu_context);
perf_counter_task_ctx(&cpuctx->ctx, task_event);
put_cpu_var(perf_cpu_context);
rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
* events ends up in.
*/
ctx = rcu_dereference(current->perf_counter_ctxp);
if (!ctx)
ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
if (ctx)
perf_counter_task_ctx(ctx, task_event);
rcu_read_unlock();
}
static void perf_counter_task(struct task_struct *task, int new)
static void perf_counter_task(struct task_struct *task,
struct perf_counter_context *task_ctx,
int new)
{
struct perf_task_event task_event;
......@@ -2936,8 +2955,9 @@ static void perf_counter_task(struct task_struct *task, int new)
return;
task_event = (struct perf_task_event){
.task = task,
.event = {
.task = task,
.task_ctx = task_ctx,
.event = {
.header = {
.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
.misc = 0,
......@@ -2955,7 +2975,7 @@ static void perf_counter_task(struct task_struct *task, int new)
void perf_counter_fork(struct task_struct *task)
{
perf_counter_task(task, 1);
perf_counter_task(task, NULL, 1);
}
/*
......@@ -3344,87 +3364,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
* Generic software counter infrastructure
*/
static void perf_swcounter_update(struct perf_counter *counter)
/*
* We directly increment counter->count and keep a second value in
* counter->hw.period_left to count intervals. This period counter
* is kept in the range [-sample_period, 0] so that we can use the
* sign as trigger.
*/
static u64 perf_swcounter_set_period(struct perf_counter *counter)
{
struct hw_perf_counter *hwc = &counter->hw;
u64 prev, now;
s64 delta;
u64 period = hwc->last_period;
u64 nr, offset;
s64 old, val;
hwc->last_period = hwc->sample_period;
again:
prev = atomic64_read(&hwc->prev_count);
now = atomic64_read(&hwc->count);
if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
goto again;
old = val = atomic64_read(&hwc->period_left);
if (val < 0)
return 0;
delta = now - prev;
nr = div64_u64(period + val, period);
offset = nr * period;
val -= offset;
if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
goto again;
atomic64_add(delta, &counter->count);
atomic64_sub(delta, &hwc->period_left);
return nr;
}
static void perf_swcounter_set_period(struct perf_counter *counter)
static void perf_swcounter_overflow(struct perf_counter *counter,
int nmi, struct perf_sample_data *data)
{
struct hw_perf_counter *hwc = &counter->hw;
s64 left = atomic64_read(&hwc->period_left);
s64 period = hwc->sample_period;
u64 overflow;
if (unlikely(left <= -period)) {
left = period;
atomic64_set(&hwc->period_left, left);
hwc->last_period = period;
}
data->period = counter->hw.last_period;
overflow = perf_swcounter_set_period(counter);
if (unlikely(left <= 0)) {
left += period;
atomic64_add(period, &hwc->period_left);
hwc->last_period = period;
}
if (hwc->interrupts == MAX_INTERRUPTS)
return;
atomic64_set(&hwc->prev_count, -left);
atomic64_set(&hwc->count, -left);
for (; overflow; overflow--) {
if (perf_counter_overflow(counter, nmi, data)) {
/*
* We inhibit the overflow from happening when
* hwc->interrupts == MAX_INTERRUPTS.
*/
break;
}
}
}
static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
static void perf_swcounter_unthrottle(struct perf_counter *counter)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
struct perf_counter *counter;
u64 period;
counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
counter->pmu->read(counter);
data.addr = 0;
data.regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
* Nothing to do, we already reset hwc->interrupts.
*/
if ((counter->attr.exclude_kernel || !data.regs) &&
!counter->attr.exclude_user)
data.regs = task_pt_regs(current);
}
if (data.regs) {
if (perf_counter_overflow(counter, 0, &data))
ret = HRTIMER_NORESTART;
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
int nmi, struct perf_sample_data *data)
{
struct hw_perf_counter *hwc = &counter->hw;
period = max_t(u64, 10000, counter->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
atomic64_add(nr, &counter->count);
return ret;
}
if (!hwc->sample_period)
return;
static void perf_swcounter_overflow(struct perf_counter *counter,
int nmi, struct perf_sample_data *data)
{
data->period = counter->hw.last_period;
if (!data->regs)
return;
perf_swcounter_update(counter);
perf_swcounter_set_period(counter);
if (perf_counter_overflow(counter, nmi, data))
/* soft-disable the counter */
;
if (!atomic64_add_negative(nr, &hwc->period_left))
perf_swcounter_overflow(counter, nmi, data);
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
......@@ -3488,15 +3502,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
return 1;
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
int nmi, struct perf_sample_data *data)
{
int neg = atomic64_add_negative(nr, &counter->hw.count);
if (counter->hw.sample_period && !neg && data->regs)
perf_swcounter_overflow(counter, nmi, data);
}
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
......@@ -3575,26 +3580,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
static void perf_swcounter_read(struct perf_counter *counter)
{
perf_swcounter_update(counter);
}
static int perf_swcounter_enable(struct perf_counter *counter)
{
perf_swcounter_set_period(counter);
struct hw_perf_counter *hwc = &counter->hw;
if (hwc->sample_period) {
hwc->last_period = hwc->sample_period;
perf_swcounter_set_period(counter);
}
return 0;
}
static void perf_swcounter_disable(struct perf_counter *counter)
{
perf_swcounter_update(counter);
}
static const struct pmu perf_ops_generic = {
.enable = perf_swcounter_enable,
.disable = perf_swcounter_disable,
.read = perf_swcounter_read,
.unthrottle = perf_swcounter_unthrottle,
};
/*
* hrtimer based swcounter callback
*/
static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
struct perf_counter *counter;
u64 period;
counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
counter->pmu->read(counter);
data.addr = 0;
data.regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
if ((counter->attr.exclude_kernel || !data.regs) &&
!counter->attr.exclude_user)
data.regs = task_pt_regs(current);
if (data.regs) {
if (perf_counter_overflow(counter, 0, &data))
ret = HRTIMER_NORESTART;
}
period = max_t(u64, 10000, counter->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
return ret;
}
/*
* Software counter: cpu wall time clock
*/
......@@ -3715,15 +3759,15 @@ static const struct pmu perf_ops_task_clock = {
void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
int entry_size)
{
struct perf_tracepoint_record tp = {
struct perf_raw_record raw = {
.size = entry_size,
.record = record,
.data = record,
};
struct perf_sample_data data = {
.regs = get_irq_regs(),
.addr = addr,
.private = &tp,
.raw = &raw,
};
if (!data.regs)
......@@ -3743,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
{
/*
* Raw tracepoint data is a severe data leak, only allow root to
* have these.
*/
if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (ftrace_profile_enable(counter->attr.config))
return NULL;
......@@ -4285,7 +4337,7 @@ void perf_counter_exit_task(struct task_struct *child)
unsigned long flags;
if (likely(!child->perf_counter_ctxp)) {
perf_counter_task(child, 0);
perf_counter_task(child, NULL, 0);
return;
}
......@@ -4305,6 +4357,7 @@ void perf_counter_exit_task(struct task_struct *child)
* incremented the context's refcount before we do put_ctx below.
*/
spin_lock(&child_ctx->lock);
child->perf_counter_ctxp = NULL;
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
......@@ -4318,9 +4371,7 @@ void perf_counter_exit_task(struct task_struct *child)
* won't get any samples after PERF_EVENT_EXIT. We can however still
* get a few PERF_EVENT_READ events.
*/
perf_counter_task(child, 0);
child->perf_counter_ctxp = NULL;
perf_counter_task(child, child_ctx, 0);
/*
* We can recurse on the same lock type through:
......
------------------------------
****** perf by examples ******
------------------------------
[ From an e-mail by Ingo Molnar, http://lkml.org/lkml/2009/8/4/346 ]
First, discovery/enumeration of available counters can be done via
'perf list':
titan:~> perf list
[...]
kmem:kmalloc [Tracepoint event]
kmem:kmem_cache_alloc [Tracepoint event]
kmem:kmalloc_node [Tracepoint event]
kmem:kmem_cache_alloc_node [Tracepoint event]
kmem:kfree [Tracepoint event]
kmem:kmem_cache_free [Tracepoint event]
kmem:mm_page_free_direct [Tracepoint event]
kmem:mm_pagevec_free [Tracepoint event]
kmem:mm_page_alloc [Tracepoint event]
kmem:mm_page_alloc_zone_locked [Tracepoint event]
kmem:mm_page_pcpu_drain [Tracepoint event]
kmem:mm_page_alloc_extfrag [Tracepoint event]
Then any (or all) of the above event sources can be activated and
measured. For example the page alloc/free properties of a 'hackbench
run' are:
titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc
-e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10
Time: 0.575
Performance counter stats for './hackbench 10':
13857 kmem:mm_page_pcpu_drain
27576 kmem:mm_page_alloc
6025 kmem:mm_pagevec_free
20934 kmem:mm_page_free_direct
0.613972165 seconds time elapsed
You can observe the statistical properties as well, by using the
'repeat the workload N times' feature of perf stat:
titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e
kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
kmem:mm_page_free_direct ./hackbench 10
Time: 0.627
Time: 0.644
Time: 0.564
Time: 0.559
Time: 0.626
Performance counter stats for './hackbench 10' (5 runs):
12920 kmem:mm_page_pcpu_drain ( +- 3.359% )
25035 kmem:mm_page_alloc ( +- 3.783% )
6104 kmem:mm_pagevec_free ( +- 0.934% )
18376 kmem:mm_page_free_direct ( +- 4.941% )
0.643954516 seconds time elapsed ( +- 2.363% )
Furthermore, these tracepoints can be used to sample the workload as
well. For example the page allocations done by a 'git gc' can be
captured the following way:
titan:~/git> perf record -f -e kmem:mm_page_alloc -c 1 ./git gc
Counting objects: 1148, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (450/450), done.
Writing objects: 100% (1148/1148), done.
Total 1148 (delta 690), reused 1148 (delta 690)
[ perf record: Captured and wrote 0.267 MB perf.data (~11679 samples) ]
To check which functions generated page allocations:
titan:~/git> perf report
# Samples: 10646
#
# Overhead Command Shared Object
# ........ ............... ..........................
#
23.57% git-repack /lib64/libc-2.5.so
21.81% git /lib64/libc-2.5.so
14.59% git ./git
11.79% git-repack ./git
7.12% git /lib64/ld-2.5.so
3.16% git-repack /lib64/libpthread-2.5.so
2.09% git-repack /bin/bash
1.97% rm /lib64/libc-2.5.so
1.39% mv /lib64/ld-2.5.so
1.37% mv /lib64/libc-2.5.so
1.12% git-repack /lib64/ld-2.5.so
0.95% rm /lib64/ld-2.5.so
0.90% git-update-serv /lib64/libc-2.5.so
0.73% git-update-serv /lib64/ld-2.5.so
0.68% perf /lib64/libpthread-2.5.so
0.64% git-repack /usr/lib64/libz.so.1.2.3
Or to see it on a more finegrained level:
titan:~/git> perf report --sort comm,dso,symbol
# Samples: 10646
#
# Overhead Command Shared Object Symbol
# ........ ............... .......................... ......
#
9.35% git-repack ./git [.] insert_obj_hash
9.12% git ./git [.] insert_obj_hash