Commit 82f67cd9 authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds
Browse files

[PATCH] Add debugging feature /proc/timer_stat



Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.

Sample output:

# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
  24,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
  11,     0 swapper          sk_reset_timer (tcp_delack_timer)
   6,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
   2,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
  17,     0 swapper          hrtimer_restart_sched_tick (hrtimer_sched_tick)
   2,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
   4,  2050 pcscd            do_nanosleep (hrtimer_wakeup)
   5,  4179 sshd             sk_reset_timer (tcp_write_timer)
   4,  2248 yum-updatesd     schedule_timeout (process_timeout)
  18,     0 swapper          hrtimer_restart_sched_tick (hrtimer_sched_tick)
   3,     0 swapper          sk_reset_timer (tcp_delack_timer)
   1,     1 swapper          neigh_table_init_no_netlink (neigh_periodic_timer)
   2,     1 swapper          e1000_up (e1000_watchdog)
   1,     1 init             schedule_timeout (process_timeout)
100 total events, 25.24 events/sec

[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: default avatarAdrian Bunk <bunk@stusta.de>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8bfd9a7a
timer_stats - timer usage statistics
------------------------------------
timer_stats is a debugging facility to make the timer (ab)usage in a Linux
system visible to kernel and userspace developers. It is not intended for
production usage as it adds significant overhead to the (hr)timer code and the
(hr)timer data structures.
timer_stats should be used by kernel and userspace developers to verify that
their code does not make unduly use of timers. This helps to avoid unnecessary
wakeups, which should be avoided to optimize power consumption.
It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration
section.
timer_stats collects information about the timer events which are fired in a
Linux system over a sample period:
- the pid of the task(process) which initialized the timer
- the name of the process which initialized the timer
- the function where the timer was intialized
- the callback function which is associated to the timer
- the number of events (callbacks)
timer_stats adds an entry to /proc: /proc/timer_stats
This entry is used to control the statistics functionality and to read out the
sampled information.
The timer_stats functionality is inactive on bootup.
To activate a sample period issue:
# echo 1 >/proc/timer_stats
To stop a sample period issue:
# echo 0 >/proc/timer_stats
The statistics can be retrieved by:
# cat /proc/timer_stats
The readout of /proc/timer_stats automatically disables sampling. The sampled
information is kept until a new sample period is started. This allows multiple
readouts.
Sample output of /proc/timer_stats:
Timerstats sample period: 3.888770 s
12, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
15, 1 swapper hcd_submit_urb (rh_timer_func)
4, 959 kedac schedule_timeout (process_timeout)
1, 0 swapper page_writeback_init (wb_timer_fn)
28, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
22, 2948 IRQ 4 tty_flip_buffer_push (delayed_work_timer_fn)
3, 3100 bash schedule_timeout (process_timeout)
1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
1, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
1, 2292 ip __netdev_watchdog_up (dev_watchdog)
1, 23 events/1 do_cache_clean (delayed_work_timer_fn)
90 total events, 30.0 events/sec
The first column is the number of events, the second column the pid, the third
column is the name of the process. The forth column shows the function which
initialized the timer and in parantheses the callback function which was
executed on expiry.
Thomas, Ingo
......@@ -119,6 +119,11 @@ struct hrtimer {
enum hrtimer_cb_mode cb_mode;
struct list_head cb_entry;
#endif
#ifdef CONFIG_TIMER_STATS
void *start_site;
char start_comm[16];
int start_pid;
#endif
};
/**
......@@ -311,4 +316,44 @@ extern unsigned long ktime_divns(const ktime_t kt, s64 div);
# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
#endif
/*
* Timer-statistics info:
*/
#ifdef CONFIG_TIMER_STATS
extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
void *timerf, char * comm);
static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
{
timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
timer->function, timer->start_comm);
}
extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer,
void *addr);
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
{
__timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0));
}
static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
{
timer->start_site = NULL;
}
#else
static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
{
}
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
{
}
static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
{
}
#endif
#endif
......@@ -2,6 +2,7 @@
#define _LINUX_TIMER_H
#include <linux/list.h>
#include <linux/ktime.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
......@@ -15,6 +16,11 @@ struct timer_list {
unsigned long data;
struct tvec_t_base_s *base;
#ifdef CONFIG_TIMER_STATS
void *start_site;
char start_comm[16];
int start_pid;
#endif
};
extern struct tvec_t_base_s boot_tvec_bases;
......@@ -73,6 +79,54 @@ extern unsigned long next_timer_interrupt(void);
*/
extern unsigned long get_next_timer_interrupt(unsigned long now);
/*
* Timer-statistics info:
*/
#ifdef CONFIG_TIMER_STATS
extern void init_timer_stats(void);
extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
void *timerf, char * comm);
static inline void timer_stats_account_timer(struct timer_list *timer)
{
timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
timer->function, timer->start_comm);
}
extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
void *addr);
static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
{
__timer_stats_timer_set_start_info(timer, __builtin_return_address(0));
}
static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
{
timer->start_site = NULL;
}
#else
static inline void init_timer_stats(void)
{
}
static inline void timer_stats_account_timer(struct timer_list *timer)
{
}
static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
{
}
static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
{
}
#endif
extern void delayed_work_timer_fn(unsigned long __data);
/**
* add_timer - start a timer
* @timer: the timer to be added
......
......@@ -585,6 +585,18 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
#ifdef CONFIG_TIMER_STATS
void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
{
if (timer->start_site)
return;
timer->start_site = addr;
memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
timer->start_pid = current->pid;
}
#endif
/*
* Counterpart to lock_timer_base above:
*/
......@@ -743,6 +755,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
* reprogramming happens in the interrupt handler. This is a
* rare case and less expensive than a smp call.
*/
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
reprogram);
......@@ -791,6 +804,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
}
timer->expires = tim;
timer_stats_hrtimer_set_start_info(timer);
enqueue_hrtimer(timer, new_base, base == new_base);
unlock_hrtimer_base(timer, &flags);
......@@ -925,6 +940,12 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
timer->base = &cpu_base->clock_base[clock_id];
hrtimer_init_timer_hres(timer);
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
timer->start_pid = -1;
memset(timer->start_comm, 0, TASK_COMM_LEN);
#endif
}
EXPORT_SYMBOL_GPL(hrtimer_init);
......@@ -1006,6 +1027,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
__remove_hrtimer(timer, base,
HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
/*
* Note: We clear the CALLBACK bit after
......@@ -1050,6 +1072,8 @@ static void run_hrtimer_softirq(struct softirq_action *h)
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
......@@ -1106,6 +1130,8 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
......
......@@ -5,3 +5,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
/*
* kernel/time/timer_stats.c
*
* Collect timer usage statistics.
*
* Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
* timer_stats is based on timer_top, a similar functionality which was part of
* Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
* Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
* on dynamic allocation of the statistics entries and linear search based
* lookup combined with a global lock, rather than the static array, hash
* and per-CPU locking which is used by timer_stats. It was written for the
* pre hrtimer kernel code and therefore did not take hrtimers into account.
* Nevertheless it provided the base for the timer_stats implementation and
* was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
* for this effort.
*
* timer_top.c is
* Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
* Written by Daniel Petrini <d.pensator@gmail.com>
* timer_top.c was released under the GNU General Public License version 2
*
* We export the addresses and counting of timer functions being called,
* the pid and cmdline from the owner process if applicable.
*
* Start/stop data collection:
* # echo 1[0] >/proc/timer_stats
*
* Display the information collected so far:
* # cat /proc/timer_stats
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <asm/uaccess.h>
/*
* This is our basic unit of interest: a timer expiry event identified
* by the timer, its start/expire functions and the PID of the task that
* started the timer. We count the number of times an event happens:
*/
struct entry {
/*
* Hash list:
*/
struct entry *next;
/*
* Hash keys:
*/
void *timer;
void *start_func;
void *expire_func;
pid_t pid;
/*
* Number of timeout events:
*/
unsigned long count;
/*
* We save the command-line string to preserve
* this information past task exit:
*/
char comm[TASK_COMM_LEN + 1];
} ____cacheline_aligned_in_smp;
/*
* Spinlock protecting the tables - not taken during lookup:
*/
static DEFINE_SPINLOCK(table_lock);
/*
* Per-CPU lookup locks for fast hash lookup:
*/
static DEFINE_PER_CPU(spinlock_t, lookup_lock);
/*
* Mutex to serialize state changes with show-stats activities:
*/
static DEFINE_MUTEX(show_mutex);
/*
* Collection status, active/inactive:
*/
static int __read_mostly active;
/*
* Beginning/end timestamps of measurement:
*/
static ktime_t time_start, time_stop;
/*
* tstat entry structs only get allocated while collection is
* active and never freed during that time - this simplifies
* things quite a bit.
*
* They get freed when a new collection period is started.
*/
#define MAX_ENTRIES_BITS 10
#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
static unsigned long nr_entries;
static struct entry entries[MAX_ENTRIES];
static atomic_t overflow_count;
static void reset_entries(void)
{
nr_entries = 0;
memset(entries, 0, sizeof(entries));
atomic_set(&overflow_count, 0);
}
static struct entry *alloc_entry(void)
{
if (nr_entries >= MAX_ENTRIES)
return NULL;
return entries + nr_entries++;
}
/*
* The entries are in a hash-table, for fast lookup:
*/
#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
#define __tstat_hashfn(entry) \
(((unsigned long)(entry)->timer ^ \
(unsigned long)(entry)->start_func ^ \
(unsigned long)(entry)->expire_func ^ \
(unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
static int match_entries(struct entry *entry1, struct entry *entry2)
{
return entry1->timer == entry2->timer &&
entry1->start_func == entry2->start_func &&
entry1->expire_func == entry2->expire_func &&
entry1->pid == entry2->pid;
}
/*
* Look up whether an entry matching this item is present
* in the hash already. Must be called with irqs off and the
* lookup lock held:
*/
static struct entry *tstat_lookup(struct entry *entry, char *comm)
{
struct entry **head, *curr, *prev;
head = tstat_hashentry(entry);
curr = *head;
/*
* The fastpath is when the entry is already hashed,
* we do this with the lookup lock held, but with the
* table lock not held:
*/
while (curr) {
if (match_entries(curr, entry))
return curr;
curr = curr->next;
}
/*
* Slowpath: allocate, set up and link a new hash entry:
*/
prev = NULL;
curr = *head;
spin_lock(&table_lock);
/*
* Make sure we have not raced with another CPU:
*/
while (curr) {
if (match_entries(curr, entry))
goto out_unlock;
prev = curr;
curr = curr->next;
}
curr = alloc_entry();
if (curr) {
*curr = *entry;
curr->count = 0;
memcpy(curr->comm, comm, TASK_COMM_LEN);
if (prev)
prev->next = curr;
else
*head = curr;
curr->next = NULL;
}
out_unlock:
spin_unlock(&table_lock);
return curr;
}
/**
* timer_stats_update_stats - Update the statistics for a timer.
* @timer: pointer to either a timer_list or a hrtimer
* @pid: the pid of the task which set up the timer
* @startf: pointer to the function which did the timer setup
* @timerf: pointer to the timer callback function of the timer
* @comm: name of the process which set up the timer
*
* When the timer is already registered, then the event counter is
* incremented. Otherwise the timer is registered in a free slot.
*/
void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
void *timerf, char * comm)
{
/*
* It doesnt matter which lock we take:
*/
spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
struct entry *entry, input;
unsigned long flags;
input.timer = timer;
input.start_func = startf;
input.expire_func = timerf;
input.pid = pid;
spin_lock_irqsave(lock, flags);
if (!active)
goto out_unlock;
entry = tstat_lookup(&input, comm);
if (likely(entry))
entry->count++;
else
atomic_inc(&overflow_count);
out_unlock:
spin_unlock_irqrestore(lock, flags);
}
static void print_name_offset(struct seq_file *m, unsigned long addr)
{
char namebuf[KSYM_NAME_LEN+1];
unsigned long size, offset;
const char *sym_name;
char *modname;
sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
if (sym_name)
seq_printf(m, "%s", sym_name);
else
seq_printf(m, "<%p>", (void *)addr);
}
static int tstats_show(struct seq_file *m, void *v)
{
struct timespec period;
struct entry *entry;
unsigned long ms;
long events = 0;
ktime_t time;
int i;
mutex_lock(&show_mutex);
/*
* If still active then calculate up to now:
*/
if (active)
time_stop = ktime_get();
time = ktime_sub(time_stop, time_start);
period = ktime_to_timespec(time);
ms = period.tv_nsec / 1000000;
seq_puts(m, "Timer Stats Version: v0.1\n");
seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
if (atomic_read(&overflow_count))
seq_printf(m, "Overflow: %d entries\n",
atomic_read(&overflow_count));
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
seq_printf(m, "%4lu, %5d %-16s ",
entry->count, entry->pid, entry->comm);
print_name_offset(m, (unsigned long)entry->start_func);
seq_puts(m, " (");
print_name_offset(m, (unsigned long)entry->expire_func);
seq_puts(m, ")\n");
events += entry->count;
}
ms += period.tv_sec * 1000;
if (!ms)
ms = 1;
if (events && period.tv_sec)
seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
events / period.tv_sec, events * 1000 / ms);
else
seq_printf(m, "%ld total events\n", events);
mutex_unlock(&show_mutex);
return 0;
}
/*
* After a state change, make sure all concurrent lookup/update
* activities have stopped:
*/
static void sync_access(void)
{
unsigned long flags;
int cpu;
for_each_online_cpu(cpu) {
spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
/* nothing */
spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
}
}
static ssize_t tstats_write(struct file *file, const char __user *buf,
size_t count, loff_t *offs)
{
char ctl[2];
if (count != 2 || *offs)
return -EINVAL;
if (copy_from_user(ctl, buf, count))