Newer
Older
/*
* linux/kernel/exit.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/mnt_namespace.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/taskstats_kern.h>
#include <linux/freezer.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <trace/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
DEFINE_TRACE(sched_process_free);
DEFINE_TRACE(sched_process_exit);
DEFINE_TRACE(sched_process_wait);
static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
if (thread_group_leader(p)) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
__get_cpu_var(process_counts)--;
/*
* This function expects the tasklist_lock write-locked.
*/
static void __exit_signal(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
struct sighand_struct *sighand;
BUG_ON(!sig);
BUG_ON(!atomic_read(&sig->count));
sighand = rcu_dereference(tsk->sighand);
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
if (atomic_dec_and_test(&sig->count))
posix_cpu_timers_exit_group(tsk);
else {
/*
* If there is any task waiting for the group exit
* then notify it:
*/
if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
wake_up_process(sig->group_exit_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
/*
* Accumulate here the counters for all threads but the
* group leader as they die, so they can be added into
* the process-wide totals when those are taken.
* The group leader stays around as a zombie as long
* as there are other threads. When it gets reaped,
* the exit.c code will add its counts into these totals.
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
sig->utime = cputime_add(sig->utime, task_utime(tsk));
sig->stime = cputime_add(sig->stime, task_stime(tsk));
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
__unhash_process(tsk);
/*
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
if (sig) {
flush_sigqueue(&sig->shared_pending);
Oleg Nesterov
committed
/*
* Make sure ->signal can't go away under rq->lock,
* see account_group_exec_runtime().
*/
task_rq_unlock_wait(tsk);
__cleanup_signal(sig);
}
}
static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
trace_sched_process_free(tsk);
put_task_struct(tsk);
struct task_struct *leader;
David Howells
committed
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials */
atomic_dec(&__task_cred(p)->user->processes);
Pavel Emelyanov
committed
proc_flush_task(p);
/*
* If we are the last non-leader member of the thread
* group, and the leader is zombie, then notify the
* group leader's parent process. (if it wants notification.)
*/
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
*
* do_notify_parent() will have marked it self-reaping in
* that case.
*/
zap_leader = task_detached(leader);
/*
* This maintains the invariant that release_task()
* only runs on a task in EXIT_DEAD, just for sanity.
*/
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
write_unlock_irq(&tasklist_lock);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
p = leader;
if (unlikely(zap_leader))
goto repeat;
}
/*
* This checks not only the pgrp, but falls back on the pid if no
* satisfactory pgrp is found. I dunno - gdb doesn't work correctly
* without this...
*
* The caller must hold rcu lock or the tasklist lock.
struct pid *session_of_pgrp(struct pid *pgrp)
struct pid *sid = NULL;
p = pid_task(pgrp, PIDTYPE_PGID);
if (p == NULL)
p = pid_task(pgrp, PIDTYPE_PID);
if (p != NULL)
sid = task_session(p);
return sid;
}
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
* to receive a SIGHUP and a SIGCONT.
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
if ((p == ignored_task) ||
(p->exit_state && thread_group_empty(p)) ||
is_global_init(p->real_parent))
if (task_pgrp(p->real_parent) != pgrp &&
task_session(p->real_parent) == task_session(p))
return 0;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return 1;
int is_current_pgrp_orphaned(void)
retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
read_unlock(&tasklist_lock);
return retval;
}
static int has_stopped_jobs(struct pid *pgrp)
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/*
* Check to see if any process groups have become orphaned as
* a result of our exiting, and if they have any stopped jobs,
* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
struct pid *pgrp = task_pgrp(tsk);
struct task_struct *ignored_task = tsk;
if (!parent)
/* exit: our father is in a different pgrp than
* we are and we were the only connection outside.
*/
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
* we are, and it was the only connection outside.
*/
ignored_task = NULL;
if (task_pgrp(parent) != pgrp &&
task_session(parent) == task_session(tsk) &&
will_become_orphaned_pgrp(pgrp, ignored_task) &&
has_stopped_jobs(pgrp)) {
__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
}
}
* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
*
* If a kernel thread is launched as a result of a system call, or if
* it ever exits, it should generally reparent itself to kthreadd so it
* isn't in the way of other processes and is correctly cleaned up on exit.
*
* The various task state such as scheduling policy and priority may have
* been inherited from a user process, so we reset them to sane values here.
*
* NOTE that reparent_to_kthreadd() gives the caller full capabilities.
static void reparent_to_kthreadd(void)
{
write_lock_irq(&tasklist_lock);
ptrace_unlink(current);
/* Reparent to init */
current->real_parent = current->parent = kthreadd_task;
list_move_tail(¤t->sibling, ¤t->real_parent->children);
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
/* signals? */
memcpy(current->signal->rlim, init_task.signal->rlim,
sizeof(current->signal->rlim));
atomic_inc(&init_cred.usage);
commit_creds(&init_cred);
void __set_special_pids(struct pid *pid)
struct task_struct *curr = current->group_leader;
if (task_session(curr) != pid) {
change_pid(curr, PIDTYPE_SID, pid);
if (task_pgrp(curr) != pid) {
change_pid(curr, PIDTYPE_PGID, pid);
static void set_special_pids(struct pid *pid)
write_unlock_irq(&tasklist_lock);
}
/*
* Let kernel threads use this to say that they
* allow a certain signal (since daemonize() will
* have disabled all of them by default).
*/
int allow_signal(int sig)
{
if (!valid_signal(sig) || sig < 1)
return -EINVAL;
spin_lock_irq(¤t->sighand->siglock);
sigdelset(¤t->blocked, sig);
if (!current->mm) {
/* Kernel threads handle their own signals.
Let the signal code know it'll be handled, so
that they don't get converted to SIGKILL or
just silently dropped */
current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
}
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
return 0;
}
EXPORT_SYMBOL(allow_signal);
int disallow_signal(int sig)
{
if (!valid_signal(sig) || sig < 1)
return -EINVAL;
spin_lock_irq(¤t->sighand->siglock);
current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
return 0;
}
EXPORT_SYMBOL(disallow_signal);
/*
* Put all the gunge required to become a kernel thread without
* attached user resources in one place where it belongs.
*/
void daemonize(const char *name, ...)
{
va_list args;
struct fs_struct *fs;
sigset_t blocked;
va_start(args, name);
vsnprintf(current->comm, sizeof(current->comm), name, args);
va_end(args);
/*
* If we were started as result of loading a module, close all of the
* user space pages. We don't need them, and if we didn't close them
* they would be locked into memory.
*/
exit_mm(current);
/*
* We don't want to have TIF_FREEZE set if the system-wide hibernation
* or suspend transition begins right now.
*/
if (current->nsproxy != &init_nsproxy) {
get_nsproxy(&init_nsproxy);
switch_task_namespaces(current, &init_nsproxy);
}
set_special_pids(&init_struct_pid);
/* Block and flush all signals */
sigfillset(&blocked);
sigprocmask(SIG_BLOCK, &blocked, NULL);
flush_signals(current);
/* Become as one with the init task */
exit_fs(current); /* current->fs->count--; */
fs = init_task.fs;
current->fs = fs;
atomic_inc(&fs->count);
current->files = init_task.files;
atomic_inc(¤t->files->count);
reparent_to_kthreadd();
static void close_files(struct files_struct * files)
/*
* It is safe to dereference the fd table without RCU or
* ->file_lock because this is the last reference to the
* files structure.
*/
for (;;) {
unsigned long set;
i = j * __NFDBITS;
if (i >= fdt->max_fds)
set = fdt->open_fds->fds_bits[j++];
struct file * file = xchg(&fdt->fd[i], NULL);
}
i++;
set >>= 1;
}
}
}
struct files_struct *get_files_struct(struct task_struct *task)
{
struct files_struct *files;
task_lock(task);
files = task->files;
if (files)
atomic_inc(&files->count);
task_unlock(task);
return files;
}
void put_files_struct(struct files_struct *files)
if (atomic_dec_and_test(&files->count)) {
close_files(files);
/*
* Free the fd and fdset arrays if we expanded them.
* If the fdtable was embedded, pass files for freeing
* at the end of the RCU grace period. Otherwise,
* you can free files immediately.
kmem_cache_free(files_cachep, files);
void reset_files_struct(struct files_struct *files)
struct task_struct *tsk = current;
struct files_struct *old;
old = tsk->files;
task_lock(tsk);
tsk->files = files;
task_unlock(tsk);
put_files_struct(old);
}
void exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
if (files) {
task_lock(tsk);
tsk->files = NULL;
task_unlock(tsk);
put_files_struct(files);
}
}
void put_fs_struct(struct fs_struct *fs)
{
/* No need to hold fs->lock if we are killing it */
if (atomic_dec_and_test(&fs->count)) {
void exit_fs(struct task_struct *tsk)
{
struct fs_struct * fs = tsk->fs;
if (fs) {
task_lock(tsk);
tsk->fs = NULL;
task_unlock(tsk);
put_fs_struct(fs);
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
#ifdef CONFIG_MM_OWNER
/*
* Task p is exiting and it owned mm, lets find a new owner for it
*/
static inline int
mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
{
/*
* If there are other users of the mm and the owner (us) is exiting
* we need to find a new owner to take on the responsibility.
*/
if (atomic_read(&mm->mm_users) <= 1)
return 0;
if (mm->owner != p)
return 0;
return 1;
}
void mm_update_next_owner(struct mm_struct *mm)
{
struct task_struct *c, *g, *p = current;
retry:
if (!mm_need_new_owner(mm, p))
return;
read_lock(&tasklist_lock);
/*
* Search in the children
*/
list_for_each_entry(c, &p->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
/*
* Search in the siblings
*/
list_for_each_entry(c, &p->parent->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
/*
* Search through everything else. We should not get
* here often
*/
do_each_thread(g, c) {
if (c->mm == mm)
goto assign_new_owner;
} while_each_thread(g, c);
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
return;
assign_new_owner:
BUG_ON(c == p);
get_task_struct(c);
/*
* The task_lock protects c->mm from changing.
* We always want mm->owner->mm == mm
*/
task_lock(c);
/*
* Delay read_unlock() till we have the task_lock()
* to ensure that c does not slip away underneath us
*/
read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
put_task_struct(c);
goto retry;
}
mm->owner = c;
task_unlock(c);
put_task_struct(c);
}
#endif /* CONFIG_MM_OWNER */
/*
* Turn us into a lazy TLB process if we
* aren't already..
*/
static void exit_mm(struct task_struct * tsk)
struct core_state *core_state;
mm_release(tsk, mm);
if (!mm)
return;
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
down_read(&mm->mmap_sem);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
self.task = tsk;
self.next = xchg(&core_state->dumper.next, &self);
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
}
atomic_inc(&mm->mm_count);
/* more a memory barrier than a real lock */
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current);
/* We don't want this task to be frozen prematurely */
clear_freeze_flag(tsk);
/* Returns nonzero if the child should be released. */
static int reparent_thread(struct task_struct *p, struct task_struct *father)
int dead;
if (p->pdeath_signal)
/* We already hold the tasklist_lock here. */
group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
list_move_tail(&p->sibling, &p->real_parent->children);
if (task_detached(p))
return 0;
/* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (same_thread_group(p->real_parent, father))
return 0;
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
dead = 0;
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
do_notify_parent(p, p->exit_signal);
if (task_detached(p)) {
p->exit_state = EXIT_DEAD;
dead = 1;
}
}
return dead;
}
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the child reaper process (ie "init") in our pid
* space.
static struct task_struct *find_new_reaper(struct task_struct *father)
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *thread;
thread = father;
while_each_thread(father, thread) {
if (thread->flags & PF_EXITING)
continue;
if (unlikely(pid_ns->child_reaper == father))
pid_ns->child_reaper = thread;
return thread;
}
if (unlikely(pid_ns->child_reaper == father)) {
write_unlock_irq(&tasklist_lock);
if (unlikely(pid_ns == &init_pid_ns))
panic("Attempted to kill init!");
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
* We can not clear ->child_reaper or leave it alone.
* There may by stealth EXIT_DEAD tasks on ->children,
* forget_original_parent() must move them somewhere.
pid_ns->child_reaper = init_pid_ns.child_reaper;
return pid_ns->child_reaper;
}
static void forget_original_parent(struct task_struct *father)
struct task_struct *p, *n, *reaper;
exit_ptrace(father);
write_lock_irq(&tasklist_lock);
reaper = find_new_reaper(father);
Matthias Kaehlcke
committed
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
if (p->parent == father) {
BUG_ON(p->ptrace);
p->parent = p->real_parent;
}
if (reparent_thread(p, father))
list_add(&p->ptrace_entry, &ptrace_dead);;
write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&father->children));
list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
/*
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
/*
* This does two things:
*
* A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk);
exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
*
* Thread signals are configurable, but you aren't going to use
* that to send signals to arbitary processes.
* That stops right now.
*
* If the parent exec id doesn't match the exec id we saved
* when we started then we know the parent has changed security
* domain.
*
* If our self_exec id doesn't match our parent_exec_id then
* we have changed execution domain as these two values started
* the same after a fork.
*/
if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id) &&
!capable(CAP_KILL))
signal = tracehook_notify_death(tsk, &cookie, group_dead);
tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for us */
if (thread_group_leader(tsk) &&
tsk->signal->group_exit_task &&
tsk->signal->notify_count < 0)
wake_up_process(tsk->signal->group_exit_task);
tracehook_report_death(tsk, signal, cookie, group_dead);
/* If the process is dead, release it - nobody will wait for it */
#ifdef CONFIG_DEBUG_STACK_USAGE
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
unsigned long free;
free = stack_not_used(current);
if (free >= lowest_to_date)
return;
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
"left\n",
current->comm, free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif
NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
/*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
* cleanup has been done or not. In the worst case it
* loops once more. We pretend that the cleanup was
* done as there is no way to return. Either the
* OWNER_DIED bit is set by now or we push the blocked
* task into the wait for ever nirwana as well.
*/
tsk->flags |= PF_EXITPIDONE;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
*/
smp_mb();
spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
KaiGai Kohei
committed
acct_collect(code, group_dead);
if (unlikely(tsk->audit_context))
audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
KaiGai Kohei
committed
acct_process();
trace_sched_process_exit(tsk);
exit_files(tsk);