All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit f5fa3630 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Various scheduler fixes all over the place: three SCHED_DL fixes,
  three sched/numa fixes, two generic race fixes and a comment fix"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/dl: Fix preemption checks
  sched: Update comments for CLONE_NEWNS
  sched: stop the unbound recursion in preempt_schedule_context()
  sched/fair: Fix division by zero sysctl_numa_balancing_scan_size
  sched/fair: Care divide error in update_task_scan_period()
  sched/numa: Fix unsafe get_task_struct() in task_numa_assign()
  sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer()
  sched/deadline: Don't replenish from a !SCHED_DEADLINE entity
  sched: Fix race between task_group and sched_task_group
parents 5656b408 f3a7e1a9
......@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
# ifdef CONFIG_CONTEXT_TRACKING
extern asmlinkage void ___preempt_schedule_context(void);
# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
extern asmlinkage void preempt_schedule_context(void);
# endif
#endif
......
......@@ -13,7 +13,7 @@
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
......
......@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
#ifdef CONFIG_PREEMPT
/**
* preempt_schedule_context - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
* infrastructure itself. But as tracing can happen in areas coming
* from userspace or just about to enter userspace, a preempt enable
* can occur before user_exit() is called. This will cause the scheduler
* to be called when the system is still in usermode.
*
* To prevent this, the preempt_enable_notrace will use this function
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
if (likely(!preemptible()))
return;
/*
* Need to disable preemption in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
preempt_disable_notrace();
prev_ctx = exception_enter();
preempt_enable_no_resched_notrace();
preempt_schedule();
preempt_disable_notrace();
exception_exit(prev_ctx);
preempt_enable_notrace();
}
EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_PREEMPT */
/**
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
......
......@@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_CONTEXT_TRACKING
/**
* preempt_schedule_context - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
* infrastructure itself. But as tracing can happen in areas coming
* from userspace or just about to enter userspace, a preempt enable
* can occur before user_exit() is called. This will cause the scheduler
* to be called when the system is still in usermode.
*
* To prevent this, the preempt_enable_notrace will use this function
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
if (likely(!preemptible()))
return;
do {
__preempt_count_add(PREEMPT_ACTIVE);
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
prev_ctx = exception_enter();
__schedule();
exception_exit(prev_ctx);
__preempt_count_sub(PREEMPT_ACTIVE);
barrier();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_CONTEXT_TRACKING */
#endif /* CONFIG_PREEMPT */
/*
......@@ -7833,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
......@@ -8205,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
......
......@@ -518,12 +518,20 @@ again:
}
/*
* We need to take care of a possible races here. In fact, the
* task might have changed its scheduling policy to something
* different from SCHED_DEADLINE or changed its reservation
* parameters (through sched_setattr()).
* We need to take care of several possible races here:
*
* - the task might have changed its scheduling policy
* to something different than SCHED_DEADLINE
* - the task might have changed its reservation parameters
* (through sched_setattr())
* - the task might have been boosted by someone else and
* might be in the boosting/deboosting path
*
* In all this cases we bail out, as the task is already
* in the runqueue or is going to be enqueued back anyway.
*/
if (!dl_task(p) || dl_se->dl_new)
if (!dl_task(p) || dl_se->dl_new ||
dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
......@@ -532,7 +540,7 @@ again:
dl_se->dl_yielded = 0;
if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
......@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
* that is going to be deboosted, but exceedes its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
*/
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return;
}
/*
* If p is throttled, we do nothing. In fact, if it exhausted
......@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && task_has_dl_policy(rq->curr))
check_preempt_curr_dl(rq, p, 0);
if (check_resched) {
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
}
}
}
......
......@@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
if (scan_size < MAX_SCAN_WINDOW)
windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
......@@ -1164,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
if (cur->pid == 0) /* idle */
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
* No need to move the exiting task, and this ensures that ->curr
* wasn't reaped and thus get_task_struct() in task_numa_assign()
* is safe under RCU read lock.
* Note that rcu_read_lock() itself can't protect from the final
* put_task_struct() after the last schedule().
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
raw_spin_unlock_irq(&dst_rq->lock);
/*
* "imp" is the fault differential for the source task between the
......@@ -1520,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
......
......@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
{
.procname = "numa_balancing",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment