rt.c 46.7 KB
Newer Older
1
2
3
4
5
/*
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */

6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#include "sched.h"

#include <linux/slab.h>

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

struct rt_bandwidth def_rt_bandwidth;

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
	struct rt_bandwidth *rt_b =
		container_of(timer, struct rt_bandwidth, rt_period_timer);
	ktime_t now;
	int overrun;
	int idle = 0;

	for (;;) {
		now = hrtimer_cb_get_time(timer);
		overrun = hrtimer_forward(timer, now, rt_b->rt_period);

		if (!overrun)
			break;

		idle = do_sched_rt_period_timer(rt_b, overrun);
	}

	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
	rt_b->rt_period = ns_to_ktime(period);
	rt_b->rt_runtime = runtime;

	raw_spin_lock_init(&rt_b->rt_runtime_lock);

	hrtimer_init(&rt_b->rt_period_timer,
			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rt_b->rt_period_timer.function = sched_rt_period_timer;
}

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
		return;

	if (hrtimer_active(&rt_b->rt_period_timer))
		return;

	raw_spin_lock(&rt_b->rt_runtime_lock);
	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
	raw_spin_unlock(&rt_b->rt_runtime_lock);
}

void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
{
	struct rt_prio_array *array;
	int i;

	array = &rt_rq->active;
	for (i = 0; i < MAX_RT_PRIO; i++) {
		INIT_LIST_HEAD(array->queue + i);
		__clear_bit(i, array->bitmap);
	}
	/* delimiter for bitsearch: */
	__set_bit(MAX_RT_PRIO, array->bitmap);

#if defined CONFIG_SMP
	rt_rq->highest_prio.curr = MAX_RT_PRIO;
	rt_rq->highest_prio.next = MAX_RT_PRIO;
	rt_rq->rt_nr_migratory = 0;
	rt_rq->overloaded = 0;
	plist_head_init(&rt_rq->pushable_tasks);
#endif

	rt_rq->rt_time = 0;
	rt_rq->rt_throttled = 0;
	rt_rq->rt_runtime = 0;
	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
}

87
#ifdef CONFIG_RT_GROUP_SCHED
88
89
90
91
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
	hrtimer_cancel(&rt_b->rt_period_timer);
}
92
93
94

#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)

95
96
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
97
98
99
#ifdef CONFIG_SCHED_DEBUG
	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
#endif
100
101
102
103
104
105
106
107
108
109
110
111
112
	return container_of(rt_se, struct task_struct, rt);
}

static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
	return rt_rq->rq;
}

static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
	return rt_se->rt_rq;
}

113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
void free_rt_sched_group(struct task_group *tg)
{
	int i;

	if (tg->rt_se)
		destroy_rt_bandwidth(&tg->rt_bandwidth);

	for_each_possible_cpu(i) {
		if (tg->rt_rq)
			kfree(tg->rt_rq[i]);
		if (tg->rt_se)
			kfree(tg->rt_se[i]);
	}

	kfree(tg->rt_rq);
	kfree(tg->rt_se);
}

void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
		struct sched_rt_entity *rt_se, int cpu,
		struct sched_rt_entity *parent)
{
	struct rq *rq = cpu_rq(cpu);

	rt_rq->highest_prio.curr = MAX_RT_PRIO;
	rt_rq->rt_nr_boosted = 0;
	rt_rq->rq = rq;
	rt_rq->tg = tg;

	tg->rt_rq[cpu] = rt_rq;
	tg->rt_se[cpu] = rt_se;

	if (!rt_se)
		return;

	if (!parent)
		rt_se->rt_rq = &rq->rt;
	else
		rt_se->rt_rq = parent->my_q;

	rt_se->my_q = rt_rq;
	rt_se->parent = parent;
	INIT_LIST_HEAD(&rt_se->run_list);
}

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
	struct rt_rq *rt_rq;
	struct sched_rt_entity *rt_se;
	int i;

	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
	if (!tg->rt_rq)
		goto err;
	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
	if (!tg->rt_se)
		goto err;

	init_rt_bandwidth(&tg->rt_bandwidth,
			ktime_to_ns(def_rt_bandwidth.rt_period), 0);

	for_each_possible_cpu(i) {
		rt_rq = kzalloc_node(sizeof(struct rt_rq),
				     GFP_KERNEL, cpu_to_node(i));
		if (!rt_rq)
			goto err;

		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
				     GFP_KERNEL, cpu_to_node(i));
		if (!rt_se)
			goto err_free_rq;

		init_rt_rq(rt_rq, cpu_rq(i));
		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
	}

	return 1;

err_free_rq:
	kfree(rt_rq);
err:
	return 0;
}

198
199
#else /* CONFIG_RT_GROUP_SCHED */

200
201
#define rt_entity_is_task(rt_se) (1)

202
203
204
205
206
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
	return container_of(rt_se, struct task_struct, rt);
}

207
208
209
210
211
212
213
214
215
216
217
218
219
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
	return container_of(rt_rq, struct rq, rt);
}

static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
	struct task_struct *p = rt_task_of(rt_se);
	struct rq *rq = task_rq(p);

	return &rq->rt;
}

220
221
222
223
224
225
void free_rt_sched_group(struct task_group *tg) { }

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
	return 1;
}
226
227
#endif /* CONFIG_RT_GROUP_SCHED */

228
#ifdef CONFIG_SMP
229

230
static inline int rt_overloaded(struct rq *rq)
231
{
232
	return atomic_read(&rq->rd->rto_count);
233
}
234

235
236
static inline void rt_set_overload(struct rq *rq)
{
237
238
239
	if (!rq->online)
		return;

240
	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
241
242
243
244
245
246
247
248
	/*
	 * Make sure the mask is visible before we set
	 * the overload count. That is checked to determine
	 * if we should look at the mask. It would be a shame
	 * if we looked at the mask, but the mask was not
	 * updated yet.
	 */
	wmb();
249
	atomic_inc(&rq->rd->rto_count);
250
}
251

252
253
static inline void rt_clear_overload(struct rq *rq)
{
254
255
256
	if (!rq->online)
		return;

257
	/* the order here really doesn't matter */
258
	atomic_dec(&rq->rd->rto_count);
259
	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
260
}
261

262
static void update_rt_migration(struct rt_rq *rt_rq)
263
{
264
	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
265
266
267
		if (!rt_rq->overloaded) {
			rt_set_overload(rq_of_rt_rq(rt_rq));
			rt_rq->overloaded = 1;
268
		}
269
270
271
	} else if (rt_rq->overloaded) {
		rt_clear_overload(rq_of_rt_rq(rt_rq));
		rt_rq->overloaded = 0;
272
	}
273
}
274

275
276
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
277
278
279
280
281
282
	if (!rt_entity_is_task(rt_se))
		return;

	rt_rq = &rq_of_rt_rq(rt_rq)->rt;

	rt_rq->rt_nr_total++;
283
284
285
286
287
288
289
290
	if (rt_se->nr_cpus_allowed > 1)
		rt_rq->rt_nr_migratory++;

	update_rt_migration(rt_rq);
}

static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
291
292
293
294
295
296
	if (!rt_entity_is_task(rt_se))
		return;

	rt_rq = &rq_of_rt_rq(rt_rq)->rt;

	rt_rq->rt_nr_total--;
297
298
299
300
301
302
	if (rt_se->nr_cpus_allowed > 1)
		rt_rq->rt_nr_migratory--;

	update_rt_migration(rt_rq);
}

303
304
305
306
307
static inline int has_pushable_tasks(struct rq *rq)
{
	return !plist_head_empty(&rq->rt.pushable_tasks);
}

308
309
310
311
312
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
	plist_node_init(&p->pushable_tasks, p->prio);
	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
313
314
315
316

	/* Update the highest prio pushable task */
	if (p->prio < rq->rt.highest_prio.next)
		rq->rt.highest_prio.next = p->prio;
317
318
319
320
321
322
}

static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);

323
324
325
326
327
328
329
	/* Update the new highest prio pushable task */
	if (has_pushable_tasks(rq)) {
		p = plist_first_entry(&rq->rt.pushable_tasks,
				      struct task_struct, pushable_tasks);
		rq->rt.highest_prio.next = p->prio;
	} else
		rq->rt.highest_prio.next = MAX_RT_PRIO;
330
331
}

332
333
#else

334
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
Peter Zijlstra's avatar
Peter Zijlstra committed
335
{
Peter Zijlstra's avatar
Peter Zijlstra committed
336
337
}

338
339
340
341
static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}

342
static inline
343
344
345
346
void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}

347
static inline
348
349
350
void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
351

352
353
#endif /* CONFIG_SMP */

Peter Zijlstra's avatar
Peter Zijlstra committed
354
355
356
357
358
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
	return !list_empty(&rt_se->run_list);
}

359
#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra's avatar
Peter Zijlstra committed
360

Peter Zijlstra's avatar
Peter Zijlstra committed
361
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
362
363
{
	if (!rt_rq->tg)
Peter Zijlstra's avatar
Peter Zijlstra committed
364
		return RUNTIME_INF;
Peter Zijlstra's avatar
Peter Zijlstra committed
365

366
367
368
369
370
371
	return rt_rq->rt_runtime;
}

static inline u64 sched_rt_period(struct rt_rq *rt_rq)
{
	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
Peter Zijlstra's avatar
Peter Zijlstra committed
372
373
}

374
375
typedef struct task_group *rt_rq_iter_t;

376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
static inline struct task_group *next_task_group(struct task_group *tg)
{
	do {
		tg = list_entry_rcu(tg->list.next,
			typeof(struct task_group), list);
	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));

	if (&tg->list == &task_groups)
		tg = NULL;

	return tg;
}

#define for_each_rt_rq(rt_rq, iter, rq)					\
	for (iter = container_of(&task_groups, typeof(*iter), list);	\
		(iter = next_task_group(iter)) &&			\
		(rt_rq = iter->rt_rq[cpu_of(rq)]);)
393

394
395
396
397
398
399
400
401
402
403
404
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
	list_add_rcu(&rt_rq->leaf_rt_rq_list,
			&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
}

static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
{
	list_del_rcu(&rt_rq->leaf_rt_rq_list);
}

Peter Zijlstra's avatar
Peter Zijlstra committed
405
#define for_each_leaf_rt_rq(rt_rq, rq) \
406
	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
Peter Zijlstra's avatar
Peter Zijlstra committed
407
408
409
410
411
412
413
414
415

#define for_each_sched_rt_entity(rt_se) \
	for (; rt_se; rt_se = rt_se->parent)

static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{
	return rt_se->my_q;
}

416
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
Peter Zijlstra's avatar
Peter Zijlstra committed
417
418
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);

Peter Zijlstra's avatar
Peter Zijlstra committed
419
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
420
{
421
	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
422
423
	struct sched_rt_entity *rt_se;

424
425
426
	int cpu = cpu_of(rq_of_rt_rq(rt_rq));

	rt_se = rt_rq->tg->rt_se[cpu];
Peter Zijlstra's avatar
Peter Zijlstra committed
427

428
429
	if (rt_rq->rt_nr_running) {
		if (rt_se && !on_rt_rq(rt_se))
430
			enqueue_rt_entity(rt_se, false);
431
		if (rt_rq->highest_prio.curr < curr->prio)
432
			resched_task(curr);
Peter Zijlstra's avatar
Peter Zijlstra committed
433
434
435
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
436
static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
437
{
438
	struct sched_rt_entity *rt_se;
439
	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
440

441
	rt_se = rt_rq->tg->rt_se[cpu];
Peter Zijlstra's avatar
Peter Zijlstra committed
442
443
444
445
446

	if (rt_se && on_rt_rq(rt_se))
		dequeue_rt_entity(rt_se);
}

Peter Zijlstra's avatar
Peter Zijlstra committed
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
}

static int rt_se_boosted(struct sched_rt_entity *rt_se)
{
	struct rt_rq *rt_rq = group_rt_rq(rt_se);
	struct task_struct *p;

	if (rt_rq)
		return !!rt_rq->rt_nr_boosted;

	p = rt_task_of(rt_se);
	return p->prio != p->normal_prio;
}

464
#ifdef CONFIG_SMP
465
static inline const struct cpumask *sched_rt_period_mask(void)
466
467
468
{
	return cpu_rq(smp_processor_id())->rd->span;
}
Peter Zijlstra's avatar
Peter Zijlstra committed
469
#else
470
static inline const struct cpumask *sched_rt_period_mask(void)
471
{
472
	return cpu_online_mask;
473
474
}
#endif
Peter Zijlstra's avatar
Peter Zijlstra committed
475

476
477
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
478
{
479
480
	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
}
Peter Zijlstra's avatar
Peter Zijlstra committed
481

482
483
484
485
486
static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
{
	return &rt_rq->tg->rt_bandwidth;
}

487
#else /* !CONFIG_RT_GROUP_SCHED */
488
489
490

static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
491
492
493
494
495
496
	return rt_rq->rt_runtime;
}

static inline u64 sched_rt_period(struct rt_rq *rt_rq)
{
	return ktime_to_ns(def_rt_bandwidth.rt_period);
Peter Zijlstra's avatar
Peter Zijlstra committed
497
498
}

499
500
501
502
503
typedef struct rt_rq *rt_rq_iter_t;

#define for_each_rt_rq(rt_rq, iter, rq) \
	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)

504
505
506
507
508
509
510
511
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
}

static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
{
}

Peter Zijlstra's avatar
Peter Zijlstra committed
512
513
514
515
516
517
518
519
520
521
522
#define for_each_leaf_rt_rq(rt_rq, rq) \
	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)

#define for_each_sched_rt_entity(rt_se) \
	for (; rt_se; rt_se = NULL)

static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{
	return NULL;
}

Peter Zijlstra's avatar
Peter Zijlstra committed
523
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
524
{
525
526
	if (rt_rq->rt_nr_running)
		resched_task(rq_of_rt_rq(rt_rq)->curr);
Peter Zijlstra's avatar
Peter Zijlstra committed
527
528
}

Peter Zijlstra's avatar
Peter Zijlstra committed
529
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
530
531
532
{
}

Peter Zijlstra's avatar
Peter Zijlstra committed
533
534
535
536
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
	return rt_rq->rt_throttled;
}
537

538
static inline const struct cpumask *sched_rt_period_mask(void)
539
{
540
	return cpu_online_mask;
541
542
543
544
545
546
547
548
}

static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
	return &cpu_rq(cpu)->rt;
}

549
550
551
552
553
static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
{
	return &def_rt_bandwidth;
}

554
#endif /* CONFIG_RT_GROUP_SCHED */
555

556
#ifdef CONFIG_SMP
557
558
559
/*
 * We ran out of runtime, see if we can borrow some from our neighbours.
 */
560
static int do_balance_runtime(struct rt_rq *rt_rq)
561
562
563
564
565
566
{
	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
	int i, weight, more = 0;
	u64 rt_period;

567
	weight = cpumask_weight(rd->span);
568

569
	raw_spin_lock(&rt_b->rt_runtime_lock);
570
	rt_period = ktime_to_ns(rt_b->rt_period);
571
	for_each_cpu(i, rd->span) {
572
573
574
575
576
577
		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
		s64 diff;

		if (iter == rt_rq)
			continue;

578
		raw_spin_lock(&iter->rt_runtime_lock);
579
580
581
582
583
		/*
		 * Either all rqs have inf runtime and there's nothing to steal
		 * or __disable_runtime() below sets a specific rq to inf to
		 * indicate its been disabled and disalow stealing.
		 */
584
585
586
		if (iter->rt_runtime == RUNTIME_INF)
			goto next;

587
588
589
590
		/*
		 * From runqueues with spare time, take 1/n part of their
		 * spare time, but no more than our period.
		 */
591
592
		diff = iter->rt_runtime - iter->rt_time;
		if (diff > 0) {
593
			diff = div_u64((u64)diff, weight);
594
595
596
597
598
599
			if (rt_rq->rt_runtime + diff > rt_period)
				diff = rt_period - rt_rq->rt_runtime;
			iter->rt_runtime -= diff;
			rt_rq->rt_runtime += diff;
			more = 1;
			if (rt_rq->rt_runtime == rt_period) {
600
				raw_spin_unlock(&iter->rt_runtime_lock);
601
602
603
				break;
			}
		}
604
next:
605
		raw_spin_unlock(&iter->rt_runtime_lock);
606
	}
607
	raw_spin_unlock(&rt_b->rt_runtime_lock);
608
609
610

	return more;
}
611

612
613
614
/*
 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 */
615
616
617
static void __disable_runtime(struct rq *rq)
{
	struct root_domain *rd = rq->rd;
618
	rt_rq_iter_t iter;
619
620
621
622
623
	struct rt_rq *rt_rq;

	if (unlikely(!scheduler_running))
		return;

624
	for_each_rt_rq(rt_rq, iter, rq) {
625
626
627
628
		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
		s64 want;
		int i;

629
630
		raw_spin_lock(&rt_b->rt_runtime_lock);
		raw_spin_lock(&rt_rq->rt_runtime_lock);
631
632
633
634
635
		/*
		 * Either we're all inf and nobody needs to borrow, or we're
		 * already disabled and thus have nothing to do, or we have
		 * exactly the right amount of runtime to take out.
		 */
636
637
638
		if (rt_rq->rt_runtime == RUNTIME_INF ||
				rt_rq->rt_runtime == rt_b->rt_runtime)
			goto balanced;
639
		raw_spin_unlock(&rt_rq->rt_runtime_lock);
640

641
642
643
644
645
		/*
		 * Calculate the difference between what we started out with
		 * and what we current have, that's the amount of runtime
		 * we lend and now have to reclaim.
		 */
646
647
		want = rt_b->rt_runtime - rt_rq->rt_runtime;

648
649
650
		/*
		 * Greedy reclaim, take back as much as we can.
		 */
651
		for_each_cpu(i, rd->span) {
652
653
654
			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
			s64 diff;

655
656
657
			/*
			 * Can't reclaim from ourselves or disabled runqueues.
			 */
658
			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
659
660
				continue;

661
			raw_spin_lock(&iter->rt_runtime_lock);
662
663
664
665
666
667
668
669
			if (want > 0) {
				diff = min_t(s64, iter->rt_runtime, want);
				iter->rt_runtime -= diff;
				want -= diff;
			} else {
				iter->rt_runtime -= want;
				want -= want;
			}
670
			raw_spin_unlock(&iter->rt_runtime_lock);
671
672
673
674
675

			if (!want)
				break;
		}

676
		raw_spin_lock(&rt_rq->rt_runtime_lock);
677
678
679
680
		/*
		 * We cannot be left wanting - that would mean some runtime
		 * leaked out of the system.
		 */
681
682
		BUG_ON(want);
balanced:
683
684
685
686
		/*
		 * Disable all the borrow logic by pretending we have inf
		 * runtime - in which case borrowing doesn't make sense.
		 */
687
		rt_rq->rt_runtime = RUNTIME_INF;
688
689
		raw_spin_unlock(&rt_rq->rt_runtime_lock);
		raw_spin_unlock(&rt_b->rt_runtime_lock);
690
691
692
693
694
695
696
	}
}

static void disable_runtime(struct rq *rq)
{
	unsigned long flags;

697
	raw_spin_lock_irqsave(&rq->lock, flags);
698
	__disable_runtime(rq);
699
	raw_spin_unlock_irqrestore(&rq->lock, flags);
700
701
702
703
}

static void __enable_runtime(struct rq *rq)
{
704
	rt_rq_iter_t iter;
705
706
707
708
709
	struct rt_rq *rt_rq;

	if (unlikely(!scheduler_running))
		return;

710
711
712
	/*
	 * Reset each runqueue's bandwidth settings
	 */
713
	for_each_rt_rq(rt_rq, iter, rq) {
714
715
		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);

716
717
		raw_spin_lock(&rt_b->rt_runtime_lock);
		raw_spin_lock(&rt_rq->rt_runtime_lock);
718
719
		rt_rq->rt_runtime = rt_b->rt_runtime;
		rt_rq->rt_time = 0;
720
		rt_rq->rt_throttled = 0;
721
722
		raw_spin_unlock(&rt_rq->rt_runtime_lock);
		raw_spin_unlock(&rt_b->rt_runtime_lock);
723
724
725
726
727
728
729
	}
}

static void enable_runtime(struct rq *rq)
{
	unsigned long flags;

730
	raw_spin_lock_irqsave(&rq->lock, flags);
731
	__enable_runtime(rq);
732
	raw_spin_unlock_irqrestore(&rq->lock, flags);
733
734
}

735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
	int cpu = (int)(long)hcpu;

	switch (action) {
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
		disable_runtime(cpu_rq(cpu));
		return NOTIFY_OK;

	case CPU_DOWN_FAILED:
	case CPU_DOWN_FAILED_FROZEN:
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
		enable_runtime(cpu_rq(cpu));
		return NOTIFY_OK;

	default:
		return NOTIFY_DONE;
	}
}

757
758
759
760
static int balance_runtime(struct rt_rq *rt_rq)
{
	int more = 0;

761
762
763
	if (!sched_feat(RT_RUNTIME_SHARE))
		return more;

764
	if (rt_rq->rt_time > rt_rq->rt_runtime) {
765
		raw_spin_unlock(&rt_rq->rt_runtime_lock);
766
		more = do_balance_runtime(rt_rq);
767
		raw_spin_lock(&rt_rq->rt_runtime_lock);
768
769
770
771
	}

	return more;
}
772
#else /* !CONFIG_SMP */
773
774
775
776
static inline int balance_runtime(struct rt_rq *rt_rq)
{
	return 0;
}
777
#endif /* CONFIG_SMP */
778

779
780
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
781
	int i, idle = 1, throttled = 0;
782
	const struct cpumask *span;
783
784

	span = sched_rt_period_mask();
785
	for_each_cpu(i, span) {
786
787
788
789
		int enqueue = 0;
		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
		struct rq *rq = rq_of_rt_rq(rt_rq);

790
		raw_spin_lock(&rq->lock);
791
792
793
		if (rt_rq->rt_time) {
			u64 runtime;

794
			raw_spin_lock(&rt_rq->rt_runtime_lock);
795
796
797
798
799
800
801
			if (rt_rq->rt_throttled)
				balance_runtime(rt_rq);
			runtime = rt_rq->rt_runtime;
			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
				rt_rq->rt_throttled = 0;
				enqueue = 1;
802
803
804
805
806
807
808

				/*
				 * Force a clock update if the CPU was idle,
				 * lest wakeup -> unthrottle time accumulate.
				 */
				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
					rq->skip_clock_update = -1;
809
810
811
			}
			if (rt_rq->rt_time || rt_rq->rt_nr_running)
				idle = 0;
812
			raw_spin_unlock(&rt_rq->rt_runtime_lock);
813
		} else if (rt_rq->rt_nr_running) {
814
			idle = 0;
815
816
817
			if (!rt_rq_throttled(rt_rq))
				enqueue = 1;
		}
818
819
		if (rt_rq->rt_throttled)
			throttled = 1;
820
821
822

		if (enqueue)
			sched_rt_rq_enqueue(rt_rq);
823
		raw_spin_unlock(&rq->lock);
824
825
	}

826
827
828
	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
		return 1;

829
830
	return idle;
}
831

Peter Zijlstra's avatar
Peter Zijlstra committed
832
833
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
834
#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra's avatar
Peter Zijlstra committed
835
836
837
	struct rt_rq *rt_rq = group_rt_rq(rt_se);

	if (rt_rq)
838
		return rt_rq->highest_prio.curr;
Peter Zijlstra's avatar
Peter Zijlstra committed
839
840
841
842
843
#endif

	return rt_task_of(rt_se)->prio;
}

Peter Zijlstra's avatar
Peter Zijlstra committed
844
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
845
{
Peter Zijlstra's avatar
Peter Zijlstra committed
846
	u64 runtime = sched_rt_runtime(rt_rq);
Peter Zijlstra's avatar
Peter Zijlstra committed
847
848

	if (rt_rq->rt_throttled)
Peter Zijlstra's avatar
Peter Zijlstra committed
849
		return rt_rq_throttled(rt_rq);
Peter Zijlstra's avatar
Peter Zijlstra committed
850

851
	if (runtime >= sched_rt_period(rt_rq))
852
853
		return 0;

854
855
856
857
	balance_runtime(rt_rq);
	runtime = sched_rt_runtime(rt_rq);
	if (runtime == RUNTIME_INF)
		return 0;
858

Peter Zijlstra's avatar
Peter Zijlstra committed
859
	if (rt_rq->rt_time > runtime) {
860
861
862
863
864
865
866
		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);

		/*
		 * Don't actually throttle groups that have no runtime assigned
		 * but accrue some time due to boosting.
		 */
		if (likely(rt_b->rt_runtime)) {
867
868
			static bool once = false;

869
			rt_rq->rt_throttled = 1;
870
871
872
873
874

			if (!once) {
				once = true;
				printk_sched("sched: RT throttling activated\n");
			}
875
876
877
878
879
880
881
882
883
		} else {
			/*
			 * In case we did anyway, make it go away,
			 * replenishment is a joke, since it will replenish us
			 * with exactly 0 ns.
			 */
			rt_rq->rt_time = 0;
		}

Peter Zijlstra's avatar
Peter Zijlstra committed
884
		if (rt_rq_throttled(rt_rq)) {
Peter Zijlstra's avatar
Peter Zijlstra committed
885
			sched_rt_rq_dequeue(rt_rq);
Peter Zijlstra's avatar
Peter Zijlstra committed
886
887
			return 1;
		}
Peter Zijlstra's avatar
Peter Zijlstra committed
888
889
890
891
892
	}

	return 0;
}

893
894
895
896
/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
Alexey Dobriyan's avatar
Alexey Dobriyan committed
897
static void update_curr_rt(struct rq *rq)
898
899
{
	struct task_struct *curr = rq->curr;
Peter Zijlstra's avatar
Peter Zijlstra committed
900
901
	struct sched_rt_entity *rt_se = &curr->rt;
	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
902
903
	u64 delta_exec;

Peter Zijlstra's avatar
Peter Zijlstra committed
904
	if (curr->sched_class != &rt_sched_class)
905
906
		return;

907
	delta_exec = rq->clock_task - curr->se.exec_start;
908
909
	if (unlikely((s64)delta_exec < 0))
		delta_exec = 0;
Ingo Molnar's avatar
Ingo Molnar committed
910

911
912
	schedstat_set(curr->se.statistics.exec_max,
		      max(curr->se.statistics.exec_max, delta_exec));
913
914

	curr->se.sum_exec_runtime += delta_exec;
915
916
	account_group_exec_runtime(curr, delta_exec);

917
	curr->se.exec_start = rq->clock_task;
918
	cpuacct_charge(curr, delta_exec);
Peter Zijlstra's avatar
Peter Zijlstra committed
919

920
921
	sched_rt_avg_update(rq, delta_exec);

922
923
924
	if (!rt_bandwidth_enabled())
		return;

Dhaval Giani's avatar
Dhaval Giani committed
925
926
927
	for_each_sched_rt_entity(rt_se) {
		rt_rq = rt_rq_of_se(rt_se);

928
		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
929
			raw_spin_lock(&rt_rq->rt_runtime_lock);
930
931
932
			rt_rq->rt_time += delta_exec;
			if (sched_rt_runtime_exceeded(rt_rq))
				resched_task(curr);
933
			raw_spin_unlock(&rt_rq->rt_runtime_lock);
934
		}
Dhaval Giani's avatar
Dhaval Giani committed
935
	}
936
937
}

938
#if defined CONFIG_SMP
939

940
941
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
942
{
943
	struct rq *rq = rq_of_rt_rq(rt_rq);
944

945
946
	if (rq->online && prio < prev_prio)
		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
947
}
948

949
950
951
952
static void
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
	struct rq *rq = rq_of_rt_rq(rt_rq);
953

954
955
	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
956
957
}

958
959
#else /* CONFIG_SMP */

Peter Zijlstra's avatar
Peter Zijlstra committed
960
static inline
961
962
963
964
965
void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
static inline
void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}

#endif /* CONFIG_SMP */
966

967
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
	int prev_prio = rt_rq->highest_prio.curr;

	if (prio < prev_prio)
		rt_rq->highest_prio.curr = prio;

	inc_rt_prio_smp(rt_rq, prio, prev_prio);
}

static void
dec_rt_prio(struct rt_rq *rt_rq, int prio)
{
	int prev_prio = rt_rq->highest_prio.curr;

Peter Zijlstra's avatar
Peter Zijlstra committed
984
	if (rt_rq->rt_nr_running) {
985

986
		WARN_ON(prio < prev_prio);
987

988
		/*
989
990
		 * This may have been our highest task, and therefore
		 * we may have some recomputation to do
991
		 */
992
		if (prio == prev_prio) {
993
994
995
			struct rt_prio_array *array = &rt_rq->active;

			rt_rq->highest_prio.curr =
996
				sched_find_first_bit(array->bitmap);
997
998
		}

999
	} else
1000
		rt_rq->highest_prio.curr = MAX_RT_PRIO;
1001

1002
1003
	dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
1004

1005
1006
1007
1008
1009
1010
#else

static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}

#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1011

1012
#ifdef CONFIG_RT_GROUP_SCHED
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
	if (rt_se_boosted(rt_se))
		rt_rq->rt_nr_boosted++;

	if (rt_rq->tg)
		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
}

static void
dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
Peter Zijlstra's avatar
Peter Zijlstra committed
1027
1028
1029
1030
	if (rt_se_boosted(rt_se))
		rt_rq->rt_nr_boosted--;

	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
}

#else /* CONFIG_RT_GROUP_SCHED */

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
	start_rt_bandwidth(&def_rt_bandwidth);
}

static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}

#endif /* CONFIG_RT_GROUP_SCHED */

static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
	int prio = rt_se_prio(rt_se);

	WARN_ON(!rt_prio(prio));
	rt_rq->rt_nr_running++;

	inc_rt_prio(rt_rq, prio);
	inc_rt_migration(rt_se, rt_rq);
	inc_rt_group(rt_se, rt_rq);
}

static inline
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
	WARN_ON(!rt_rq->rt_nr_running);
	rt_rq->rt_nr_running--;

	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
	dec_rt_migration(rt_se, rt_rq);
	dec_rt_group(rt_se, rt_rq);
1069
1070
}

1071
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1072
{
Peter Zijlstra's avatar
Peter Zijlstra committed
1073
1074
1075
	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
	struct rt_prio_array *array = &rt_rq->active;
	struct rt_rq *group_rq = group_rt_rq(rt_se);
1076
	struct list_head *queue = array->queue + rt_se_prio(rt_se);
1077

1078
1079
1080
1081
1082
1083
1084
	/*
	 * Don't enqueue the group if its throttled, or when empty.
	 * The latter is a consequence of the former when a child group
	 * get throttled and the current group doesn't have any other
	 * active members.
	 */
	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
Peter Zijlstra's avatar
Peter Zijlstra committed
1085
		return;
1086

1087
1088
1089
	if (!rt_rq->rt_nr_running)
		list_add_leaf_rt_rq(rt_rq);

1090
1091
1092
1093
	if (head)
		list_add(&rt_se->run_list, queue);
	else
		list_add_tail(&rt_se->run_list, queue);
Peter Zijlstra's avatar
Peter Zijlstra committed
1094
	__set_bit(rt_se_prio(rt_se), array->bitmap);
1095

Peter Zijlstra's avatar
Peter Zijlstra committed
1096
1097
1098
	inc_rt_tasks(rt_se, rt_rq);
}

1099
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
Peter Zijlstra's avatar
Peter Zijlstra committed
1100
1101
1102
1103
1104
1105
1106
1107
1108
{
	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
	struct rt_prio_array *array = &rt_rq->active;

	list_del_init(&rt_se->run_list);
	if (list_empty(array->queue + rt_se_prio(rt_se)))
		__clear_bit(rt_se_prio(rt_se), array->bitmap);

	dec_rt_tasks(rt_se, rt_rq);
1109
1110
	if (!rt_rq->rt_nr_running)
		list_del_leaf_rt_rq(rt_rq);
Peter Zijlstra's avatar
Peter Zijlstra committed
1111
1112
1113
1114
1115
1116
}

/*
 * Because the prio of an upper entry depends on the lower
 * entries, we must remove entries top - down.
 */
1117
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
Peter Zijlstra's avatar
Peter Zijlstra committed
1118
{
1119
	struct sched_rt_entity *back = NULL;
Peter Zijlstra's avatar
Peter Zijlstra committed
1120

1121
1122
1123
1124
1125
1126
1127
	for_each_sched_rt_entity(rt_se) {
		rt_se->back = back;
		back = rt_se;
	}

	for (rt_se = back; rt_se; rt_se = rt_se->back) {
		if (on_rt_rq(rt_se))
1128
1129
1130
1131
			__dequeue_rt_entity(rt_se);
	}
}

1132
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1133
1134
1135
{
	dequeue_rt_stack(rt_se);
	for_each_sched_rt_entity(rt_se)
1136
		__enqueue_rt_entity(rt_se, head);
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
}

static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
	dequeue_rt_stack(rt_se);

	for_each_sched_rt_entity(rt_se) {
		struct rt_rq *rt_rq = group_rt_rq(rt_se);

		if (rt_rq && rt_rq->rt_nr_running)
1147
			__enqueue_rt_entity(rt_se, false);
1148
	}
1149
1150
1151
1152
1153
}

/*
 * Adding/removing a task to/from a priority array:
 */
1154
static void
1155
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
Peter Zijlstra's avatar
Peter Zijlstra committed
1156
1157
1158
{
	struct sched_rt_entity *rt_se = &p->rt;

1159
	if (flags & ENQUEUE_WAKEUP)
Peter Zijlstra's avatar