workqueue.c 152 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Tejun Heo's avatar
Tejun Heo committed
2
 * kernel/workqueue.c - generic async execution with shared worker pool
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Tejun Heo's avatar
Tejun Heo committed
4
 * Copyright (C) 2002		Ingo Molnar
Linus Torvalds's avatar
Linus Torvalds committed
5
 *
Tejun Heo's avatar
Tejun Heo committed
6
7
8
9
10
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
Linus Torvalds's avatar
Linus Torvalds committed
11
 *
Tejun Heo's avatar
Tejun Heo committed
12
 * Made to use alloc_percpu by Christoph Lameter.
Linus Torvalds's avatar
Linus Torvalds committed
13
 *
Tejun Heo's avatar
Tejun Heo committed
14
15
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16
 *
Tejun Heo's avatar
Tejun Heo committed
17
18
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
19
20
21
22
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
Tejun Heo's avatar
Tejun Heo committed
23
24
 *
 * Please read Documentation/workqueue.txt for details.
Linus Torvalds's avatar
Linus Torvalds committed
25
26
 */

27
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
29
30
31
32
33
34
35
36
37
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
38
#include <linux/hardirq.h>
39
#include <linux/mempolicy.h>
40
#include <linux/freezer.h>
41
42
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
43
#include <linux/lockdep.h>
Tejun Heo's avatar
Tejun Heo committed
44
#include <linux/idr.h>
45
#include <linux/jhash.h>
46
#include <linux/hashtable.h>
47
#include <linux/rculist.h>
48
#include <linux/nodemask.h>
49
#include <linux/moduleparam.h>
50
#include <linux/uaccess.h>
51

52
#include "workqueue_internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
53

Tejun Heo's avatar
Tejun Heo committed
54
enum {
55
56
	/*
	 * worker_pool flags
57
	 *
58
	 * A bound pool is either associated or disassociated with its CPU.
59
60
61
62
63
64
	 * While associated (!DISASSOCIATED), all workers are bound to the
	 * CPU and none has %WORKER_UNBOUND set and concurrency management
	 * is in effect.
	 *
	 * While DISASSOCIATED, the cpu may be offline and all workers have
	 * %WORKER_UNBOUND set and concurrency management disabled, and may
65
	 * be executing on any CPU.  The pool behaves as an unbound one.
66
	 *
67
	 * Note that DISASSOCIATED should be flipped only while holding
68
	 * attach_mutex to avoid changing binding state while
69
	 * worker_attach_to_pool() is in progress.
70
	 */
71
	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
72

Tejun Heo's avatar
Tejun Heo committed
73
74
75
	/* worker flags */
	WORKER_DIE		= 1 << 1,	/* die die die */
	WORKER_IDLE		= 1 << 2,	/* is idle */
76
	WORKER_PREP		= 1 << 3,	/* preparing to run works */
77
	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
78
	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
79
	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */
80

81
82
	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
				  WORKER_UNBOUND | WORKER_REBOUND,
83

84
	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
85

86
	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
Tejun Heo's avatar
Tejun Heo committed
87
	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
88

89
90
91
	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */

92
93
94
	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
						/* call for help after 10ms
						   (min two ticks) */
95
96
97
98
99
	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */

	/*
	 * Rescue workers are used only on emergencies and shared by
100
	 * all cpus.  Give MIN_NICE.
101
	 */
102
103
	RESCUER_NICE_LEVEL	= MIN_NICE,
	HIGHPRI_NICE_LEVEL	= MIN_NICE,
104
105

	WQ_NAME_LEN		= 24,
Tejun Heo's avatar
Tejun Heo committed
106
};
Linus Torvalds's avatar
Linus Torvalds committed
107
108

/*
Tejun Heo's avatar
Tejun Heo committed
109
110
 * Structure fields follow one of the following exclusion rules.
 *
111
112
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
Tejun Heo's avatar
Tejun Heo committed
113
 *
114
115
116
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
117
 * L: pool->lock protected.  Access with pool->lock held.
Tejun Heo's avatar
Tejun Heo committed
118
 *
119
120
121
122
 * X: During normal operation, modification requires pool->lock and should
 *    be done only from local cpu.  Either disabling preemption on local
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
123
 *
124
 * A: pool->attach_mutex protected.
125
 *
126
 * PL: wq_pool_mutex protected.
127
 *
128
 * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
129
 *
130
131
132
133
134
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      sched-RCU for reads.
 *
135
136
 * WQ: wq->mutex protected.
 *
137
 * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
138
139
 *
 * MD: wq_mayday_lock protected.
Linus Torvalds's avatar
Linus Torvalds committed
140
141
 */

142
/* struct worker is defined in workqueue_internal.h */
Tejun Heo's avatar
Tejun Heo committed
143

144
struct worker_pool {
145
	spinlock_t		lock;		/* the pool lock */
146
	int			cpu;		/* I: the associated cpu */
147
	int			node;		/* I: the associated node ID */
Tejun Heo's avatar
Tejun Heo committed
148
	int			id;		/* I: pool ID */
149
	unsigned int		flags;		/* X: flags */
150

151
152
	unsigned long		watchdog_ts;	/* L: watchdog timestamp */

153
154
	struct list_head	worklist;	/* L: list of pending works */
	int			nr_workers;	/* L: total number of workers */
155
156

	/* nr_idle includes the ones off idle_list for rebinding */
157
158
159
160
161
162
	int			nr_idle;	/* L: currently idle ones */

	struct list_head	idle_list;	/* X: list of idle workers */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */

163
	/* a workers is either on busy_hash or idle_list, or the manager */
164
165
166
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */

167
	/* see manage_workers() for details on the two manager mutexes */
168
	struct mutex		manager_arb;	/* manager arbitration */
169
	struct worker		*manager;	/* L: purely informational */
170
171
	struct mutex		attach_mutex;	/* attach/detach exclusion */
	struct list_head	workers;	/* A: attached workers */
172
	struct completion	*detach_completion; /* all workers detached */
173

174
	struct ida		worker_ida;	/* worker IDs for task name */
175

176
	struct workqueue_attrs	*attrs;		/* I: worker attributes */
177
178
	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
	int			refcnt;		/* PL: refcnt for unbound pools */
179

180
181
182
183
184
185
	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;
186
187
188
189
190
191

	/*
	 * Destruction of pool is sched-RCU protected to allow dereferences
	 * from get_work_pool().
	 */
	struct rcu_head		rcu;
192
193
} ____cacheline_aligned_in_smp;

Linus Torvalds's avatar
Linus Torvalds committed
194
/*
195
196
197
198
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
Linus Torvalds's avatar
Linus Torvalds committed
199
 */
200
struct pool_workqueue {
201
	struct worker_pool	*pool;		/* I: the associated pool */
Tejun Heo's avatar
Tejun Heo committed
202
	struct workqueue_struct *wq;		/* I: the owning workqueue */
203
204
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
Tejun Heo's avatar
Tejun Heo committed
205
	int			refcnt;		/* L: reference count */
206
207
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
208
	int			nr_active;	/* L: nr of active works */
209
	int			max_active;	/* L: max active works */
210
	struct list_head	delayed_works;	/* L: delayed works */
211
	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
212
	struct list_head	mayday_node;	/* MD: node on wq->maydays */
Tejun Heo's avatar
Tejun Heo committed
213
214
215
216
217

	/*
	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
	 * itself is also sched-RCU protected so that the first pwq can be
218
	 * determined without grabbing wq->mutex.
Tejun Heo's avatar
Tejun Heo committed
219
220
221
	 */
	struct work_struct	unbound_release_work;
	struct rcu_head		rcu;
222
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
Linus Torvalds's avatar
Linus Torvalds committed
223

224
225
226
227
/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
228
229
	struct list_head	list;		/* WQ: list of flushers */
	int			flush_color;	/* WQ: flush color waiting for */
230
231
232
	struct completion	done;		/* flush completion */
};

233
234
struct wq_device;

Linus Torvalds's avatar
Linus Torvalds committed
235
/*
236
237
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
Linus Torvalds's avatar
Linus Torvalds committed
238
239
 */
struct workqueue_struct {
240
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
241
	struct list_head	list;		/* PR: list of all workqueues */
242

243
244
245
	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
246
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
247
248
249
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */
250

251
	struct list_head	maydays;	/* MD: pwqs requesting rescue */
252
253
	struct worker		*rescuer;	/* I: rescue worker */

254
	int			nr_drainers;	/* WQ: drain in progress */
255
	int			saved_max_active; /* WQ: saved pwq max_active */
256

257
258
	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
259

260
261
262
#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
263
#ifdef CONFIG_LOCKDEP
Tejun Heo's avatar
Tejun Heo committed
264
	struct lockdep_map	lockdep_map;
265
#endif
266
	char			name[WQ_NAME_LEN]; /* I: workqueue name */
267

268
269
270
271
272
273
274
	/*
	 * Destruction of workqueue_struct is sched-RCU protected to allow
	 * walking the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

275
276
277
	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
278
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
Linus Torvalds's avatar
Linus Torvalds committed
279
280
};

281
282
static struct kmem_cache *pwq_cache;

283
284
285
static cpumask_var_t *wq_numa_possible_cpumask;
					/* possible CPUs of each node */

286
287
288
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

289
/* see the comment above the definition of WQ_POWER_EFFICIENT */
290
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
291
292
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

293
294
static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */

295
296
297
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;

298
static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
299
static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
300

301
static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
302
static bool workqueue_freezing;		/* PL: have wqs started freezing? */
303

304
305
306
307
308
/* PL: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
309

310
311
312
313
314
315
316
317
318
319
320
321
/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

322
/* the per-cpu worker pools */
323
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
324

325
static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
326

327
/* PL: hash of all unbound pools keyed by pool->attrs */
328
329
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

330
/* I: attributes used when instantiating standard unbound pools on demand */
331
332
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

333
334
335
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

336
struct workqueue_struct *system_wq __read_mostly;
337
EXPORT_SYMBOL(system_wq);
338
struct workqueue_struct *system_highpri_wq __read_mostly;
339
EXPORT_SYMBOL_GPL(system_highpri_wq);
340
struct workqueue_struct *system_long_wq __read_mostly;
341
EXPORT_SYMBOL_GPL(system_long_wq);
342
struct workqueue_struct *system_unbound_wq __read_mostly;
343
EXPORT_SYMBOL_GPL(system_unbound_wq);
344
struct workqueue_struct *system_freezable_wq __read_mostly;
345
EXPORT_SYMBOL_GPL(system_freezable_wq);
346
347
348
349
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
350

351
static int worker_thread(void *__worker);
352
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
353

354
355
356
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

357
#define assert_rcu_or_pool_mutex()					\
358
359
360
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU or wq_pool_mutex should be held")
361

362
#define assert_rcu_or_wq_mutex(wq)					\
363
364
365
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex),			\
			 "sched RCU or wq->mutex should be held")
366

367
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
368
369
370
371
	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&			\
			 !lockdep_is_held(&wq->mutex) &&		\
			 !lockdep_is_held(&wq_pool_mutex),		\
			 "sched RCU, wq->mutex or wq_pool_mutex should be held")
372

373
374
375
#define for_each_cpu_worker_pool(pool, cpu)				\
	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
376
	     (pool)++)
377

378
379
380
/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
381
 * @pi: integer used for iteration
382
 *
383
384
385
 * This must be called either with wq_pool_mutex held or sched RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
386
387
388
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
389
 */
390
391
#define for_each_pool(pool, pi)						\
	idr_for_each_entry(&worker_pool_idr, pool, pi)			\
392
		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\
393
		else
394

395
396
397
398
399
/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
400
 * This must be called with @pool->attach_mutex.
401
402
403
404
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
405
406
#define for_each_pool_worker(worker, pool)				\
	list_for_each_entry((worker), &(pool)->workers, node)		\
407
		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
408
409
		else

410
411
412
413
/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
414
 *
415
 * This must be called either with wq->mutex held or sched RCU read locked.
416
417
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
418
419
420
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
421
422
 */
#define for_each_pwq(pwq, wq)						\
423
	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)		\
424
		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
425
		else
426

427
428
429
430
#ifdef CONFIG_DEBUG_OBJECTS_WORK

static struct debug_obj_descr work_debug_descr;

431
432
433
434
435
static void *work_debug_hint(void *addr)
{
	return ((struct work_struct *) addr)->func;
}

436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static int work_fixup_init(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_init(work, &work_debug_descr);
		return 1;
	default:
		return 0;
	}
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown object is activated (might be a statically initialized object)
 */
static int work_fixup_activate(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {

	case ODEBUG_STATE_NOTAVAILABLE:
		/*
		 * This is not really a fixup. The work struct was
		 * statically initialized. We just make sure that it
		 * is tracked in the object tracker.
		 */
471
		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
			debug_object_init(work, &work_debug_descr);
			debug_object_activate(work, &work_debug_descr);
			return 0;
		}
		WARN_ON_ONCE(1);
		return 0;

	case ODEBUG_STATE_ACTIVE:
		WARN_ON(1);

	default:
		return 0;
	}
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static int work_fixup_free(void *addr, enum debug_obj_state state)
{
	struct work_struct *work = addr;

	switch (state) {
	case ODEBUG_STATE_ACTIVE:
		cancel_work_sync(work);
		debug_object_free(work, &work_debug_descr);
		return 1;
	default:
		return 0;
	}
}

static struct debug_obj_descr work_debug_descr = {
	.name		= "work_struct",
507
	.debug_hint	= work_debug_hint,
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
	.fixup_init	= work_fixup_init,
	.fixup_activate	= work_fixup_activate,
	.fixup_free	= work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
	debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
	debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
	if (onstack)
		debug_object_init_on_stack(work, &work_debug_descr);
	else
		debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
	debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

538
539
540
541
542
543
544
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
	destroy_timer_on_stack(&work->timer);
	debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

545
546
547
548
549
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

550
551
552
553
554
555
556
/**
 * worker_pool_assign_id - allocate ID and assing it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
Tejun Heo's avatar
Tejun Heo committed
557
558
559
560
static int worker_pool_assign_id(struct worker_pool *pool)
{
	int ret;

561
	lockdep_assert_held(&wq_pool_mutex);
562

563
564
	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
			GFP_KERNEL);
565
	if (ret >= 0) {
Tejun Heo's avatar
Tejun Heo committed
566
		pool->id = ret;
567
568
		return 0;
	}
569
	return ret;
570
571
}

572
573
574
575
576
/**
 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
 * @wq: the target workqueue
 * @node: the node ID
 *
577
578
 * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
 * read locked.
579
580
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
581
582
 *
 * Return: The unbound pool_workqueue for @node.
583
584
585
586
 */
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
						  int node)
{
587
	assert_rcu_or_wq_mutex_or_pool_mutex(wq);
588
589
590
591
592
593
594
595
596
597

	/*
	 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
	 * delayed item is pending.  The plan is to keep CPU -> NODE
	 * mapping valid and stable across CPU on/offlines.  Once that
	 * happens, this workaround can be removed.
	 */
	if (unlikely(node == NUMA_NO_NODE))
		return wq->dfl_pwq;

598
599
600
	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}

601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
static unsigned int work_color_to_flags(int color)
{
	return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(struct work_struct *work)
{
	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
		((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
	return (color + 1) % WORK_NR_COLORS;
}
Linus Torvalds's avatar
Linus Torvalds committed
616

617
/*
618
619
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
620
 * is cleared and the high bits contain OFFQ flags and pool ID.
621
 *
622
623
 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 * and clear_work_data() can be used to set the pwq, pool or clear
624
625
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
626
 *
627
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
628
 * corresponding to a work.  Pool is available once the work has been
629
 * queued anywhere after initialization until it is sync canceled.  pwq is
630
 * available only while the work item is queued.
631
 *
632
633
634
635
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
 * but stay off timer and worklist for arbitrarily long and nobody should
 * try to steal the PENDING bit.
636
 */
637
638
static inline void set_work_data(struct work_struct *work, unsigned long data,
				 unsigned long flags)
639
{
640
	WARN_ON_ONCE(!work_pending(work));
641
642
	atomic_long_set(&work->data, data | flags | work_static(work));
}
643

644
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
645
646
			 unsigned long extra_flags)
{
647
648
	set_work_data(work, (unsigned long)pwq,
		      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
649
650
}

651
652
653
654
655
656
657
static void set_work_pool_and_keep_pending(struct work_struct *work,
					   int pool_id)
{
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
		      WORK_STRUCT_PENDING);
}

658
659
static void set_work_pool_and_clear_pending(struct work_struct *work,
					    int pool_id)
660
{
661
662
663
664
665
666
667
	/*
	 * The following wmb is paired with the implied mb in
	 * test_and_set_bit(PENDING) and ensures all updates to @work made
	 * here are visible to and precede any updates by the next PENDING
	 * owner.
	 */
	smp_wmb();
668
	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
669
}
670

671
static void clear_work_data(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
672
{
673
674
	smp_wmb();	/* see set_work_pool_and_clear_pending() */
	set_work_data(work, WORK_STRUCT_NO_POOL, 0);
Linus Torvalds's avatar
Linus Torvalds committed
675
676
}

677
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
678
{
679
	unsigned long data = atomic_long_read(&work->data);
680

681
	if (data & WORK_STRUCT_PWQ)
682
683
684
		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
	else
		return NULL;
685
686
}

687
688
689
690
/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
691
692
693
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under sched-RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or with preemption disabled.
694
695
696
697
698
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
699
700
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
701
702
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
703
{
704
	unsigned long data = atomic_long_read(&work->data);
705
	int pool_id;
706

707
	assert_rcu_or_pool_mutex();
708

709
710
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
711
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
712

713
714
	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
	if (pool_id == WORK_OFFQ_POOL_NONE)
715
716
		return NULL;

717
	return idr_find(&worker_pool_idr, pool_id);
718
719
720
721
722
723
}

/**
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
724
 * Return: The worker_pool ID @work was last associated with.
725
726
727
728
 * %WORK_OFFQ_POOL_NONE if none.
 */
static int get_work_pool_id(struct work_struct *work)
{
729
730
	unsigned long data = atomic_long_read(&work->data);

731
732
	if (data & WORK_STRUCT_PWQ)
		return ((struct pool_workqueue *)
733
			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
734

735
	return data >> WORK_OFFQ_POOL_SHIFT;
736
737
}

738
739
static void mark_work_canceling(struct work_struct *work)
{
740
	unsigned long pool_id = get_work_pool_id(work);
741

742
743
	pool_id <<= WORK_OFFQ_POOL_SHIFT;
	set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
744
745
746
747
748
749
}

static bool work_is_canceling(struct work_struct *work)
{
	unsigned long data = atomic_long_read(&work->data);

750
	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
751
752
}

753
/*
754
755
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
756
 * they're being called with pool->lock held.
757
758
 */

759
static bool __need_more_worker(struct worker_pool *pool)
760
{
761
	return !atomic_read(&pool->nr_running);
762
763
}

764
/*
765
766
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
767
768
 *
 * Note that, because unbound workers never contribute to nr_running, this
769
 * function will always return %true for unbound pools as long as the
770
 * worklist isn't empty.
771
 */
772
static bool need_more_worker(struct worker_pool *pool)
773
{
774
	return !list_empty(&pool->worklist) && __need_more_worker(pool);
775
}
776

777
/* Can I start working?  Called from busy but !running workers. */
778
static bool may_start_working(struct worker_pool *pool)
779
{
780
	return pool->nr_idle;
781
782
783
}

/* Do I need to keep working?  Called from currently running workers. */
784
static bool keep_working(struct worker_pool *pool)
785
{
786
787
	return !list_empty(&pool->worklist) &&
		atomic_read(&pool->nr_running) <= 1;
788
789
790
}

/* Do we need a new worker?  Called from manager. */
791
static bool need_to_create_worker(struct worker_pool *pool)
792
{
793
	return need_more_worker(pool) && !may_start_working(pool);
794
}
795

796
/* Do we have too many workers and should some go away? */
797
static bool too_many_workers(struct worker_pool *pool)
798
{
799
	bool managing = mutex_is_locked(&pool->manager_arb);
800
801
	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
	int nr_busy = pool->nr_workers - nr_idle;
802
803

	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
804
805
}

806
/*
807
808
809
 * Wake up functions.
 */

810
811
/* Return the first idle worker.  Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
812
{
813
	if (unlikely(list_empty(&pool->idle_list)))
814
815
		return NULL;

816
	return list_first_entry(&pool->idle_list, struct worker, entry);
817
818
819
820
}

/**
 * wake_up_worker - wake up an idle worker
821
 * @pool: worker pool to wake worker from
822
 *
823
 * Wake up the first idle worker of @pool.
824
825
 *
 * CONTEXT:
826
 * spin_lock_irq(pool->lock).
827
 */
828
static void wake_up_worker(struct worker_pool *pool)
829
{
830
	struct worker *worker = first_idle_worker(pool);
831
832
833
834
835

	if (likely(worker))
		wake_up_process(worker->task);
}

836
/**
837
838
839
840
841
842
843
844
845
846
 * wq_worker_waking_up - a worker is waking up
 * @task: task waking up
 * @cpu: CPU @task is waking up to
 *
 * This function is called during try_to_wake_up() when a worker is
 * being awoken.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 */
847
void wq_worker_waking_up(struct task_struct *task, int cpu)
848
849
850
{
	struct worker *worker = kthread_data(task);

851
	if (!(worker->flags & WORKER_NOT_RUNNING)) {
852
		WARN_ON_ONCE(worker->pool->cpu != cpu);
853
		atomic_inc(&worker->pool->nr_running);
854
	}
855
856
857
858
859
860
861
862
863
864
865
866
867
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called during schedule() when a busy worker is
 * going to sleep.  Worker on the same cpu can be woken up by
 * returning pointer to its task.
 *
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 *
868
 * Return:
869
870
 * Worker task on @cpu to wake up, %NULL if none.
 */
871
struct task_struct *wq_worker_sleeping(struct task_struct *task)
872
873
{
	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
874
	struct worker_pool *pool;
875

876
877
878
879
880
	/*
	 * Rescuers, which may not have all the fields set up like normal
	 * workers, also reach here, let's not access anything before
	 * checking NOT_RUNNING.
	 */
881
	if (worker->flags & WORKER_NOT_RUNNING)
882
883
		return NULL;

884
885
	pool = worker->pool;

886
	/* this can only happen on the local cpu */
887
	if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
888
		return NULL;
889
890
891
892
893
894

	/*
	 * The counterpart of the following dec_and_test, implied mb,
	 * worklist not empty test sequence is in insert_work().
	 * Please read comment there.
	 *
895
896
897
	 * NOT_RUNNING is clear.  This means that we're bound to and
	 * running on the local cpu w/ rq lock held and preemption
	 * disabled, which in turn means that none else could be
898
	 * manipulating idle_list, so dereferencing idle_list without pool
899
	 * lock is safe.
900
	 */
901
902
	if (atomic_dec_and_test(&pool->nr_running) &&
	    !list_empty(&pool->worklist))
903
		to_wakeup = first_idle_worker(pool);
904
905
906
907
908
	return to_wakeup ? to_wakeup->task : NULL;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
909
 * @worker: self
910
911
 * @flags: flags to set
 *
912
 * Set @flags in @worker->flags and adjust nr_running accordingly.
913
 *
914
 * CONTEXT:
915
 * spin_lock_irq(pool->lock)
916
 */
917
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
918
{
919
	struct worker_pool *pool = worker->pool;
920

921
922
	WARN_ON_ONCE(worker->task != current);

923
	/* If transitioning into NOT_RUNNING, adjust nr_running. */
924
925
	if ((flags & WORKER_NOT_RUNNING) &&
	    !(worker->flags & WORKER_NOT_RUNNING)) {
926
		atomic_dec(&pool->nr_running);
927
928
	}

929
930
931
932
	worker->flags |= flags;
}

/**
933
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
934
 * @worker: self
935
936
 * @flags: flags to clear
 *
937
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
938
 *
939
 * CONTEXT:
940
 * spin_lock_irq(pool->lock)
941
942
943
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
944
	struct worker_pool *pool = worker->pool;
945
946
	unsigned int oflags = worker->flags;

947
948
	WARN_ON_ONCE(worker->task != current);

949
	worker->flags &= ~flags;
950

951
952
953
954
955
	/*
	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
	 * of multiple flags, not a single flag.
	 */
956
957
	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
958
			atomic_inc(&pool->nr_running);
959
960
}

961
962
/**
 * find_worker_executing_work - find worker which is executing a work
963
 * @pool: pool of interest
964
965
 * @work: work to find worker for
 *
966
967
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
968
969
970
971
972
973
974
975
976
977
978
979
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
980
981
982
983
984
985
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
Tejun Heo's avatar