blk-core.c 73.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6
7
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *	-  July2000
Linus Torvalds's avatar
Linus Torvalds committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
28
#include <linux/task_io_accounting_ops.h>
29
#include <linux/fault-inject.h>
30
#include <linux/list_sort.h>
31
32
33

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
Linus Torvalds's avatar
Linus Torvalds committed
34

35
36
#include "blk.h"

37
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
38
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
39
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
40

41
static int __make_request(struct request_queue *q, struct bio *bio);
Linus Torvalds's avatar
Linus Torvalds committed
42
43
44
45

/*
 * For the allocated request tables
 */
46
static struct kmem_cache *request_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
47
48
49
50

/*
 * For queue allocation
 */
51
struct kmem_cache *blk_requestq_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
52
53
54
55

/*
 * Controlling structure to kblockd
 */
56
static struct workqueue_struct *kblockd_workqueue;
Linus Torvalds's avatar
Linus Torvalds committed
57

58
59
static void drive_stat_acct(struct request *rq, int new_io)
{
60
	struct hd_struct *part;
61
	int rw = rq_data_dir(rq);
Tejun Heo's avatar
Tejun Heo committed
62
	int cpu;
63

64
	if (!blk_do_io_stat(rq))
65
66
		return;

67
	cpu = part_stat_lock();
Tejun Heo's avatar
Tejun Heo committed
68

69
70
	if (!new_io) {
		part = rq->part;
71
		part_stat_inc(cpu, part, merges[rw]);
72
73
	} else {
		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
74
		if (!hd_struct_try_get(part)) {
75
76
77
78
79
80
81
82
83
			/*
			 * The partition is already being removed,
			 * the request will be accounted on the disk only
			 *
			 * We take a reference on disk->part0 although that
			 * partition will never be deleted, so we can treat
			 * it as any other partition.
			 */
			part = &rq->rq_disk->part0;
84
			hd_struct_get(part);
85
		}
86
		part_round_stats(cpu, part);
87
		part_inc_in_flight(part, rw);
88
		rq->part = part;
89
	}
90

91
	part_stat_unlock();
92
93
}

94
void blk_queue_congestion_threshold(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
{
	int nr;

	nr = q->nr_requests - (q->nr_requests / 8) + 1;
	if (nr > q->nr_requests)
		nr = q->nr_requests;
	q->nr_congestion_on = nr;

	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
	if (nr < 1)
		nr = 1;
	q->nr_congestion_off = nr;
}

/**
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 * @bdev:	device
 *
 * Locates the passed device's request queue and returns the address of its
 * backing_dev_info
 *
 * Will return NULL if the request queue cannot be located.
 */
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
{
	struct backing_dev_info *ret = NULL;
121
	struct request_queue *q = bdev_get_queue(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
122
123
124
125
126
127
128

	if (q)
		ret = &q->backing_dev_info;
	return ret;
}
EXPORT_SYMBOL(blk_get_backing_dev_info);

129
void blk_rq_init(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
130
{
131
132
	memset(rq, 0, sizeof(*rq));

Linus Torvalds's avatar
Linus Torvalds committed
133
	INIT_LIST_HEAD(&rq->queuelist);
134
	INIT_LIST_HEAD(&rq->timeout_list);
135
	rq->cpu = -1;
Jens Axboe's avatar
Jens Axboe committed
136
	rq->q = q;
137
	rq->__sector = (sector_t) -1;
138
139
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
140
	rq->cmd = rq->__cmd;
141
	rq->cmd_len = BLK_MAX_CDB;
Jens Axboe's avatar
Jens Axboe committed
142
	rq->tag = -1;
Linus Torvalds's avatar
Linus Torvalds committed
143
	rq->ref_count = 1;
144
	rq->start_time = jiffies;
145
	set_start_time_ns(rq);
146
	rq->part = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
147
}
148
EXPORT_SYMBOL(blk_rq_init);
Linus Torvalds's avatar
Linus Torvalds committed
149

150
151
static void req_bio_endio(struct request *rq, struct bio *bio,
			  unsigned int nbytes, int error)
Linus Torvalds's avatar
Linus Torvalds committed
152
{
153
154
155
156
	if (error)
		clear_bit(BIO_UPTODATE, &bio->bi_flags);
	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
		error = -EIO;
157

158
159
160
161
	if (unlikely(nbytes > bio->bi_size)) {
		printk(KERN_ERR "%s: want %u bytes done, %u left\n",
		       __func__, nbytes, bio->bi_size);
		nbytes = bio->bi_size;
162
	}
163

164
165
	if (unlikely(rq->cmd_flags & REQ_QUIET))
		set_bit(BIO_QUIET, &bio->bi_flags);
166

167
168
	bio->bi_size -= nbytes;
	bio->bi_sector += (nbytes >> 9);
169

170
171
	if (bio_integrity(bio))
		bio_integrity_advance(bio, nbytes);
172

173
174
175
	/* don't actually finish bio if it's part of flush sequence */
	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
		bio_endio(bio, error);
Linus Torvalds's avatar
Linus Torvalds committed
176
177
178
179
180
181
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	int bit;

182
	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
183
184
		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
		rq->cmd_flags);
Linus Torvalds's avatar
Linus Torvalds committed
185

186
187
188
	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
	       (unsigned long long)blk_rq_pos(rq),
	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
Tejun Heo's avatar
Tejun Heo committed
189
	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
190
	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
Linus Torvalds's avatar
Linus Torvalds committed
191

192
	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
193
		printk(KERN_INFO "  cdb: ");
194
		for (bit = 0; bit < BLK_MAX_CDB; bit++)
Linus Torvalds's avatar
Linus Torvalds committed
195
196
197
198
199
200
			printk("%02x ", rq->cmd[bit]);
		printk("\n");
	}
}
EXPORT_SYMBOL(blk_dump_rq_flags);

201
static void blk_delay_work(struct work_struct *work)
Linus Torvalds's avatar
Linus Torvalds committed
202
{
203
	struct request_queue *q;
Linus Torvalds's avatar
Linus Torvalds committed
204

205
206
	q = container_of(work, struct request_queue, delay_work.work);
	spin_lock_irq(q->queue_lock);
207
	__blk_run_queue(q);
208
	spin_unlock_irq(q->queue_lock);
Linus Torvalds's avatar
Linus Torvalds committed
209
210
211
}

/**
212
213
214
 * blk_delay_queue - restart queueing after defined interval
 * @q:		The &struct request_queue in question
 * @msecs:	Delay in msecs
Linus Torvalds's avatar
Linus Torvalds committed
215
216
 *
 * Description:
217
218
219
220
221
 *   Sometimes queueing needs to be postponed for a little while, to allow
 *   resources to come back. This function will make sure that queueing is
 *   restarted around the specified time.
 */
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
222
{
223
224
	queue_delayed_work(kblockd_workqueue, &q->delay_work,
				msecs_to_jiffies(msecs));
225
}
226
EXPORT_SYMBOL(blk_delay_queue);
227

Linus Torvalds's avatar
Linus Torvalds committed
228
229
/**
 * blk_start_queue - restart a previously stopped queue
230
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
234
235
236
 *
 * Description:
 *   blk_start_queue() will clear the stop flag on the queue, and call
 *   the request_fn for the queue if it was in a stopped state when
 *   entered. Also see blk_stop_queue(). Queue lock must be held.
 **/
237
void blk_start_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
238
{
239
240
	WARN_ON(!irqs_disabled());

241
	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
242
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
243
244
245
246
247
}
EXPORT_SYMBOL(blk_start_queue);

/**
 * blk_stop_queue - stop a queue
248
 * @q:    The &struct request_queue in question
Linus Torvalds's avatar
Linus Torvalds committed
249
250
251
252
253
254
255
256
257
258
259
 *
 * Description:
 *   The Linux block layer assumes that a block driver will consume all
 *   entries on the request queue when the request_fn strategy is called.
 *   Often this will not happen, because of hardware limitations (queue
 *   depth settings). If a device driver gets a 'queue full' response,
 *   or if it simply chooses not to queue more I/O at one point, it can
 *   call this function to prevent the request_fn from being called until
 *   the driver has signalled it's ready to go again. This happens by calling
 *   blk_start_queue() to restart queue operations. Queue lock must be held.
 **/
260
void blk_stop_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
261
{
262
	__cancel_delayed_work(&q->delay_work);
263
	queue_flag_set(QUEUE_FLAG_STOPPED, q);
Linus Torvalds's avatar
Linus Torvalds committed
264
265
266
267
268
269
270
271
272
273
274
275
}
EXPORT_SYMBOL(blk_stop_queue);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
276
 *     that the callbacks might use. The caller must already have made sure
Linus Torvalds's avatar
Linus Torvalds committed
277
278
279
 *     that its ->make_request_fn will not re-add plugging prior to calling
 *     this function.
 *
280
281
282
283
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevaotor_exit()
 *     and blk_throtl_exit() to be called with queue lock initialized.
 *
Linus Torvalds's avatar
Linus Torvalds committed
284
285
286
 */
void blk_sync_queue(struct request_queue *q)
{
287
	del_timer_sync(&q->timeout);
288
	cancel_delayed_work_sync(&q->delay_work);
Linus Torvalds's avatar
Linus Torvalds committed
289
290
291
292
}
EXPORT_SYMBOL(blk_sync_queue);

/**
293
 * __blk_run_queue - run a single device queue
Linus Torvalds's avatar
Linus Torvalds committed
294
 * @q:	The queue to run
295
296
297
 *
 * Description:
 *    See @blk_run_queue. This variant must be called with the queue lock
298
 *    held and interrupts disabled.
Linus Torvalds's avatar
Linus Torvalds committed
299
 */
300
void __blk_run_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
301
{
302
303
304
	if (unlikely(blk_queue_stopped(q)))
		return;

305
	q->request_fn(q);
306
307
}
EXPORT_SYMBOL(__blk_run_queue);
308

309
310
311
312
313
314
315
316
317
318
/**
 * blk_run_queue_async - run a single device queue in workqueue context
 * @q:	The queue to run
 *
 * Description:
 *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
 *    of us.
 */
void blk_run_queue_async(struct request_queue *q)
{
319
320
	if (likely(!blk_queue_stopped(q))) {
		__cancel_delayed_work(&q->delay_work);
321
		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
322
	}
323
}
324
EXPORT_SYMBOL(blk_run_queue_async);
325

326
327
328
/**
 * blk_run_queue - run a single device queue
 * @q: The queue to run
329
330
331
 *
 * Description:
 *    Invoke request handling on this queue, if it has pending work to do.
Tejun Heo's avatar
Tejun Heo committed
332
 *    May be used to restart queueing when a request has completed.
333
334
335
336
337
338
 */
void blk_run_queue(struct request_queue *q)
{
	unsigned long flags;

	spin_lock_irqsave(q->queue_lock, flags);
339
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
340
341
342
343
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_run_queue);

344
void blk_put_queue(struct request_queue *q)
345
346
347
{
	kobject_put(&q->kobj);
}
348
EXPORT_SYMBOL(blk_put_queue);
349

350
351
352
353
354
/*
 * Note: If a driver supplied the queue lock, it should not zap that lock
 * unexpectedly as some queue cleanup components like elevator_exit() and
 * blk_throtl_exit() need queue lock.
 */
355
void blk_cleanup_queue(struct request_queue *q)
356
{
357
358
359
360
361
362
363
364
	/*
	 * We know we have process context here, so we can be a little
	 * cautious and ensure that pending block actions on this device
	 * are done before moving on. Going into this function, we should
	 * not have processes doing IO to this device.
	 */
	blk_sync_queue(q);

365
	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
366
	mutex_lock(&q->sysfs_lock);
367
	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
368
369
370
371
372
	mutex_unlock(&q->sysfs_lock);

	if (q->elevator)
		elevator_exit(q->elevator);

373
374
	blk_throtl_exit(q);

375
376
	blk_put_queue(q);
}
Linus Torvalds's avatar
Linus Torvalds committed
377
378
EXPORT_SYMBOL(blk_cleanup_queue);

379
static int blk_init_free_list(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
380
381
382
{
	struct request_list *rl = &q->rq;

383
384
385
	if (unlikely(rl->rq_pool))
		return 0;

386
387
	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
388
	rl->elvpriv = 0;
389
390
	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
Linus Torvalds's avatar
Linus Torvalds committed
391

392
393
	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
				mempool_free_slab, request_cachep, q->node);
Linus Torvalds's avatar
Linus Torvalds committed
394
395
396
397
398
399
400

	if (!rl->rq_pool)
		return -ENOMEM;

	return 0;
}

401
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
402
{
403
404
405
	return blk_alloc_queue_node(gfp_mask, -1);
}
EXPORT_SYMBOL(blk_alloc_queue);
Linus Torvalds's avatar
Linus Torvalds committed
406

407
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
408
{
409
	struct request_queue *q;
Peter Zijlstra's avatar
Peter Zijlstra committed
410
	int err;
411

412
	q = kmem_cache_alloc_node(blk_requestq_cachep,
413
				gfp_mask | __GFP_ZERO, node_id);
Linus Torvalds's avatar
Linus Torvalds committed
414
415
416
	if (!q)
		return NULL;

417
418
419
420
	q->backing_dev_info.ra_pages =
			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	q->backing_dev_info.state = 0;
	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
421
	q->backing_dev_info.name = "block";
422

Peter Zijlstra's avatar
Peter Zijlstra committed
423
424
	err = bdi_init(&q->backing_dev_info);
	if (err) {
425
		kmem_cache_free(blk_requestq_cachep, q);
Peter Zijlstra's avatar
Peter Zijlstra committed
426
427
428
		return NULL;
	}

429
430
431
432
433
	if (blk_throtl_init(q)) {
		kmem_cache_free(blk_requestq_cachep, q);
		return NULL;
	}

434
435
	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
		    laptop_mode_timer_fn, (unsigned long) q);
436
437
	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
	INIT_LIST_HEAD(&q->timeout_list);
438
439
440
	INIT_LIST_HEAD(&q->flush_queue[0]);
	INIT_LIST_HEAD(&q->flush_queue[1]);
	INIT_LIST_HEAD(&q->flush_data_in_flight);
441
	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
442

443
	kobject_init(&q->kobj, &blk_queue_ktype);
Linus Torvalds's avatar
Linus Torvalds committed
444

445
	mutex_init(&q->sysfs_lock);
446
	spin_lock_init(&q->__queue_lock);
447

448
449
450
451
452
453
	/*
	 * By default initialize queue_lock to internal lock and driver can
	 * override it later if need be.
	 */
	q->queue_lock = &q->__queue_lock;

Linus Torvalds's avatar
Linus Torvalds committed
454
455
	return q;
}
456
EXPORT_SYMBOL(blk_alloc_queue_node);
Linus Torvalds's avatar
Linus Torvalds committed
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479

/**
 * blk_init_queue  - prepare a request queue for use with a block device
 * @rfn:  The function to be called to process requests that have been
 *        placed on the queue.
 * @lock: Request queue spin lock
 *
 * Description:
 *    If a block device wishes to use the standard request handling procedures,
 *    which sorts requests and coalesces adjacent requests, then it must
 *    call blk_init_queue().  The function @rfn will be called when there
 *    are requests on the queue that need to be processed.  If the device
 *    supports plugging, then @rfn may not be called immediately when requests
 *    are available on the queue, but may be called at some time later instead.
 *    Plugged queues are generally unplugged when a buffer belonging to one
 *    of the requests on the queue is needed, or due to memory pressure.
 *
 *    @rfn is not required, or even expected, to remove all requests off the
 *    queue, but only as many as it can handle at a time.  If it does leave
 *    requests on the queue, it is responsible for arranging that the requests
 *    get dealt with eventually.
 *
 *    The queue spin lock must be held while manipulating the requests on the
480
481
 *    request queue; this lock will be taken also from interrupt context, so irq
 *    disabling is needed for it.
Linus Torvalds's avatar
Linus Torvalds committed
482
 *
483
 *    Function returns a pointer to the initialized request queue, or %NULL if
Linus Torvalds's avatar
Linus Torvalds committed
484
485
486
487
488
489
 *    it didn't succeed.
 *
 * Note:
 *    blk_init_queue() must be paired with a blk_cleanup_queue() call
 *    when the block device is deactivated (such as at module unload).
 **/
490

491
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
Linus Torvalds's avatar
Linus Torvalds committed
492
{
493
494
495
496
	return blk_init_queue_node(rfn, lock, -1);
}
EXPORT_SYMBOL(blk_init_queue);

497
struct request_queue *
498
499
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
500
	struct request_queue *uninit_q, *q;
Linus Torvalds's avatar
Linus Torvalds committed
501

502
503
504
505
506
507
508
509
510
	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
	if (!uninit_q)
		return NULL;

	q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
	if (!q)
		blk_cleanup_queue(uninit_q);

	return q;
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
}
EXPORT_SYMBOL(blk_init_queue_node);

struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
			 spinlock_t *lock)
{
	return blk_init_allocated_queue_node(q, rfn, lock, -1);
}
EXPORT_SYMBOL(blk_init_allocated_queue);

struct request_queue *
blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
			      spinlock_t *lock, int node_id)
{
Linus Torvalds's avatar
Linus Torvalds committed
526
527
528
	if (!q)
		return NULL;

529
	q->node = node_id;
530
	if (blk_init_free_list(q))
531
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
532
533
534

	q->request_fn		= rfn;
	q->prep_rq_fn		= NULL;
535
	q->unprep_rq_fn		= NULL;
536
	q->queue_flags		= QUEUE_FLAG_DEFAULT;
537
538
539
540

	/* Override internal queue lock with supplied lock pointer */
	if (lock)
		q->queue_lock		= lock;
Linus Torvalds's avatar
Linus Torvalds committed
541

542
543
544
	/*
	 * This also sets hw/phys segments, boundary and size
	 */
Linus Torvalds's avatar
Linus Torvalds committed
545
546
	blk_queue_make_request(q, __make_request);

547
548
	q->sg_reserved_size = INT_MAX;

Linus Torvalds's avatar
Linus Torvalds committed
549
550
551
552
553
554
555
556
557
558
	/*
	 * all done
	 */
	if (!elevator_init(q, NULL)) {
		blk_queue_congestion_threshold(q);
		return q;
	}

	return NULL;
}
559
EXPORT_SYMBOL(blk_init_allocated_queue_node);
Linus Torvalds's avatar
Linus Torvalds committed
560

561
int blk_get_queue(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
562
{
Nick Piggin's avatar
Nick Piggin committed
563
	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
564
		kobject_get(&q->kobj);
Linus Torvalds's avatar
Linus Torvalds committed
565
566
567
568
569
		return 0;
	}

	return 1;
}
570
EXPORT_SYMBOL(blk_get_queue);
Linus Torvalds's avatar
Linus Torvalds committed
571

572
static inline void blk_free_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
573
{
574
	if (rq->cmd_flags & REQ_ELVPRIV)
575
		elv_put_request(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
576
577
578
	mempool_free(rq, q->rq.rq_pool);
}

Jens Axboe's avatar
Jens Axboe committed
579
static struct request *
580
blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
581
582
583
584
585
586
{
	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

	if (!rq)
		return NULL;

587
	blk_rq_init(q, rq);
588

589
	rq->cmd_flags = flags | REQ_ALLOCED;
Linus Torvalds's avatar
Linus Torvalds committed
590

591
	if (priv) {
592
		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
593
594
595
			mempool_free(rq, q->rq.rq_pool);
			return NULL;
		}
596
		rq->cmd_flags |= REQ_ELVPRIV;
597
	}
Linus Torvalds's avatar
Linus Torvalds committed
598

599
	return rq;
Linus Torvalds's avatar
Linus Torvalds committed
600
601
602
603
604
605
}

/*
 * ioc_batching returns true if the ioc is a valid batching request and
 * should be given priority access to a request.
 */
606
static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
{
	if (!ioc)
		return 0;

	/*
	 * Make sure the process is able to allocate at least 1 request
	 * even if the batch times out, otherwise we could theoretically
	 * lose wakeups.
	 */
	return ioc->nr_batch_requests == q->nr_batching ||
		(ioc->nr_batch_requests > 0
		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
}

/*
 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
 * will cause the process to be a "batcher" on all queues in the system. This
 * is the behaviour we want though - once it gets a wakeup it should be given
 * a nice run.
 */
627
static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
Linus Torvalds's avatar
Linus Torvalds committed
628
629
630
631
632
633
634
635
{
	if (!ioc || ioc_batching(q, ioc))
		return;

	ioc->nr_batch_requests = q->nr_batching;
	ioc->last_waited = jiffies;
}

636
static void __freed_request(struct request_queue *q, int sync)
Linus Torvalds's avatar
Linus Torvalds committed
637
638
639
{
	struct request_list *rl = &q->rq;

640
641
	if (rl->count[sync] < queue_congestion_off_threshold(q))
		blk_clear_queue_congested(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
642

643
644
645
	if (rl->count[sync] + 1 <= q->nr_requests) {
		if (waitqueue_active(&rl->wait[sync]))
			wake_up(&rl->wait[sync]);
Linus Torvalds's avatar
Linus Torvalds committed
646

647
		blk_clear_queue_full(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
648
649
650
651
652
653
654
	}
}

/*
 * A request has just been released.  Account for it, update the full and
 * congestion status, wake up any waiters.   Called under q->queue_lock.
 */
655
static void freed_request(struct request_queue *q, int sync, int priv)
Linus Torvalds's avatar
Linus Torvalds committed
656
657
658
{
	struct request_list *rl = &q->rq;

659
	rl->count[sync]--;
660
661
	if (priv)
		rl->elvpriv--;
Linus Torvalds's avatar
Linus Torvalds committed
662

663
	__freed_request(q, sync);
Linus Torvalds's avatar
Linus Torvalds committed
664

665
666
	if (unlikely(rl->starved[sync ^ 1]))
		__freed_request(q, sync ^ 1);
Linus Torvalds's avatar
Linus Torvalds committed
667
668
}

669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
/*
 * Determine if elevator data should be initialized when allocating the
 * request associated with @bio.
 */
static bool blk_rq_should_init_elevator(struct bio *bio)
{
	if (!bio)
		return true;

	/*
	 * Flush requests do not use the elevator so skip initialization.
	 * This allows a request to share the flush and elevator data.
	 */
	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
		return false;

	return true;
}

Linus Torvalds's avatar
Linus Torvalds committed
688
/*
Nick Piggin's avatar
Nick Piggin committed
689
690
691
 * Get a free request, queue_lock must be held.
 * Returns NULL on failure, with queue_lock held.
 * Returns !NULL on success, with queue_lock *not held*.
Linus Torvalds's avatar
Linus Torvalds committed
692
 */
693
static struct request *get_request(struct request_queue *q, int rw_flags,
694
				   struct bio *bio, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
695
696
697
{
	struct request *rq = NULL;
	struct request_list *rl = &q->rq;
698
	struct io_context *ioc = NULL;
699
	const bool is_sync = rw_is_sync(rw_flags) != 0;
700
	int may_queue, priv = 0;
701

702
	may_queue = elv_may_queue(q, rw_flags);
703
704
705
	if (may_queue == ELV_MQUEUE_NO)
		goto rq_starved;

706
707
	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
		if (rl->count[is_sync]+1 >= q->nr_requests) {
708
			ioc = current_io_context(GFP_ATOMIC, q->node);
709
710
711
712
713
714
			/*
			 * The queue will fill after this allocation, so set
			 * it as full, and mark this process as "batching".
			 * This process will be allowed to complete a batch of
			 * requests, others will be blocked.
			 */
715
			if (!blk_queue_full(q, is_sync)) {
716
				ioc_set_batching(q, ioc);
717
				blk_set_queue_full(q, is_sync);
718
719
720
721
722
723
724
725
726
727
728
			} else {
				if (may_queue != ELV_MQUEUE_MUST
						&& !ioc_batching(q, ioc)) {
					/*
					 * The queue is full and the allocating
					 * process is not a "batcher", and not
					 * exempted by the IO scheduler
					 */
					goto out;
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
729
		}
730
		blk_set_queue_congested(q, is_sync);
Linus Torvalds's avatar
Linus Torvalds committed
731
732
	}

733
734
735
736
737
	/*
	 * Only allow batching queuers to allocate up to 50% over the defined
	 * limit of requests, otherwise we could have thousands of requests
	 * allocated with any setting of ->nr_requests
	 */
738
	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
739
		goto out;
740

741
742
	rl->count[is_sync]++;
	rl->starved[is_sync] = 0;
743

744
745
746
747
748
	if (blk_rq_should_init_elevator(bio)) {
		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
		if (priv)
			rl->elvpriv++;
	}
749

750
751
	if (blk_queue_io_stat(q))
		rw_flags |= REQ_IO_STAT;
Linus Torvalds's avatar
Linus Torvalds committed
752
753
	spin_unlock_irq(q->queue_lock);

754
	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
755
	if (unlikely(!rq)) {
Linus Torvalds's avatar
Linus Torvalds committed
756
757
758
759
760
761
762
763
		/*
		 * Allocation failed presumably due to memory. Undo anything
		 * we might have messed up.
		 *
		 * Allocating task should really be put onto the front of the
		 * wait queue, but this is pretty rare.
		 */
		spin_lock_irq(q->queue_lock);
764
		freed_request(q, is_sync, priv);
Linus Torvalds's avatar
Linus Torvalds committed
765
766
767
768
769
770
771
772
773

		/*
		 * in the very unlikely event that allocation failed and no
		 * requests for this direction was pending, mark us starved
		 * so that freeing of a request in the other direction will
		 * notice us. another possible fix would be to split the
		 * rq mempool into READ and WRITE
		 */
rq_starved:
774
775
		if (unlikely(rl->count[is_sync] == 0))
			rl->starved[is_sync] = 1;
Linus Torvalds's avatar
Linus Torvalds committed
776
777
778
779

		goto out;
	}

780
781
782
783
784
785
	/*
	 * ioc may be NULL here, and ioc_batching will be false. That's
	 * OK, if the queue is under the request limit then requests need
	 * not count toward the nr_batch_requests limit. There will always
	 * be some limit enforced by BLK_BATCH_TIME.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
786
787
	if (ioc_batching(q, ioc))
		ioc->nr_batch_requests--;
788

789
	trace_block_getrq(q, bio, rw_flags & 1);
Linus Torvalds's avatar
Linus Torvalds committed
790
791
792
793
794
out:
	return rq;
}

/*
Jens Axboe's avatar
Jens Axboe committed
795
796
 * No available requests for this queue, wait for some requests to become
 * available.
Nick Piggin's avatar
Nick Piggin committed
797
798
 *
 * Called with q->queue_lock held, and returns with it unlocked.
Linus Torvalds's avatar
Linus Torvalds committed
799
 */
800
static struct request *get_request_wait(struct request_queue *q, int rw_flags,
801
					struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
802
{
803
	const bool is_sync = rw_is_sync(rw_flags) != 0;
Linus Torvalds's avatar
Linus Torvalds committed
804
805
	struct request *rq;

806
	rq = get_request(q, rw_flags, bio, GFP_NOIO);
807
808
	while (!rq) {
		DEFINE_WAIT(wait);
809
		struct io_context *ioc;
Linus Torvalds's avatar
Linus Torvalds committed
810
811
		struct request_list *rl = &q->rq;

812
		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
Linus Torvalds's avatar
Linus Torvalds committed
813
814
				TASK_UNINTERRUPTIBLE);

815
		trace_block_sleeprq(q, bio, rw_flags & 1);
Linus Torvalds's avatar
Linus Torvalds committed
816

817
818
		spin_unlock_irq(q->queue_lock);
		io_schedule();
Linus Torvalds's avatar
Linus Torvalds committed
819

820
821
822
823
824
825
826
827
		/*
		 * After sleeping, we become a "batching" process and
		 * will be able to allocate at least one request, and
		 * up to a big batch of them for a small period time.
		 * See ioc_batching, ioc_set_batching
		 */
		ioc = current_io_context(GFP_NOIO, q->node);
		ioc_set_batching(q, ioc);
Nick Piggin's avatar
Nick Piggin committed
828

829
		spin_lock_irq(q->queue_lock);
830
		finish_wait(&rl->wait[is_sync], &wait);
831
832
833

		rq = get_request(q, rw_flags, bio, GFP_NOIO);
	};
Linus Torvalds's avatar
Linus Torvalds committed
834
835
836
837

	return rq;
}

838
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
839
840
841
{
	struct request *rq;

842
843
844
	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
		return NULL;

Linus Torvalds's avatar
Linus Torvalds committed
845
846
	BUG_ON(rw != READ && rw != WRITE);

Nick Piggin's avatar
Nick Piggin committed
847
848
	spin_lock_irq(q->queue_lock);
	if (gfp_mask & __GFP_WAIT) {
849
		rq = get_request_wait(q, rw, NULL);
Nick Piggin's avatar
Nick Piggin committed
850
	} else {
851
		rq = get_request(q, rw, NULL, gfp_mask);
Nick Piggin's avatar
Nick Piggin committed
852
853
854
855
		if (!rq)
			spin_unlock_irq(q->queue_lock);
	}
	/* q->queue_lock is unlocked at this point */
Linus Torvalds's avatar
Linus Torvalds committed
856
857
858
859
860

	return rq;
}
EXPORT_SYMBOL(blk_get_request);

861
/**
862
 * blk_make_request - given a bio, allocate a corresponding struct request.
863
 * @q: target request queue
864
865
 * @bio:  The bio describing the memory mappings that will be submitted for IO.
 *        It may be a chained-bio properly constructed by block/bio layer.
866
 * @gfp_mask: gfp flags to be used for memory allocation
867
 *
868
869
870
871
 * blk_make_request is the parallel of generic_make_request for BLOCK_PC
 * type commands. Where the struct request needs to be farther initialized by
 * the caller. It is passed a &struct bio, which describes the memory info of
 * the I/O transfer.
872
 *
873
874
875
876
877
878
879
880
881
 * The caller of blk_make_request must make sure that bi_io_vec
 * are set to describe the memory buffers. That bio_data_dir() will return
 * the needed direction of the request. (And all bio's in the passed bio-chain
 * are properly set accordingly)
 *
 * If called under none-sleepable conditions, mapped bio buffers must not
 * need bouncing, by calling the appropriate masked or flagged allocator,
 * suitable for the target device. Otherwise the call to blk_queue_bounce will
 * BUG.
882
883
884
885
886
887
888
889
890
 *
 * WARNING: When allocating/cloning a bio-chain, careful consideration should be
 * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
 * anything but the first bio in the chain. Otherwise you risk waiting for IO
 * completion of a bio that hasn't been submitted yet, thus resulting in a
 * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
 * of bio_alloc(), as that avoids the mempool deadlock.
 * If possible a big IO should be split into smaller parts when allocation
 * fails. Partial allocation should not be an error, or you risk a live-lock.
891
 */
892
893
struct request *blk_make_request(struct request_queue *q, struct bio *bio,
				 gfp_t gfp_mask)
894
{
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);

	if (unlikely(!rq))
		return ERR_PTR(-ENOMEM);

	for_each_bio(bio) {
		struct bio *bounce_bio = bio;
		int ret;

		blk_queue_bounce(q, &bounce_bio);
		ret = blk_rq_append_bio(q, rq, bounce_bio);
		if (unlikely(ret)) {
			blk_put_request(rq);
			return ERR_PTR(ret);
		}
	}

	return rq;
913
}
914
EXPORT_SYMBOL(blk_make_request);
915

Linus Torvalds's avatar
Linus Torvalds committed
916
917
918
919
920
921
922
923
924
925
/**
 * blk_requeue_request - put a request back on queue
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 *
 * Description:
 *    Drivers often keep queueing requests until the hardware cannot accept
 *    more, when that condition happens we need to put the request back
 *    on the queue. Must be called with queue lock held.
 */
926
void blk_requeue_request(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
927
{
928
929
	blk_delete_timer(rq);
	blk_clear_rq_complete(rq);
930
	trace_block_rq_requeue(q, rq);
931

Linus Torvalds's avatar
Linus Torvalds committed
932
933
934
	if (blk_rq_tagged(rq))
		blk_queue_end_tag(q, rq);

935
936
	BUG_ON(blk_queued_rq(rq));

Linus Torvalds's avatar
Linus Torvalds committed
937
938
939
940
	elv_requeue_request(q, rq);
}
EXPORT_SYMBOL(blk_requeue_request);

941
942
943
944
static void add_acct_request(struct request_queue *q, struct request *rq,
			     int where)
{
	drive_stat_acct(rq, 1);
Jens Axboe's avatar
Jens Axboe committed
945
	__elv_add_request(q, rq, where);
946
947
}

Linus Torvalds's avatar
Linus Torvalds committed
948
/**
949
 * blk_insert_request - insert a special request into a request queue
Linus Torvalds's avatar
Linus Torvalds committed
950
951
952
953
954
955
956
957
958
 * @q:		request queue where request should be inserted
 * @rq:		request to be inserted
 * @at_head:	insert request at head or tail of queue
 * @data:	private data
 *
 * Description:
 *    Many block devices need to execute commands asynchronously, so they don't
 *    block the whole kernel from preemption during request execution.  This is
 *    accomplished normally by inserting aritficial requests tagged as
959
960
 *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
 *    be scheduled for actual execution by the request queue.
Linus Torvalds's avatar
Linus Torvalds committed
961
962
963
964
965
966
 *
 *    We have the option of inserting the head or the tail of the queue.
 *    Typically we use the tail for new ioctls and so forth.  We use the head
 *    of the queue for things like a QUEUE_FULL message from a device, or a
 *    host that is unable to accept a particular command.
 */
967
void blk_insert_request(struct request_queue *q, struct request *rq,
968
			int at_head, void *data)
Linus Torvalds's avatar
Linus Torvalds committed
969
{
970
	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
Linus Torvalds's avatar
Linus Torvalds committed
971
972
973
974
975
976
977
	unsigned long flags;

	/*
	 * tell I/O scheduler that this isn't a regular read/write (ie it
	 * must not attempt merges on this) and that it acts as a soft
	 * barrier
	 */
978
	rq->cmd_type = REQ_TYPE_SPECIAL;
Linus Torvalds's avatar
Linus Torvalds committed
979
980
981
982
983
984
985
986

	rq->special = data;

	spin_lock_irqsave(q->queue_lock, flags);

	/*
	 * If command is tagged, release the tag
	 */
987
988
	if (blk_rq_tagged(rq))
		blk_queue_end_tag(q, rq);
Linus Torvalds's avatar
Linus Torvalds committed
989

990
	add_acct_request(q, rq, where);
991
	__blk_run_queue(q);
Linus Torvalds's avatar
Linus Torvalds committed
992
993
994
995
	spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_insert_request);

996
997
998
999
1000
static void part_round_stats_single(int cpu, struct hd_struct *part,
				    unsigned long now)
{
	if (now == part->stamp)
		return;