ll_rw_blk.c 105 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
28
#include <linux/task_io_accounting_ops.h>
29
30
#include <linux/interrupt.h>
#include <linux/cpu.h>
31
#include <linux/blktrace_api.h>
32
#include <linux/fault-inject.h>
Linus Torvalds's avatar
Linus Torvalds committed
33
34
35
36
37
38

/*
 * for max sense size
 */
#include <scsi/scsi_cmnd.h>

39
static void blk_unplug_work(struct work_struct *work);
Linus Torvalds's avatar
Linus Torvalds committed
40
static void blk_unplug_timeout(unsigned long data);
41
static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
42
static void init_request_from_bio(struct request *req, struct bio *bio);
43
static int __make_request(struct request_queue *q, struct bio *bio);
44
static struct io_context *current_io_context(gfp_t gfp_flags, int node);
45
static void blk_recalc_rq_segments(struct request *rq);
Linus Torvalds's avatar
Linus Torvalds committed
46
47
48
49

/*
 * For the allocated request tables
 */
50
static struct kmem_cache *request_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
51
52
53
54

/*
 * For queue allocation
 */
55
static struct kmem_cache *requestq_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59

/*
 * For io context allocations
 */
60
static struct kmem_cache *iocontext_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
61
62
63
64

/*
 * Controlling structure to kblockd
 */
65
static struct workqueue_struct *kblockd_workqueue;
Linus Torvalds's avatar
Linus Torvalds committed
66
67
68
69
70
71

unsigned long blk_max_low_pfn, blk_max_pfn;

EXPORT_SYMBOL(blk_max_low_pfn);
EXPORT_SYMBOL(blk_max_pfn);

72
73
static DEFINE_PER_CPU(struct list_head, blk_cpu_done);

Linus Torvalds's avatar
Linus Torvalds committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME	(HZ/50UL)

/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ	32

/*
 * Return the threshold (number of used requests) at which the queue is
 * considered to be congested.  It include a little hysteresis to keep the
 * context switch rate down.
 */
static inline int queue_congestion_on_threshold(struct request_queue *q)
{
	return q->nr_congestion_on;
}

/*
 * The threshold at which a queue is considered to be uncongested
 */
static inline int queue_congestion_off_threshold(struct request_queue *q)
{
	return q->nr_congestion_off;
}

static void blk_queue_congestion_threshold(struct request_queue *q)
{
	int nr;

	nr = q->nr_requests - (q->nr_requests / 8) + 1;
	if (nr > q->nr_requests)
		nr = q->nr_requests;
	q->nr_congestion_on = nr;

	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
	if (nr < 1)
		nr = 1;
	q->nr_congestion_off = nr;
}

/**
 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
 * @bdev:	device
 *
 * Locates the passed device's request queue and returns the address of its
 * backing_dev_info
 *
 * Will return NULL if the request queue cannot be located.
 */
struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
{
	struct backing_dev_info *ret = NULL;
125
	struct request_queue *q = bdev_get_queue(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

	if (q)
		ret = &q->backing_dev_info;
	return ret;
}
EXPORT_SYMBOL(blk_get_backing_dev_info);

/**
 * blk_queue_prep_rq - set a prepare_request function for queue
 * @q:		queue
 * @pfn:	prepare_request function
 *
 * It's possible for a queue to register a prepare_request callback which
 * is invoked before the request is handed to the request_fn. The goal of
 * the function is to prepare a request for I/O, it can be used to build a
 * cdb from the request data for instance.
 *
 */
144
void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
Linus Torvalds's avatar
Linus Torvalds committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
{
	q->prep_rq_fn = pfn;
}

EXPORT_SYMBOL(blk_queue_prep_rq);

/**
 * blk_queue_merge_bvec - set a merge_bvec function for queue
 * @q:		queue
 * @mbfn:	merge_bvec_fn
 *
 * Usually queues have static limitations on the max sectors or segments that
 * we can put in a request. Stacking drivers may have some settings that
 * are dynamic, and thus we have to query the queue whether it is ok to
 * add a new bio_vec to a bio at a given offset or not. If the block device
 * has such limitations, it needs to register a merge_bvec_fn to control
 * the size of bio's sent to it. Note that a block device *must* allow a
 * single page to be added to an empty bio. The block device driver may want
 * to use the bio_split() function to deal with these bio's. By default
 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
 * honored.
 */
167
void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
172
173
{
	q->merge_bvec_fn = mbfn;
}

EXPORT_SYMBOL(blk_queue_merge_bvec);

174
void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
175
176
177
178
179
180
{
	q->softirq_done_fn = fn;
}

EXPORT_SYMBOL(blk_queue_softirq_done);

Linus Torvalds's avatar
Linus Torvalds committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/**
 * blk_queue_make_request - define an alternate make_request function for a device
 * @q:  the request queue for the device to be affected
 * @mfn: the alternate make_request function
 *
 * Description:
 *    The normal way for &struct bios to be passed to a device
 *    driver is for them to be collected into requests on a request
 *    queue, and then to allow the device driver to select requests
 *    off that queue when it is ready.  This works well for many block
 *    devices. However some block devices (typically virtual devices
 *    such as md or lvm) do not benefit from the processing on the
 *    request queue, and are served best by having the requests passed
 *    directly to them.  This can be achieved by providing a function
 *    to blk_queue_make_request().
 *
 * Caveat:
 *    The driver that does this *must* be able to deal appropriately
 *    with buffers in "highmemory". This can be accomplished by either calling
 *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
 *    blk_queue_bounce() to create a buffer in normal memory.
 **/
203
void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
Linus Torvalds's avatar
Linus Torvalds committed
204
205
206
207
208
{
	/*
	 * set defaults
	 */
	q->nr_requests = BLKDEV_MAX_RQ;
209
210
	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
Linus Torvalds's avatar
Linus Torvalds committed
211
212
213
214
	q->make_request_fn = mfn;
	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
	q->backing_dev_info.state = 0;
	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
215
	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
Linus Torvalds's avatar
Linus Torvalds committed
216
217
218
219
220
221
222
223
224
225
	blk_queue_hardsect_size(q, 512);
	blk_queue_dma_alignment(q, 511);
	blk_queue_congestion_threshold(q);
	q->nr_batching = BLK_BATCH_REQ;

	q->unplug_thresh = 4;		/* hmm */
	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
	if (q->unplug_delay == 0)
		q->unplug_delay = 1;

226
	INIT_WORK(&q->unplug_work, blk_unplug_work);
Linus Torvalds's avatar
Linus Torvalds committed
227
228
229
230
231
232
233
234
235
236
237
238

	q->unplug_timer.function = blk_unplug_timeout;
	q->unplug_timer.data = (unsigned long)q;

	/*
	 * by default assume old behaviour and bounce for any highmem page
	 */
	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
}

EXPORT_SYMBOL(blk_queue_make_request);

239
static void rq_init(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
240
241
{
	INIT_LIST_HEAD(&rq->queuelist);
242
	INIT_LIST_HEAD(&rq->donelist);
Linus Torvalds's avatar
Linus Torvalds committed
243
244
245

	rq->errors = 0;
	rq->bio = rq->biotail = NULL;
246
247
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
248
	rq->ioprio = 0;
Linus Torvalds's avatar
Linus Torvalds committed
249
250
251
252
253
254
	rq->buffer = NULL;
	rq->ref_count = 1;
	rq->q = q;
	rq->special = NULL;
	rq->data_len = 0;
	rq->data = NULL;
255
	rq->nr_phys_segments = 0;
Linus Torvalds's avatar
Linus Torvalds committed
256
257
258
	rq->sense = NULL;
	rq->end_io = NULL;
	rq->end_io_data = NULL;
259
	rq->completion_data = NULL;
260
	rq->next_rq = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
261
262
263
264
}

/**
 * blk_queue_ordered - does this queue support ordered writes
265
266
 * @q:        the request queue
 * @ordered:  one of QUEUE_ORDERED_*
Jens Axboe's avatar
Jens Axboe committed
267
 * @prepare_flush_fn: rq setup helper for cache flush ordered writes
Linus Torvalds's avatar
Linus Torvalds committed
268
269
270
271
272
273
274
275
 *
 * Description:
 *   For journalled file systems, doing ordered writes on a commit
 *   block instead of explicitly doing wait_on_buffer (which is bad
 *   for performance) can be a big win. Block drivers supporting this
 *   feature should call this function and indicate so.
 *
 **/
276
int blk_queue_ordered(struct request_queue *q, unsigned ordered,
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
		      prepare_flush_fn *prepare_flush_fn)
{
	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
	    prepare_flush_fn == NULL) {
		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
		return -EINVAL;
	}

	if (ordered != QUEUE_ORDERED_NONE &&
	    ordered != QUEUE_ORDERED_DRAIN &&
	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
	    ordered != QUEUE_ORDERED_TAG &&
	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
	    ordered != QUEUE_ORDERED_TAG_FUA) {
		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
294
	}
295

296
	q->ordered = ordered;
297
298
299
300
	q->next_ordered = ordered;
	q->prepare_flush_fn = prepare_flush_fn;

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
301
302
303
304
305
306
307
308
309
310
311
312
313
314
}

EXPORT_SYMBOL(blk_queue_ordered);

/**
 * blk_queue_issue_flush_fn - set function for issuing a flush
 * @q:     the request queue
 * @iff:   the function to be called issuing the flush
 *
 * Description:
 *   If a driver supports issuing a flush command, the support is notified
 *   to the block layer by defining it through this call.
 *
 **/
315
void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)
Linus Torvalds's avatar
Linus Torvalds committed
316
317
318
319
320
321
322
323
324
{
	q->issue_flush_fn = iff;
}

EXPORT_SYMBOL(blk_queue_issue_flush_fn);

/*
 * Cache flushing for ordered writes handling
 */
325
inline unsigned blk_ordered_cur_seq(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
326
{
327
328
329
	if (!q->ordseq)
		return 0;
	return 1 << ffz(q->ordseq);
Linus Torvalds's avatar
Linus Torvalds committed
330
331
}

332
unsigned blk_ordered_req_seq(struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
333
{
334
	struct request_queue *q = rq->q;
Linus Torvalds's avatar
Linus Torvalds committed
335

336
	BUG_ON(q->ordseq == 0);
337

338
339
340
341
342
343
	if (rq == &q->pre_flush_rq)
		return QUEUE_ORDSEQ_PREFLUSH;
	if (rq == &q->bar_rq)
		return QUEUE_ORDSEQ_BAR;
	if (rq == &q->post_flush_rq)
		return QUEUE_ORDSEQ_POSTFLUSH;
Linus Torvalds's avatar
Linus Torvalds committed
344

345
346
347
348
349
350
351
352
353
	/*
	 * !fs requests don't need to follow barrier ordering.  Always
	 * put them at the front.  This fixes the following deadlock.
	 *
	 * http://thread.gmane.org/gmane.linux.kernel/537473
	 */
	if (!blk_fs_request(rq))
		return QUEUE_ORDSEQ_DRAIN;

354
355
	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
356
357
358
		return QUEUE_ORDSEQ_DRAIN;
	else
		return QUEUE_ORDSEQ_DONE;
Linus Torvalds's avatar
Linus Torvalds committed
359
360
}

361
void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
Linus Torvalds's avatar
Linus Torvalds committed
362
{
363
364
	struct request *rq;
	int uptodate;
Linus Torvalds's avatar
Linus Torvalds committed
365

366
367
	if (error && !q->orderr)
		q->orderr = error;
Linus Torvalds's avatar
Linus Torvalds committed
368

369
370
	BUG_ON(q->ordseq & seq);
	q->ordseq |= seq;
Linus Torvalds's avatar
Linus Torvalds committed
371

372
373
	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
		return;
Linus Torvalds's avatar
Linus Torvalds committed
374
375

	/*
376
	 * Okay, sequence complete.
Linus Torvalds's avatar
Linus Torvalds committed
377
	 */
378
379
	rq = q->orig_bar_rq;
	uptodate = q->orderr ? q->orderr : 1;
Linus Torvalds's avatar
Linus Torvalds committed
380

381
	q->ordseq = 0;
Linus Torvalds's avatar
Linus Torvalds committed
382

383
384
	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
	end_that_request_last(rq, uptodate);
Linus Torvalds's avatar
Linus Torvalds committed
385
386
}

387
static void pre_flush_end_io(struct request *rq, int error)
Linus Torvalds's avatar
Linus Torvalds committed
388
{
389
390
391
	elv_completed_request(rq->q, rq);
	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
}
Linus Torvalds's avatar
Linus Torvalds committed
392

393
394
395
396
397
static void bar_end_io(struct request *rq, int error)
{
	elv_completed_request(rq->q, rq);
	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
}
Linus Torvalds's avatar
Linus Torvalds committed
398

399
400
401
402
403
static void post_flush_end_io(struct request *rq, int error)
{
	elv_completed_request(rq->q, rq);
	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
}
Linus Torvalds's avatar
Linus Torvalds committed
404

405
static void queue_flush(struct request_queue *q, unsigned which)
406
407
408
{
	struct request *rq;
	rq_end_io_fn *end_io;
Linus Torvalds's avatar
Linus Torvalds committed
409

410
411
412
413
414
415
	if (which == QUEUE_ORDERED_PREFLUSH) {
		rq = &q->pre_flush_rq;
		end_io = pre_flush_end_io;
	} else {
		rq = &q->post_flush_rq;
		end_io = post_flush_end_io;
Linus Torvalds's avatar
Linus Torvalds committed
416
	}
417

418
	rq->cmd_flags = REQ_HARDBARRIER;
419
420
	rq_init(q, rq);
	rq->elevator_private = NULL;
421
	rq->elevator_private2 = NULL;
422
423
424
425
	rq->rq_disk = q->bar_rq.rq_disk;
	rq->end_io = end_io;
	q->prepare_flush_fn(q, rq);

426
	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
Linus Torvalds's avatar
Linus Torvalds committed
427
428
}

429
static inline struct request *start_ordered(struct request_queue *q,
430
					    struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
431
{
432
433
434
435
436
437
438
439
440
441
442
	q->bi_size = 0;
	q->orderr = 0;
	q->ordered = q->next_ordered;
	q->ordseq |= QUEUE_ORDSEQ_STARTED;

	/*
	 * Prep proxy barrier request.
	 */
	blkdev_dequeue_request(rq);
	q->orig_bar_rq = rq;
	rq = &q->bar_rq;
443
	rq->cmd_flags = 0;
444
	rq_init(q, rq);
445
446
447
	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
		rq->cmd_flags |= REQ_RW;
	rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
448
	rq->elevator_private = NULL;
449
	rq->elevator_private2 = NULL;
450
451
452
453
454
455
456
457
458
459
460
461
462
463
	init_request_from_bio(rq, q->orig_bar_rq->bio);
	rq->end_io = bar_end_io;

	/*
	 * Queue ordered sequence.  As we stack them at the head, we
	 * need to queue in reverse order.  Note that we rely on that
	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
	 * request gets inbetween ordered sequence.
	 */
	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
	else
		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;

464
	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
465
466
467
468
469
470

	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
		rq = &q->pre_flush_rq;
	} else
		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
Linus Torvalds's avatar
Linus Torvalds committed
471

472
473
474
475
476
477
	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
	else
		rq = NULL;

	return rq;
Linus Torvalds's avatar
Linus Torvalds committed
478
479
}

480
int blk_do_ordered(struct request_queue *q, struct request **rqp)
Linus Torvalds's avatar
Linus Torvalds committed
481
{
482
	struct request *rq = *rqp;
483
	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
Linus Torvalds's avatar
Linus Torvalds committed
484

485
486
487
	if (!q->ordseq) {
		if (!is_barrier)
			return 1;
Linus Torvalds's avatar
Linus Torvalds committed
488

489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
		if (q->next_ordered != QUEUE_ORDERED_NONE) {
			*rqp = start_ordered(q, rq);
			return 1;
		} else {
			/*
			 * This can happen when the queue switches to
			 * ORDERED_NONE while this request is on it.
			 */
			blkdev_dequeue_request(rq);
			end_that_request_first(rq, -EOPNOTSUPP,
					       rq->hard_nr_sectors);
			end_that_request_last(rq, -EOPNOTSUPP);
			*rqp = NULL;
			return 0;
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
505

506
507
508
509
510
511
512
513
514
	/*
	 * Ordered sequence in progress
	 */

	/* Special requests are not subject to ordering rules. */
	if (!blk_fs_request(rq) &&
	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
		return 1;

515
	if (q->ordered & QUEUE_ORDERED_TAG) {
516
		/* Ordered by tag.  Blocking the next barrier is enough. */
517
518
		if (is_barrier && rq != &q->bar_rq)
			*rqp = NULL;
519
520
521
522
523
	} else {
		/* Ordered by draining.  Wait for turn. */
		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
			*rqp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
524
525
526
527
528
	}

	return 1;
}

529
static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
Linus Torvalds's avatar
Linus Torvalds committed
530
{
531
	struct request_queue *q = bio->bi_private;
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549

	/*
	 * This is dry run, restore bio_sector and size.  We'll finish
	 * this request again with the original bi_end_io after an
	 * error occurs or post flush is complete.
	 */
	q->bi_size += bytes;

	if (bio->bi_size)
		return 1;

	/* Reset bio */
	set_bit(BIO_UPTODATE, &bio->bi_flags);
	bio->bi_size = q->bi_size;
	bio->bi_sector -= (q->bi_size >> 9);
	q->bi_size = 0;

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
550
551
}

Jens Axboe's avatar
Jens Axboe committed
552
553
static int ordered_bio_endio(struct request *rq, struct bio *bio,
			     unsigned int nbytes, int error)
Linus Torvalds's avatar
Linus Torvalds committed
554
{
555
	struct request_queue *q = rq->q;
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
	bio_end_io_t *endio;
	void *private;

	if (&q->bar_rq != rq)
		return 0;

	/*
	 * Okay, this is the barrier request in progress, dry finish it.
	 */
	if (error && !q->orderr)
		q->orderr = error;

	endio = bio->bi_end_io;
	private = bio->bi_private;
	bio->bi_end_io = flush_dry_bio_endio;
	bio->bi_private = q;

	bio_endio(bio, nbytes, error);

	bio->bi_end_io = endio;
	bio->bi_private = private;

	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
579
580
581
582
583
584
585
586
587
588
589
}

/**
 * blk_queue_bounce_limit - set bounce buffer limit for queue
 * @q:  the request queue for the device
 * @dma_addr:   bus address limit
 *
 * Description:
 *    Different hardware can have different requirements as to what pages
 *    it can do I/O directly to. A low level driver can call
 *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
590
 *    buffers for doing I/O to pages residing above @page.
Linus Torvalds's avatar
Linus Torvalds committed
591
 **/
592
void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
Linus Torvalds's avatar
Linus Torvalds committed
593
594
{
	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
595
596
597
598
599
600
601
	int dma = 0;

	q->bounce_gfp = GFP_NOIO;
#if BITS_PER_LONG == 64
	/* Assume anything <= 4GB can be handled by IOMMU.
	   Actually some IOMMUs can handle everything, but I don't
	   know of a way to test this here. */
602
	if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
603
604
605
606
607
608
609
610
		dma = 1;
	q->bounce_pfn = max_low_pfn;
#else
	if (bounce_pfn < blk_max_low_pfn)
		dma = 1;
	q->bounce_pfn = bounce_pfn;
#endif
	if (dma) {
Linus Torvalds's avatar
Linus Torvalds committed
611
612
		init_emergency_isa_pool();
		q->bounce_gfp = GFP_NOIO | GFP_DMA;
613
614
		q->bounce_pfn = bounce_pfn;
	}
Linus Torvalds's avatar
Linus Torvalds committed
615
616
617
618
619
620
621
622
623
624
625
626
627
}

EXPORT_SYMBOL(blk_queue_bounce_limit);

/**
 * blk_queue_max_sectors - set max sectors for a request for this queue
 * @q:  the request queue for the device
 * @max_sectors:  max sectors in the usual 512b unit
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the size of
 *    received requests.
 **/
628
void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
Linus Torvalds's avatar
Linus Torvalds committed
629
630
631
632
633
634
{
	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
	}

635
636
637
638
639
640
	if (BLK_DEF_MAX_SECTORS > max_sectors)
		q->max_hw_sectors = q->max_sectors = max_sectors;
 	else {
		q->max_sectors = BLK_DEF_MAX_SECTORS;
		q->max_hw_sectors = max_sectors;
	}
Linus Torvalds's avatar
Linus Torvalds committed
641
642
643
644
645
646
647
648
649
650
651
652
653
654
}

EXPORT_SYMBOL(blk_queue_max_sectors);

/**
 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
 *    physical data segments in a request.  This would be the largest sized
 *    scatter list the driver could handle.
 **/
655
656
void blk_queue_max_phys_segments(struct request_queue *q,
				 unsigned short max_segments)
Linus Torvalds's avatar
Linus Torvalds committed
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
{
	if (!max_segments) {
		max_segments = 1;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
	}

	q->max_phys_segments = max_segments;
}

EXPORT_SYMBOL(blk_queue_max_phys_segments);

/**
 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
 * @q:  the request queue for the device
 * @max_segments:  max number of segments
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the number of
 *    hw data segments in a request.  This would be the largest number of
 *    address/length pairs the host adapter can actually give as once
 *    to the device.
 **/
679
680
void blk_queue_max_hw_segments(struct request_queue *q,
			       unsigned short max_segments)
Linus Torvalds's avatar
Linus Torvalds committed
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
{
	if (!max_segments) {
		max_segments = 1;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
	}

	q->max_hw_segments = max_segments;
}

EXPORT_SYMBOL(blk_queue_max_hw_segments);

/**
 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
 * @q:  the request queue for the device
 * @max_size:  max size of segment in bytes
 *
 * Description:
 *    Enables a low level driver to set an upper limit on the size of a
 *    coalesced segment
 **/
701
void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
Linus Torvalds's avatar
Linus Torvalds committed
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
{
	if (max_size < PAGE_CACHE_SIZE) {
		max_size = PAGE_CACHE_SIZE;
		printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
	}

	q->max_segment_size = max_size;
}

EXPORT_SYMBOL(blk_queue_max_segment_size);

/**
 * blk_queue_hardsect_size - set hardware sector size for the queue
 * @q:  the request queue for the device
 * @size:  the hardware sector size, in bytes
 *
 * Description:
 *   This should typically be set to the lowest possible sector size
 *   that the hardware can operate on (possible without reverting to
 *   even internal read-modify-write operations). Usually the default
 *   of 512 covers most hardware.
 **/
724
void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
Linus Torvalds's avatar
Linus Torvalds committed
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
{
	q->hardsect_size = size;
}

EXPORT_SYMBOL(blk_queue_hardsect_size);

/*
 * Returns the minimum that is _not_ zero, unless both are zero.
 */
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))

/**
 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
 * @t:	the stacking driver (top)
 * @b:  the underlying device (bottom)
 **/
741
void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
Linus Torvalds's avatar
Linus Torvalds committed
742
743
{
	/* zero is "infinity" */
744
745
	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
Linus Torvalds's avatar
Linus Torvalds committed
746
747
748
749
750

	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
751
752
	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
		clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
Linus Torvalds's avatar
Linus Torvalds committed
753
754
755
756
757
758
759
760
761
}

EXPORT_SYMBOL(blk_queue_stack_limits);

/**
 * blk_queue_segment_boundary - set boundary rules for segment merging
 * @q:  the request queue for the device
 * @mask:  the memory boundary mask
 **/
762
void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
Linus Torvalds's avatar
Linus Torvalds committed
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
{
	if (mask < PAGE_CACHE_SIZE - 1) {
		mask = PAGE_CACHE_SIZE - 1;
		printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
	}

	q->seg_boundary_mask = mask;
}

EXPORT_SYMBOL(blk_queue_segment_boundary);

/**
 * blk_queue_dma_alignment - set dma length and memory alignment
 * @q:     the request queue for the device
 * @mask:  alignment mask
 *
 * description:
 *    set required memory and length aligment for direct dma transactions.
 *    this is used when buiding direct io requests for the queue.
 *
 **/
784
void blk_queue_dma_alignment(struct request_queue *q, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
{
	q->dma_alignment = mask;
}

EXPORT_SYMBOL(blk_queue_dma_alignment);

/**
 * blk_queue_find_tag - find a request by its tag and queue
 * @q:	 The request queue for the device
 * @tag: The tag of the request
 *
 * Notes:
 *    Should be used when a device returns a tag and you want to match
 *    it with a request.
 *
 *    no locks need be held.
 **/
802
struct request *blk_queue_find_tag(struct request_queue *q, int tag)
Linus Torvalds's avatar
Linus Torvalds committed
803
{
804
	return blk_map_queue_find_tag(q->queue_tags, tag);
Linus Torvalds's avatar
Linus Torvalds committed
805
806
807
808
809
}

EXPORT_SYMBOL(blk_queue_find_tag);

/**
810
811
 * __blk_free_tags - release a given set of tag maintenance info
 * @bqt:	the tag map to free
Linus Torvalds's avatar
Linus Torvalds committed
812
 *
813
814
815
816
 * Tries to free the specified @bqt@.  Returns true if it was
 * actually freed and false if there are still references using it
 */
static int __blk_free_tags(struct blk_queue_tag *bqt)
Linus Torvalds's avatar
Linus Torvalds committed
817
{
818
	int retval;
Linus Torvalds's avatar
Linus Torvalds committed
819

820
821
	retval = atomic_dec_and_test(&bqt->refcnt);
	if (retval) {
Linus Torvalds's avatar
Linus Torvalds committed
822
823
824
825
826
827
828
829
830
831
		BUG_ON(bqt->busy);
		BUG_ON(!list_empty(&bqt->busy_list));

		kfree(bqt->tag_index);
		bqt->tag_index = NULL;

		kfree(bqt->tag_map);
		bqt->tag_map = NULL;

		kfree(bqt);
832

Linus Torvalds's avatar
Linus Torvalds committed
833
834
	}

835
836
837
838
839
840
841
842
843
844
845
	return retval;
}

/**
 * __blk_queue_free_tags - release tag maintenance info
 * @q:  the request queue for the device
 *
 *  Notes:
 *    blk_cleanup_queue() will take care of calling this function, if tagging
 *    has been used. So there's no need to call this directly.
 **/
846
static void __blk_queue_free_tags(struct request_queue *q)
847
848
849
850
851
852
853
854
{
	struct blk_queue_tag *bqt = q->queue_tags;

	if (!bqt)
		return;

	__blk_free_tags(bqt);

Linus Torvalds's avatar
Linus Torvalds committed
855
856
857
858
	q->queue_tags = NULL;
	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
}

859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874

/**
 * blk_free_tags - release a given set of tag maintenance info
 * @bqt:	the tag map to free
 *
 * For externally managed @bqt@ frees the map.  Callers of this
 * function must guarantee to have released all the queues that
 * might have been using this tag map.
 */
void blk_free_tags(struct blk_queue_tag *bqt)
{
	if (unlikely(!__blk_free_tags(bqt)))
		BUG();
}
EXPORT_SYMBOL(blk_free_tags);

Linus Torvalds's avatar
Linus Torvalds committed
875
876
877
878
879
880
881
882
/**
 * blk_queue_free_tags - release tag maintenance info
 * @q:  the request queue for the device
 *
 *  Notes:
 *	This is used to disabled tagged queuing to a device, yet leave
 *	queue in function.
 **/
883
void blk_queue_free_tags(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
884
885
886
887
888
889
890
{
	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
}

EXPORT_SYMBOL(blk_queue_free_tags);

static int
891
init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
Linus Torvalds's avatar
Linus Torvalds committed
892
893
894
{
	struct request **tag_index;
	unsigned long *tag_map;
895
	int nr_ulongs;
Linus Torvalds's avatar
Linus Torvalds committed
896

897
	if (q && depth > q->nr_requests * 2) {
Linus Torvalds's avatar
Linus Torvalds committed
898
899
900
901
902
		depth = q->nr_requests * 2;
		printk(KERN_ERR "%s: adjusted depth to %d\n",
				__FUNCTION__, depth);
	}

903
	tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed
904
905
906
	if (!tag_index)
		goto fail;

907
	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
908
	tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
Linus Torvalds's avatar
Linus Torvalds committed
909
910
911
	if (!tag_map)
		goto fail;

912
	tags->real_max_depth = depth;
Linus Torvalds's avatar
Linus Torvalds committed
913
914
915
916
917
918
919
920
921
922
	tags->max_depth = depth;
	tags->tag_index = tag_index;
	tags->tag_map = tag_map;

	return 0;
fail:
	kfree(tag_index);
	return -ENOMEM;
}

923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
						   int depth)
{
	struct blk_queue_tag *tags;

	tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
	if (!tags)
		goto fail;

	if (init_tag_map(q, tags, depth))
		goto fail;

	INIT_LIST_HEAD(&tags->busy_list);
	tags->busy = 0;
	atomic_set(&tags->refcnt, 1);
	return tags;
fail:
	kfree(tags);
	return NULL;
}

/**
 * blk_init_tags - initialize the tag info for an external tag map
 * @depth:	the maximum queue depth supported
 * @tags: the tag to use
 **/
struct blk_queue_tag *blk_init_tags(int depth)
{
	return __blk_queue_init_tags(NULL, depth);
}
EXPORT_SYMBOL(blk_init_tags);

Linus Torvalds's avatar
Linus Torvalds committed
955
956
957
958
959
960
/**
 * blk_queue_init_tags - initialize the queue tag info
 * @q:  the request queue for the device
 * @depth:  the maximum queue depth supported
 * @tags: the tag to use
 **/
961
int blk_queue_init_tags(struct request_queue *q, int depth,
Linus Torvalds's avatar
Linus Torvalds committed
962
963
964
965
966
967
968
			struct blk_queue_tag *tags)
{
	int rc;

	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);

	if (!tags && !q->queue_tags) {
969
		tags = __blk_queue_init_tags(q, depth);
Linus Torvalds's avatar
Linus Torvalds committed
970

971
		if (!tags)
Linus Torvalds's avatar
Linus Torvalds committed
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
			goto fail;
	} else if (q->queue_tags) {
		if ((rc = blk_queue_resize_tags(q, depth)))
			return rc;
		set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
		return 0;
	} else
		atomic_inc(&tags->refcnt);

	/*
	 * assign it, all done
	 */
	q->queue_tags = tags;
	q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
	return 0;
fail:
	kfree(tags);
	return -ENOMEM;
}

EXPORT_SYMBOL(blk_queue_init_tags);

/**
 * blk_queue_resize_tags - change the queueing depth
 * @q:  the request queue for the device
 * @new_depth: the new max command queueing depth
 *
 *  Notes:
 *    Must be called with the queue lock held.
 **/
1002
int blk_queue_resize_tags(struct request_queue *q, int new_depth)
Linus Torvalds's avatar
Linus Torvalds committed
1003
1004
1005
1006
{
	struct blk_queue_tag *bqt = q->queue_tags;
	struct request **tag_index;
	unsigned long *tag_map;
1007
	int max_depth, nr_ulongs;
Linus Torvalds's avatar
Linus Torvalds committed
1008
1009
1010
1011

	if (!bqt)
		return -ENXIO;

1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
	/*
	 * if we already have large enough real_max_depth.  just
	 * adjust max_depth.  *NOTE* as requests with tag value
	 * between new_depth and real_max_depth can be in-flight, tag
	 * map can not be shrunk blindly here.
	 */
	if (new_depth <= bqt->real_max_depth) {
		bqt->max_depth = new_depth;
		return 0;
	}

1023
1024
1025
1026
1027
1028
1029
	/*
	 * Currently cannot replace a shared tag map with a new
	 * one, so error out if this is the case
	 */
	if (atomic_read(&bqt->refcnt) != 1)
		return -EBUSY;

Linus Torvalds's avatar
Linus Torvalds committed
1030
1031
1032
1033
1034
	/*
	 * save the old state info, so we can copy it back
	 */
	tag_index = bqt->tag_index;
	tag_map = bqt->tag_map;
1035
	max_depth = bqt->real_max_depth;
Linus Torvalds's avatar
Linus Torvalds committed
1036
1037
1038
1039
1040

	if (init_tag_map(q, bqt, new_depth))
		return -ENOMEM;

	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
1041
	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
1042
	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
Linus Torvalds's avatar
Linus Torvalds committed
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064

	kfree(tag_index);
	kfree(tag_map);
	return 0;
}

EXPORT_SYMBOL(blk_queue_resize_tags);

/**
 * blk_queue_end_tag - end tag operations for a request
 * @q:  the request queue for the device
 * @rq: the request that has completed
 *
 *  Description:
 *    Typically called when end_that_request_first() returns 0, meaning
 *    all transfers have been done for a request. It's important to call
 *    this function before end_that_request_last(), as that will put the
 *    request back on the free list thus corrupting the internal tag list.
 *
 *  Notes:
 *   queue lock must be held.
 **/
1065
void blk_queue_end_tag(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
1066
1067
1068
1069
1070
1071
{
	struct blk_queue_tag *bqt = q->queue_tags;
	int tag = rq->tag;

	BUG_ON(tag == -1);

1072
	if (unlikely(tag >= bqt->real_max_depth))
1073
1074
1075
1076
		/*
		 * This can happen after tag depth has been reduced.
		 * FIXME: how about a warning or info message here?
		 */
Linus Torvalds's avatar
Linus Torvalds committed
1077
1078
1079
		return;

	list_del_init(&rq->queuelist);
1080
	rq->cmd_flags &= ~REQ_QUEUED;
Linus Torvalds's avatar
Linus Torvalds committed
1081
1082
1083
	rq->tag = -1;

	if (unlikely(bqt->tag_index[tag] == NULL))
1084
1085
		printk(KERN_ERR "%s: tag %d is missing\n",
		       __FUNCTION__, tag);
Linus Torvalds's avatar
Linus Torvalds committed
1086
1087

	bqt->tag_index[tag] = NULL;
1088

Nick Piggin's avatar
Nick Piggin committed
1089
1090
1091
1092
1093
1094
	/*
	 * We use test_and_clear_bit's memory ordering properties here.
	 * The tag_map bit acts as a lock for tag_index[bit], so we need
	 * a barrer before clearing the bit (precisely: release semantics).
	 * Could use clear_bit_unlock when it is merged.
	 */
1095
1096
1097
1098
1099
1100
	if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {
		printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
		       __FUNCTION__, tag);
		return;
	}

Linus Torvalds's avatar
Linus Torvalds committed
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
	bqt->busy--;
}

EXPORT_SYMBOL(blk_queue_end_tag);

/**
 * blk_queue_start_tag - find a free tag and assign it
 * @q:  the request queue for the device
 * @rq:  the block request that needs tagging
 *
 *  Description:
 *    This can either be used as a stand-alone helper, or possibly be
 *    assigned as the queue &prep_rq_fn (in which case &struct request
 *    automagically gets a tag assigned). Note that this function
 *    assumes that any type of request can be queued! if this is not
 *    true for your device, you must check the request type before
 *    calling this function.  The request will also be removed from
 *    the request queue, so it's the drivers responsibility to readd
 *    it if it should need to be restarted for some reason.
 *
 *  Notes:
 *   queue lock must be held.
 **/
1124
int blk_queue_start_tag(struct request_queue *q, struct request *rq)
Linus Torvalds's avatar
Linus Torvalds committed
1125
1126
{
	struct blk_queue_tag *bqt = q->queue_tags;
1127
	int tag;
Linus Torvalds's avatar
Linus Torvalds committed
1128

1129
	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
Linus Torvalds's avatar
Linus Torvalds committed
1130
		printk(KERN_ERR 
1131
1132
1133
		       "%s: request %p for device [%s] already tagged %d",
		       __FUNCTION__, rq,
		       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
Linus Torvalds's avatar
Linus Torvalds committed
1134
1135
1136
		BUG();
	}

1137
1138
1139
1140
1141
1142
1143
1144
	/*
	 * Protect against shared tag maps, as we may not have exclusive
	 * access to the tag map.
	 */
	do {
		tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
		if (tag >= bqt->max_depth)
			return 1;
Linus Torvalds's avatar
Linus Torvalds committed
1145

1146
	} while (test_and_set_bit(tag, bqt->tag_map));
Nick Piggin's avatar
Nick Piggin committed
1147
1148
1149
1150
	/*
	 * We rely on test_and_set_bit providing lock memory ordering semantics
	 * (could use test_and_set_bit_lock when it is merged).
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1151

1152
	rq->cmd_flags |= REQ_QUEUED;
Linus Torvalds's avatar
Linus Torvalds committed
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
	rq->tag = tag;
	bqt->tag_index[tag] = rq;
	blkdev_dequeue_request(rq);
	list_add(&rq->queuelist, &bqt->busy_list);
	bqt->busy++;
	return 0;
}

EXPORT_SYMBOL(blk_queue_start_tag);

/**
 * blk_queue_invalidate_tags - invalidate all pending tags
 * @q:  the request queue for the device
 *
 *  Description:
 *   Hardware conditions may dictate a need to stop all pending requests.
 *   In this case, we will safely clear the block side of the tag queue and
 *   readd all requests to the request queue in the right order.
 *
 *  Notes:
 *   queue lock must be held.
 **/
1175
void blk_queue_invalidate_tags(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
1176
1177
1178
1179
1180
1181
1182
1183
1184
{
	struct blk_queue_tag *bqt = q->queue_tags;
	struct list_head *tmp, *n;
	struct request *rq;

	list_for_each_safe(tmp, n, &bqt->busy_list) {
		rq = list_entry_rq(tmp);

		if (rq->tag == -1) {
1185
1186
			printk(KERN_ERR
			       "%s: bad tag found on list\n", __FUNCTION__);
Linus Torvalds's avatar
Linus Torvalds committed
1187
			list_del_init(&rq->queuelist);
1188
			rq->cmd_flags &= ~REQ_QUEUED;
Linus Torvalds's avatar
Linus Torvalds committed
1189
1190
1191
		} else
			blk_queue_end_tag(q, rq);

1192
		rq->cmd_flags &= ~REQ_STARTED;
Linus Torvalds's avatar
Linus Torvalds committed
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
	}
}

EXPORT_SYMBOL(blk_queue_invalidate_tags);

void blk_dump_rq_flags(struct request *rq, char *msg)
{
	int bit;

1203
1204
1205
	printk("%s: dev %s: type=%x, flags=%x\n", msg,
		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
		rq->cmd_flags);
Linus Torvalds's avatar
Linus Torvalds committed
1206
1207
1208
1209
1210
1211

	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
						       rq->nr_sectors,
						       rq->current_nr_sectors);
	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);

1212
	if (blk_pc_request(rq)) {
Linus Torvalds's avatar
Linus Torvalds committed
1213
1214
1215
1216
1217
1218
1219
1220
1221
		printk("cdb: ");
		for (bit = 0; bit < sizeof(rq->cmd); bit++)
			printk("%02x ", rq->cmd[bit]);
		printk("\n");
	}
}

EXPORT_SYMBOL(blk_dump_rq_flags);

1222
void blk_recount_segments(struct request_queue *q, struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
1223
{
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
	struct request rq;
	struct bio *nxt = bio->bi_next;
	rq.q = q;
	rq.bio = rq.biotail = bio;
	bio->bi_next = NULL;
	blk_recalc_rq_segments(&rq);
	bio->bi_next = nxt;
	bio->bi_phys_segments = rq.nr_phys_segments;
	bio->bi_hw_segments = rq.nr_hw_segments;
	bio->bi_flags |= (1 << BIO_SEG_VALID);
}
EXPORT_SYMBOL(blk_recount_segments);

static void blk_recalc_rq_segments(struct request *rq)
{
	int nr_phys_segs;
	int nr_hw_segs;
	unsigned int phys_size;
	unsigned int hw_size;
Linus Torvalds's avatar
Linus Torvalds committed
1243
	struct bio_vec *bv, *bvprv = NULL;
1244
1245
1246
	int seg_size;
	int hw_seg_size;
	int cluster;
1247
	struct req_iterator iter;
Linus Torvalds's avatar
Linus Torvalds committed
1248
	int high, highprv = 1;
1249
	struct request_queue *q = rq->q;
Linus Torvalds's avatar
Linus Torvalds committed
1250

1251
	if (!rq->bio)
Linus Torvalds's avatar
Linus Torvalds committed
1252
1253
1254
		return;

	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1255
1256
	hw_seg_size = seg_size = 0;
	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
1257
	rq_for_each_segment(bv, rq, iter) {
Linus Torvalds's avatar
Linus Torvalds committed
1258
1259
1260
1261
1262
		/*
		 * the trick here is making sure that a high page is never
		 * considered part of another segment, since that might
		 * change with the bounce page.
		 */
1263
		high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
Linus Torvalds's avatar
Linus Torvalds committed
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
		if (high || highprv)
			goto new_hw_segment;
		if (cluster) {
			if (seg_size + bv->bv_len > q->max_segment_size)
				goto new_segment;
			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
				goto new_segment;
			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
				goto new_segment;
			if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
				goto new_hw_segment;

			seg_size += bv->bv_len;
			hw_seg_size += bv->bv_len;
			bvprv = bv;
			continue;
		}
new_segment:
		if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1283
		    !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
Linus Torvalds's avatar
Linus Torvalds committed
1284
			hw_seg_size += bv->bv_len;
1285
		else {
Linus Torvalds's avatar
Linus Torvalds committed
1286
new_hw_segment:
1287
1288
1289
			if (nr_hw_segs == 1 &&
			    hw_seg_size > rq->bio->bi_hw_front_size)
				rq->bio->bi_hw_front_size = hw_seg_size;
Linus Torvalds's avatar
Linus Torvalds committed
1290
1291
1292
1293
1294
1295
1296
1297
1298
			hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
			nr_hw_segs++;
		}

		nr_phys_segs++;
		bvprv = bv;
		seg_size = bv->bv_len;
		highprv = high;
	}
1299
1300
1301
1302
1303
1304
1305
1306

	if (nr_hw_segs == 1 &&
	    hw_seg_size > rq->bio->bi_hw_front_size)
		rq->bio->bi_hw_front_size = hw_seg_size;
	if (hw_seg_size > rq->biotail->bi_hw_back_size)
		rq->biotail->bi_hw_back_size = hw_seg_size;
	rq->nr_phys_segments = nr_phys_segs;
	rq->nr_hw_segments = nr_hw_segs;
Linus Torvalds's avatar
Linus Torvalds committed
1307
1308
}

1309
static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
Linus Torvalds's avatar
Linus Torvalds committed
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
				   struct bio *nxt)
{
	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
		return 0;

	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
		return 0;
	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
		return 0;

	/*
	 * bio and nxt are contigous in memory, check if the queue allows
	 * these two to be merged into one
	 */
	if (BIO_SEG_BOUNDARY(q, bio, nxt))
		return 1;

	return 0;
}

1330
static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
Linus Torvalds's avatar
Linus Torvalds committed
1331
1332
1333
1334
1335
1336
1337
				 struct bio *nxt)
{
	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
		blk_recount_segments(q, bio);
	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
		blk_recount_segments(q, nxt);
	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1338
	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
Linus Torvalds's avatar
Linus Torvalds committed
1339
		return 0;
1340
	if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
Linus Torvalds's avatar
Linus Torvalds committed
1341
1342
1343
1344
1345
1346
1347
1348
1349
		return 0;

	return 1;
}

/*
 * map a request to scatterlist, return number of sg entries setup. Caller
 * must make sure sg can hold rq->nr_phys_segments entries
 */
1350
1351
int blk_rq_map_sg(struct request_queue *q, struct request *rq,
		  struct scatterlist *sg)
Linus Torvalds's avatar
Linus Torvalds committed
1352
1353
{
	struct bio_vec *bvec, *bvprv;
1354
1355
	struct req_iterator iter;
	int nsegs, cluster;
Linus Torvalds's avatar
Linus Torvalds committed
1356
1357
1358
1359
1360
1361
1362
1363

	nsegs = 0;
	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);

	/*
	 * for each bio in rq
	 */
	bvprv = NULL;
1364
	rq_for_each_segment(bvec, rq, iter) {
Linus Torvalds's avatar
Linus Torvalds committed
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
			int nbytes = bvec->bv_len;

			if (bvprv && cluster) {
				if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
					goto new_segment;

				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
					goto new_segment;
				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
					goto new_segment;

				sg[nsegs - 1].length += nbytes;
			} else {
new_segment:
				memset(&sg[nsegs],0,sizeof(struct scatterlist));
				sg[nsegs].page = bvec->bv_page;
				sg[nsegs].length = nbytes;
				sg[nsegs].offset = bvec->bv_offset;

				nsegs++;
			}
			bvprv = bvec;
1387
	} /* segments in rq */
Linus Torvalds's avatar
Linus Torvalds committed
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398

	return nsegs;
}

EXPORT_SYMBOL(blk_rq_map_sg);

/*
 * the standard queue merge functions, can be overridden with device
 * specific ones if so desired
 */

1399
static inline int ll_new_mergeable(struct request_queue *q,
Linus Torvalds's avatar
Linus Torvalds committed
1400
1401
1402
1403
1404
1405
				   struct request *req,
				   struct bio *bio)
{
	int nr_phys_segs = bio_phys_segments(q, bio);

	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1406
		req->cmd_flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
		if (req == q->last_merge)
			q->last_merge = NULL;
		return 0;
	}

	/*
	 * A hw segment is just getting larger, bump just the phys
	 * counter.
	 */
	req->nr_phys_segments += nr_phys_segs;
	return 1;
}

1420
static inline int ll_new_hw_segment(struct request_queue *q,
Linus Torvalds's avatar
Linus Torvalds committed
1421
1422
1423
1424
1425
1426
1427
1428
				    struct request *req,
				    struct bio *bio)
{
	int nr_hw_segs = bio_hw_segments(q, bio);
	int nr_phys_segs = bio_phys_segments(q, bio);

	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1429
		req->cmd_flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
		if (req == q->last_merge)
			q->last_merge = NULL;
		return 0;
	}

	/*
	 * This will form the start of a new hw segment.  Bump both
	 * counters.
	 */
	req->nr_hw_segments += nr_hw_segs;
	req->nr_phys_segments += nr_phys_segs;
	return 1;
}

NeilBrown's avatar
NeilBrown committed
1444
1445
static int ll_back_merge_fn(struct request_queue *q, struct request *req,
			    struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
1446
{
1447
	unsigned short max_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
1448
1449
	int len;

1450
1451
1452
1453
1454
1455
	if (unlikely(blk_pc_request(req)))
		max_sectors = q->max_hw_sectors;
	else
		max_sectors = q->max_sectors;

	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1456
		req->cmd_flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
		if (req == q->last_merge)
			q->last_merge = NULL;
		return 0;
	}
	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
		blk_recount_segments(q, req->biotail);
	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
		blk_recount_segments(q, bio);
	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
	    !BIOVEC_VIRT_OVERSIZE(len)) {
		int mergeable =  ll_new_mergeable(q, req, bio);

		if (mergeable) {
			if (req->nr_hw_segments == 1)
				req->bio->bi_hw_front_size = len;
			if (bio->bi_hw_segments == 1)
				bio->bi_hw_back_size = len;
		}
		return mergeable;
	}

	return ll_new_hw_segment(q, req, bio);
}

1482
static int ll_front_merge_fn(struct request_queue *q, struct request *req, 
Linus Torvalds's avatar
Linus Torvalds committed
1483
1484
			     struct bio *bio)
{
1485
	unsigned short max_sectors;
Linus Torvalds's avatar
Linus Torvalds committed
1486
1487
	int len;

1488
1489
1490
1491
1492
1493
1494
	if (unlikely(blk_pc_request(req)))
		max_sectors = q->max_hw_sectors;
	else
		max_sectors = q->max_sectors;


	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1495
		req->cmd_flags |= REQ_NOMERGE;
Linus Torvalds's avatar
Linus Torvalds committed
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
		if (req == q->last_merge)
			q->last_merge = NULL;
		return 0;
	}
	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
		blk_recount_segments(q, bio);
	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
		blk_recount_segments(q, req->bio);
	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
	    !BIOVEC_VIRT_OVERSIZE(len)) {
		int mergeable =  ll_new_mergeable(q, req, bio);

		if (mergeable) {
			if (bio->bi_hw_segments == 1)
				bio->bi_hw_front_size = len;
			if (req->nr_hw_segments == 1)
				req->biotail->bi_hw_back_size = len;
		}
		return mergeable;
	}

	return ll_new_hw_segment(q, req, bio);
}

1521
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
Linus Torvalds's avatar
Linus Torvalds committed
1522
1523
				struct request *next)
{
1524
1525
	int total_phys_segments;
	int total_hw_segments;
Linus Torvalds's avatar
Linus Torvalds committed
1526
1527
1528
1529
1530
1531
1532
1533
1534

	/*
	 * First check if the either of the requests are re-queued
	 * requests.  Can't merge them if they are.
	 */
	if (req->special || next->special)
		return 0;

	/*
1535
	 * Will it become too large?
Linus Torvalds's avatar
Linus Torvalds committed
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
	 */
	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
		return 0;

	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
	if (blk_phys_contig_segment(q, req->biotail, next->bio))
		total_phys_segments--;

	if (total_phys_segments > q->max_phys_segments)
		return 0;

	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
		int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
		/*
		 * propagate the combined length to the end of the requests
		 */
		if (req->nr_hw_segments == 1)
			req->bio->bi_hw_front_size = len;
		if (next->nr_hw_segments == 1)
			next->biotail->bi_hw_back_size = len;
		total_hw_segments--;
	}

	if (total_hw_segments > q->max_hw_segments)
		return 0;

	/* Merge is OK... */
	req->nr_phys_segments = total_phys_segments;
	req->nr_hw_segments = total_hw_segments;
	return 1;
}

/*
 * "plug" the device if there are no outstanding requests: this will
 * force the transfer to start only after we have put all the requests
 * on the list.
 *
 * This is called with interrupts off and no requests on the queue and
 * with the queue lock held.
 */
1577
void blk_plug_device(struct request_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
1578
1579
1580
1581
1582
1583
1584
{
	WARN_ON(!irqs_disabled());

	/*
	 * don't plug a stopped queue, it must be paired with blk_start_queue()
	 * which will restart the queueing
	 */
1585
	if (blk_queue_stopped(q))
Linus Torvalds's avatar
Linus Torvalds committed
1586
1587
		return;

1588
	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
1589
		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1590
1591
		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
	}
Linus Torvalds's avatar
Linus Torvalds committed