Commit 320ae51f authored by Jens Axboe's avatar Jens Axboe

blk-mq: new multi-queue block IO queueing mechanism

Linux currently has two models for block devices:

- The classic request_fn based approach, where drivers use struct
  request units for IO. The block layer provides various helper
  functionalities to let drivers share code, things like tag
  management, timeout handling, queueing, etc.

- The "stacked" approach, where a driver squeezes in between the
  block layer and IO submitter. Since this bypasses the IO stack,
  driver generally have to manage everything themselves.

With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.

The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.

This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.

blk-mq provides various helper functions, which include:

- Scalable support for request tagging. Most devices need to
  be able to uniquely identify a request both in the driver and
  to the hardware. The tagging uses per-cpu caches for freed
  tags, to enable cache hot reuse.

- Timeout handling without tracking request on a per-device
  basis. Basically the driver should be able to get a notification,
  if a request happens to fail.

- Optional support for non 1:1 mappings between issue and
  submission queues. blk-mq can redirect IO completions to the
  desired location.

- Support for per-request payloads. Drivers almost always need
  to associate a request structure with some driver private
  command structure. Drivers can tell blk-mq this at init time,
  and then any request handed to the driver will have the
  required size of memory associated with it.

- Support for merging of IO, and plugging. The stacked model
  gets neither of these. Even for high IOPS devices, merging
  sequential IO reduces per-command overhead and thus
  increases bandwidth.

For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).

Contributions in this patch from the following people:

Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 1dddc01a
......@@ -5,8 +5,9 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
partition-generic.o partitions/
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o partitions/
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
......
......@@ -16,6 +16,7 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
......@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
/*
* For the allocated request tables
*/
static struct kmem_cache *request_cachep;
struct kmem_cache *request_cachep = NULL;
/*
* For queue allocation
......@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
static void drive_stat_acct(struct request *rq, int new_io)
{
struct hd_struct *part;
int rw = rq_data_dir(rq);
int cpu;
if (!blk_do_io_stat(rq))
return;
cpu = part_stat_lock();
if (!new_io) {
part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
} else {
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
/*
* The partition is already being removed,
* the request will be accounted on the disk only
*
* We take a reference on disk->part0 although that
* partition will never be deleted, so we can treat
* it as any other partition.
*/
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
rq->part = part;
}
part_stat_unlock();
}
void blk_queue_congestion_threshold(struct request_queue *q)
{
int nr;
......@@ -594,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
if (percpu_counter_init(&q->mq_usage_counter, 0))
goto fail_q;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
goto fail_q;
goto fail_c;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
......@@ -643,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
init_waitqueue_head(&q->mq_freeze_wq);
if (blkcg_init_queue(q))
goto fail_id;
......@@ -650,6 +620,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_c:
percpu_counter_destroy(&q->mq_usage_counter);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
......@@ -1108,7 +1080,8 @@ retry:
goto retry;
}
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
static struct request *blk_old_get_request(struct request_queue *q, int rw,
gfp_t gfp_mask)
{
struct request *rq;
......@@ -1125,6 +1098,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
return rq;
}
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
if (q->mq_ops)
return blk_mq_alloc_request(q, rw, gfp_mask);
else
return blk_old_get_request(q, rw, gfp_mask);
}
EXPORT_SYMBOL(blk_get_request);
/**
......@@ -1210,7 +1191,7 @@ EXPORT_SYMBOL(blk_requeue_request);
static void add_acct_request(struct request_queue *q, struct request *rq,
int where)
{
drive_stat_acct(rq, 1);
blk_account_io_start(rq, true);
__elv_add_request(q, rq, where);
}
......@@ -1299,12 +1280,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
void blk_put_request(struct request *req)
{
unsigned long flags;
struct request_queue *q = req->q;
spin_lock_irqsave(q->queue_lock, flags);
__blk_put_request(q, req);
spin_unlock_irqrestore(q->queue_lock, flags);
if (q->mq_ops)
blk_mq_free_request(req);
else {
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
__blk_put_request(q, req);
spin_unlock_irqrestore(q->queue_lock, flags);
}
}
EXPORT_SYMBOL(blk_put_request);
......@@ -1340,8 +1326,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
}
EXPORT_SYMBOL_GPL(blk_add_request_payload);
static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio)
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
......@@ -1358,12 +1344,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
blk_account_io_start(req, false);
return true;
}
static bool bio_attempt_front_merge(struct request_queue *q,
struct request *req, struct bio *bio)
bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
......@@ -1388,12 +1374,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
req->__data_len += bio->bi_size;
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
blk_account_io_start(req, false);
return true;
}
/**
* attempt_plug_merge - try to merge with %current's plugged list
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @request_count: out parameter for number of traversed plugged requests
......@@ -1409,8 +1395,8 @@ static bool bio_attempt_front_merge(struct request_queue *q,
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*/
static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int *request_count)
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int *request_count)
{
struct blk_plug *plug;
struct request *rq;
......@@ -1489,7 +1475,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
if (attempt_plug_merge(q, bio, &request_count))
if (blk_attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
......@@ -1557,7 +1543,7 @@ get_rq:
}
}
list_add_tail(&req->queuelist, &plug->list);
drive_stat_acct(req, 1);
blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
......@@ -2011,7 +1997,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
static void blk_account_io_completion(struct request *req, unsigned int bytes)
void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (blk_do_io_stat(req)) {
const int rw = rq_data_dir(req);
......@@ -2025,7 +2011,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
}
}
static void blk_account_io_done(struct request *req)
void blk_account_io_done(struct request *req)
{
/*
* Account IO completion. flush_rq isn't accounted as a
......@@ -2073,6 +2059,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
}
#endif
void blk_account_io_start(struct request *rq, bool new_io)
{
struct hd_struct *part;
int rw = rq_data_dir(rq);
int cpu;
if (!blk_do_io_stat(rq))
return;
cpu = part_stat_lock();
if (!new_io) {
part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
} else {
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
/*
* The partition is already being removed,
* the request will be accounted on the disk only
*
* We take a reference on disk->part0 although that
* partition will never be deleted, so we can treat
* it as any other partition.
*/
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
rq->part = part;
}
part_stat_unlock();
}
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
......@@ -2448,7 +2470,6 @@ static void blk_finish_request(struct request *req, int error)
if (req->cmd_flags & REQ_DONTPREP)
blk_unprep_request(req);
blk_account_io_done(req);
if (req->end_io)
......@@ -2870,6 +2891,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
/*
......@@ -2967,6 +2989,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
BUG_ON(plug->magic != PLUG_MAGIC);
flush_plug_callbacks(plug, from_schedule);
if (!list_empty(&plug->mq_list))
blk_mq_flush_plug_list(plug, from_schedule);
if (list_empty(&plug->list))
return;
......
......@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/sched/sysctl.h>
#include "blk.h"
......@@ -58,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
if (q->mq_ops) {
blk_mq_insert_request(q, rq, true);
return;
}
/*
* need to check this before __blk_run_queue(), because rq can
* be freed before that returns.
......
......@@ -69,8 +69,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
/* FLUSH/FUA sequences */
enum {
......@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
/* make @rq a normal request */
rq->cmd_flags &= ~REQ_FLUSH_SEQ;
rq->end_io = rq->flush.saved_end_io;
blk_clear_rq_complete(rq);
}
static void mq_flush_data_run(struct work_struct *work)
{
struct request *rq;
rq = container_of(work, struct request, mq_flush_data);
memset(&rq->csd, 0, sizeof(rq->csd));
blk_mq_run_request(rq, true, false);
}
static void blk_mq_flush_data_insert(struct request *rq)
{
INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
kblockd_schedule_work(rq->q, &rq->mq_flush_data);
}
/**
......@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
* completion and trigger the next step.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock)
* spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if requests were added to the dispatch queue, %false otherwise.
......@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
bool queued = false;
bool queued = false, kicked;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
......@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
list_add(&rq->queuelist, &q->queue_head);
queued = true;
if (q->mq_ops)
blk_mq_flush_data_insert(rq);
else {
list_add(&rq->queuelist, &q->queue_head);
queued = true;
}
break;
case REQ_FSEQ_DONE:
......@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
BUG_ON(!list_empty(&rq->queuelist));
list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
__blk_end_request_all(rq, error);
if (q->mq_ops)
blk_mq_end_io(rq, error);
else
__blk_end_request_all(rq, error);
break;
default:
BUG();
}
return blk_kick_flush(q) | queued;
kicked = blk_kick_flush(q);
/* blk_mq_run_flush will run queue */
if (q->mq_ops)
return queued;
return kicked | queued;
}
static void flush_end_io(struct request *flush_rq, int error)
{
struct request_queue *q = flush_rq->q;
struct list_head *running = &q->flush_queue[q->flush_running_idx];
struct list_head *running;
bool queued = false;
struct request *rq, *n;
unsigned long flags = 0;
if (q->mq_ops) {
blk_mq_free_request(flush_rq);
spin_lock_irqsave(&q->mq_flush_lock, flags);
}
running = &q->flush_queue[q->flush_running_idx];
BUG_ON(q->flush_pending_idx == q->flush_running_idx);
/* account completion of the flush request */
q->flush_running_idx ^= 1;
elv_completed_request(q, flush_rq);
if (!q->mq_ops)
elv_completed_request(q, flush_rq);
/* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) {
......@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
* directly into request_fn may confuse the driver. Always use
* kblockd.
*/
if (queued || q->flush_queue_delayed)
blk_run_queue_async(q);
if (queued || q->flush_queue_delayed) {
if (!q->mq_ops)
blk_run_queue_async(q);
else
/*
* This can be optimized to only run queues with requests
* queued if necessary.
*/
blk_mq_run_queues(q, true);
}
q->flush_queue_delayed = 0;
if (q->mq_ops)
spin_unlock_irqrestore(&q->mq_flush_lock, flags);
}
static void mq_flush_work(struct work_struct *work)
{
struct request_queue *q;
struct request *rq;
q = container_of(work, struct request_queue, mq_flush_work);
/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
__GFP_WAIT|GFP_ATOMIC);
rq->cmd_type = REQ_TYPE_FS;
rq->end_io = flush_end_io;
blk_mq_run_request(rq, true, false);
}
/*
* We can't directly use q->flush_rq, because it doesn't have tag and is not in
* hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
* so offload the work to workqueue.
*
* Note: we assume a flush request finished in any hardware queue will flush
* the whole disk cache.
*/
static void mq_run_flush(struct request_queue *q)
{
kblockd_schedule_work(q, &q->mq_flush_work);
}
/**
......@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
* Please read the comment at the top of this file for more info.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock)
* spin_lock_irq(q->queue_lock or q->mq_flush_lock)
*
* RETURNS:
* %true if flush was issued, %false otherwise.
......@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
* Issue flush and toggle pending_idx. This makes pending_idx
* different from running_idx, which means flush is in flight.
*/
q->flush_pending_idx ^= 1;
if (q->mq_ops) {
mq_run_flush(q);
return true;
}
blk_rq_init(q, &q->flush_rq);
q->flush_rq.cmd_type = REQ_TYPE_FS;
q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
q->flush_rq.rq_disk = first_rq->rq_disk;
q->flush_rq.end_io = flush_end_io;
q->flush_pending_idx ^= 1;
list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
return true;
}
......@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
blk_run_queue_async(q);
}
static void mq_flush_data_end_io(struct request *rq, int error)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
unsigned long flags;
ctx = rq->mq_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
*/
spin_lock_irqsave(&q->mq_flush_lock, flags);
if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
blk_mq_run_hw_queue(hctx, true);
spin_unlock_irqrestore(&q->mq_flush_lock, flags);
}
/**
* blk_insert_flush - insert a new FLUSH/FUA request
* @rq: request to insert
*
* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
* or __blk_mq_run_hw_queue() to dispatch request.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock)
* spin_lock_irq(q->queue_lock) in !mq case
*/
void blk_insert_flush(struct request *rq)
{
......@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
if (!policy) {
__blk_end_bidi_request(rq, 0, 0, 0);
if (q->mq_ops)
blk_mq_end_io(rq, 0);
else
__blk_end_bidi_request(rq, 0, 0, 0);
return;
}
......@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
list_add_tail(&rq->queuelist, &q->queue_head);
if (q->mq_ops) {
blk_mq_run_request(rq, false, true);
} else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
......@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
INIT_LIST_HEAD(&rq->flush.list);
rq->cmd_flags |= REQ_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
if (q->mq_ops) {
rq->end_io = mq_flush_data_end_io;
spin_lock_irq(&q->mq_flush_lock);
blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&q->mq_flush_lock);
return;
}
rq->end_io = flush_data_end_io;
blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
......@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
void blk_mq_init_flush(struct request_queue *q)
{
spin_lock_init(&q->mq_flush_lock);
INIT_WORK(&q->mq_flush_work, mq_flush_work);
}
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/blk-mq.h>
#include "blk-mq.h"
static LIST_HEAD(blk_mq_cpu_notify_list);
static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long) hcpu;
struct blk_mq_cpu_notifier *notify;
spin_lock(&blk_mq_cpu_notify_lock);
list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
notify->notify(notify->data, action, cpu);
spin_unlock(&blk_mq_cpu_notify_lock);
return NOTIFY_OK;
}
static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
unsigned int cpu)
{
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
/*
* If the CPU goes away, ensure that we run any pending
* completions.
*/
struct llist_node *node;
struct request *rq;
local_irq_disable();
node = llist_del_all(&per_cpu(ipi_lists, cpu));
while (node) {
struct llist_node *next = node->next;
rq = llist_entry(node, struct request, ll_list);
__blk_mq_end_io(rq, rq->errors);
node = next;
}
local_irq_enable();
}
}
static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
.notifier_call = blk_mq_main_cpu_notify,
};
void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
{
BUG_ON(!notifier->notify);
spin_lock(&blk_mq_cpu_notify_lock);
list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
spin_unlock(&blk_mq_cpu_notify_lock);
}
void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
{
spin_lock(&blk_mq_cpu_notify_lock);
list_del(&notifier->list);
spin_unlock(&blk_mq_cpu_notify_lock);
}
void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
void (*fn)(void *, unsigned long, unsigned int),
void *data)
{
notifier->notify = fn;
notifier->data = data;
}
static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
.notify = blk_mq_cpu_notify,
};
void __init blk_mq_cpu_init(void)
{
register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
blk_mq_register_cpu_notifier(&cpu_notifier);
}
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
static void show_map(unsigned int *map, unsigned int nr)
{
int i;
pr_info("blk-mq: CPU -> queue map\n");
for_each_online_cpu(i)
pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
}
static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
const int cpu)
{
return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
}
static int get_first_sibling(unsigned int cpu)
{
unsigned int ret;
ret = cpumask_first(topology_thread_cpumask(cpu));
if (ret < nr_cpu_ids)
return ret;
return cpu;
}
int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
return 1;
cpumask_clear(cpus);