1、首先看一下blk-mq的处理流程图（不同内核版本之间会有一些差异，但整体结构基本一样）

从整个流程图可以看到，主要是分为三个部分：（1）初始化硬件设备的相关参数（2）初始化请求队列request_queue（3）bio请求的处理过程。前面两个过程主要是完成底层存储设备向文件系统的注册，同时完成软硬队列映射关系等初始化，后一个部分是bio在MQ机制最后生成对应子请求并挂载在硬件队列上的过程。

2、接下来我们来分析具体的函数

（1）int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)——为一个或者多个request queue分配tag

/** Alloc a tag set to be associated with one or more request queues.* May fail with EINVAL for various error conditions. May adjust the* requested depth down, if it's too large. In that case, the set* value will be stored in set->queue_depth.*/
//为一个或者多个请求队列分配tag（tag set可以是多个request queue共享的）int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{int i, ret;BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);if (!set->nr_hw_queues)             //硬件队列身量return -EINVAL; if (!set->queue_depth)              //软件队列深度return -EINVAL;if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)       //块设备保留的tag数return -EINVAL;if (!set->ops->queue_rq)                  //ops表示块设备驱动的抽象集合return -EINVAL;if (!set->ops->get_budget ^ !set->ops->put_budget)return -EINVAL;if (set->queue_depth > BLK_MQ_MAX_DEPTH) {pr_info("blk-mq: reduced tag depth to %u\n",BLK_MQ_MAX_DEPTH);set->queue_depth = BLK_MQ_MAX_DEPTH;}if (!set->nr_maps)                         //映射表数量set->nr_maps = 1;else if (set->nr_maps > HCTX_MAX_TYPES)return -EINVAL;/** If a crashdump is active, then we are potentially in a very* memory constrained environment. Limit us to 1 queue and* 64 tags to prevent using too much memory.*/if (is_kdump_kernel()) {set->nr_hw_queues = 1;set->nr_maps = 1;set->queue_depth = min(64U, set->queue_depth);}/** There is no use for more h/w queues than cpus if we just have* a single map*/if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)set->nr_hw_queues = nr_cpu_ids;set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),GFP_KERNEL, set->numa_node);if (!set->tags)return -ENOMEM;ret = -ENOMEM;for (i = 0; i < set->nr_maps; i++) {set->map[i].mq_map = kcalloc_node(nr_cpu_ids,sizeof(set->map[i].mq_map[0]),GFP_KERNEL, set->numa_node);if (!set->map[i].mq_map)goto out_free_mq_map;set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;}ret = blk_mq_update_queue_map(set);               //更新映射表if (ret)goto out_free_mq_map;ret = blk_mq_alloc_rq_maps(set);                  //分配request和tagif (ret)goto out_free_mq_map;mutex_init(&set->tag_list_lock);INIT_LIST_HEAD(&set->tag_list);return 0;out_free_mq_map:for (i = 0; i < set->nr_maps; i++) {kfree(set->map[i].mq_map);set->map[i].mq_map = NULL;}kfree(set->tags);set->tags = NULL;return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);

（2）static struct request *blk_mq_get_request(struct request_queue *q,
struct bio *bio,
struct blk_mq_alloc_data *data)——为bio分配request

//为bio分配request
static struct request *blk_mq_get_request(struct request_queue *q,struct bio *bio,struct blk_mq_alloc_data *data)
{struct elevator_queue *e = q->elevator;struct request *rq;unsigned int tag;bool clear_ctx_on_error = false;u64 alloc_time_ns = 0;blk_queue_enter_live(q);/* alloc_time includes depth and tag waits */if (blk_queue_rq_alloc_time(q))alloc_time_ns = ktime_get_ns();data->q = q;if (likely(!data->ctx)) {data->ctx = blk_mq_get_ctx(q);                     //获取当前cpu的软件队列(ctx)clear_ctx_on_error = true;}if (likely(!data->hctx))data->hctx = blk_mq_map_queue(q, data->cmd_flags,      //找到软件队列(ctx)对应的硬件队列(hctx)data->ctx);if (data->cmd_flags & REQ_NOWAIT)data->flags |= BLK_MQ_REQ_NOWAIT;if (e) {data->flags |= BLK_MQ_REQ_INTERNAL;/** Flush requests are special and go directly to the* dispatch list. Don't include reserved tags in the* limiting, as it isn't useful.*/if (!op_is_flush(data->cmd_flags) &&e->type->ops.limit_depth &&!(data->flags & BLK_MQ_REQ_RESERVED))e->type->ops.limit_depth(data->cmd_flags, data);} else {blk_mq_tag_busy(data->hctx);}tag = blk_mq_get_tag(data);                      //获取tag，可能因为当前无可用tag进入iowait状态   if (tag == BLK_MQ_TAG_FAIL) {if (clear_ctx_on_error)data->ctx = NULL;blk_queue_exit(q);return NULL;}rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);              //初始化tag对应的requestif (!op_is_flush(data->cmd_flags)) {rq->elv.icq = NULL;if (e && e->type->ops.prepare_request) {if (e->type->icq_cache)blk_mq_sched_assign_ioc(rq);e->type->ops.prepare_request(rq, bio);rq->rq_flags |= RQF_ELVPRIV;}}data->hctx->queued++;return rq;
}

（3）struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)——基于blk_mq的块设备驱动初始化时，调用blk_mq_init_queue初始化IO请求队列（request_queue）

struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{struct request_queue *uninit_q, *q;uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);         //分配请求队列的内存，所分配内存结点与设备NUMA节点一致，避免远端内存访问问题if (!uninit_q)return ERR_PTR(-ENOMEM);/** Initialize the queue without an elevator. device_add_disk() will do* the initialization.*/q = blk_mq_init_allocated_queue(set, uninit_q, false);              //初始化分配的请求队列，即软件队列和硬件队列if (IS_ERR(q))blk_cleanup_queue(uninit_q);return q;
}

（4）、static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)——将上层提交的bio封装成request并提交到块设备层

//将上层提交的bio封装成request并提交到块设备层
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{const int is_sync = op_is_sync(bio->bi_opf);const int is_flush_fua = op_is_flush(bio->bi_opf);struct blk_mq_alloc_data data = { .flags = 0};struct request *rq;struct blk_plug *plug;struct request *same_queue_rq = NULL;unsigned int nr_segs;blk_qc_t cookie;blk_queue_bounce(q, &bio);__blk_queue_split(q, &bio, &nr_segs);if (!bio_integrity_prep(bio))return BLK_QC_T_NONE;if (!is_flush_fua && !blk_queue_nomerges(q) &&blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))return BLK_QC_T_NONE;if (blk_mq_sched_bio_merge(q, bio, nr_segs))return BLK_QC_T_NONE;rq_qos_throttle(q, bio);data.cmd_flags = bio->bi_opf;rq = blk_mq_get_request(q, bio, &data);if (unlikely(!rq)) {rq_qos_cleanup(q, bio);if (bio->bi_opf & REQ_NOWAIT)bio_wouldblock_error(bio);return BLK_QC_T_NONE;}trace_block_getrq(q, bio, bio->bi_opf);rq_qos_track(q, rq, bio);cookie = request_to_qc_t(data.hctx, rq);blk_mq_bio_to_request(rq, bio, nr_segs);        //生产request后，后续准备将request插入到请求队列中plug = blk_mq_plug(q, bio);if (unlikely(is_flush_fua)) {                   //若是flush或fua请求，则将request插入到flush队列，并调用blk_mq_run_hw_queue启动请求派发/* bypass scheduler for flush rq */blk_insert_flush(rq);blk_mq_run_hw_queue(data.hctx, true);} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||!blk_queue_nonrot(q))) {//若当前线程正在做IO plug且块设备是硬件单队列，则将request插入到当前线程的plug list中/** Use plugging if we have a ->commit_rqs() hook as well, as* we know the driver uses bd->last in a smart fashion.** Use normal plugging if this disk is slow HDD, as sequential* IO may benefit a lot from plug merging.*/unsigned int request_count = plug->rq_count;struct request *last = NULL;if (!request_count)trace_block_plug(q);elselast = list_entry_rq(plug->mq_list.prev);if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {blk_flush_plug_list(plug, false);trace_block_plug(q);}blk_add_rq_to_plug(plug, rq);} else if (q->elevator) {//若配置了调度器，则调用blk_mq_sched_insert_request将请求插入到调度器队列中，否则插入到当前CPU的软件队列中blk_mq_sched_insert_request(rq, false, true, true);} else if (plug && !blk_queue_nomerges(q)) {/** We do limited plugging. If the bio can be merged, do that.* Otherwise the existing request in the plug list will be* issued. So the plug list will have one request at most* The plug list might get flushed before this. If that happens,* the plug list is empty, and same_queue_rq is invalid.*/if (list_empty(&plug->mq_list))same_queue_rq = NULL;if (same_queue_rq) {list_del_init(&same_queue_rq->queuelist);plug->rq_count--;}blk_add_rq_to_plug(plug, rq);trace_block_plug(q);//若硬件多队列是IO同步请求，则调用blk_mq_try_issue_directly尝试将request直接派发到块设备驱动                   if (same_queue_rq) {data.hctx = same_queue_rq->mq_hctx;trace_block_unplug(q, 1, true);blk_mq_try_issue_directly(data.hctx, same_queue_rq,&cookie);}} else if ((q->nr_hw_queues > 1 && is_sync) ||!data.hctx->dispatch_busy) {blk_mq_try_issue_directly(data.hctx, rq, &cookie);} else {blk_mq_sched_insert_request(rq, false, true, true);}return cookie;
}

参考链接：

文件系统学习8——文件系统MQ队列机制详解_Zuoerfeng-CSDN博客_blk_mq_make_request上一篇已经讲述了MQ多队列的机制，利用cpu的多核，配上多队列机制，并发的处理IO请求，提高效率。本篇详细讲述下从bio下发到IO调度器中，MQ队列机制是如何一步步完成的。1、MQ处理结构流图从整个流程图可以看到，主要是分为三个部分：初始化硬件设备的target参数、初始化请求队列request_queue以及bio请求的处理过程。前面两个过程主要是完成底层存储设备向文件系统的注册，同时完...https://blog.csdn.net/u012414189/article/details/88358648

存储之Block-MultiQueue机制详解（二）相关推荐

Android 进阶——Framework 核心之Android Storage Access Framework（SAF）存储访问框架机制详解（二）
文章大纲引言一.DirectFragment 1.当选中DirectoryFragment中RecyclerView的Item时 2.选中DirectoryFragment中RecyclerVie ...
【Android签名机制详解】二：Android V1、V2、V3、V4签名方案
前言书接上回[Android签名机制详解]一:密码学入门,在了解了消息摘要.非对称加密.数字签名.数字证书的基本概念后,我们趁热打铁.直奔主题,讲解签名在Android中的实际应用. 基础知识 An ...
Android 进阶——Framework 核心之Android Storage Access Framework（SAF）存储访问框架机制详解（一）
文章大纲引言一.Android Storage Access Framework 二.Storage Access Framework 的主要角色成员 1.Document Provider 文件 ...
Android Loader 异步加载详解二：探寻Loader内部机制
Android Loader 异步加载详解二:探寻Loader内部机制转载请标明出处:http://blog.csdn.net/zhaoyanjun6/article/details/7025991 ...
Android学习笔记——Android 签名机制详解
Android 签名机制详解近期由于工作需要在学习 Android 的签名机制,因为没有现成资料,只能通过开发者文档和阅读博客的方式对 Android 签名机制进行大致了解.过程中查阅到的资料相对零 ...
MapTask运行机制详解以及Map任务的并行度,ReduceTask 工作机制以及reduceTask的并行度,MapReduce总体工作机制
MapTask运行机制详解整个Map阶段流程大体如图所示简单概述 inputFile通过split被逻辑切分为多个split文件, 通过Record按行读取内容给map(用户自己实现的)进行处理, ...
JAVA之JVM垃圾回收（GC）机制详解
一.为什么需要垃圾回收如果不进行垃圾回收,内存迟早都会被消耗空,因为我们在不断的分配内存空间而不进行回收.除非内存无限大,我们可以任性的分配而不回收,但是事实并非如此.所以,垃圾回收是必须的. 二. ...
java 深拷贝_java 深拷贝与浅拷贝机制详解
java 深拷贝与浅拷贝机制详解概要: 在Java中,拷贝分为深拷贝和浅拷贝两种.java在公共超类Object中实现了一种叫做clone的方法,这种方法clone出来的新对象为浅拷贝,而通过自己定 ...
Android Gradle 自定义Task详解二：进阶
转载请标明出处:http://blog.csdn.net/zhaoyanjun6/article/details/78523958 本文出自[赵彦军的博客] 系列目录 Android Gradle使用 ...

存储之Block-MultiQueue机制详解（二）

存储之Block-MultiQueue机制详解（二）相关推荐

最新文章

热门文章