1. 内核层文件读写的函数调用关系

sys _read

vfs_read

do_sync_read

f_op->aio_read

generic_file_aio_read

do_generic_file_read

mpage_readpage

do_mpage_readpage

mpage_bio_submit

submit_bio

generic_make_request

__generic_make_request

q->make_request_fn

__make_request

q->request_fn

end_request

mpage_end_io_read

光读一个文件就需要这么长的函数调用关系,而且后面还有设备驱动,如果是U盘,还涉及scsi和usb框架,这复杂度真是有点大。这也充分说明了内核中分层的思想。今天就从submit_bio说起。

void submit_bio(int rw, struct bio *bio)
{int count = bio_sectors(bio);bio->bi_rw |= rw;/** If it's a regular read/write or a barrier with data attached,* go through the normal accounting stuff before submission.*/if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {if (rw & WRITE) {count_vm_events(PGPGOUT, count);} else {task_io_account_read(bio->bi_size);count_vm_events(PGPGIN, count);}if (unlikely(block_dump)) {char b[BDEVNAME_SIZE];printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",current->comm, task_pid_nr(current),(rw & WRITE) ? "WRITE" : "READ",(unsigned long long)bio->bi_sector,bdevname(bio->bi_bdev, b),count);}}generic_make_request(bio);
}
void generic_make_request(struct bio *bio)
{struct bio_list bio_list_on_stack;if (current->bio_list) {/* make_request is active */  //如果当前进程已经处于make_request状态,添加到待处理链表bio_list_add(current->bio_list, bio);return;}/* following loop may be a bit non-obvious, and so deserves some* explanation.* Before entering the loop, bio->bi_next is NULL (as all callers* ensure that) so we have a list with a single bio.* We pretend that we have just taken it off a longer list, so* we assign bio_list to a pointer to the bio_list_on_stack,* thus initialising the bio_list of new bios to be* added.  __generic_make_request may indeed add some more bios* through a recursive call to generic_make_request.  If it* did, we find a non-NULL value in bio_list and re-enter the loop* from the top.  In this case we really did just take the bio* of the top of the list (no pretending) and so remove it from* bio_list, and call into __generic_make_request again.** The loop was structured like this to make only one call to* __generic_make_request (which is important as it is large and* inlined) and to keep the structure simple.*/BUG_ON(bio->bi_next);   //必须保证bio->bi_next是NULLbio_list_init(&bio_list_on_stack);current->bio_list = &bio_list_on_stack;do {__generic_make_request(bio);bio = bio_list_pop(current->bio_list);} while (bio);current->bio_list = NULL; /* deactivate */
}
static inline void __generic_make_request(struct bio *bio)
{struct request_queue *q;sector_t old_sector;int ret, nr_sectors = bio_sectors(bio);dev_t old_dev;int err = -EIO;might_sleep();if (bio_check_eod(bio, nr_sectors))goto end_io;/** Resolve the mapping until finished. (drivers are* still free to implement/resolve their own stacking* by explicitly returning 0)** NOTE: we don't repeat the blk_size check for each new device.* Stacking drivers are expected to know what they are doing.*/old_sector = -1;old_dev = 0;do {char b[BDEVNAME_SIZE];struct hd_struct *part;q = bdev_get_queue(bio->bi_bdev);if (unlikely(!q)) {printk(KERN_ERR"generic_make_request: Trying to access ""nonexistent block-device %s (%Lu)\n",bdevname(bio->bi_bdev, b),(long long) bio->bi_sector);goto end_io;}if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&nr_sectors > queue_max_hw_sectors(q))) {printk(KERN_ERR "bio too big device %s (%u > %u)\n",bdevname(bio->bi_bdev, b),bio_sectors(bio),queue_max_hw_sectors(q));goto end_io;}if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))goto end_io;part = bio->bi_bdev->bd_part;if (should_fail_request(part, bio->bi_size) ||should_fail_request(&part_to_disk(part)->part0,bio->bi_size))goto end_io;/** If this device has partitions, remap block n* of partition p to block n+start(p) of the disk.*/blk_partition_remap(bio);    //如果是分区映射到磁盘if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))goto end_io;if (old_sector != -1)trace_block_bio_remap(q, bio, old_dev, old_sector);old_sector = bio->bi_sector;old_dev = bio->bi_bdev->bd_dev;if (bio_check_eod(bio, nr_sectors))goto end_io;/** Filter flush bio's early so that make_request based* drivers without flush support don't have to worry* about them.*/if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);if (!nr_sectors) {err = 0;goto end_io;}}if ((bio->bi_rw & REQ_DISCARD) &&(!blk_queue_discard(q) ||((bio->bi_rw & REQ_SECURE) &&!blk_queue_secdiscard(q)))) {err = -EOPNOTSUPP;goto end_io;}if (blk_throtl_bio(q, &bio))goto end_io;/** If bio = NULL, bio has been throttled and will be submitted* later.*/if (!bio)break;trace_block_bio_queue(q, bio);ret = q->make_request_fn(q, bio); //调用请求队列的make_request_fn回调函数} while (ret);return;end_io:bio_endio(bio, err);
}

对于scsi设备的request_queue的钩子函数在哪初始化的呢:

scsi_alloc_sdev->scsi_alloc_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request,request_fn初始化为scsi_request_fn。

对于mtdblock设备的request_queue的钩子函数在哪初始化的呢:

add_mtd_blktrans_dev->blk_init_queue

把make_request_fn回调函数初始化为__make_request,request_fn初始化为mtd_blktrans_request。

对于mmcblock设备的requet_queue的钩子函数在哪初始化的呢:

mmc_blk_alloc_req->mmc_init_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request,request_fn初始化为mmc_request。

接下来分析__make_request如何把bio排序、合并后加入到request结构中。

static int __make_request(struct request_queue *q, struct bio *bio)
{const bool sync = !!(bio->bi_rw & REQ_SYNC);struct blk_plug *plug;int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;struct request *req;unsigned int request_count = 0;/** low level driver can indicate that it wants pages above a* certain limit bounced to low memory (ie for highmem, or even* ISA dma in theory)*/blk_queue_bounce(q, &bio);  //创建反弹缓冲区if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {spin_lock_irq(q->queue_lock);where = ELEVATOR_INSERT_FLUSH;goto get_rq;}/** Check if we can merge with the plugged list before grabbing* any locks.*/if (attempt_plug_merge(current, q, bio, &request_count))goto out;spin_lock_irq(q->queue_lock);el_ret = elv_merge(q, &req, bio);   //交给IO调度器排序合并if (el_ret == ELEVATOR_BACK_MERGE) { //可以合并到某个request后面if (bio_attempt_back_merge(q, req, bio)) {if (!attempt_back_merge(q, req))elv_merged_request(q, req, el_ret);goto out_unlock;}} else if (el_ret == ELEVATOR_FRONT_MERGE) {  //可以合并到某个request前面if (bio_attempt_front_merge(q, req, bio)) {if (!attempt_front_merge(q, req))elv_merged_request(q, req, el_ret);goto out_unlock;}}//在请求队列中没有找到可以合并的request
get_rq:/** This sync check and mask will be re-done in init_request_from_bio(),* but we need to set it earlier to expose the sync flag to the* rq allocator and io schedulers.*/rw_flags = bio_data_dir(bio);if (sync)rw_flags |= REQ_SYNC;/** Grab a free request. This is might sleep but can not fail.* Returns with the queue unlocked.*/req = get_request_wait(q, rw_flags, bio);   //分配一个 request/** After dropping the lock and possibly sleeping here, our request* may now be mergeable after it had proven unmergeable (above).* We don't worry about that case for efficiency. It won't happen* often, and the elevators are able to handle it.*/init_request_from_bio(req, bio);   //根据bio初始化requestif (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||bio_flagged(bio, BIO_CPU_AFFINE))req->cpu = raw_smp_processor_id();plug = current->plug;if (plug) {/** If this is the first request added after a plug, fire* of a plug trace. If others have been added before, check* if we have multiple devices in this plug. If so, make a* note to sort the list before dispatch.*/if (list_empty(&plug->list))trace_block_plug(q);else if (!plug->should_sort) {struct request *__rq;__rq = list_entry_rq(plug->list.prev);if (__rq->q != q)plug->should_sort = 1;}if (request_count >= BLK_MAX_REQUEST_COUNT)blk_flush_plug_list(plug, false);  //请求数量达到上限,进行unplug操作list_add_tail(&req->queuelist, &plug->list);drive_stat_acct(req, 1);} else {spin_lock_irq(q->queue_lock);add_acct_request(q, req, where);  //添加到io调度器中__blk_run_queue(q);  //执行unplug操作
out_unlock:spin_unlock_irq(q->queue_lock);}
out:return 0;
}
void __blk_run_queue(struct request_queue *q)
{if (unlikely(blk_queue_stopped(q)))return;q->request_fn(q);
}

接下来以scsi设备为例来分析q->request_fn函数

static void scsi_request_fn(struct request_queue *q)
{struct scsi_device *sdev = q->queuedata;struct Scsi_Host *shost;struct scsi_cmnd *cmd;struct request *req;if (!sdev) {printk("scsi: killing requests for dead queue\n");while ((req = blk_peek_request(q)) != NULL)scsi_kill_request(req, q);return;}if(!get_device(&sdev->sdev_gendev))/* We must be tearing the block queue down already */return;/** To start with, we keep looping until the queue is empty, or until* the host is no longer able to accept any more requests.*/shost = sdev->host;for (;;) {int rtn;/** get next queueable request.  We do this early to make sure* that the request is fully prepared even if we cannot * accept it.*/req = blk_peek_request(q); //获得下一个请求if (!req || !scsi_dev_queue_ready(q, sdev))break;if (unlikely(!scsi_device_online(sdev))) {    //设备离线sdev_printk(KERN_ERR, sdev,"rejecting I/O to offline device\n");scsi_kill_request(req, q);continue;}/** Remove the request from the request list.*/if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))blk_start_request(req);sdev->device_busy++;spin_unlock(q->queue_lock);cmd = req->special;if (unlikely(cmd == NULL)) {printk(KERN_CRIT "impossible request in %s.\n""please mail a stack trace to ""linux-scsi@vger.kernel.org\n",__func__);blk_dump_rq_flags(req, "foo");BUG();}spin_lock(shost->host_lock);/** We hit this when the driver is using a host wide* tag map. For device level tag maps the queue_depth check* in the device ready fn would prevent us from trying* to allocate a tag. Since the map is a shared host resource* we add the dev to the starved list so it eventually gets* a run when a tag is freed.*/if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {if (list_empty(&sdev->starved_entry))list_add_tail(&sdev->starved_entry,&shost->starved_list);goto not_ready;}if (!scsi_target_queue_ready(shost, sdev))   //是否可以发送命令到目标节点goto not_ready;if (!scsi_host_queue_ready(q, shost, sdev))       //是否可以发送到主机适配器goto not_ready;scsi_target(sdev)->target_busy++;shost->host_busy++;/** XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will*     take the lock again.*/spin_unlock_irq(shost->host_lock);/** Finally, initialize any error handling parameters, and set up* the timers for timeouts.*/scsi_init_cmd_errh(cmd);/** Dispatch the command to the low-level driver.*/rtn = scsi_dispatch_cmd(cmd);   //派发命令到底层驱动spin_lock_irq(q->queue_lock);if (rtn)goto out_delay;}goto out;not_ready:spin_unlock_irq(shost->host_lock);/** lock q, handle tag, requeue req, and decrement device_busy. We* must return with queue_lock held.** Decrementing device_busy without checking it is OK, as all such* cases (host limits or settings) should run the queue at some* later time.*/spin_lock_irq(q->queue_lock);blk_requeue_request(q, req);sdev->device_busy--;
out_delay:if (sdev->device_busy == 0)blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:/* must be careful here...if we trigger the ->remove() function* we cannot be holding the q lock */spin_unlock_irq(q->queue_lock);put_device(&sdev->sdev_gendev);spin_lock_irq(q->queue_lock);
}

blk_peek_request主要调用了__elv_next_request和q->prep_rq_fn(q, rq),而prep_rq_fn被赋值为sd_prep_fn,主要作用是为请求构造scsi命令。

int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{struct Scsi_Host *host = cmd->device->host;unsigned long timeout;int rtn = 0;atomic_inc(&cmd->device->iorequest_cnt);/* check if the device is still usable */if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {/* in SDEV_DEL we error all commands. DID_NO_CONNECT* returns an immediate error upwards, and signals* that the device is no longer present */cmd->result = DID_NO_CONNECT << 16;scsi_done(cmd);/* return 0 (because the command has been processed) */goto out;}/* Check to see if the scsi lld made this device blocked. */if (unlikely(scsi_device_blocked(cmd->device))) {/* * in blocked state, the command is just put back on* the device queue.  The suspend state has already* blocked the queue so future requests should not* occur until the device transitions out of the* suspend state.*/scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));/** NOTE: rtn is still zero here because we don't need the* queue to be plugged on return (it's already stopped)*/goto out;}/* * If SCSI-2 or lower, store the LUN value in cmnd.*/if (cmd->device->scsi_level <= SCSI_2 &&cmd->device->scsi_level != SCSI_UNKNOWN) {cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |(cmd->device->lun << 5 & 0xe0);}/** We will wait MIN_RESET_DELAY clock ticks after the last reset so* we can avoid the drive not being ready.*/timeout = host->last_reset + MIN_RESET_DELAY;if (host->resetting && time_before(jiffies, timeout)) {int ticks_remaining = timeout - jiffies;/** NOTE: This may be executed from within an interrupt* handler!  This is bad, but for now, it'll do.  The irq* level of the interrupt handler has been masked out by the* platform dependent interrupt handling code already, so the* sti() here will not cause another call to the SCSI host's* interrupt handler (assuming there is one irq-level per* host).*/while (--ticks_remaining >= 0)mdelay(1 + 999 / HZ);host->resetting = 0;}scsi_log_send(cmd);/** Before we queue this command, check if the command* length exceeds what the host adapter can handle.*/if (cmd->cmd_len > cmd->device->host->max_cmd_len) {SCSI_LOG_MLQUEUE(3,printk("queuecommand : command too long. ""cdb_size=%d host->max_cmd_len=%d\n",cmd->cmd_len, cmd->device->host->max_cmd_len));cmd->result = (DID_ABORT << 16);scsi_done(cmd);goto out;}if (unlikely(host->shost_state == SHOST_DEL)) {cmd->result = (DID_NO_CONNECT << 16);scsi_done(cmd);} else {trace_scsi_dispatch_cmd_start(cmd);cmd->scsi_done = scsi_done;rtn = host->hostt->queuecommand(host, cmd); //调用主机适配器的queuecommand回调函数}if (rtn) {trace_scsi_dispatch_cmd_error(cmd, rtn);if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&rtn != SCSI_MLQUEUE_TARGET_BUSY)rtn = SCSI_MLQUEUE_HOST_BUSY;scsi_queue_insert(cmd, rtn);SCSI_LOG_MLQUEUE(3,printk("queuecommand : request rejected\n"));}out:SCSI_LOG_MLQUEUE(3, printk("leaving scsi_dispatch_cmnd()\n"));return rtn;
}

block IO层框架分析2相关推荐

  1. SensorKernel层框架分析

    接上文 安卓9.0Sensor框架 前言 前面我们已经讲解了sensor框架中的framework到vendor层,这篇文章我们将会讲解kernel层的内容.不过不同的芯片平台,kernel层中的se ...

  2. [由零开始] 手写Mybatis-自定义持久层框架

    手写Mybatis-自定义持久层框架思路分析 JDBC存 的问题 数据库配置硬编码 频繁创建与销毁数据库连接 ,资源严重浪费 SQL语句.参数写在代码里和代码在一起,存在硬编码,不好维护, 对参数和返 ...

  3. BLOCK层代码分析(9)IO下发之IO下发

    看着题目是不是很奇怪,想不出好的名字,就这样将就吧. 前面bio bounce过程,bio的切分和合并,request的获取是为IO请求下发做准备工作.当这些准备工作完成后,才进入到真正的IO下发过程 ...

  4. BLOCK层代码分析(10)IO下发之IO下发函数总结

    BLOCK层IO下发涉及直接下发,调度器,没有设置调度类型以及plug/unplug等,因此下发函数纷繁复杂,这里做介绍几个主要的函数. 前面介绍了函数blk_mq_try_issue_directl ...

  5. mybatis和spring jdbc持久层框架事务支持分析

    mybatis和spring jdbc持久层框架事务支持分析 ​ 持久层框架中的事务支持指的是持久层框架如何支持数据库事务,我们先梳理出原生数据库事务操作的主线脉络,它是通过java.sql 包下的C ...

  6. block io生命历程

    作为存储业务的一个重要组成部分,block IO是非易失存储的唯一路径,它的生命历程每个阶段都直接关乎我们手机的性能.功耗.甚至寿命.本文试图通过block IO的产生.调度.下发.返回的4个阶段,阐 ...

  7. Linux USB驱动框架分析 【转】

    转自:http://blog.chinaunix.net/uid-11848011-id-96188.html 初次接触与OS相关的设备驱动编写,感觉还挺有意思的,为了不至于忘掉看过的东西,笔记跟总结 ...

  8. usb serial port 驱动_tty初探 — uart驱动框架分析

    写在前面: 我们没有讲UART驱动,不过我们认为,只要系统学习了第2期,应该具备分析UART驱动的能力,小编做答疑几年以来,陆陆续续有不少人问到UART驱动怎么写,所以今天就分享一篇深度长文(1700 ...

  9. LCD 设备驱动框架分析及核心结构

    Linux 下很多东西都是和结构体相关,举个例子,时钟大家都知道吧,Linux 下对应时钟的东西就有好几个结构体,所以你要是想明白Linux 下那些东西,对结构体要有所了解,LCD 是基础的驱动设备, ...

最新文章

  1. Ajax 中XmlHttp 乱码 的解决方法 (UTF8,GB2312 编码 解码)
  2. 数据迁移(数据清洗)分享
  3. CodeForces - 1450C2 Errich-Tac-Toe (Hard Version)(思维+构造)
  4. 这才是老公的正确用法,不吃就往死里打......
  5. Word——Word中粘贴Visio图只显示下面一部分
  6. CentOS7.4搭建FTP服务器(vsftp)
  7. python从指定文件夹导入模块_Python实现的在特定目录下导入模块功能分析
  8. Protocol Buffers数据编码
  9. Django rest framework(7)----分页
  10. 嵌入式开发|STM32工程中添加Bootloader实现串口程序下载
  11. 主梁弹性模量计算_混凝土松弛系数的实用计算
  12. c语言与程序设计教材,C语言与程序设计(高等学校计算机专业规划教材)
  13. 华为交换机常见STP/RSTP操作
  14. 微信防撤回是怎么实现的?
  15. flex属性-flex:1到底是什么
  16. 写一个函数,输出四次“hello world“,每次间隔3秒
  17. jffs2的目录项查找过程
  18. html标签中加入颜色,html怎么给span添加颜色
  19. Docker 核心技术(2)- helloworld 镜像
  20. MISC 拼图 工具 linux下使用

热门文章

  1. Postgresql杂谈 20—详解Postgresql中的Checkpoint、WAL日志和热备份恢复
  2. hbase集群 数据写入_一种构建HBase集群全文索引方法,数据读取方法以及数据写入方法与流程...
  3. python项目中的.idea文件夹是干什么的
  4. linux和window下mkdir函数问题(转-锦曦月)
  5. 教程:FFmpeg命令行参数命令合集01.
  6. 使用ThreeJs从零开始构建3D智能仓库——第三章(选中物体与特效)
  7. Exception in thread “main“ java.io.FileNotFoundException: test\mybatis-config.xml (系统找不到指定的路径。)
  8. 用css制作好看的登录注册页面
  9. 基于Docker的Hadoop完全分布式安装
  10. SpringCloud组件:Ribbon的负载均衡策略及原理