block IO层框架分析2

1. 内核层文件读写的函数调用关系

sys _read

vfs_read

do_sync_read

f_op->aio_read

generic_file_aio_read

do_generic_file_read

mpage_readpage

do_mpage_readpage

mpage_bio_submit

submit_bio

generic_make_request

__generic_make_request

q->make_request_fn

__make_request

q->request_fn

end_request

mpage_end_io_read

光读一个文件就需要这么长的函数调用关系，而且后面还有设备驱动，如果是U盘，还涉及scsi和usb框架，这复杂度真是有点大。这也充分说明了内核中分层的思想。今天就从submit_bio说起。

void submit_bio(int rw, struct bio *bio)
{int count = bio_sectors(bio);bio->bi_rw |= rw;/** If it's a regular read/write or a barrier with data attached,* go through the normal accounting stuff before submission.*/if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {if (rw & WRITE) {count_vm_events(PGPGOUT, count);} else {task_io_account_read(bio->bi_size);count_vm_events(PGPGIN, count);}if (unlikely(block_dump)) {char b[BDEVNAME_SIZE];printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",current->comm, task_pid_nr(current),(rw & WRITE) ? "WRITE" : "READ",(unsigned long long)bio->bi_sector,bdevname(bio->bi_bdev, b),count);}}generic_make_request(bio);
}

void generic_make_request(struct bio *bio)
{struct bio_list bio_list_on_stack;if (current->bio_list) {/* make_request is active */  //如果当前进程已经处于make_request状态，添加到待处理链表bio_list_add(current->bio_list, bio);return;}/* following loop may be a bit non-obvious, and so deserves some* explanation.* Before entering the loop, bio->bi_next is NULL (as all callers* ensure that) so we have a list with a single bio.* We pretend that we have just taken it off a longer list, so* we assign bio_list to a pointer to the bio_list_on_stack,* thus initialising the bio_list of new bios to be* added.  __generic_make_request may indeed add some more bios* through a recursive call to generic_make_request.  If it* did, we find a non-NULL value in bio_list and re-enter the loop* from the top.  In this case we really did just take the bio* of the top of the list (no pretending) and so remove it from* bio_list, and call into __generic_make_request again.** The loop was structured like this to make only one call to* __generic_make_request (which is important as it is large and* inlined) and to keep the structure simple.*/BUG_ON(bio->bi_next);   //必须保证bio->bi_next是NULLbio_list_init(&bio_list_on_stack);current->bio_list = &bio_list_on_stack;do {__generic_make_request(bio);bio = bio_list_pop(current->bio_list);} while (bio);current->bio_list = NULL; /* deactivate */
}

static inline void __generic_make_request(struct bio *bio)
{struct request_queue *q;sector_t old_sector;int ret, nr_sectors = bio_sectors(bio);dev_t old_dev;int err = -EIO;might_sleep();if (bio_check_eod(bio, nr_sectors))goto end_io;/** Resolve the mapping until finished. (drivers are* still free to implement/resolve their own stacking* by explicitly returning 0)** NOTE: we don't repeat the blk_size check for each new device.* Stacking drivers are expected to know what they are doing.*/old_sector = -1;old_dev = 0;do {char b[BDEVNAME_SIZE];struct hd_struct *part;q = bdev_get_queue(bio->bi_bdev);if (unlikely(!q)) {printk(KERN_ERR"generic_make_request: Trying to access ""nonexistent block-device %s (%Lu)\n",bdevname(bio->bi_bdev, b),(long long) bio->bi_sector);goto end_io;}if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&nr_sectors > queue_max_hw_sectors(q))) {printk(KERN_ERR "bio too big device %s (%u > %u)\n",bdevname(bio->bi_bdev, b),bio_sectors(bio),queue_max_hw_sectors(q));goto end_io;}if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))goto end_io;part = bio->bi_bdev->bd_part;if (should_fail_request(part, bio->bi_size) ||should_fail_request(&part_to_disk(part)->part0,bio->bi_size))goto end_io;/** If this device has partitions, remap block n* of partition p to block n+start(p) of the disk.*/blk_partition_remap(bio);    //如果是分区映射到磁盘if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))goto end_io;if (old_sector != -1)trace_block_bio_remap(q, bio, old_dev, old_sector);old_sector = bio->bi_sector;old_dev = bio->bi_bdev->bd_dev;if (bio_check_eod(bio, nr_sectors))goto end_io;/** Filter flush bio's early so that make_request based* drivers without flush support don't have to worry* about them.*/if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);if (!nr_sectors) {err = 0;goto end_io;}}if ((bio->bi_rw & REQ_DISCARD) &&(!blk_queue_discard(q) ||((bio->bi_rw & REQ_SECURE) &&!blk_queue_secdiscard(q)))) {err = -EOPNOTSUPP;goto end_io;}if (blk_throtl_bio(q, &bio))goto end_io;/** If bio = NULL, bio has been throttled and will be submitted* later.*/if (!bio)break;trace_block_bio_queue(q, bio);ret = q->make_request_fn(q, bio); //调用请求队列的make_request_fn回调函数} while (ret);return;end_io:bio_endio(bio, err);
}

对于scsi设备的request_queue的钩子函数在哪初始化的呢：

scsi_alloc_sdev->scsi_alloc_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request，request_fn初始化为scsi_request_fn。

对于mtdblock设备的request_queue的钩子函数在哪初始化的呢：

add_mtd_blktrans_dev->blk_init_queue

把make_request_fn回调函数初始化为__make_request，request_fn初始化为mtd_blktrans_request。

对于mmcblock设备的requet_queue的钩子函数在哪初始化的呢：

mmc_blk_alloc_req->mmc_init_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request，request_fn初始化为mmc_request。

接下来分析__make_request如何把bio排序、合并后加入到request结构中。

static int __make_request(struct request_queue *q, struct bio *bio)
{const bool sync = !!(bio->bi_rw & REQ_SYNC);struct blk_plug *plug;int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;struct request *req;unsigned int request_count = 0;/** low level driver can indicate that it wants pages above a* certain limit bounced to low memory (ie for highmem, or even* ISA dma in theory)*/blk_queue_bounce(q, &bio);  //创建反弹缓冲区if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {spin_lock_irq(q->queue_lock);where = ELEVATOR_INSERT_FLUSH;goto get_rq;}/** Check if we can merge with the plugged list before grabbing* any locks.*/if (attempt_plug_merge(current, q, bio, &request_count))goto out;spin_lock_irq(q->queue_lock);el_ret = elv_merge(q, &req, bio);   //交给IO调度器排序合并if (el_ret == ELEVATOR_BACK_MERGE) { //可以合并到某个request后面if (bio_attempt_back_merge(q, req, bio)) {if (!attempt_back_merge(q, req))elv_merged_request(q, req, el_ret);goto out_unlock;}} else if (el_ret == ELEVATOR_FRONT_MERGE) {  //可以合并到某个request前面if (bio_attempt_front_merge(q, req, bio)) {if (!attempt_front_merge(q, req))elv_merged_request(q, req, el_ret);goto out_unlock;}}//在请求队列中没有找到可以合并的request
get_rq:/** This sync check and mask will be re-done in init_request_from_bio(),* but we need to set it earlier to expose the sync flag to the* rq allocator and io schedulers.*/rw_flags = bio_data_dir(bio);if (sync)rw_flags |= REQ_SYNC;/** Grab a free request. This is might sleep but can not fail.* Returns with the queue unlocked.*/req = get_request_wait(q, rw_flags, bio);   //分配一个 request/** After dropping the lock and possibly sleeping here, our request* may now be mergeable after it had proven unmergeable (above).* We don't worry about that case for efficiency. It won't happen* often, and the elevators are able to handle it.*/init_request_from_bio(req, bio);   //根据bio初始化requestif (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||bio_flagged(bio, BIO_CPU_AFFINE))req->cpu = raw_smp_processor_id();plug = current->plug;if (plug) {/** If this is the first request added after a plug, fire* of a plug trace. If others have been added before, check* if we have multiple devices in this plug. If so, make a* note to sort the list before dispatch.*/if (list_empty(&plug->list))trace_block_plug(q);else if (!plug->should_sort) {struct request *__rq;__rq = list_entry_rq(plug->list.prev);if (__rq->q != q)plug->should_sort = 1;}if (request_count >= BLK_MAX_REQUEST_COUNT)blk_flush_plug_list(plug, false);  //请求数量达到上限，进行unplug操作list_add_tail(&req->queuelist, &plug->list);drive_stat_acct(req, 1);} else {spin_lock_irq(q->queue_lock);add_acct_request(q, req, where);  //添加到io调度器中__blk_run_queue(q);  //执行unplug操作
out_unlock:spin_unlock_irq(q->queue_lock);}
out:return 0;
}

void __blk_run_queue(struct request_queue *q)
{if (unlikely(blk_queue_stopped(q)))return;q->request_fn(q);
}

接下来以scsi设备为例来分析q->request_fn函数

static void scsi_request_fn(struct request_queue *q)
{struct scsi_device *sdev = q->queuedata;struct Scsi_Host *shost;struct scsi_cmnd *cmd;struct request *req;if (!sdev) {printk("scsi: killing requests for dead queue\n");while ((req = blk_peek_request(q)) != NULL)scsi_kill_request(req, q);return;}if(!get_device(&sdev->sdev_gendev))/* We must be tearing the block queue down already */return;/** To start with, we keep looping until the queue is empty, or until* the host is no longer able to accept any more requests.*/shost = sdev->host;for (;;) {int rtn;/** get next queueable request.  We do this early to make sure* that the request is fully prepared even if we cannot * accept it.*/req = blk_peek_request(q); //获得下一个请求if (!req || !scsi_dev_queue_ready(q, sdev))break;if (unlikely(!scsi_device_online(sdev))) {    //设备离线sdev_printk(KERN_ERR, sdev,"rejecting I/O to offline device\n");scsi_kill_request(req, q);continue;}/** Remove the request from the request list.*/if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))blk_start_request(req);sdev->device_busy++;spin_unlock(q->queue_lock);cmd = req->special;if (unlikely(cmd == NULL)) {printk(KERN_CRIT "impossible request in %s.\n""please mail a stack trace to ""linux-scsi@vger.kernel.org\n",__func__);blk_dump_rq_flags(req, "foo");BUG();}spin_lock(shost->host_lock);/** We hit this when the driver is using a host wide* tag map. For device level tag maps the queue_depth check* in the device ready fn would prevent us from trying* to allocate a tag. Since the map is a shared host resource* we add the dev to the starved list so it eventually gets* a run when a tag is freed.*/if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {if (list_empty(&sdev->starved_entry))list_add_tail(&sdev->starved_entry,&shost->starved_list);goto not_ready;}if (!scsi_target_queue_ready(shost, sdev))   //是否可以发送命令到目标节点goto not_ready;if (!scsi_host_queue_ready(q, shost, sdev))       //是否可以发送到主机适配器goto not_ready;scsi_target(sdev)->target_busy++;shost->host_busy++;/** XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will*     take the lock again.*/spin_unlock_irq(shost->host_lock);/** Finally, initialize any error handling parameters, and set up* the timers for timeouts.*/scsi_init_cmd_errh(cmd);/** Dispatch the command to the low-level driver.*/rtn = scsi_dispatch_cmd(cmd);   //派发命令到底层驱动spin_lock_irq(q->queue_lock);if (rtn)goto out_delay;}goto out;not_ready:spin_unlock_irq(shost->host_lock);/** lock q, handle tag, requeue req, and decrement device_busy. We* must return with queue_lock held.** Decrementing device_busy without checking it is OK, as all such* cases (host limits or settings) should run the queue at some* later time.*/spin_lock_irq(q->queue_lock);blk_requeue_request(q, req);sdev->device_busy--;
out_delay:if (sdev->device_busy == 0)blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:/* must be careful here...if we trigger the ->remove() function* we cannot be holding the q lock */spin_unlock_irq(q->queue_lock);put_device(&sdev->sdev_gendev);spin_lock_irq(q->queue_lock);
}

blk_peek_request主要调用了__elv_next_request和q->prep_rq_fn(q, rq)，而prep_rq_fn被赋值为sd_prep_fn，主要作用是为请求构造scsi命令。

int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{struct Scsi_Host *host = cmd->device->host;unsigned long timeout;int rtn = 0;atomic_inc(&cmd->device->iorequest_cnt);/* check if the device is still usable */if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {/* in SDEV_DEL we error all commands. DID_NO_CONNECT* returns an immediate error upwards, and signals* that the device is no longer present */cmd->result = DID_NO_CONNECT << 16;scsi_done(cmd);/* return 0 (because the command has been processed) */goto out;}/* Check to see if the scsi lld made this device blocked. */if (unlikely(scsi_device_blocked(cmd->device))) {/* * in blocked state, the command is just put back on* the device queue.  The suspend state has already* blocked the queue so future requests should not* occur until the device transitions out of the* suspend state.*/scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));/** NOTE: rtn is still zero here because we don't need the* queue to be plugged on return (it's already stopped)*/goto out;}/* * If SCSI-2 or lower, store the LUN value in cmnd.*/if (cmd->device->scsi_level <= SCSI_2 &&cmd->device->scsi_level != SCSI_UNKNOWN) {cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |(cmd->device->lun << 5 & 0xe0);}/** We will wait MIN_RESET_DELAY clock ticks after the last reset so* we can avoid the drive not being ready.*/timeout = host->last_reset + MIN_RESET_DELAY;if (host->resetting && time_before(jiffies, timeout)) {int ticks_remaining = timeout - jiffies;/** NOTE: This may be executed from within an interrupt* handler!  This is bad, but for now, it'll do.  The irq* level of the interrupt handler has been masked out by the* platform dependent interrupt handling code already, so the* sti() here will not cause another call to the SCSI host's* interrupt handler (assuming there is one irq-level per* host).*/while (--ticks_remaining >= 0)mdelay(1 + 999 / HZ);host->resetting = 0;}scsi_log_send(cmd);/** Before we queue this command, check if the command* length exceeds what the host adapter can handle.*/if (cmd->cmd_len > cmd->device->host->max_cmd_len) {SCSI_LOG_MLQUEUE(3,printk("queuecommand : command too long. ""cdb_size=%d host->max_cmd_len=%d\n",cmd->cmd_len, cmd->device->host->max_cmd_len));cmd->result = (DID_ABORT << 16);scsi_done(cmd);goto out;}if (unlikely(host->shost_state == SHOST_DEL)) {cmd->result = (DID_NO_CONNECT << 16);scsi_done(cmd);} else {trace_scsi_dispatch_cmd_start(cmd);cmd->scsi_done = scsi_done;rtn = host->hostt->queuecommand(host, cmd); //调用主机适配器的queuecommand回调函数}if (rtn) {trace_scsi_dispatch_cmd_error(cmd, rtn);if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&rtn != SCSI_MLQUEUE_TARGET_BUSY)rtn = SCSI_MLQUEUE_HOST_BUSY;scsi_queue_insert(cmd, rtn);SCSI_LOG_MLQUEUE(3,printk("queuecommand : request rejected\n"));}out:SCSI_LOG_MLQUEUE(3, printk("leaving scsi_dispatch_cmnd()\n"));return rtn;
}

block IO层框架分析2相关推荐

SensorKernel层框架分析
接上文安卓9.0Sensor框架前言前面我们已经讲解了sensor框架中的framework到vendor层,这篇文章我们将会讲解kernel层的内容.不过不同的芯片平台,kernel层中的se ...
[由零开始] 手写Mybatis-自定义持久层框架
手写Mybatis-自定义持久层框架思路分析 JDBC存的问题数据库配置硬编码频繁创建与销毁数据库连接 ,资源严重浪费 SQL语句.参数写在代码里和代码在一起,存在硬编码,不好维护, 对参数和返 ...
BLOCK层代码分析（9）IO下发之IO下发
看着题目是不是很奇怪,想不出好的名字,就这样将就吧. 前面bio bounce过程,bio的切分和合并,request的获取是为IO请求下发做准备工作.当这些准备工作完成后,才进入到真正的IO下发过程 ...
BLOCK层代码分析（10）IO下发之IO下发函数总结
BLOCK层IO下发涉及直接下发,调度器,没有设置调度类型以及plug/unplug等,因此下发函数纷繁复杂,这里做介绍几个主要的函数. 前面介绍了函数blk_mq_try_issue_directl ...
mybatis和spring jdbc持久层框架事务支持分析
mybatis和spring jdbc持久层框架事务支持分析持久层框架中的事务支持指的是持久层框架如何支持数据库事务,我们先梳理出原生数据库事务操作的主线脉络,它是通过java.sql 包下的C ...
block io生命历程
作为存储业务的一个重要组成部分,block IO是非易失存储的唯一路径,它的生命历程每个阶段都直接关乎我们手机的性能.功耗.甚至寿命.本文试图通过block IO的产生.调度.下发.返回的4个阶段,阐 ...
Linux USB驱动框架分析【转】
转自:http://blog.chinaunix.net/uid-11848011-id-96188.html 初次接触与OS相关的设备驱动编写,感觉还挺有意思的,为了不至于忘掉看过的东西,笔记跟总结 ...
usb serial port 驱动_tty初探 — uart驱动框架分析
写在前面: 我们没有讲UART驱动,不过我们认为,只要系统学习了第2期,应该具备分析UART驱动的能力,小编做答疑几年以来,陆陆续续有不少人问到UART驱动怎么写,所以今天就分享一篇深度长文(1700 ...
LCD 设备驱动框架分析及核心结构
Linux 下很多东西都是和结构体相关,举个例子,时钟大家都知道吧,Linux 下对应时钟的东西就有好几个结构体,所以你要是想明白Linux 下那些东西,对结构体要有所了解,LCD 是基础的驱动设备, ...

block IO层框架分析2

block IO层框架分析2相关推荐

最新文章

热门文章