赞
踩
上文中,我们将提交了submit_bio后的流程进行了分析,经过梳理之后,一个submit_bio在经过层层转化,最后进入到硬件派发队列hctx时,会调用硬件绑定的驱动所提供的queue_rq函数,这个函数是每一个硬件派发队列对应驱动都必须提供的。
我们之前所研究的virtio部分是挂在pci总线上面的,然而在多数情况下,硬盘是通过scsi子系统被发现并管理的,磁盘驱动为scsi子系统中的一种高级驱动sd。
因此,根据我们现有的知识,在mq体系下,scsi一定向上层提供了对应的硬件派发队列以及对应的queue_rq函数。
我们通过简单地搜索,就能找到scsi所绑定的blk_mq_op:
- //common/drivers/scsi/scsi_lib.c
- static const struct blk_mq_ops scsi_mq_ops = {
- .get_budget = scsi_mq_get_budget,
- .put_budget = scsi_mq_put_budget,
- .queue_rq = scsi_queue_rq,
- .commit_rqs = scsi_commit_rqs,
- .complete = scsi_complete,
- .timeout = scsi_timeout,
- #ifdef CONFIG_BLK_DEBUG_FS
- .show_rq = scsi_show_rq,
- #endif
- .init_request = scsi_mq_init_request,
- .exit_request = scsi_mq_exit_request,
- .cleanup_rq = scsi_cleanup_rq,
- .busy = scsi_mq_lld_busy,
- .map_queues = scsi_map_queues,
- .init_hctx = scsi_init_hctx,
- .poll = scsi_mq_poll,
- .set_rq_budget_token = scsi_mq_set_rq_budget_token,
- .get_rq_budget_token = scsi_mq_get_rq_budget_token,
- };

找到scsi_queue_rq函数:
- //common/drivers/scsi/scsi_lib.c
- static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
- {
- struct request *req = bd->rq;
- struct request_queue *q = req->q;
- struct scsi_device *sdev = q->queuedata;
- struct Scsi_Host *shost = sdev->host;
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
- blk_status_t ret;
- int reason;
-
-
- WARN_ON_ONCE(cmd->budget_token < 0);
-
-
- /*
- * If the device is not in running state we will reject some or all
- * commands.
- */
- if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
- ret = scsi_device_state_check(sdev, req);
- if (ret != BLK_STS_OK)
- goto out_put_budget;
- }
-
-
- ret = BLK_STS_RESOURCE;
- if (!scsi_target_queue_ready(shost, sdev))
- goto out_put_budget;
- if (unlikely(scsi_host_in_recovery(shost))) {
- if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
- ret = BLK_STS_OFFLINE;
- goto out_dec_target_busy;
- }
- if (!scsi_host_queue_ready(q, shost, sdev, cmd))
- goto out_dec_target_busy;
-
-
- if (!(req->rq_flags & RQF_DONTPREP)) {
- ret = scsi_prepare_cmd(req);
- if (ret != BLK_STS_OK)
- goto out_dec_host_busy;
- req->rq_flags |= RQF_DONTPREP;
- } else {
- clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
- }
-
-
- cmd->flags &= SCMD_PRESERVED_FLAGS;
- if (sdev->simple_tags)
- cmd->flags |= SCMD_TAGGED;
- if (bd->last)
- cmd->flags |= SCMD_LAST;
-
-
- scsi_set_resid(cmd, 0);
- memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
- cmd->submitter = SUBMITTED_BY_BLOCK_LAYER;
-
-
- blk_mq_start_request(req);
- reason = scsi_dispatch_cmd(cmd);
- if (reason) {
- scsi_set_blocked(cmd, reason);
- ret = BLK_STS_RESOURCE;
- goto out_dec_host_busy;
- }
-
-
- atomic_inc(&cmd->device->iorequest_cnt);
- return BLK_STS_OK;
-
-
- out_dec_host_busy:
- scsi_dec_host_busy(shost, cmd);
- out_dec_target_busy:
- if (scsi_target(sdev)->can_queue > 0)
- atomic_dec(&scsi_target(sdev)->target_busy);
- out_put_budget:
- scsi_mq_put_budget(q, cmd->budget_token);
- cmd->budget_token = -1;
- switch (ret) {
- case BLK_STS_OK:
- break;
- case BLK_STS_RESOURCE:
- case BLK_STS_ZONE_RESOURCE:
- if (scsi_device_blocked(sdev))
- ret = BLK_STS_DEV_RESOURCE;
- break;
- case BLK_STS_AGAIN:
- cmd->result = DID_BUS_BUSY << 16;
- if (req->rq_flags & RQF_DONTPREP)
- scsi_mq_uninit_cmd(cmd);
- break;
- default:
- if (unlikely(!scsi_device_online(sdev)))
- cmd->result = DID_NO_CONNECT << 16;
- else
- cmd->result = DID_ERROR << 16;
- /*
- * Make sure to release all allocated resources when
- * we hit an error, as we will never see this command
- * again.
- */
- if (req->rq_flags & RQF_DONTPREP)
- scsi_mq_uninit_cmd(cmd);
- scsi_run_queue_async(sdev);
- break;
- }
- return ret;
- }

这里面虽然没有和zone有关的内容,但是不要慌张,浅浅地回忆一下,进入到这个函数之后大致要完成的工作是,把队列中的request再转化为对硬件的command,接着下发command到硬件,完成io。
也就是说,对于request的解析,一定是在command生成之前的。
在上面代码的35行之前,是在做一些必要的检查,确保队列、硬件处于正常工作的状态,接着37行,出现一个关键的函数scsi_prepare_cmd,顾名思义,command可能会在这个函数中进行初始化。
所以我们进入到这个函数:
- //common/drivers/scsi/scsi_lib.c
- static blk_status_t scsi_prepare_cmd(struct request *req)
- {
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
- struct scsi_device *sdev = req->q->queuedata;
- struct Scsi_Host *shost = sdev->host;
- bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
- struct scatterlist *sg;
-
-
- scsi_init_command(sdev, cmd);
-
-
- cmd->eh_eflags = 0;
- cmd->prot_type = 0;
- cmd->prot_flags = 0;
- cmd->submitter = 0;
- memset(&cmd->sdb, 0, sizeof(cmd->sdb));
- cmd->underflow = 0;
- cmd->transfersize = 0;
- cmd->host_scribble = NULL;
- cmd->result = 0;
- cmd->extra_len = 0;
- cmd->state = 0;
- if (in_flight)
- __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
-
-
- /*
- * Only clear the driver-private command data if the LLD does not supply
- * a function to initialize that data.
- */
- if (!shost->hostt->init_cmd_priv)
- memset(cmd + 1, 0, shost->hostt->cmd_size);
-
-
- cmd->prot_op = SCSI_PROT_NORMAL;
- if (blk_rq_bytes(req))
- cmd->sc_data_direction = rq_dma_dir(req);
- else
- cmd->sc_data_direction = DMA_NONE;
-
-
- sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
- cmd->sdb.table.sgl = sg;
-
-
- if (scsi_host_get_prot(shost)) {
- memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
-
-
- cmd->prot_sdb->table.sgl =
- (struct scatterlist *)(cmd->prot_sdb + 1);
- }
-
-
- /*
- * Special handling for passthrough commands, which don't go to the ULP
- * at all:
- */
- if (blk_rq_is_passthrough(req))
- return scsi_setup_scsi_cmnd(sdev, req);
-
-
- if (sdev->handler && sdev->handler->prep_fn) {
- blk_status_t ret = sdev->handler->prep_fn(sdev, req);
-
-
- if (ret != BLK_STS_OK)
- return ret;
- }
-
-
- /* Usually overridden by the ULP */
- cmd->allowed = 0;
- memset(cmd->cmnd, 0, sizeof(cmd->cmnd));
- return scsi_cmd_to_driver(cmd)->init_command(cmd);
- }

要知道scsi层里面,高级驱动可不止sd一个,因此,我们可以猜测这个函数只是在做一些通用性的命令初始化,对于特异性的初始化,一定会转交sd驱动处理,所以直接看代码的66行,调用了对应cmd绑定驱动的init_command函数。
让我们看看sd是如何处理的:
- //common/drivers/scsi/sd.c
- static struct scsi_driver sd_template = {
- .gendrv = {
- .name = "sd",
- .owner = THIS_MODULE,
- .probe = sd_probe,
- .probe_type = PROBE_PREFER_ASYNCHRONOUS,
- .remove = sd_remove,
- .shutdown = sd_shutdown,
- .pm = &sd_pm_ops,
- },
- .rescan = sd_rescan,
- .init_command = sd_init_command,
- .uninit_command = sd_uninit_command,
- .done = sd_done,
- .eh_action = sd_eh_action,
- .eh_reset = sd_eh_reset,
- };

上面的结构体描述了一个scsi_driver的一系列回调函数,关于scsi我们之后再系统的学习,这里我们先去了解它的init_command函数:
- //common/drivers/scsi/sd.c
- static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
- {
- struct request *rq = scsi_cmd_to_rq(cmd);
-
-
- switch (req_op(rq)) {
- case REQ_OP_DISCARD:
- switch (scsi_disk(rq->q->disk)->provisioning_mode) {
- case SD_LBP_UNMAP:
- return sd_setup_unmap_cmnd(cmd);
- case SD_LBP_WS16:
- return sd_setup_write_same16_cmnd(cmd, true);
- case SD_LBP_WS10:
- return sd_setup_write_same10_cmnd(cmd, true);
- case SD_LBP_ZERO:
- return sd_setup_write_same10_cmnd(cmd, false);
- default:
- return BLK_STS_TARGET;
- }
- case REQ_OP_WRITE_ZEROES:
- return sd_setup_write_zeroes_cmnd(cmd);
- case REQ_OP_FLUSH:
- return sd_setup_flush_cmnd(cmd);
- case REQ_OP_READ:
- case REQ_OP_WRITE:
- case REQ_OP_ZONE_APPEND:
- return sd_setup_read_write_cmnd(cmd);
- case REQ_OP_ZONE_RESET:
- return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
- false);
- case REQ_OP_ZONE_RESET_ALL:
- return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
- true);
- case REQ_OP_ZONE_OPEN:
- return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false);
- case REQ_OP_ZONE_CLOSE:
- return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false);
- case REQ_OP_ZONE_FINISH:
- return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false);
- default:
- WARN_ON_ONCE(1);
- return BLK_STS_NOTSUPP;
- }
- }

到这里之后,完成了将request转化为command的操作,在此处REQ_OP的作用是将不同的操作正确生成了Command命令。
之后,系统会在scsi_dispatch_cmd函数中,将准备好的command下发到低层驱动。
- static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
- {
- struct Scsi_Host *host = cmd->device->host;
- int rtn = 0;
-
-
- atomic_inc(&cmd->device->iorequest_cnt);
-
-
- /* check if the device is still usable */
- if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
- /* in SDEV_DEL we error all commands. DID_NO_CONNECT
- * returns an immediate error upwards, and signals
- * that the device is no longer present */
- cmd->result = DID_NO_CONNECT << 16;
- goto done;
- }
-
-
- /* Check to see if the scsi lld made this device blocked. */
- if (unlikely(scsi_device_blocked(cmd->device))) {
- /*
- * in blocked state, the command is just put back on
- * the device queue. The suspend state has already
- * blocked the queue so future requests should not
- * occur until the device transitions out of the
- * suspend state.
- */
- SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
- "queuecommand : device blocked\n"));
- return SCSI_MLQUEUE_DEVICE_BUSY;
- }
-
-
- /* Store the LUN value in cmnd, if needed. */
- if (cmd->device->lun_in_cdb)
- cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
- (cmd->device->lun << 5 & 0xe0);
-
-
- scsi_log_send(cmd);
-
-
- /*
- * Before we queue this command, check if the command
- * length exceeds what the host adapter can handle.
- */
- if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
- SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
- "queuecommand : command too long. "
- "cdb_size=%d host->max_cmd_len=%d\n",
- cmd->cmd_len, cmd->device->host->max_cmd_len));
- cmd->result = (DID_ABORT << 16);
- goto done;
- }
-
-
- if (unlikely(host->shost_state == SHOST_DEL)) {
- cmd->result = (DID_NO_CONNECT << 16);
- goto done;
-
-
- }
-
-
- trace_scsi_dispatch_cmd_start(cmd);
- rtn = host->hostt->queuecommand(host, cmd);
- if (rtn) {
- trace_scsi_dispatch_cmd_error(cmd, rtn);
- if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
- rtn != SCSI_MLQUEUE_TARGET_BUSY)
- rtn = SCSI_MLQUEUE_HOST_BUSY;
-
-
- SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
- "queuecommand : request rejected\n"));
- }
-
-
- return rtn;
- done:
- cmd->scsi_done(cmd);
- return 0;
- }

queuecommand是低层驱动中一个最关键的函数,将在之后学习scsi的时候再细说,在这个函数中,cmd->scsi_done函数会在所有操作完成后返回前被调用,向上层报告结果。这个函数则是在scsi_queue_rq函数中被定义为scsi_mq_done。
cmd->scsi_done = scsi_mq_done;
- //common/drivers/scsi/scsi_lib.c
- static void scsi_mq_done(struct scsi_cmnd *cmd)
- {
- if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q)))
- return;
- if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
- return;
- trace_scsi_dispatch_cmd_done(cmd);
- blk_mq_complete_request(scsi_cmd_to_rq(cmd));
- }
- //common/block/blk-mq.c
- void blk_mq_complete_request(struct request *rq)
- {
- if (!blk_mq_complete_request_remote(rq))
- rq->q->mq_ops->complete(rq);
- }
最终调用了mq_ops里的complete函数。
sd_zbc.c里同样提供了相关实现:
- //common/drivers/scsi/sd_zbc.c
- unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
- struct scsi_sense_hdr *sshdr)
- {
- int result = cmd->result;
- struct request *rq = scsi_cmd_to_rq(cmd);
-
-
- if (op_is_zone_mgmt(req_op(rq)) &&
- result &&
- sshdr->sense_key == ILLEGAL_REQUEST &&
- sshdr->asc == 0x24) {
- /*
- * INVALID FIELD IN CDB error: a zone management command was
- * attempted on a conventional zone. Nothing to worry about,
- * so be quiet about the error. 允许区块管理命令操作传统区块
- */
- rq->rq_flags |= RQF_QUIET;
- } else if (sd_zbc_need_zone_wp_update(rq))
- good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
- // 更新块指针
- if (req_op(rq) == REQ_OP_ZONE_APPEND)
- blk_req_zone_write_unlock(rq);
- // 对于写追加操作的特殊处理
- return good_bytes;
- }

完成函数中,根据下层返回的结果,移动写指针到正确的位置上。
- static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
- unsigned int good_bytes)
- {
- int result = cmd->result;
- struct request *rq = scsi_cmd_to_rq(cmd);
- struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
- unsigned int zno = blk_rq_zone_no(rq);
- enum req_opf op = req_op(rq);
- unsigned long flags;
-
-
- /*
- * If we got an error for a command that needs updating the write
- * pointer offset cache, we must mark the zone wp offset entry as
- * invalid to force an update from disk the next time a zone append
- * command is issued.
- */
- spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-
-
- if (result && op != REQ_OP_ZONE_RESET_ALL) {
- if (op == REQ_OP_ZONE_APPEND) {
- /* Force complete completion (no retry) */
- good_bytes = 0;
- scsi_set_resid(cmd, blk_rq_bytes(rq));
- }
-
-
- /*
- * Force an update of the zone write pointer offset on
- * the next zone append access.
- */
- if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
- sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
- goto unlock_wp_offset;
- }
-
-
- switch (op) {
- case REQ_OP_ZONE_APPEND:
- rq->__sector += sdkp->zones_wp_offset[zno];
- fallthrough;
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE_SAME:
- case REQ_OP_WRITE:
- if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
- sdkp->zones_wp_offset[zno] +=
- good_bytes >> SECTOR_SHIFT;
- break;
- case REQ_OP_ZONE_RESET:
- sdkp->zones_wp_offset[zno] = 0;
- break;
- case REQ_OP_ZONE_FINISH:
- sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
- break;
- case REQ_OP_ZONE_RESET_ALL:
- memset(sdkp->zones_wp_offset, 0,
- sdkp->nr_zones * sizeof(unsigned int));
- break;
- default:
- break;
- }
-
-
- unlock_wp_offset:
- spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
-
- return good_bytes;
- }

移动指针的函数并不难理解,根据执行的指令进行计算,修改指针写偏移即可。
写到这里,我们可以得出一个zone相关命令的完整流程。
所以,根据以上流程,如果一个设备支持zone且已经被scsi正确适配,要使其处理bio层下发的request,以下回调函数不可或缺:
scsi层的.queuecommand函数,它要能接受zone相关命令,并实际完成读写后向上层返回结果;
对应驱动的.init_command函数,它要能正确地生成低层设备能够处理的命令;
mq_ops中的.complete函数,它要能在低层设备处理完成后,成功移动写指针到对应位置。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。