当前位置:   article > 正文

Linux kernel | scsi层是如何处理zone相关request的

scsi_mq_init_request

上文中,我们将提交了submit_bio后的流程进行了分析,经过梳理之后,一个submit_bio在经过层层转化,最后进入到硬件派发队列hctx时,会调用硬件绑定的驱动所提供的queue_rq函数,这个函数是每一个硬件派发队列对应驱动都必须提供的。

outside_default.png

我们之前所研究的virtio部分是挂在pci总线上面的,然而在多数情况下,硬盘是通过scsi子系统被发现并管理的,磁盘驱动为scsi子系统中的一种高级驱动sd。

因此,根据我们现有的知识,在mq体系下,scsi一定向上层提供了对应的硬件派发队列以及对应的queue_rq函数。

我们通过简单地搜索,就能找到scsi所绑定的blk_mq_op:

  1. //common/drivers/scsi/scsi_lib.c
  2. static const struct blk_mq_ops scsi_mq_ops = {
  3. .get_budget = scsi_mq_get_budget,
  4. .put_budget = scsi_mq_put_budget,
  5. .queue_rq = scsi_queue_rq,
  6. .commit_rqs = scsi_commit_rqs,
  7. .complete = scsi_complete,
  8. .timeout = scsi_timeout,
  9. #ifdef CONFIG_BLK_DEBUG_FS
  10. .show_rq = scsi_show_rq,
  11. #endif
  12. .init_request = scsi_mq_init_request,
  13. .exit_request = scsi_mq_exit_request,
  14. .cleanup_rq = scsi_cleanup_rq,
  15. .busy = scsi_mq_lld_busy,
  16. .map_queues = scsi_map_queues,
  17. .init_hctx = scsi_init_hctx,
  18. .poll = scsi_mq_poll,
  19. .set_rq_budget_token = scsi_mq_set_rq_budget_token,
  20. .get_rq_budget_token = scsi_mq_get_rq_budget_token,
  21. };

找到scsi_queue_rq函数:

  1. //common/drivers/scsi/scsi_lib.c
  2. static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
  3. const struct blk_mq_queue_data *bd)
  4. {
  5. struct request *req = bd->rq;
  6. struct request_queue *q = req->q;
  7. struct scsi_device *sdev = q->queuedata;
  8. struct Scsi_Host *shost = sdev->host;
  9. struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
  10. blk_status_t ret;
  11. int reason;
  12. WARN_ON_ONCE(cmd->budget_token < 0);
  13. /*
  14. * If the device is not in running state we will reject some or all
  15. * commands.
  16. */
  17. if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
  18. ret = scsi_device_state_check(sdev, req);
  19. if (ret != BLK_STS_OK)
  20. goto out_put_budget;
  21. }
  22. ret = BLK_STS_RESOURCE;
  23. if (!scsi_target_queue_ready(shost, sdev))
  24. goto out_put_budget;
  25. if (unlikely(scsi_host_in_recovery(shost))) {
  26. if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
  27. ret = BLK_STS_OFFLINE;
  28. goto out_dec_target_busy;
  29. }
  30. if (!scsi_host_queue_ready(q, shost, sdev, cmd))
  31. goto out_dec_target_busy;
  32. if (!(req->rq_flags & RQF_DONTPREP)) {
  33. ret = scsi_prepare_cmd(req);
  34. if (ret != BLK_STS_OK)
  35. goto out_dec_host_busy;
  36. req->rq_flags |= RQF_DONTPREP;
  37. } else {
  38. clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
  39. }
  40. cmd->flags &= SCMD_PRESERVED_FLAGS;
  41. if (sdev->simple_tags)
  42. cmd->flags |= SCMD_TAGGED;
  43. if (bd->last)
  44. cmd->flags |= SCMD_LAST;
  45. scsi_set_resid(cmd, 0);
  46. memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
  47. cmd->submitter = SUBMITTED_BY_BLOCK_LAYER;
  48. blk_mq_start_request(req);
  49. reason = scsi_dispatch_cmd(cmd);
  50. if (reason) {
  51. scsi_set_blocked(cmd, reason);
  52. ret = BLK_STS_RESOURCE;
  53. goto out_dec_host_busy;
  54. }
  55. atomic_inc(&cmd->device->iorequest_cnt);
  56. return BLK_STS_OK;
  57. out_dec_host_busy:
  58. scsi_dec_host_busy(shost, cmd);
  59. out_dec_target_busy:
  60. if (scsi_target(sdev)->can_queue > 0)
  61. atomic_dec(&scsi_target(sdev)->target_busy);
  62. out_put_budget:
  63. scsi_mq_put_budget(q, cmd->budget_token);
  64. cmd->budget_token = -1;
  65. switch (ret) {
  66. case BLK_STS_OK:
  67. break;
  68. case BLK_STS_RESOURCE:
  69. case BLK_STS_ZONE_RESOURCE:
  70. if (scsi_device_blocked(sdev))
  71. ret = BLK_STS_DEV_RESOURCE;
  72. break;
  73. case BLK_STS_AGAIN:
  74. cmd->result = DID_BUS_BUSY << 16;
  75. if (req->rq_flags & RQF_DONTPREP)
  76. scsi_mq_uninit_cmd(cmd);
  77. break;
  78. default:
  79. if (unlikely(!scsi_device_online(sdev)))
  80. cmd->result = DID_NO_CONNECT << 16;
  81. else
  82. cmd->result = DID_ERROR << 16;
  83. /*
  84. * Make sure to release all allocated resources when
  85. * we hit an error, as we will never see this command
  86. * again.
  87. */
  88. if (req->rq_flags & RQF_DONTPREP)
  89. scsi_mq_uninit_cmd(cmd);
  90. scsi_run_queue_async(sdev);
  91. break;
  92. }
  93. return ret;
  94. }

这里面虽然没有和zone有关的内容,但是不要慌张,浅浅地回忆一下,进入到这个函数之后大致要完成的工作是,把队列中的request再转化为对硬件的command,接着下发command到硬件,完成io。

也就是说,对于request的解析,一定是在command生成之前的。

在上面代码的35行之前,是在做一些必要的检查,确保队列、硬件处于正常工作的状态,接着37行,出现一个关键的函数scsi_prepare_cmd,顾名思义,command可能会在这个函数中进行初始化。

所以我们进入到这个函数:

  1. //common/drivers/scsi/scsi_lib.c
  2. static blk_status_t scsi_prepare_cmd(struct request *req)
  3. {
  4. struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
  5. struct scsi_device *sdev = req->q->queuedata;
  6. struct Scsi_Host *shost = sdev->host;
  7. bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
  8. struct scatterlist *sg;
  9. scsi_init_command(sdev, cmd);
  10. cmd->eh_eflags = 0;
  11. cmd->prot_type = 0;
  12. cmd->prot_flags = 0;
  13. cmd->submitter = 0;
  14. memset(&cmd->sdb, 0, sizeof(cmd->sdb));
  15. cmd->underflow = 0;
  16. cmd->transfersize = 0;
  17. cmd->host_scribble = NULL;
  18. cmd->result = 0;
  19. cmd->extra_len = 0;
  20. cmd->state = 0;
  21. if (in_flight)
  22. __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
  23. /*
  24. * Only clear the driver-private command data if the LLD does not supply
  25. * a function to initialize that data.
  26. */
  27. if (!shost->hostt->init_cmd_priv)
  28. memset(cmd + 1, 0, shost->hostt->cmd_size);
  29. cmd->prot_op = SCSI_PROT_NORMAL;
  30. if (blk_rq_bytes(req))
  31. cmd->sc_data_direction = rq_dma_dir(req);
  32. else
  33. cmd->sc_data_direction = DMA_NONE;
  34. sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
  35. cmd->sdb.table.sgl = sg;
  36. if (scsi_host_get_prot(shost)) {
  37. memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
  38. cmd->prot_sdb->table.sgl =
  39. (struct scatterlist *)(cmd->prot_sdb + 1);
  40. }
  41. /*
  42. * Special handling for passthrough commands, which don't go to the ULP
  43. * at all:
  44. */
  45. if (blk_rq_is_passthrough(req))
  46. return scsi_setup_scsi_cmnd(sdev, req);
  47. if (sdev->handler && sdev->handler->prep_fn) {
  48. blk_status_t ret = sdev->handler->prep_fn(sdev, req);
  49. if (ret != BLK_STS_OK)
  50. return ret;
  51. }
  52. /* Usually overridden by the ULP */
  53. cmd->allowed = 0;
  54. memset(cmd->cmnd, 0, sizeof(cmd->cmnd));
  55. return scsi_cmd_to_driver(cmd)->init_command(cmd);
  56. }

要知道scsi层里面,高级驱动可不止sd一个,因此,我们可以猜测这个函数只是在做一些通用性的命令初始化,对于特异性的初始化,一定会转交sd驱动处理,所以直接看代码的66行,调用了对应cmd绑定驱动的init_command函数。

让我们看看sd是如何处理的:

  1. //common/drivers/scsi/sd.c
  2. static struct scsi_driver sd_template = {
  3. .gendrv = {
  4. .name = "sd",
  5. .owner = THIS_MODULE,
  6. .probe = sd_probe,
  7. .probe_type = PROBE_PREFER_ASYNCHRONOUS,
  8. .remove = sd_remove,
  9. .shutdown = sd_shutdown,
  10. .pm = &sd_pm_ops,
  11. },
  12. .rescan = sd_rescan,
  13. .init_command = sd_init_command,
  14. .uninit_command = sd_uninit_command,
  15. .done = sd_done,
  16. .eh_action = sd_eh_action,
  17. .eh_reset = sd_eh_reset,
  18. };

上面的结构体描述了一个scsi_driver的一系列回调函数,关于scsi我们之后再系统的学习,这里我们先去了解它的init_command函数:

  1. //common/drivers/scsi/sd.c
  2. static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
  3. {
  4. struct request *rq = scsi_cmd_to_rq(cmd);
  5. switch (req_op(rq)) {
  6. case REQ_OP_DISCARD:
  7. switch (scsi_disk(rq->q->disk)->provisioning_mode) {
  8. case SD_LBP_UNMAP:
  9. return sd_setup_unmap_cmnd(cmd);
  10. case SD_LBP_WS16:
  11. return sd_setup_write_same16_cmnd(cmd, true);
  12. case SD_LBP_WS10:
  13. return sd_setup_write_same10_cmnd(cmd, true);
  14. case SD_LBP_ZERO:
  15. return sd_setup_write_same10_cmnd(cmd, false);
  16. default:
  17. return BLK_STS_TARGET;
  18. }
  19. case REQ_OP_WRITE_ZEROES:
  20. return sd_setup_write_zeroes_cmnd(cmd);
  21. case REQ_OP_FLUSH:
  22. return sd_setup_flush_cmnd(cmd);
  23. case REQ_OP_READ:
  24. case REQ_OP_WRITE:
  25. case REQ_OP_ZONE_APPEND:
  26. return sd_setup_read_write_cmnd(cmd);
  27. case REQ_OP_ZONE_RESET:
  28. return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
  29. false);
  30. case REQ_OP_ZONE_RESET_ALL:
  31. return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
  32. true);
  33. case REQ_OP_ZONE_OPEN:
  34. return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false);
  35. case REQ_OP_ZONE_CLOSE:
  36. return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false);
  37. case REQ_OP_ZONE_FINISH:
  38. return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false);
  39. default:
  40. WARN_ON_ONCE(1);
  41. return BLK_STS_NOTSUPP;
  42. }
  43. }

到这里之后,完成了将request转化为command的操作,在此处REQ_OP的作用是将不同的操作正确生成了Command命令。

outside_default.png

之后,系统会在scsi_dispatch_cmd函数中,将准备好的command下发到低层驱动。

  1. static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
  2. {
  3. struct Scsi_Host *host = cmd->device->host;
  4. int rtn = 0;
  5. atomic_inc(&cmd->device->iorequest_cnt);
  6. /* check if the device is still usable */
  7. if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
  8. /* in SDEV_DEL we error all commands. DID_NO_CONNECT
  9. * returns an immediate error upwards, and signals
  10. * that the device is no longer present */
  11. cmd->result = DID_NO_CONNECT << 16;
  12. goto done;
  13. }
  14. /* Check to see if the scsi lld made this device blocked. */
  15. if (unlikely(scsi_device_blocked(cmd->device))) {
  16. /*
  17. * in blocked state, the command is just put back on
  18. * the device queue. The suspend state has already
  19. * blocked the queue so future requests should not
  20. * occur until the device transitions out of the
  21. * suspend state.
  22. */
  23. SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
  24. "queuecommand : device blocked\n"));
  25. return SCSI_MLQUEUE_DEVICE_BUSY;
  26. }
  27. /* Store the LUN value in cmnd, if needed. */
  28. if (cmd->device->lun_in_cdb)
  29. cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
  30. (cmd->device->lun << 5 & 0xe0);
  31. scsi_log_send(cmd);
  32. /*
  33. * Before we queue this command, check if the command
  34. * length exceeds what the host adapter can handle.
  35. */
  36. if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
  37. SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
  38. "queuecommand : command too long. "
  39. "cdb_size=%d host->max_cmd_len=%d\n",
  40. cmd->cmd_len, cmd->device->host->max_cmd_len));
  41. cmd->result = (DID_ABORT << 16);
  42. goto done;
  43. }
  44. if (unlikely(host->shost_state == SHOST_DEL)) {
  45. cmd->result = (DID_NO_CONNECT << 16);
  46. goto done;
  47. }
  48. trace_scsi_dispatch_cmd_start(cmd);
  49. rtn = host->hostt->queuecommand(host, cmd);
  50. if (rtn) {
  51. trace_scsi_dispatch_cmd_error(cmd, rtn);
  52. if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
  53. rtn != SCSI_MLQUEUE_TARGET_BUSY)
  54. rtn = SCSI_MLQUEUE_HOST_BUSY;
  55. SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
  56. "queuecommand : request rejected\n"));
  57. }
  58. return rtn;
  59. done:
  60. cmd->scsi_done(cmd);
  61. return 0;
  62. }

queuecommand是低层驱动中一个最关键的函数,将在之后学习scsi的时候再细说,在这个函数中,cmd->scsi_done函数会在所有操作完成后返回前被调用,向上层报告结果。这个函数则是在scsi_queue_rq函数中被定义为scsi_mq_done。

cmd->scsi_done = scsi_mq_done;
  1. //common/drivers/scsi/scsi_lib.c
  2. static void scsi_mq_done(struct scsi_cmnd *cmd)
  3. {
  4. if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q)))
  5. return;
  6. if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
  7. return;
  8. trace_scsi_dispatch_cmd_done(cmd);
  9. blk_mq_complete_request(scsi_cmd_to_rq(cmd));
  10. }
  1. //common/block/blk-mq.c
  2. void blk_mq_complete_request(struct request *rq)
  3. {
  4. if (!blk_mq_complete_request_remote(rq))
  5. rq->q->mq_ops->complete(rq);
  6. }

最终调用了mq_ops里的complete函数。

sd_zbc.c里同样提供了相关实现:

  1. //common/drivers/scsi/sd_zbc.c
  2. unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
  3. struct scsi_sense_hdr *sshdr)
  4. {
  5. int result = cmd->result;
  6. struct request *rq = scsi_cmd_to_rq(cmd);
  7. if (op_is_zone_mgmt(req_op(rq)) &&
  8. result &&
  9. sshdr->sense_key == ILLEGAL_REQUEST &&
  10. sshdr->asc == 0x24) {
  11. /*
  12. * INVALID FIELD IN CDB error: a zone management command was
  13. * attempted on a conventional zone. Nothing to worry about,
  14. * so be quiet about the error. 允许区块管理命令操作传统区块
  15. */
  16. rq->rq_flags |= RQF_QUIET;
  17. } else if (sd_zbc_need_zone_wp_update(rq))
  18. good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
  19. // 更新块指针
  20. if (req_op(rq) == REQ_OP_ZONE_APPEND)
  21. blk_req_zone_write_unlock(rq);
  22.     // 对于写追加操作的特殊处理
  23. return good_bytes;
  24. }

完成函数中,根据下层返回的结果,移动写指针到正确的位置上。

  1. static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
  2. unsigned int good_bytes)
  3. {
  4. int result = cmd->result;
  5. struct request *rq = scsi_cmd_to_rq(cmd);
  6. struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
  7. unsigned int zno = blk_rq_zone_no(rq);
  8. enum req_opf op = req_op(rq);
  9. unsigned long flags;
  10. /*
  11. * If we got an error for a command that needs updating the write
  12. * pointer offset cache, we must mark the zone wp offset entry as
  13. * invalid to force an update from disk the next time a zone append
  14. * command is issued.
  15. */
  16. spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
  17. if (result && op != REQ_OP_ZONE_RESET_ALL) {
  18. if (op == REQ_OP_ZONE_APPEND) {
  19. /* Force complete completion (no retry) */
  20. good_bytes = 0;
  21. scsi_set_resid(cmd, blk_rq_bytes(rq));
  22. }
  23. /*
  24. * Force an update of the zone write pointer offset on
  25. * the next zone append access.
  26. */
  27. if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
  28. sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
  29. goto unlock_wp_offset;
  30. }
  31. switch (op) {
  32. case REQ_OP_ZONE_APPEND:
  33. rq->__sector += sdkp->zones_wp_offset[zno];
  34. fallthrough;
  35. case REQ_OP_WRITE_ZEROES:
  36. case REQ_OP_WRITE_SAME:
  37. case REQ_OP_WRITE:
  38. if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
  39. sdkp->zones_wp_offset[zno] +=
  40. good_bytes >> SECTOR_SHIFT;
  41. break;
  42. case REQ_OP_ZONE_RESET:
  43. sdkp->zones_wp_offset[zno] = 0;
  44. break;
  45. case REQ_OP_ZONE_FINISH:
  46. sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
  47. break;
  48. case REQ_OP_ZONE_RESET_ALL:
  49. memset(sdkp->zones_wp_offset, 0,
  50. sdkp->nr_zones * sizeof(unsigned int));
  51. break;
  52. default:
  53. break;
  54. }
  55. unlock_wp_offset:
  56. spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
  57. return good_bytes;
  58. }

移动指针的函数并不难理解,根据执行的指令进行计算,修改指针写偏移即可。

写到这里,我们可以得出一个zone相关命令的完整流程。

outside_default.png

所以,根据以上流程,如果一个设备支持zone且已经被scsi正确适配,要使其处理bio层下发的request,以下回调函数不可或缺:

  1. scsi层的.queuecommand函数,它要能接受zone相关命令,并实际完成读写后向上层返回结果;

  2. 对应驱动的.init_command函数,它要能正确地生成低层设备能够处理的命令;

  3. mq_ops中的.complete函数,它要能在低层设备处理完成后,成功移动写指针到对应位置。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/从前慢现在也慢/article/detail/217374
推荐阅读
相关标签
  

闽ICP备14008679号