赞
踩
对SSD进行trim,是通过ioclt发送BLKDISCARD命令完成的。
下面是block层ioctl的调用栈。
sys_ioctl->do_vfs_ioctl->block_ioctl->blkdev_ioctl->blkdev_issue_discard->__blk_run_queue->scsi_request_fn->....
具体通过代码讲解BLKDISCARD执行流程。
static long vfs_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
int error = -ENOTTY;
if (!filp->f_op->unlocked_ioctl)
goto out;
error = filp->f_op->unlocked_ioctl(filp, cmd, arg);//这里f_op注册的是def_blk_fops->unlocked_ioctl,即block_ioctl
if (error == -ENOIOCTLCMD)
error = -ENOTTY;
out:
return error;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct block_device *bdev = I_BDEV(file->f_mapping->host);
fmode_t mode = file->f_mode;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
* to updated it before every ioctl.
*/
if (file->f_flags & O_NDELAY)
mode |= FMODE_NDELAY;
else
mode &= ~FMODE_NDELAY;
return blkdev_ioctl(bdev, mode, cmd, arg);
}
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
unsigned long arg)
{
....
case BLKDISCARD: //对于cmd为BLKDISCARD的命令,走该分支;
case BLKSECDISCARD: {
uint64_t range[2];
if (!(mode & FMODE_WRITE))
return -EBADF;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
return blk_ioctl_discard(bdev, range[0], range[1],
cmd == BLKSECDISCARD);
}
.....
}
static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
uint64_t len, int secure) //参数为起始sector,discard的长度len,单位为sector
{
unsigned long flags = 0;
if (start & 511)
return -EINVAL;
if (len & 511)
return -EINVAL;
start >>= 9;
len >>= 9;
if (start + len > (i_size_read(bdev->bd_inode) >> 9))
return -EINVAL;
if (secure)
flags |= BLKDEV_DISCARD_SECURE;
return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
}
int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
{
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD; //将bio的类型设置为WRITE|DISCARD
unsigned int max_discard_sectors, granularity;
int alignment;
struct bio_batch bb;
struct bio *bio;
int ret = 0;
struct blk_plug plug;
if (!q)
return -ENXIO;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
/*
* Ensure that max_discard_sectors is of the proper
* granularity, so that requests stay aligned after a split.
*/
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
/* Avoid infinite loop below. Being cautious never hurts. */
return -EOPNOTSUPP;
}
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secdiscard(q))
return -EOPNOTSUPP;
type |= REQ_SECURE;
}
atomic_set(&bb.done, 1);
bb.flags = 1 << BIO_UPTODATE;
bb.wait = &wait;
blk_start_plug(&plug);
while (nr_sects) {
unsigned int req_sects;//循环发送bio请求给硬盘
sector_t end_sect, tmp;
bio = bio_alloc(gfp_mask, 1); //分配一个bio
if (!bio) {
ret = -ENOMEM;
break;
}
req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
/*
* If splitting a request, and the next starting sector would be
* misaligned, stop the discard at the previous aligned sector.
*/
end_sect = sector + req_sects;
tmp = end_sect;
if (req_sects < nr_sects &&
sector_div(tmp, granularity) != alignment) {
end_sect = end_sect - alignment;
sector_div(end_sect, granularity);
end_sect = end_sect * granularity + alignment;
req_sects = end_sect - sector;
}
bio->bi_iter.bi_sector = sector; //初始化bio的一些成员
bio->bi_end_io = bio_batch_end_io;
bio->bi_bdev = bdev;
bio->bi_private = &bb;
bio->bi_iter.bi_size = req_sects << 9;
nr_sects -= req_sects;
sector = end_sect;
atomic_inc(&bb.done);
submit_bio(type, bio);//发送bio
/*
* We can loop for a long time in here, if someone does
* full device discards (like mkfs). Be nice and allow
* us to schedule out to avoid softlocking if preempt
* is disabled.
*/
cond_resched();
}
blk_finish_plug(&plug);
/* Wait for bios in-flight */
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
if (!test_bit(BIO_UPTODATE, &bb.flags))
ret = -EIO;
return ret;
}
发送bio的调用栈:
submit_bio->generic_make_request->blk_queue_bio{这里将bio转换成request,并插入到request_queue中}->__blk_run_queue->scsi_request_fn.
在scsi_request_fn中,首先通过blk_peek_request从request_queue中取出一个req,并通过scsi_prep_fn,对req做一些准备工作。
static int scsi_prep_fn(struct request_queue *q, struct request *req)
{
struct scsi_device *sdev = q->queuedata;
struct scsi_cmnd *cmd;
int ret;
ret = scsi_prep_state_check(sdev, req);
if (ret != BLKPREP_OK)
goto out;
cmd = scsi_get_cmd_from_req(sdev, req);//通过req构建scsi_cmd结构体,完成block向scsi层转换
if (unlikely(!cmd)) {
ret = BLKPREP_DEFER;
goto out;
}
if (req->cmd_type == REQ_TYPE_FS)
ret = scsi_cmd_to_driver(cmd)->init_command(cmd);//对于req属于fs的类型,调用sd_init_command;
else if (req->cmd_type == REQ_TYPE_BLOCK_PC)
ret = scsi_setup_blk_pc_cmnd(sdev, req);
else
ret = BLKPREP_KILL;
out:
return scsi_prep_return(q, req, ret);
}
对于BLKDISCARD的情形,发送的为FS TYPE的cmd,因此,这里走sd_init_command;
static int sd_init_command(struct scsi_cmnd *SCpnt)
{
struct request *rq = SCpnt->request;
struct scsi_device *sdp = SCpnt->device;
struct gendisk *disk = rq->rq_disk;
struct scsi_disk *sdkp;
sector_t block = blk_rq_pos(rq);
sector_t threshold;
unsigned int this_count = blk_rq_sectors(rq);
int ret, host_dif;
unsigned char protect;
/*
* Discard request come in as REQ_TYPE_FS but we turn them into
* block PC requests to make life easier.
*/
if (rq->cmd_flags & REQ_DISCARD) { //由于cmd_flags包含有REQ_DISCARD flag,因此走该分支;
ret = sd_setup_discard_cmnd(sdp, rq);
goto out;
} else if (rq->cmd_flags & REQ_WRITE_SAME) {
ret = sd_setup_write_same_cmnd(sdp, rq);
goto out;
} else if (rq->cmd_flags & REQ_FLUSH) {
ret = scsi_setup_flush_cmnd(sdp, rq);
goto out;
}
...
正常的IO请求初始化走下面分支,这里不再介绍。
...
}
static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
{
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
unsigned int nr_sectors = blk_rq_sectors(rq);
unsigned int nr_bytes = blk_rq_bytes(rq);
unsigned int len;
int ret;
char *buf;
struct page *page;
sector >>= ilog2(sdp->sector_size) - 9;
nr_sectors >>= ilog2(sdp->sector_size) - 9;
rq->timeout = SD_TIMEOUT;
memset(rq->cmd, 0, rq->cmd_len);
page = alloc_page(GFP_ATOMIC | __GFP_ZERO);//分配一个全零的Page
if (!page)
return BLKPREP_DEFER;
switch (sdkp->provisioning_mode) { //根据scsi device的规定的等级,选择不同的discard command,这里provisioning_mode为SD_LBP_WS16
case SD_LBP_UNMAP:
buf = page_address(page);
rq->cmd_len = 10;
rq->cmd[0] = UNMAP;
rq->cmd[8] = 24;
put_unaligned_be16(6 + 16, &buf[0]);
put_unaligned_be16(16, &buf[2]);
put_unaligned_be64(sector, &buf[8]);
put_unaligned_be32(nr_sectors, &buf[16]);
len = 24;
break;
case SD_LBP_WS16:
rq->cmd_len = 16;
rq->cmd[0] = WRITE_SAME_16;
rq->cmd[1] = 0x8; /* UNMAP */
put_unaligned_be64(sector, &rq->cmd[2]);
put_unaligned_be32(nr_sectors, &rq->cmd[10]);
len = sdkp->device->sector_size;
break;
case SD_LBP_WS10:
case SD_LBP_ZERO:
rq->cmd_len = 10;
rq->cmd[0] = WRITE_SAME;
if (sdkp->provisioning_mode == SD_LBP_WS10)
rq->cmd[1] = 0x8; /* UNMAP */
put_unaligned_be32(sector, &rq->cmd[2]);
put_unaligned_be16(nr_sectors, &rq->cmd[7]);
len = sdkp->device->sector_size;
break;
default:
ret = BLKPREP_KILL;
goto out;
}
rq->completion_data = page;//这里将page 赋值给rq->completion_data,在最后scsi_unprep_fn,调用sd_uninit_command中,通过该变量来释放该page的。
blk_add_request_payload(rq, page, len);
ret = scsi_setup_blk_pc_cmnd(sdp, rq);
rq->__data_len = nr_bytes;
out:
if (ret != BLKPREP_OK)
__free_page(page);
return ret;
}
static void sd_uninit_command(struct scsi_cmnd *SCpnt)
{
struct request *rq = SCpnt->request;
if (rq->cmd_flags & REQ_DISCARD)
__free_page(rq->completion_data);
if (SCpnt->cmnd != rq->cmd) {
mempool_free(SCpnt->cmnd, sd_cdb_pool);
SCpnt->cmnd = NULL;
SCpnt->cmd_len = 0;
}
}
接下来,将初始化的scsi_cmd dispatch出去。
scsi_request_fn->scsi_dispatch_cmd->ata_scsi_queuecmd->__ata_scsi_queuecmd.
static inline int __ata_scsi_queuecmd(struct scsi_cmnd *scmd,
struct ata_device *dev)
{
u8 scsi_op = scmd->cmnd[0];
ata_xlat_func_t xlat_func;
int rc = 0;
if (dev->class == ATA_DEV_ATA) {//针对ATA device,走该分支
if (unlikely(!scmd->cmd_len || scmd->cmd_len > dev->cdb_len))
goto bad_cdb_len;
xlat_func = ata_get_xlat_func(dev, scsi_op);
} else {
if (unlikely(!scmd->cmd_len))
goto bad_cdb_len;
xlat_func = NULL;
if (likely((scsi_op != ATA_16) || !atapi_passthru16)) {
/* relay SCSI command to ATAPI device */
int len = COMMAND_SIZE(scsi_op);
if (unlikely(len > scmd->cmd_len || len > dev->cdb_len))
goto bad_cdb_len;
xlat_func = atapi_xlat;
} else {
/* ATA_16 passthru, treat as an ATA command */
if (unlikely(scmd->cmd_len > 16))
goto bad_cdb_len;
xlat_func = ata_get_xlat_func(dev, scsi_op);
}
}
if (xlat_func)
rc = ata_scsi_translate(dev, scmd, xlat_func);
else
ata_scsi_simulate(dev, scmd);
return rc;
bad_cdb_len:
DPRINTK("bad CDB len=%u, scsi_op=0x%02x, max=%u\n",
scmd->cmd_len, scsi_op, dev->cdb_len);
scmd->result = DID_ERROR << 16;
scmd->scsi_done(scmd);
return 0;
}
static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
{
switch (cmd) {
case READ_6:
case READ_10:
case READ_16:
case WRITE_6:
case WRITE_10:
case WRITE_16:
return ata_scsi_rw_xlat;
case WRITE_SAME_16:
return ata_scsi_write_same_xlat;//由于discard通过write_same_16 cmd实现,因此xlat_fn=ata_scsi_write_same_xlat;
case SYNCHRONIZE_CACHE:
if (ata_try_flush_cache(dev))
return ata_scsi_flush_xlat;
break;
case VERIFY:
case VERIFY_16:
return ata_scsi_verify_xlat;
case ATA_12:
case ATA_16:
return ata_scsi_pass_thru;
case MODE_SELECT:
case MODE_SELECT_10:
return ata_scsi_mode_select_xlat;
break;
case START_STOP:
return ata_scsi_start_stop_xlat;
}
return NULL;
}
下面通过ata_scsi_translate把scsi_cmd转换成具体的ata_command发送出去。
static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
ata_xlat_func_t xlat_func)
{
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
int rc;
VPRINTK("ENTER\n");
qc = ata_scsi_qc_new(dev, cmd);
if (!qc)
goto err_mem;
/* data is present; dma-map it */
if (cmd->sc_data_direction == DMA_FROM_DEVICE ||
cmd->sc_data_direction == DMA_TO_DEVICE) {
if (unlikely(scsi_bufflen(cmd) < 1)) {
ata_dev_warn(dev, "WARNING: zero len r/w req\n");
goto err_did;
}
ata_sg_init(qc, scsi_sglist(cmd), scsi_sg_count(cmd));
qc->dma_dir = cmd->sc_data_direction;
}
qc->complete_fn = ata_scsi_qc_complete;
if (xlat_func(qc)) //具体转换函数
goto early_finish;
if (ap->ops->qc_defer) {
if ((rc = ap->ops->qc_defer(qc)))
goto defer;
}
/* select device, send command to hardware */
ata_qc_issue(qc); //具体调用sata driver的函数发送命令给controller。
VPRINTK("EXIT\n");
return 0;
}
命令完成过程,跟正常command 命令完成流程一样,这里不再列出。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。