赞
踩
本文代码基于Linux 5.10。
Linux mount 主要通过mount 命令或者mount api来实现, 本文主要介绍mount 调用在内核中的实现。
fs_context 是mount 流程中的重要数据结构, 其定义如下
- include/linux/fs_context.h
- struct fs_context {
- const struct fs_context_operations *ops;
- struct mutex uapi_mutex; /* Userspace access mutex */
- struct file_system_type *fs_type;
- void *fs_private; /* The filesystem's context */
- void *sget_key;
- struct dentry *root; /* The root and superblock */
- struct user_namespace *user_ns; /* The user namespace for this mount */
- struct net *net_ns; /* The network namespace for this mount */
- const struct cred *cred; /* The mounter's credentials */
- struct p_log log; /* Logging buffer */
- const char *source; /* The source name (eg. dev path) */
- void *security; /* Linux S&M options */
- void *s_fs_info; /* Proposed s_fs_info */
- unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
- unsigned int sb_flags_mask; /* Superblock flags that were changed */
- unsigned int s_iflags; /* OR'd with sb->s_iflags */
- unsigned int lsm_flags; /* Information flags from the fs to the LSM */
- enum fs_context_purpose purpose:8;
- enum fs_context_phase phase:8; /* The phase the context is in */
- bool need_free:1; /* Need to call ops->free() */
- bool global:1; /* Goes into &init_user_ns */
- bool oldapi:1; /* Coming from mount(2) */
- };

Linux 对于这个结构体的注释是:
/*
* Filesystem context for holding the parameters used in the creation or
* reconfiguration of a superblock.
*
* Superblock creation fills in ->root whereas reconfiguration begins with this
* already set.
*
* See Documentation/filesystems/mount_api.rst
*/
我的理解这个结构是从 file_system_type 到 super_block 之间的桥梁, 控制了mount 流程。
fs_type: 对应的fs_type 结构体
ops: 这个比较重要, 指向了fs_context_operations, 一般会在文件系统的init_fs_context回调中对其进行赋值
- struct fs_context_operations {
- void (*free)(struct fs_context *fc);
- int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
- int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
- int (*parse_monolithic)(struct fs_context *fc, void *data);
- int (*get_tree)(struct fs_context *fc);
- int (*reconfigure)(struct fs_context *fc);
- };
mount 的整体调用栈如下, 下面我们一个一个分析:
- #0 exfat_fill_super (sb=0xffff888004865000, fc=0xffff888003053d80) at fs/exfat/super.c:599
- #1 0xffffffff8120a2e9 in get_tree_bdev (fc=0xffff888003053d80, fill_super=0xffffffff813232e5 <exfat_fill_super>) at fs/super.c:1344
- #2 0xffffffff813236eb in exfat_get_tree (fc=0xffff888003053d80) at fs/exfat/super.c:696
- #3 0xffffffff8120915c in vfs_get_tree (fc=fc@entry=0xffff888003053d80) at fs/super.c:1549
- #4 0xffffffff8122a997 in do_new_mount (data=0x0 <fixed_percpu_data>, name=0xffff8880032794a0 "/dev/loop0", mnt_flags=32, sb_flags=<optimized out>, fstype=0x20 <fixed_percpu_data+32> <error: Cannot access memory at address 0x20>, path=0xffffc90000183ec8) at fs/namespace.c:2875
- #5 path_mount (dev_name=dev_name@entry=0xffff8880032794a0 "/dev/loop0", path=path@entry=0xffffc90000183ec8, type_page=type_page@entry=0xffff8880032d1c78 "exfat", flags=<optimized out>, flags@entry=32768, data_page=data_page@entry=0x0 <fixed_percpu_data>) at fs/namespace.c:3205
- #6 0xffffffff8122ae10 in do_mount (dev_name=dev_name@entry=0xffff8880032794a0 "/dev/loop0", dir_name=dir_name@entry=0x7ffd4ff80f31 "/mnt", type_page=type_page@entry=0xffff8880032d1c78 "exfat", flags=flags@entry=32768, data_page=data_page@entry=0x0 <fixed_percpu_data>) at fs/namespace.c:3218
- #7 0xffffffff8122b246 in __do_sys_mount (data=<optimized out>, flags=32768, type=<optimized out>, dir_name=0x7ffd4ff80f31 "/mnt", dev_name=<optimized out>) at fs/namespace.c:3426
- #8 __se_sys_mount (data=<optimized out>, flags=32768, type=<optimized out>, dir_name=140725945110321, dev_name=<optimized out>) at fs/namespace.c:3403
- #9 __x64_sys_mount (regs=<optimized out>) at fs/namespace.c:3403
- #10 0xffffffff819bf903 in do_syscall_64 (nr=<optimized out>, regs=0xffffc90000183f58) at arch/x86/entry/common.c:46
- #11 0xffffffff81a0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120
- #12 0x0000000000000000 in ?? ()
如下是linux mount系统调用的定义, mount都会走到这个地方来, 主要调用do_mount 完成后续的工作。
- fs/namespace.c
- SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
- char __user *, type, unsigned long, flags, void __user *, data)
- {
- int ret;
- char *kernel_type;
- char *kernel_dev;
- void *options;
-
- kernel_type = copy_mount_string(type);
- ret = PTR_ERR(kernel_type);
- if (IS_ERR(kernel_type))
- goto out_type;
-
- kernel_dev = copy_mount_string(dev_name);
- ret = PTR_ERR(kernel_dev);
- if (IS_ERR(kernel_dev))
- goto out_dev;
-
- options = copy_mount_options(data);
- ret = PTR_ERR(options);
- if (IS_ERR(options))
- goto out_data;
-
- ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
-
- kfree(options);
- out_data:
- kfree(kernel_dev);
- out_dev:
- kfree(kernel_type);
- out_type:
- return ret;
- }

do_mount 主要调用了path_mount , path_mount 中主要设置了sb_flags和mnt_flags, 然后调用了do_new_mount
- fs/namespace.c
- int path_mount(const char *dev_name, struct path *path,
- const char *type_page, unsigned long flags, void *data_page)
- {
- unsigned int mnt_flags = 0, sb_flags;
- int ret;
-
- /* Discard magic */
- if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
- flags &= ~MS_MGC_MSK;
-
- /* Basic sanity checks */
- if (data_page)
- ((char *)data_page)[PAGE_SIZE - 1] = 0;
-
- if (flags & MS_NOUSER)
- return -EINVAL;
-
- ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
- if (ret)
- return ret;
- if (!may_mount())
- return -EPERM;
- if ((flags & SB_MANDLOCK) && !may_mandlock())
- return -EPERM;
-
- /* Default to relatime unless overriden */
- if (!(flags & MS_NOATIME))
- mnt_flags |= MNT_RELATIME;
-
- /* Separate the per-mountpoint flags */
- if (flags & MS_NOSUID)
- mnt_flags |= MNT_NOSUID;
- if (flags & MS_NODEV)
- mnt_flags |= MNT_NODEV;
- if (flags & MS_NOEXEC)
- mnt_flags |= MNT_NOEXEC;
- if (flags & MS_NOATIME)
- mnt_flags |= MNT_NOATIME;
- if (flags & MS_NODIRATIME)
- mnt_flags |= MNT_NODIRATIME;
- if (flags & MS_STRICTATIME)
- mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
- if (flags & MS_RDONLY)
- mnt_flags |= MNT_READONLY;
- if (flags & MS_NOSYMFOLLOW)
- mnt_flags |= MNT_NOSYMFOLLOW;
-
- /* The default atime for remount is preservation */
- if ((flags & MS_REMOUNT) &&
- ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
- MS_STRICTATIME)) == 0)) {
- mnt_flags &= ~MNT_ATIME_MASK;
- mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
- }
-
- sb_flags = flags & (SB_RDONLY |
- SB_SYNCHRONOUS |
- SB_MANDLOCK |
- SB_DIRSYNC |
- SB_SILENT |
- SB_POSIXACL |
- SB_LAZYTIME |
- SB_I_VERSION);
-
- if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
- return do_reconfigure_mnt(path, mnt_flags);
- if (flags & MS_REMOUNT)
- return do_remount(path, flags, sb_flags, mnt_flags, data_page);
- if (flags & MS_BIND)
- return do_loopback(path, dev_name, flags & MS_REC);
- if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
- return do_change_type(path, flags);
- if (flags & MS_MOVE)
- return do_move_mount_old(path, dev_name);
-
- return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
- data_page);
- }

do_new_mount 是比较重要的函数, 这里面分配了fs_context结构体。
- fs/namespace.c
- /*
- * create a new mount for userspace and request it to be added into the
- * namespace's tree
- */
- static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
- int mnt_flags, const char *name, void *data)
- {
- struct file_system_type *type;
- struct fs_context *fc;
- const char *subtype = NULL;
- int err = 0;
-
- if (!fstype)
- return -EINVAL;
-
- type = get_fs_type(fstype); /* 1 */
- if (!type)
- return -ENODEV;
-
- if (type->fs_flags & FS_HAS_SUBTYPE) {
- subtype = strchr(fstype, '.');
- if (subtype) {
- subtype++;
- if (!*subtype) {
- put_filesystem(type);
- return -EINVAL;
- }
- }
- }
-
- fc = fs_context_for_mount(type, sb_flags); /* 2 */
- put_filesystem(type);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- if (subtype)
- err = vfs_parse_fs_string(fc, "subtype",
- subtype, strlen(subtype));
- if (!err && name)
- err = vfs_parse_fs_string(fc, "source", name, strlen(name));
- if (!err)
- err = parse_monolithic_mount_data(fc, data);
- if (!err && !mount_capable(fc))
- err = -EPERM;
- if (!err)
- err = vfs_get_tree(fc); /* 3 */
- if (!err)
- err = do_new_mount_fc(fc, path, mnt_flags);
-
- put_fs_context(fc);
- return err;
- }

get_tree_bdev 函数中会申请super_block结构体,主要流程如下:
- /**
- * get_tree_bdev - Get a superblock based on a single block device
- * @fc: The filesystem context holding the parameters
- * @fill_super: Helper to initialise a new superblock
- */
- int get_tree_bdev(struct fs_context *fc,
- int (*fill_super)(struct super_block *,
- struct fs_context *))
- {
- struct block_device *bdev;
- struct super_block *s;
- fmode_t mode = FMODE_READ | FMODE_EXCL;
- int error = 0;
-
- if (!(fc->sb_flags & SB_RDONLY))
- mode |= FMODE_WRITE;
-
- if (!fc->source)
- return invalf(fc, "No source specified");
-
- fc->sb_flags |= SB_NOSEC;
- fc->sget_key = bdev;
- s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); /* 1 */
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- blkdev_put(bdev, mode);
- return PTR_ERR(s);
- }
-
- if (s->s_root) {
- /* Don't summarily change the RO/RW state. */
- if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
- warnf(fc, "%pg: Can't mount, would change RO state", bdev);
- deactivate_locked_super(s);
- blkdev_put(bdev, mode);
- return -EBUSY;
- }
- /*
- * s_umount nests inside bd_mutex during
- * __invalidate_device(). blkdev_put() acquires
- * bd_mutex and can't be called under s_umount. Drop
- * s_umount temporarily. This is safe as we're
- * holding an active reference.
- */
- up_write(&s->s_umount);
- blkdev_put(bdev, mode);
- down_write(&s->s_umount);
- } else {
- s->s_mode = mode;
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, fc); /* 2 */
- if (error) {
- deactivate_locked_super(s);
- return error;
- }
- s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
- }
- BUG_ON(fc->root);
- fc->root = dget(s->s_root);
- return 0;
- }

(1) alloc super_block 结构体
(2) 调用传入的fill_super函数, 执行文件系统自定义的操作。 这里一般做的是去解析文件系统的元数据, 并填充到文件系统的私有结构体中。
exfat fill_super函数的实现如下:
- fs/exfat/super.c
- static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
- {
- struct exfat_sb_info *sbi = sb->s_fs_info;
- struct exfat_mount_options *opts = &sbi->options;
- struct inode *root_inode;
- int err;
-
- if (opts->allow_utime == (unsigned short)-1)
- opts->allow_utime = ~opts->fs_dmask & 0022;
-
- if (opts->discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
- opts->discard = 0;
- }
- }
-
- sb->s_flags |= SB_NODIRATIME;
- sb->s_magic = EXFAT_SUPER_MAGIC;
- sb->s_op = &exfat_sops;
-
- sb->s_time_gran = 10 * NSEC_PER_MSEC;
- sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
- sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
-
- err = __exfat_fill_super(sb); /* 1 */
- if (err) {
- exfat_err(sb, "failed to recognize exfat type");
- goto check_nls_io;
- }
-
- /* set up enough so that it can read an inode */
- exfat_hash_init(sb);
-
- if (!strcmp(sbi->options.iocharset, "utf8"))
- opts->utf8 = 1;
- else {
- sbi->nls_io = load_nls(sbi->options.iocharset);
- if (!sbi->nls_io) {
- exfat_err(sb, "IO charset %s not found",
- sbi->options.iocharset);
- err = -EINVAL;
- goto free_table;
- }
- }
-
- if (sbi->options.utf8)
- sb->s_d_op = &exfat_utf8_dentry_ops;
- else
- sb->s_d_op = &exfat_dentry_ops;
-
- root_inode = new_inode(sb);
- if (!root_inode) {
- exfat_err(sb, "failed to allocate root inode");
- err = -ENOMEM;
- goto free_table;
- }
-
- root_inode->i_ino = EXFAT_ROOT_INO;
- inode_set_iversion(root_inode, 1);
- err = exfat_read_root(root_inode);
- if (err) {
- exfat_err(sb, "failed to initialize root inode");
- goto put_inode;
- }
-
- exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos);
- insert_inode_hash(root_inode);
-
- sb->s_root = d_make_root(root_inode);
- if (!sb->s_root) {
- exfat_err(sb, "failed to get the root dentry");
- err = -ENOMEM;
- goto put_inode;
- }
-
- return 0;
-
- put_inode:
- iput(root_inode);
- sb->s_root = NULL;
-
- free_table:
- exfat_free_upcase_table(sbi);
- exfat_free_bitmap(sbi);
- brelse(sbi->boot_bh);
-
- check_nls_io:
- unload_nls(sbi->nls_io);
- exfat_free_iocharset(sbi);
- sb->s_fs_info = NULL;
- kfree(sbi);
- return err;
- }

主要分为两部分:
(1) 读取exfat 的文件系统信息, 解析后保存在 exfat_sb_info这个结构体中
(2) 设置super_block 的一些重要field, 例如s_op, s_root, s_d_op
mount 完成后, 会调用do_new_mount_fc将新的挂载实例添加到系统中。
- fs/namespace.c
- /*
- * create a new mount for userspace and request it to be added into the
- * namespace's tree
- */
- static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
- int mnt_flags, const char *name, void *data)
- {
- struct file_system_type *type;
- struct fs_context *fc;
- const char *subtype = NULL;
- int err = 0;
-
- if (!fstype)
- return -EINVAL;
-
- type = get_fs_type(fstype); /* 1 */
- if (!type)
- return -ENODEV;
-
- if (type->fs_flags & FS_HAS_SUBTYPE) {
- subtype = strchr(fstype, '.');
- if (subtype) {
- subtype++;
- if (!*subtype) {
- put_filesystem(type);
- return -EINVAL;
- }
- }
- }
-
- fc = fs_context_for_mount(type, sb_flags); /* 2 */
- put_filesystem(type);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- if (subtype)
- err = vfs_parse_fs_string(fc, "subtype",
- subtype, strlen(subtype));
- if (!err && name)
- err = vfs_parse_fs_string(fc, "source", name, strlen(name));
- if (!err)
- err = parse_monolithic_mount_data(fc, data);
- if (!err && !mount_capable(fc))
- err = -EPERM;
- if (!err)
- err = vfs_get_tree(fc); /* 3 */
- if (!err)
- err = do_new_mount_fc(fc, path, mnt_flags); /* 4 */
-
- put_fs_context(fc);
- return err;
- }

这里主要调用了 do_new_mount_fc , 创建新的挂载实例关联到系统中, 这里面数据结构涉及很多,且很混乱,其中的关系暂时没有梳理清楚。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。