CephFS 与 Linux VFS

Sep 25, 2022 19:00 · 1865 words · 4 minute read FileSystem Linux

CephFS 常用于为 Kubernetes Pod 提供共享文件系统（即多个 Pod 挂载同一个 CephFS 卷）。

内核加载 CephFS

要使用文件系统，必须先挂载它；而挂载文件系统的前提是该文件系统已注册入内核。

查看已在内核注册过的文件系统（内核版本 4.18；未挂载 CephFS 卷）：

$ cat /proc/filesystems
nodev sysfs
nodev tmpfs
nodev bdev
nodev proc
nodev cgroup
#...
$ grep -i ceph /proc/filesystems

再找一个已经挂载 CephFS 卷的环境（内核版本 4.18）查看注册过的 CephFS 文件系统：

$ grep -i ceph /proc/filesystems
nodev ceph

多了一行 nodev ceph。

接下来我们就来剖析 CephFS 是如何注册入内核的。

CephFS 是在首次挂载时通过 ceph.ko 内核模块动态注册的。

查看 Linux 文件系统内核模块：

$ ls /lib/modules/$(uname -r)/kernel/fs | grep -i ceph
ceph
$ ls /lib/modules/$(uname -r)/kernel/fs/ceph
ceph.ko.xz

CephFS 内核模块 https://github.com/torvalds/linux/blob/v4.18/fs/ceph/super.c：

static struct file_system_type ceph_fs_type = {
    .owner      = THIS_MODULE,
    .name       = "ceph",
    .mount      = ceph_mount,
    .kill_sb    = ceph_kill_sb,
    .fs_flags   = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("ceph");

static int __init init_ceph(void)
{
    int ret = init_caches();
    if (ret)
        goto out;

    ceph_flock_init();
    ceph_xattr_init();
    ret = register_filesystem(&ceph_fs_type);
    if (ret)
        goto out_xattr;

    pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);

    return 0;

out_xattr:
    ceph_xattr_exit();
    destroy_caches();
out:
    return ret;
}

module_init(init_ceph);

register_filesystem 是专门用于注册文件系统的函数。init_ceph 函数通过 register_filesystem 函数将表示文件系统类型的结构体 ceph_fs_type 注册入内核。

挂载文件系统时会使用 mount 系统调用：

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
    char __user *, type, unsigned long, flags, void __user *, data)
{
    return ksys_mount(dev_name, dir_name, type, flags, data);
}

追一下调用链 ksys_mount -> do_mount -> do_new_mount -> get_fs_type：

ksys_mount

int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
    unsigned long flags, void __user *data)
{
    // a lot of code here
    ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
    // a lot of code here
}

do_mount

long do_mount(const char *dev_name, const char __user *dir_name,
    const char *type_page, unsigned long flags, void *data_page)
{
    // a lot of code here
    else
        retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
                    dev_name, data_page);
}

do_new_mount

static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
    int mnt_flags, const char *name, void *data)
{
    // a lot of code here
    type = get_fs_type(fstype);
    // a lot of code here
}

get_fs_type

static struct file_system_type *__get_fs_type(const char *name, int len)
{
    struct file_system_type *fs;

    read_lock(&file_systems_lock);
    fs = *(find_filesystem(name, len));
    if (fs && !try_module_get(fs->owner))
        fs = NULL;
    read_unlock(&file_systems_lock);
    return fs;
}

struct file_system_type *get_fs_type(const char *name)
{
    struct file_system_type *fs;
    const char *dot = strchr(name, '.');
    int len = dot ? dot - name : strlen(name);

    fs = __get_fs_type(name, len);
    if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
        fs = __get_fs_type(name, len);
        WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name);
    }

    if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
        put_filesystem(fs);
        fs = NULL;
    }
    return fs;
}

__get_fs_type 函数无法查找还未注册的文件系统，调用 request_module 函数加载 CepFS 的内核模块 ceph.ko.xz，执行 init_ceph 初始化钩子函数，将 CephFS 注册入内核中。

Linux VFS（虚拟文件系统）

Linux 支持很多种不同的文件系统，每个文件系统的实现都各不相同，因此 Linux 内核向用户空间提供了统一的接口 VFS，提供常用的文件系统对象模型（inode、dentry、mount）和相应的操作方法，抽象掉每种文件系统的底层细节。

VFS 相关文档：https://www.kernel.org/doc/html/latest/filesystems/vfs.html

CephFS 当然也实现了 VFS，定义了一个 file_operations 类型的常量 ceph_file_fops：

const struct file_operations ceph_file_fops = {
    .open = ceph_open,
    .release = ceph_release,
    .llseek = ceph_llseek,
    .read_iter = ceph_read_iter,
    .write_iter = ceph_write_iter,
    .mmap = ceph_mmap,
    .fsync = ceph_fsync,
    .lock = ceph_lock,
    .flock = ceph_flock,
    .splice_read = generic_file_splice_read,
    .splice_write = iter_file_splice_write,
    .unlocked_ioctl = ceph_ioctl,
    .compat_ioctl = ceph_ioctl,
    .fallocate = ceph_fallocate,
};

每个字段都指向了 CephFS 中相应文件操作的具体实现，其中 open 成员结构指向 ceph_open 函数；write_iter 成员结构指向 ceph_write_iter 函数。

打开文件

用户态的应用程序通过 open 系统调用打开文件：

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;

    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

调用链 do_sys_open -> do_filp_open

struct file *do_filp_open(int dfd, struct filename *pathname,
        const struct open_flags *op)
{
    struct nameidata nd;
    int flags = op->lookup_flags;
    struct file *filp;

    set_nameidata(&nd, dfd, pathname);
    filp = path_openat(&nd, op, flags | LOOKUP_RCU);
    if (unlikely(filp == ERR_PTR(-ECHILD)))
        filp = path_openat(&nd, op, flags);
    if (unlikely(filp == ERR_PTR(-ESTALE)))
        filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
    restore_nameidata();
    return filp;
}

do_filp_open 函数中初始化了 nameidata 这个结构：

struct nameidata {
    struct path path;
    struct qstr last;
    struct path root;
    struct inode *inode; /* path.dentry.d_inode */
    // a lot of code here
} __randomize_layout;

该结构在解析和查找路径时提供辅助，其中 path 是一个关键成员结构：

struct path {
    struct vfsmount *mnt;
    struct dentry *dentry;
} __randomize_layout;

vfsmount 结构和文件系统的挂载有关
dentry 结构用于标识目录、建立文件名和其 inode 之间的关联

path_openat -> do_last -> vfs_open -> do_dentry_open：

static int do_dentry_open(struct file *f,
            struct inode *inode,
            int (*open)(struct inode *, struct file *),
            const struct cred *cred)
{
    // a lot of code here
    f->f_op = fops_get(inode->i_fop);
    // a lot of code here
    if (!open)
        open = f->f_op->open;
    // a lot of code here
}

f->f_op 是一个 file_operations 类型的结构体，隐约能够感觉到这里和 CephFS 中的 ceph_file_fops 常量挂接起来了。

我们全局搜索 ceph_file_fops 关键字：

3 个结果 - 3 文件

fs/ceph/file.c:
  1822
  1823: const struct file_operations ceph_file_fops = {
  1824   .open = ceph_open,

fs/ceph/inode.c:
  909    inode->i_op = &ceph_file_iops;
  910:   inode->i_fop = &ceph_file_fops;
  911    break;

fs/ceph/super.h:
  1022  /* file.c */
  1023: extern const struct file_operations ceph_file_fops;
  1024

存储在 CephFS 文件的 inode 的 i_fop 指向的正是 ceph_file_fops 常量，而 f->f_op = fops_get(inode->i_fop); 语句将 ceph_file_fops 赋值给了 f->f_op。

我们再来看一眼搜索出的第二个结果，即 fs/ceph/inode.c 文件中的 fill_inode 函数：

static int fill_inode(struct inode *inode, struct page *locked_page,
              struct ceph_mds_reply_info_in *iinfo,
              struct ceph_mds_reply_dirfrag *dirinfo,
              struct ceph_mds_session *session,
              unsigned long ttl_from, int cap_fmode,
              struct ceph_cap_reservation *caps_reservation)
{
    // a lot of code here
    case S_IFREG:
        inode->i_fop = &ceph_file_fops;
        break;
    // a lot of code here
}

inode 变量是在 fill_inode 函数调用时传进来的，也就是说，inode（在 include/linux/fs.h 文件中）是一个 Linux 定义的通用变量，但其成员变量的值却是由各家文件系统自己来维护的，指向文件系统中的具体实现。

最终调用到 ceph_open，即 CephFS 中对文件打开操作的实现。

写文件

文件写操作最终调用到 ceph_write_iter 函数：

static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
    struct ceph_inode_info *ci = ceph_inode(inode);
    // a lot of code here
retry_snap:
    inode_lock(inode);
    // a lot of code here
    err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
                &got, NULL);
    if (err < 0)
        goto out;
    // a lot of code here
out:
    inode_unlock(inode);
}

写文件时先给 inode 上锁，因为宿主机上可能有另一个进程正在并发写相同文件，避免出现数据竞争。

再通过 ceph_get_caps 检查 Ceph caps 锁，因为服务端可能有另一个客户端正在并发写相同文件，避免出现数据竞争。

这也就是 CephFS 支持多方挂载、读写同一个卷的原因。

内核加载 CephFS

Linux VFS（虚拟文件系统）

打开文件

写文件

查看更多