容器 rootfs 限额原理

Jun 1, 2022 23:00 · 2450 words · 5 minute read Container Linux

在容器运行时利用 overlayfs 联合挂载分层镜像时,容器的 rootfs(根文件系统)本质上就是宿主机上的几个文件路径。容器在运行时可能会向其根文件系统中写入数据,这些数据实打实地存储在宿主机上,如果不加以限制任由其写入,可能影响到整个宿主机。

Docker

我们熟悉的 Docker 容器运行时,提供 overlay2.size 选项来为容器设置默认的磁盘容量:

$ sudo dockerd -s overlay2 --storage-opt overlay2.size=1G

官方文档提到了只有当底层文件系统为 xfs 并pquota 选项挂载时,才支持容器 rootfs 限额。

我们先来看一下 Docker 容器运行项目 moby 中的实现 https://github.com/moby/moby/blob/v20.10.14/daemon/graphdriver/overlay2/overlay.go#L379-L391

func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts) (retErr error) {
    // a lot of code here
    if opts != nil && len(opts.StorageOpt) > 0 {
        driver := &Driver{}
        if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil {
            return err
        }

        if driver.options.quota.Size > 0 {
            // Set container disk quota limit
            if err := d.quotaCtl.SetQuota(dir, driver.options.quota); err != nil {
                return err
            }
        }
    }
}

overlay2 的 graph driver 调用 SetQuota 方法为某个路径设置指定的容量,而这个方法由 quota 包的 Control 结构提供:

func (q *Control) SetQuota(targetPath string, quota Quota) error {
    q.RLock()
    projectID, ok := q.quotas[targetPath]
    q.RUnlock()
    if !ok {
        state := getPquotaState()
        state.Lock()
        projectID = state.nextProjectID

        // assign project id to new container directory
        err := setProjectID(targetPath, projectID)
        if err != nil {
            state.Unlock()
            return err
        }

        state.nextProjectID++
        state.Unlock()

        q.Lock()
        q.quotas[targetPath] = projectID
        q.Unlock()
    }

    // set the quota limit for the container's project id
    logrus.Debugf("SetQuota(%s, %d): projectID=%d", targetPath, quota.Size, projectID)
    return setProjectQuota(q.backingFsBlockDev, projectID, quota)
}

CRI-O

crio 容器运行时通过配置文件 /etc/containers/config.toml 为容器设置默认磁盘容量 https://github.com/containers/storage/blob/v1.37.0/docs/containers-storage.conf.5.md#storage-options-for-overlay-table

[storage.options.overlay]
  size=5g

我们再来看一眼 CRI-O 容器运行时中的实现 https://github.com/cri-o/cri-o/blob/main/vendor/github.com/containers/storage/drivers/overlay/overlay.go#L910-L929

func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts, disableQuota bool) (retErr error) {
    // a lot of code here
    if d.quotaCtl != nil && !disableQuota {
        quota := quota.Quota{}
        if opts != nil && len(opts.StorageOpt) > 0 {
            driver := &Driver{}
            if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil {
                return err
            }
            if driver.options.quota.Size > 0 {
                quota.Size = driver.options.quota.Size
            }
            if driver.options.quota.Inodes > 0 {
                quota.Inodes = driver.options.quota.Inodes
            }
        }
        // Set container disk quota limit
        // If it is set to 0, we will track the disk usage, but not enforce a limit
        if err := d.quotaCtl.SetQuota(dir, quota); err != nil {
            return err
        }
    }
}

实际上 overlay 的 graph driver 的实现在 containers/storage 项目中,同样调用 SetQuota 方法为某个路径设置指定的容量。

func (q *Control) SetQuota(targetPath string, quota Quota) error {

    projectID, ok := q.quotas[targetPath]
    if !ok {
        projectID = q.nextProjectID

        //
        // assign project id to new container directory
        //
        err := setProjectID(targetPath, projectID)
        if err != nil {
            return err
        }

        q.quotas[targetPath] = projectID
        q.nextProjectID++
    }

    //
    // set the quota limit for the container's project id
    //
    logrus.Debugf("SetQuota path=%s, size=%d, inodes=%d, projectID=%d", targetPath, quota.Size, quota.Inodes, projectID)
    return setProjectQuota(q.backingFsBlockDev, projectID, quota)
}

Docker 和 CRI-O 的 SetQuota 方法实现几乎相同,我们看 containers/storage 项目的实现 setProjectQuota 函数就行:

// setProjectQuota - set the quota for project id on xfs block device
func setProjectQuota(backingFsBlockDev string, projectID uint32, quota Quota) error {
    var d C.fs_disk_quota_t
    d.d_version = C.FS_DQUOT_VERSION
    d.d_id = C.__u32(projectID)
    d.d_flags = C.FS_PROJ_QUOTA

    if quota.Size > 0 {
        d.d_fieldmask = d.d_fieldmask | C.FS_DQ_BHARD | C.FS_DQ_BSOFT
        d.d_blk_hardlimit = C.__u64(quota.Size / 512)
        d.d_blk_softlimit = d.d_blk_hardlimit
    }
    if quota.Inodes > 0 {
        d.d_fieldmask = d.d_fieldmask | C.FS_DQ_IHARD | C.FS_DQ_ISOFT
        d.d_ino_hardlimit = C.__u64(quota.Inodes)
        d.d_ino_softlimit = d.d_ino_hardlimit
    }

    var cs = C.CString(backingFsBlockDev)
    defer C.free(unsafe.Pointer(cs))

    _, _, errno := unix.Syscall6(unix.SYS_QUOTACTL, C.Q_XSETPQLIM,
        uintptr(unsafe.Pointer(cs)), uintptr(d.d_id),
        uintptr(unsafe.Pointer(&d)), 0, 0)
    if errno != 0 {
        return fmt.Errorf("Failed to set quota limit for projid %d on %s: %v",
            projectID, backingFsBlockDev, errno.Error())
    }

    return nil
}

projectquota.go 文件顶端的 // +build linux 注解表明了这部分代码是针对 Linux 操作系统的。本质上通过 Linux quotactl 系统调用 来为 project 设置磁盘容量。

int quotactl(int cmd, const char *special, int id, caddr_t addr);
  • cmd 参数是将要被应用至 id 的子命令,这里使用 Q_XSETQLIM,表示设置磁盘额度限制:

    Set disk quota limits for user id.

  • special 指针表示一个特殊块设备的路径,这个块设备会在初始化 Control 对象时被创建/打开

  • id 参数是路线对应的 project id,这个 id 由 Control 对象来维护

  • addr 指针指向 fs_disk_quota 数据结构

setProjectQuota 的参数中只有 project id,SetQuota() 方法的目标路径参数被转换成了 project id https://github.com/containers/storage/blob/v1.37.0/drivers/quota/projectquota.go#L310-L333

func setProjectID(targetPath string, projectID uint32) error {
    dir, err := openDir(targetPath)
    if err != nil {
        return err
    }
    defer closeDir(dir)

    var fsx C.struct_fsxattr
    _, _, errno := unix.Syscall(unix.SYS_IOCTL, getDirFd(dir), C.FS_IOC_FSGETXATTR,
        uintptr(unsafe.Pointer(&fsx)))
    if errno != 0 {
        return fmt.Errorf("Failed to get projid for %s: %v", targetPath, errno.Error())
    }
    fsx.fsx_projid = C.__u32(projectID)
    fsx.fsx_xflags |= C.FS_XFLAG_PROJINHERIT
    _, _, errno = unix.Syscall(unix.SYS_IOCTL, getDirFd(dir), C.FS_IOC_FSSETXATTR,
        uintptr(unsafe.Pointer(&fsx)))
    if errno != 0 {
        return fmt.Errorf("Failed to set projid for %s: %v", targetPath, errno.Error())
    }

    return nil
}

看到 ioctl 系统调用

int ioctl(int fd, unsigned long request, ...);
  • fd 参数传入了被设置磁盘容量的目标路径的文件描述符

  • request 参数是一个设备相关的请求码

    overlay 底层文件系统是 xfs

    • FS_IOC_FSGETXATTR 获取 xfs 的 inode flags,使用 fsxattr 结构来接收值
    • FS_IOC_FSSETXATTR 设置 xfs 的 inode flags,project id 被赋值给 fsxattr 结构,相当于将 project id 与目标路径绑定

综上所述,对底层文件系统为 xfs 的目标路径设置磁盘容量

  1. 将目标路径与一个 project id 绑定
  2. 对 project id 设置 quota

而 CRI-O 容器运行时利用 overlayfs 联合挂载出容器的 rootfs:

$ mount -l | tail
overlay on /var/lib/containers/storage/overlay/4a7c2c7f0a5cf3987ebc9568271b2c227d0c2242117b1dacf8ae6f8d44ab6159/merged type overlay (rw,nodev,relatime,lowerdir=/var/lib/containers/storage/overlay/l/RABJ4WRKVTUCKSM7RGS3GQPCVS,upperdir=/var/lib/containers/storage/overlay/4a7c2c7f0a5cf3987ebc9568271b2c227d0c2242117b1dacf8ae6f8d44ab6159/diff,workdir=/var/lib/containers/storage/overlay/4a7c2c7f0a5cf3987ebc9568271b2c227d0c2242117b1dacf8ae6f8d44ab6159/work,metacopy=on,volatile)
overlay on /var/lib/containers/storage/overlay/f26c2e0c97cde54376027b93f3b0f10ac10fe8366bb55d43964173b14aec6e7f/merged type overlay (rw,nodev,relatime,lowerdir=/var/lib/containers/storage/overlay/l/3PBIFLBVM72AL2FOPF4VBBYQOW:/var/lib/containers/storage/overlay/l/VQC3JSMP4JGVY46ZWFDQSUPJKT:/var/lib/containers/storage/overlay/l/FSYSFVZEYNFKPGR4FFG32PLIJ2,upperdir=/var/lib/containers/storage/overlay/f26c2e0c97cde54376027b93f3b0f10ac10fe8366bb55d43964173b14aec6e7f/diff,workdir=/var/lib/containers/storage/overlay/f26c2e0c97cde54376027b93f3b0f10ac10fe8366bb55d43964173b14aec6e7f/work,metacopy=on,volatile)

我们随便找一个联合挂载好的容器 rootfs

  • merged
  • upperdir
  • lowerdir
  • workdir

除了无法改变的 lowerdir 只读层,其他可以变更的读写层和 overlayfs 工作区都在 /var/lib/containers/storage/overlay/f26c2e0c 路径下:

$ ls /var/lib/containers/storage/overlay/f26c2e0c/
diff  link  lower  merged  work

再回来看 graph driver 中的实现 https://github.com/containers/storage/blob/v1.37.0/drivers/overlay/overlay.go#L869-L980

func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts, disableQuota bool) (retErr error) {
    // a lot of code here
    if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
        return err
    }

    defer func() {
        // Clean up on failure
        if retErr != nil {
            os.RemoveAll(dir)
        }
    }()

    if d.quotaCtl != nil && !disableQuota {
        quota := quota.Quota{}
        if opts != nil && len(opts.StorageOpt) > 0 {
            driver := &Driver{}
            if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil {
                return err
            }
            if driver.options.quota.Size > 0 {
                quota.Size = driver.options.quota.Size
            }
            if driver.options.quota.Inodes > 0 {
                quota.Inodes = driver.options.quota.Inodes
            }
        }
        // Set container disk quota limit
        // If it is set to 0, we will track the disk usage, but not enforce a limit
        if err := d.quotaCtl.SetQuota(dir, quota); err != nil {
            return err
        }
    }
}

在设置磁盘容量前,会先去创建该路径,就是 /var/lib/containers/storage/overlay/f26c2e0c 目录。

这里要注意,因为容器的 rootfs 是 overlayfs 联合挂载出来的,所以要限制的是 upperdir 和 workdir 这两个路径,graph driver 将这两个目录置于同父目录下,父目录作为 SetQuota 方法的目标路径参数。只限制 upperdir 会有问题:在 rootfs 中创建 lowerdir 层中不存在的文件会返回 Invalid cross-device link 错误。问题出在 xfs_rename,创建文件时,overlayfs 会在 workdir 创建一个文件,然后 rename 到 upperdir。在 rename 时,XFS 会对比源路径与目标路径的 project id,如果不同就会返回 EXDEV 错误 https://github.com/torvalds/linux/blob/v5.4/fs/xfs/xfs_inode.c#L3182-L3461

int
xfs_rename(
    struct xfs_inode *src_dp,
    struct xfs_name  *src_name,
    struct xfs_inode *src_ip,
    struct xfs_inode *target_dp,
    struct xfs_name  *target_name,
    struct xfs_inode *target_ip,
    unsigned int  flags)
{
    // a lot of code here
    if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
             (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
        error = -EXDEV;
        goto out_trans_cancel;
    }
    // a lot of code here
}

这一切的前提是底层的 xfs 已开启 prjquota

编辑 /etc/default/grub,追加 rootflags=uquota,pquota

GRUB_CMDLINE_LINUX="rd.lvm.lv=centos/swap vconsole.font=latarcyrheb-sun16 rd.lvm.lv=centos/root crashkernel=auto  vconsole.keymap=us rhgb quiet rootflags=uquota,pquota"

执行 grub2-mkconfig -o /boot/grub2/grub.cfg 并重启服务器。