容器 rootfs 限额原理
Jun 1, 2022 23:00 · 2450 words · 5 minute read
在容器运行时利用 overlayfs 联合挂载分层镜像时,容器的 rootfs(根文件系统)本质上就是宿主机上的几个文件路径。容器在运行时可能会向其根文件系统中写入数据,这些数据实打实地存储在宿主机上,如果不加以限制任由其写入,可能影响到整个宿主机。
Docker
我们熟悉的 Docker 容器运行时,提供 overlay2.size 选项来为容器设置默认的磁盘容量:
$ sudo dockerd -s overlay2 --storage-opt overlay2.size=1G
官方文档提到了只有当底层文件系统为 xfs 并以 pquota
选项挂载时,才支持容器 rootfs 限额。
我们先来看一下 Docker 容器运行项目 moby 中的实现 https://github.com/moby/moby/blob/v20.10.14/daemon/graphdriver/overlay2/overlay.go#L379-L391:
func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts) (retErr error) {
// a lot of code here
if opts != nil && len(opts.StorageOpt) > 0 {
driver := &Driver{}
if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil {
return err
}
if driver.options.quota.Size > 0 {
// Set container disk quota limit
if err := d.quotaCtl.SetQuota(dir, driver.options.quota); err != nil {
return err
}
}
}
}
overlay2 的 graph driver 调用 SetQuota
方法为某个路径设置指定的容量,而这个方法由 quota 包的 Control
结构提供:
func (q *Control) SetQuota(targetPath string, quota Quota) error {
q.RLock()
projectID, ok := q.quotas[targetPath]
q.RUnlock()
if !ok {
state := getPquotaState()
state.Lock()
projectID = state.nextProjectID
// assign project id to new container directory
err := setProjectID(targetPath, projectID)
if err != nil {
state.Unlock()
return err
}
state.nextProjectID++
state.Unlock()
q.Lock()
q.quotas[targetPath] = projectID
q.Unlock()
}
// set the quota limit for the container's project id
logrus.Debugf("SetQuota(%s, %d): projectID=%d", targetPath, quota.Size, projectID)
return setProjectQuota(q.backingFsBlockDev, projectID, quota)
}
CRI-O
crio 容器运行时通过配置文件 /etc/containers/config.toml 为容器设置默认磁盘容量 https://github.com/containers/storage/blob/v1.37.0/docs/containers-storage.conf.5.md#storage-options-for-overlay-table:
[storage.options.overlay]
size=5g
我们再来看一眼 CRI-O 容器运行时中的实现 https://github.com/cri-o/cri-o/blob/main/vendor/github.com/containers/storage/drivers/overlay/overlay.go#L910-L929:
func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts, disableQuota bool) (retErr error) {
// a lot of code here
if d.quotaCtl != nil && !disableQuota {
quota := quota.Quota{}
if opts != nil && len(opts.StorageOpt) > 0 {
driver := &Driver{}
if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil {
return err
}
if driver.options.quota.Size > 0 {
quota.Size = driver.options.quota.Size
}
if driver.options.quota.Inodes > 0 {
quota.Inodes = driver.options.quota.Inodes
}
}
// Set container disk quota limit
// If it is set to 0, we will track the disk usage, but not enforce a limit
if err := d.quotaCtl.SetQuota(dir, quota); err != nil {
return err
}
}
}
实际上 overlay 的 graph driver 的实现在 containers/storage 项目中,同样调用 SetQuota
方法为某个路径设置指定的容量。
func (q *Control) SetQuota(targetPath string, quota Quota) error {
projectID, ok := q.quotas[targetPath]
if !ok {
projectID = q.nextProjectID
//
// assign project id to new container directory
//
err := setProjectID(targetPath, projectID)
if err != nil {
return err
}
q.quotas[targetPath] = projectID
q.nextProjectID++
}
//
// set the quota limit for the container's project id
//
logrus.Debugf("SetQuota path=%s, size=%d, inodes=%d, projectID=%d", targetPath, quota.Size, quota.Inodes, projectID)
return setProjectQuota(q.backingFsBlockDev, projectID, quota)
}
Docker 和 CRI-O 的 SetQuota
方法实现几乎相同,我们看 containers/storage 项目的实现 setProjectQuota 函数就行:
// setProjectQuota - set the quota for project id on xfs block device
func setProjectQuota(backingFsBlockDev string, projectID uint32, quota Quota) error {
var d C.fs_disk_quota_t
d.d_version = C.FS_DQUOT_VERSION
d.d_id = C.__u32(projectID)
d.d_flags = C.FS_PROJ_QUOTA
if quota.Size > 0 {
d.d_fieldmask = d.d_fieldmask | C.FS_DQ_BHARD | C.FS_DQ_BSOFT
d.d_blk_hardlimit = C.__u64(quota.Size / 512)
d.d_blk_softlimit = d.d_blk_hardlimit
}
if quota.Inodes > 0 {
d.d_fieldmask = d.d_fieldmask | C.FS_DQ_IHARD | C.FS_DQ_ISOFT
d.d_ino_hardlimit = C.__u64(quota.Inodes)
d.d_ino_softlimit = d.d_ino_hardlimit
}
var cs = C.CString(backingFsBlockDev)
defer C.free(unsafe.Pointer(cs))
_, _, errno := unix.Syscall6(unix.SYS_QUOTACTL, C.Q_XSETPQLIM,
uintptr(unsafe.Pointer(cs)), uintptr(d.d_id),
uintptr(unsafe.Pointer(&d)), 0, 0)
if errno != 0 {
return fmt.Errorf("Failed to set quota limit for projid %d on %s: %v",
projectID, backingFsBlockDev, errno.Error())
}
return nil
}
projectquota.go 文件顶端的 // +build linux
注解表明了这部分代码是针对 Linux 操作系统的。本质上通过 Linux quotactl 系统调用 来为 project 设置磁盘容量。
int quotactl(int cmd, const char *special, int id, caddr_t addr);
-
cmd
参数是将要被应用至id
的子命令,这里使用Q_XSETQLIM
,表示设置磁盘额度限制:Set disk quota limits for user id.
-
special
指针表示一个特殊块设备的路径,这个块设备会在初始化 Control 对象时被创建/打开 -
id
参数是路线对应的 project id,这个 id 由 Control 对象来维护 -
addr
指针指向fs_disk_quota
数据结构
而 setProjectQuota
的参数中只有 project id,SetQuota()
方法的目标路径参数被转换成了 project id https://github.com/containers/storage/blob/v1.37.0/drivers/quota/projectquota.go#L310-L333:
func setProjectID(targetPath string, projectID uint32) error {
dir, err := openDir(targetPath)
if err != nil {
return err
}
defer closeDir(dir)
var fsx C.struct_fsxattr
_, _, errno := unix.Syscall(unix.SYS_IOCTL, getDirFd(dir), C.FS_IOC_FSGETXATTR,
uintptr(unsafe.Pointer(&fsx)))
if errno != 0 {
return fmt.Errorf("Failed to get projid for %s: %v", targetPath, errno.Error())
}
fsx.fsx_projid = C.__u32(projectID)
fsx.fsx_xflags |= C.FS_XFLAG_PROJINHERIT
_, _, errno = unix.Syscall(unix.SYS_IOCTL, getDirFd(dir), C.FS_IOC_FSSETXATTR,
uintptr(unsafe.Pointer(&fsx)))
if errno != 0 {
return fmt.Errorf("Failed to set projid for %s: %v", targetPath, errno.Error())
}
return nil
}
看到 ioctl 系统调用:
int ioctl(int fd, unsigned long request, ...);
-
fd
参数传入了被设置磁盘容量的目标路径的文件描述符 -
request
参数是一个设备相关的请求码overlay 底层文件系统是 xfs
FS_IOC_FSGETXATTR
获取 xfs 的 inode flags,使用 fsxattr 结构来接收值FS_IOC_FSSETXATTR
设置 xfs 的 inode flags,project id 被赋值给 fsxattr 结构,相当于将 project id 与目标路径绑定
综上所述,对底层文件系统为 xfs 的目标路径设置磁盘容量
- 将目标路径与一个 project id 绑定
- 对 project id 设置 quota
而 CRI-O 容器运行时利用 overlayfs 联合挂载出容器的 rootfs:
$ mount -l | tail
overlay on /var/lib/containers/storage/overlay/4a7c2c7f0a5cf3987ebc9568271b2c227d0c2242117b1dacf8ae6f8d44ab6159/merged type overlay (rw,nodev,relatime,lowerdir=/var/lib/containers/storage/overlay/l/RABJ4WRKVTUCKSM7RGS3GQPCVS,upperdir=/var/lib/containers/storage/overlay/4a7c2c7f0a5cf3987ebc9568271b2c227d0c2242117b1dacf8ae6f8d44ab6159/diff,workdir=/var/lib/containers/storage/overlay/4a7c2c7f0a5cf3987ebc9568271b2c227d0c2242117b1dacf8ae6f8d44ab6159/work,metacopy=on,volatile)
overlay on /var/lib/containers/storage/overlay/f26c2e0c97cde54376027b93f3b0f10ac10fe8366bb55d43964173b14aec6e7f/merged type overlay (rw,nodev,relatime,lowerdir=/var/lib/containers/storage/overlay/l/3PBIFLBVM72AL2FOPF4VBBYQOW:/var/lib/containers/storage/overlay/l/VQC3JSMP4JGVY46ZWFDQSUPJKT:/var/lib/containers/storage/overlay/l/FSYSFVZEYNFKPGR4FFG32PLIJ2,upperdir=/var/lib/containers/storage/overlay/f26c2e0c97cde54376027b93f3b0f10ac10fe8366bb55d43964173b14aec6e7f/diff,workdir=/var/lib/containers/storage/overlay/f26c2e0c97cde54376027b93f3b0f10ac10fe8366bb55d43964173b14aec6e7f/work,metacopy=on,volatile)
我们随便找一个联合挂载好的容器 rootfs
- merged
- upperdir
- lowerdir
- workdir
除了无法改变的 lowerdir 只读层,其他可以变更的读写层和 overlayfs 工作区都在 /var/lib/containers/storage/overlay/f26c2e0c 路径下:
$ ls /var/lib/containers/storage/overlay/f26c2e0c/
diff link lower merged work
再回来看 graph driver 中的实现 https://github.com/containers/storage/blob/v1.37.0/drivers/overlay/overlay.go#L869-L980:
func (d *Driver) create(id, parent string, opts *graphdriver.CreateOpts, disableQuota bool) (retErr error) {
// a lot of code here
if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
return err
}
defer func() {
// Clean up on failure
if retErr != nil {
os.RemoveAll(dir)
}
}()
if d.quotaCtl != nil && !disableQuota {
quota := quota.Quota{}
if opts != nil && len(opts.StorageOpt) > 0 {
driver := &Driver{}
if err := d.parseStorageOpt(opts.StorageOpt, driver); err != nil {
return err
}
if driver.options.quota.Size > 0 {
quota.Size = driver.options.quota.Size
}
if driver.options.quota.Inodes > 0 {
quota.Inodes = driver.options.quota.Inodes
}
}
// Set container disk quota limit
// If it is set to 0, we will track the disk usage, but not enforce a limit
if err := d.quotaCtl.SetQuota(dir, quota); err != nil {
return err
}
}
}
在设置磁盘容量前,会先去创建该路径,就是 /var/lib/containers/storage/overlay/f26c2e0c 目录。
这里要注意,因为容器的 rootfs 是 overlayfs 联合挂载出来的,所以要限制的是 upperdir 和 workdir 这两个路径,graph driver 将这两个目录置于同父目录下,父目录作为 SetQuota
方法的目标路径参数。只限制 upperdir 会有问题:在 rootfs 中创建 lowerdir 层中不存在的文件会返回 Invalid cross-device link 错误。问题出在 xfs_rename
,创建文件时,overlayfs 会在 workdir 创建一个文件,然后 rename 到 upperdir。在 rename 时,XFS 会对比源路径与目标路径的 project id,如果不同就会返回 EXDEV 错误 https://github.com/torvalds/linux/blob/v5.4/fs/xfs/xfs_inode.c#L3182-L3461。
int
xfs_rename(
struct xfs_inode *src_dp,
struct xfs_name *src_name,
struct xfs_inode *src_ip,
struct xfs_inode *target_dp,
struct xfs_name *target_name,
struct xfs_inode *target_ip,
unsigned int flags)
{
// a lot of code here
if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
(xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
error = -EXDEV;
goto out_trans_cancel;
}
// a lot of code here
}
这一切的前提是底层的 xfs 已开启 prjquota。
编辑 /etc/default/grub,追加 rootflags=uquota,pquota
:
GRUB_CMDLINE_LINUX="rd.lvm.lv=centos/swap vconsole.font=latarcyrheb-sun16 rd.lvm.lv=centos/root crashkernel=auto vconsole.keymap=us rhgb quiet rootflags=uquota,pquota"
执行 grub2-mkconfig -o /boot/grub2/grub.cfg
并重启服务器。