Kubernetes Pod 网络初始化原理

Jul 11, 2024 00:00 · 3965 words · 8 minute read Kubernetes Golang Linux Network

本文探索如何初始化 Pod 网络栈（Kubernetes 环境的容器运行时为 containerd，无 docker），虽然篇幅较长，且耐心看完。

大家必须要能够不假思索地说出 Kubernetes 在创建 Pod 时的调用链为 kubelet -> containerd -> runc

Sandbox（pause）

每个 Pod 都有一个 pause 容器来初始化整个 Pod 网络栈，pause 容器和业务容器在同一网络命名空间中。

kubelet 通过 CRI RunPodSandbox API 调用 containerd 拉起 pause 容器：

https://github.com/containerd/containerd/blob/0166783c79caf155ebfbcae3962441c3029ab9f2/pkg/cri/server/sandbox_run.go#L56-L513

func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
    // a lot of code here
    if !hostNetwork(config) && !userNsEnabled {
        var netnsMountDir = "/var/run/netns"

        sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
        if err != nil {
            return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
        }

        // Update network namespace in the container's spec
        c.updateNetNamespacePath(spec, sandbox.NetNSPath)
    }
}

在 /var/run/netns 路径下能够看到很多 cni- 前缀的文件：

 $ ll /var/run/netns
total 0
-r--r--r-- 1 root root 0 Jan 21 17:09 cni-0e658861-8c2e-aad0-9ed3-7d6603530739
-r--r--r-- 1 root root 0 Jan 21 17:10 cni-1e58c52f-3d19-161d-2a67-74bf4ec3d25d
-r--r--r-- 1 root root 0 Jan 21 17:09 cni-1f172a43-4dfa-3a55-5117-42219807b2f7
-r--r--r-- 1 root root 0 Jan 21 19:23 cni-232a665d-25eb-5541-86c6-687d9be31ffc
-r--r--r-- 1 root root 0 Mar 21 17:39 cni-23cf1be4-65af-1e5f-c1a6-beb1bca2e541
-r--r--r-- 1 root root 0 Jan 21 19:23 cni-24df7e4d-1b92-84a1-f20b-4d3e5f2aed37
-r--r--r-- 1 root root 0 Apr 29 21:02 cni-256356df-0a90-6dbf-bd49-c3fb7084ef33
-r--r--r-- 1 root root 0 Mar  8 10:28 cni-2b72ac82-443b-cff6-04d0-3f55f97964d4
-r--r--r-- 1 root root 0 Jan 21 17:09 cni-2b79ba54-29f8-d0d0-3a58-3cfb779c16e9
-r--r--r-- 1 root root 0 May  1 18:06 cni-2e61699f-834d-4bb3-07fc-284d758235ea
-r--r--r-- 1 root root 0 May  9 11:22 cni-2e874a72-cd36-8bfc-e326-3cee15b23c0c
-r--r--r-- 1 root root 0 Jan 21 17:09 cni-32b2060d-2df9-b643-93b8-652518af4f6b

接下来看 NewNetNS 函数：

https://github.com/containerd/containerd/blob/36f520dc04259debc7b8f19f5574db2a6054abf6/pkg/netns/netns_linux.go#L179-L191

// NewNetNS creates a network namespace.
func NewNetNS(baseDir string) (*NetNS, error) {
    return NewNetNSFromPID(baseDir, 0) // PID 指定为 0
}

// NewNetNS returns the netns from pid or a new netns if pid is 0.
func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) {
    path, err := newNS(baseDir, pid)
    if err != nil {
        return nil, fmt.Errorf("failed to setup netns: %w", err)
    }
    return &NetNS{path: path}, nil
}

然后来看 newNS 函数：

https://github.com/containerd/containerd/blob/36f520dc04259debc7b8f19f5574db2a6054abf6/pkg/netns/netns_linux.go#L51-L139

func newNS(baseDir string, pid uint32) (nsPath string, err error) {
    b := make([]byte, 16)

    _, err = rand.Read(b)
    if err != nil {
        return "", fmt.Errorf("failed to generate random netns name: %w", err)
    }

    // Create the directory for mounting network namespaces
    // This needs to be a shared mountpoint in case it is mounted in to
    // other namespaces (containers)
    if err := os.MkdirAll(baseDir, 0755); err != nil {
        return "", err
    }

    // create an empty file at the mount point and fail if it already exists
    nsName := fmt.Sprintf("cni-%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:])
    nsPath = path.Join(baseDir, nsName)
    mountPointFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
    if err != nil {
        return "", err
    }
    mountPointFd.Close()

    // do namespace work in a dedicated goroutine, so that we can safely
    // Lock/Unlock OSThread without upsetting the lock/unlock state of
    // the caller of this function
    go (func() {
        defer wg.Done()
        runtime.LockOSThread()
        // Don't unlock. By not unlocking, golang will kill the OS thread when the
        // goroutine is done (for go1.10+)

        var origNS cnins.NetNS
        origNS, err = cnins.GetNS(getCurrentThreadNetNSPath())
        if err != nil {
            return
        }
        defer origNS.Close()

        // create a new netns on the current thread
        err = unix.Unshare(unix.CLONE_NEWNET)
        if err != nil {
            return
        }

        // Put this thread back to the orig ns, since it might get reused (pre go1.10)
        defer origNS.Set()

        // bind mount the netns from the current thread (from /proc) onto the
        // mount point. This causes the namespace to persist, even when there
        // are no threads in the ns.
        err = unix.Mount(getCurrentThreadNetNSPath(), nsPath, "none", unix.MS_BIND, "")
        if err != nil {
            err = fmt.Errorf("failed to bind mount ns at %s: %w", nsPath, err)
        }
    })()
    wg.Wait()
}

// getCurrentThreadNetNSPath copied from pkg/ns
func getCurrentThreadNetNSPath() string {
    // /proc/self/ns/net returns the namespace of the main thread, not
    // of whatever thread this goroutine is running on.  Make sure we
    // use the thread's net namespace since the thread is switching around
    return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid())
}

通过 Unshare(unix.CLONE_NEWNET) 系统调用为 containerd 当前线程创建一个网络命名空间

通过 Mount 系统调用将当前线程的网络命名空间 bind mount 至 /var/run/netns 路径下新建的挂载点

所以 /var/run/netns 路径下都是共享的网络命名空间挂载点：

$ mount | grep cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9
nsfs on /run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9 type nsfs (rw)
nsfs on /run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9 type nsfs (rw)

然后 containerd 将已经创建好的网络命名空间挂载点作为参数（在 OCI 容器配置文件中）传给 runc 来拉起 pause 进程，随便找一个 Pod 的 pause 容器验证一下：

$ get po pod-nginx -o yaml | grep sandbox
    io.kubernetes.pod.sandbox.uid: f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e

ctr -n k8s.io container ls | grep f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e
f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e    registry-1.ict-mec.net:18443/kubesphere/pause:3.8                                              io.containerd.runc.v2

$ ctr -n k8s.io container info f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e | jq ".Spec.linux.namespaces"
[
  {
    "type": "pid"
  },
  {
    "type": "ipc"
  },
  {
    "type": "uts"
  },
  {
    "type": "mount"
  },
  {
    "type": "network",
    "path": "/var/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9"
  }
]

OCI 容器配置文件中网络命名空间路径指定为 /var/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9，即 containerd 刚才新建的 netns。

runc

就从 OCI 容器配置文件的定义发起，搜索 json:"namespaces 关键字：

https://github.com/opencontainers/runc/blob/e49d5da2197dbe06abafdd2aecfa5be5098dbb30/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go#L166-L167

type Linux struct {
    // Namespaces contains the namespaces that are created and/or joined by the container
    Namespaces []LinuxNamespace `json:"namespaces,omitempty"`
}

再搜索 "network" 关键字：

https://github.com/opencontainers/runc/blob/e49d5da2197dbe06abafdd2aecfa5be5098dbb30/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go#L202-L203

const (
    // PIDNamespace for isolating process IDs
    PIDNamespace LinuxNamespaceType = "pid"
    // NetworkNamespace for isolating network devices, stacks, ports, etc
    NetworkNamespace LinuxNamespaceType = "network"
)

注意文件路径 vendor/github.com/opencontainers/runtime-spec/specs-go/config.go 表示这是 OCI 规范，containerd 以遵循 OCI 规范的配置文件（config.json）将容器的描述传递给 runc。

而 runc 依靠 libcontainer 来创建容器，所以在内部需要将 OCI 容器配置转换成 libcontainer 容器配置。

从这层转换中也能看出来容器圈有很多故事。

https://github.com/opencontainers/runc/blob/382eba4354d764aaffec82720c23ef429496acf9/libcontainer/specconv/spec_linux.go#L330-L498

// CreateLibcontainerConfig creates a new libcontainer configuration from a
// given specification and a cgroup name
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
    // a lot of code here
    if spec.Linux != nil {
        for _, ns := range spec.Linux.Namespaces {
            t, exists := namespaceMapping[ns.Type]
            if !exists {
                return nil, fmt.Errorf("namespace %q does not exist", ns)
            }
            if config.Namespaces.Contains(t) {
                return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
            }
            config.Namespaces.Add(t, ns.Path)
        }
        // a lot of code here
    }
}

这里 t 的值为 configs.NEWNET 即字符串 "NEWNET"；ns.Path 的值为 "/var/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9"（举个例子）。

runc 通过两步来拉起容器进程：

runc run
runc init

父进程 `runc run`

来到 libcontainer 启动容器的 Start 方法顺藤摸瓜：

Start -> start -> newParentProcess -> newInitProcess -> bootstrapData

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
    cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
    nsMaps := make(map[configs.NamespaceType]string)
    for _, ns := range c.config.Namespaces {
        if ns.Path != "" {
            nsMaps[ns.Type] = ns.Path
        }
    }
    _, sharePidns := nsMaps[configs.NEWPID]
    data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
    if err != nil {
        return nil, err
    }
    // a lot of code here
}

从 libcontainer Config 中取出各种命名空间（其中包括了 NETNS）。

https://github.com/opencontainers/runc/blob/35784a3e6af802577a1145ea1302fb0476ae71e4/libcontainer/container_linux.go#L2118-L2239

// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
    // create the netlink message
    r := nl.NewNetlinkRequest(int(InitMsg), 0)

    // a lot of code here
    // write cloneFlags
    r.AddData(&Int32msg{
        Type:  CloneFlagsAttr,
        Value: uint32(cloneFlags),
    })

    // write custom namespace paths
    if len(nsMaps) > 0 {
        nsPaths, err := c.orderNamespacePaths(nsMaps)
        if err != nil {
            return nil, err
        }
        r.AddData(&Bytemsg{
            Type:  NsPathsAttr,
            Value: []byte(strings.Join(nsPaths, ",")),
        })
    }
    // a lot of code here
    return bytes.NewReader(r.Serialize()), nil
}

从 nsMaps 中取出（包括 "/var/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9"）所有 namespace path 并通过 , 拼接。最后“序列化”至 bootstrap 数据。命名空间相关的，由 Bytemsg 类型组装。

https://github.com/opencontainers/runc/blob/d72d057ba794164c3cce9451a00b72a78b25e1ae/libcontainer/message_linux.go#L49-L71

// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value              | pad |
type Bytemsg struct {
    Type  uint16
    Value []byte
}

func (msg *Bytemsg) Serialize() []byte {
    l := msg.Len()
    if l > math.MaxUint16 {
        // We cannot return nil nor an error here, so we panic with
        // a specific type instead, which is handled via recover in
        // bootstrapData.
        panic(netlinkError{fmt.Errorf("netlink: cannot serialize bytemsg of length %d (larger than UINT16_MAX)", l)})
    }
    buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
    native := nl.NativeEndian()
    native.PutUint16(buf[0:2], uint16(l))
    native.PutUint16(buf[2:4], msg.Type)
    copy(buf[4:], msg.Value)
    return buf
}

在它的反序列化方法 Serialize 中，Bytemsg 类型按 netlink 协议格式（见注释）拼接为字节数据。

再回到 libcontainer Start -> start -> parent.start() -> start

func (p *initProcess) start() (retErr error) {
    // a lot of code here
    if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
        return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
    }
    // a lot of code here
}

将 bootstrap 数据发往一个 Unix SocketPair p.messageSockPair 的一端，它在 newParentProcess 方法中创建：

https://github.com/opencontainers/runc/blob/35784a3e6af802577a1145ea1302fb0476ae71e4/libcontainer/container_linux.go#L456-L482

func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
    parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
    if err != nil {
        return nil, fmt.Errorf("unable to create init pipe: %w", err)
    }
    messageSockPair := filePair{parentInitPipe, childInitPipe}

    parentLogPipe, childLogPipe, err := os.Pipe()
    if err != nil {
        return nil, fmt.Errorf("unable to create log pipe: %w", err)
    }
    logFilePair := filePair{parentLogPipe, childLogPipe}

    cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
    if !p.Init {
        return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
    }
    // a lot of code here
}

socketpair 用于进程间（父子进程）双向通信，在 runc 中叫做 init pipe。这里所谓的父进程（Parent Process），就是 runc run 自身，它稍后会拉起第二个 runc 进程，第一个参数为 init：

// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
    l := &LinuxFactory{
        Root:      root,
        InitPath:  "/proc/self/exe",
        InitArgs:  []string{os.Args[0], "init"},
        Validator: validate.New(),
        CriuPath:  "criu",
    }
}

p.messageSockPair 的另一端则被追加至容器进程 cmd 的 ExtraFiles 列表：

https://github.com/opencontainers/runc/blob/35784a3e6af802577a1145ea1302fb0476ae71e4/libcontainer/container_linux.go#L579

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
    // a lot of code here
    data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
    if err != nil {
        return nil, err
    }

    if c.shouldSendMountSources() {
        for i, m := range c.config.Mounts {
            if !m.IsBind() {
                // Non bind-mounts do not use an fd.
                mountFds[i] = -1
                continue
            }

            // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
            // to allocate a fd so that we know the number to pass in the environment variable. The fd
            // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
            // lifecycle of that fd is already taken care of.
            cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
            mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
        }
    }
}

然后为 runc 子进程设置 _LIBCONTAINER_INITPIPE 环境变量传递 init pipe 的文件描述符（fd）：

newParentProcess -> commandTemplate -> newInitProcess

https://github.com/opencontainers/runc/blob/35784a3e6af802577a1145ea1302fb0476ae71e4/libcontainer/container_linux.go#L484-L521

func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
    // a lot of code here
    cmd.Env = append(cmd.Env,
        "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
        "_LIBCONTAINER_STATEDIR="+c.root,
    )
    // a lot of code here
}

子进程 `runc init`

我们再来看 Child Process，即由 runc run 递归执行的 runc init：

init -> StartInitialization -> newContainerInit -> Init

import (
    _ "github.com/opencontainers/runc/libcontainer/nsenter"
)

func init() {
    if len(os.Args) > 1 && os.Args[1] == "init" {
        // This is the golang entry point for runc init, executed
        // before main() but after libcontainer/nsenter's nsexec().
        runtime.GOMAXPROCS(1)
        runtime.LockOSThread()

        level, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGLEVEL"))
        if err != nil {
            panic(err)
        }

        logPipeFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
        if err != nil {
            panic(err)
        }

        logrus.SetLevel(logrus.Level(level))
        logrus.SetOutput(os.NewFile(uintptr(logPipeFd), "logpipe"))
        logrus.SetFormatter(new(logrus.JSONFormatter))
        logrus.Debug("child process in init()")

        factory, _ := libcontainer.New("")
        if err := factory.StartInitialization(); err != nil {
            // as the error is sent back to the parent there is no need to log
            // or write it to stderr because the parent process will handle this
            os.Exit(1)
        }
        panic("libcontainer: container init failed to exec")
    }
}

注意以上注释，在 init 执行前，会先执行 libcontainer/nsenter 的 nsexec 函数。

libcontainer/nsenter 包注册了一个特殊的初始化构造器，在 Go 运行时启动之前就被调用。这使得 runc 能够在现有的命名空间上使用 setns 系统调用，并且避免 Go 运行时在处理多线程时碰到的问题。

libcontainer/nsenter 包使用 cgo。每当导入该包，C 代码函数 nsexec 就会被调用。而 libcontainer/nsenter 又只在 init.go 中被导入，所以只有执行 runc 的 init 命令时，C 代码才会执行。

void nsexec(void)
{
    // a lot of code here
    /*
     * Get the init pipe fd from the environment. The init pipe is used to
     * read the bootstrap data and tell the parent what the new pids are
     * after the setup is done.
     */
    pipenum = getenv_int("_LIBCONTAINER_INITPIPE");
    if (pipenum < 0) {
        /* We are not a runc init. Just return to go runtime. */
        return;
    }

    /* Parse all of the netlink configuration. */
    nl_parse(pipenum, &config);

            /*
             * We need to setns first. We cannot do this earlier (in stage 0)
             * because of the fact that we forked to get here (the PID of
             * [stage 2: STAGE_INIT]) would be meaningless). We could send it
             * using cmsg(3) but that's just annoying.
             */
            if (config.namespaces)
                join_namespaces(config.namespaces);
}

从环境变量 _LIBCONTAINER_INITPIPE 获取 init 管道编号
从管道中读取 netlink 数据至 config 结构，即序列化后的字节串
如果存在已有的 namespace，就调用 join_namespaces 函数加入其中

https://github.com/opencontainers/runc/blob/9c444070ec7bb83995dbc0185da68284da71c554/libcontainer/nsenter/nsexec.c#L570-L635

void join_namespaces(char *nslist)
{

    // a lot of code here

    /*
     * We have to open the file descriptors first, since after
     * we join the mnt namespace we might no longer be able to
     * access the paths.
     */
    do {
        int fd;
        char *path;
        struct namespace_t *ns;

        /* Resize the namespace array. */
        namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
        if (!namespaces)
            bail("failed to reallocate namespace array");
        ns = &namespaces[num - 1];

        /* Split 'ns:path'. */
        path = strstr(namespace, ":");
        if (!path)
            bail("failed to parse %s", namespace);
        *path++ = '\0';

        fd = open(path, O_RDONLY);
        if (fd < 0)
            bail("failed to open %s", path);

        ns->fd = fd;
        strncpy(ns->type, namespace, PATH_MAX - 1);
        strncpy(ns->path, path, PATH_MAX - 1);
        ns->path[PATH_MAX - 1] = '\0';
    } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);

    /*
     * The ordering in which we join namespaces is important. We should
     * always join the user namespace *first*. This is all guaranteed
     * from the container_linux.go side of this, so we're just going to
     * follow the order given to us.
     */

    for (i = 0; i < num; i++) {
        struct namespace_t *ns = &namespaces[i];
        int flag = nsflag(ns->type);

        write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
        if (setns(ns->fd, flag) < 0)
            bail("failed to setns into %s namespace", ns->type);

        close(ns->fd);
    }

    free(namespaces);
}

首先打开命名空间文件，例如 /var/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9 代表网络命名空间
setns 系统调用将当前进程加入到网络命名空间中

至此一切明了，containerd 首先创建好网络命名空间，通过 OCI 规范作为参数传递给 runc run 进程，runc 内部转换后创建 init pipe 将命名空间相关的数据发送给即将递归拉起的 runc init，在 Go 运行时启动前就通过 setns 将进程加入到该网络命名空间。

CNI 插件

在 containerd 创建好网络命令空间后，就会调用 CNI 插件来“配置网络栈”：

https://github.com/containerd/containerd/blob/36f520dc04259debc7b8f19f5574db2a6054abf6/pkg/cri/server/sandbox_run.go#L322-L332

func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
    // a lot of code here
    if !hostNetwork(config) && !userNsEnabled {
        var netnsMountDir = "/var/run/netns"

        sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
        if err != nil {
            return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
        }

        // Update network namespace in the container's spec
        c.updateNetNamespacePath(spec, sandbox.NetNSPath)

        // Setup network for sandbox.
        // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
        // rely on the assumption that CRI shim will not be querying the network namespace to check the
        // network states such as IP.
        // In future runtime implementation should avoid relying on CRI shim implementation details.
        // In this case however caching the IP will add a subtle performance enhancement by avoiding
        // calls to network namespace of the pod to query the IP of the veth interface on every
        // SandboxStatus request.
        if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
            return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
        }
    }
}

RunPodSandbox -> setupPodNetwork

调用 CNI 插件在宿主机和容器网络命令空间中创建虚拟网卡（veth pair），路由表等，使得容器连通宿主机网络。网络方案五花八门，但这不是本文的重点。

业务容器

kubelet 通过 CRI CreateContainer API 调用 containerd 在指定的 Sandbox 中拉起 Pod 定义中的业务容器：

https://github.com/containerd/containerd/blob/a338abc902d9f204dcb9df7212d39fd7d07ac06d/pkg/cri/server/container_create.go#L50-L311

// CreateContainer creates a new container in the given PodSandbox.
func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateContainerRequest) (_ *runtime.CreateContainerResponse, retErr error) {
    config := r.GetConfig()
    log.G(ctx).Debugf("Container config %+v", config)
    sandboxConfig := r.GetSandboxConfig()
    sandbox, err := c.sandboxStore.Get(r.GetPodSandboxId())
    if err != nil {
        return nil, fmt.Errorf("failed to find sandbox id %q: %w", r.GetPodSandboxId(), err)
    }
    // a lot of code here
    spec, err := c.containerSpec(id, sandboxID, sandboxPid, sandbox.NetNSPath, containerName, containerdImage.Name(), config, sandboxConfig,
        &image.ImageSpec.Config, append(mounts, volumeMounts...), ociRuntime)
    if err != nil {
        return nil, fmt.Errorf("failed to generate container %q spec: %w", id, err)
    }
    // a lot of code here
}

因为网络栈在 Sandbox 阶段已经全部准备好，所以只需在创建业务容器时提供已有的网络命名空间（OCI 配置）即可：

# sandbox
$ kubectl get po pod-nginx -o yaml | grep sand
    io.kubernetes.pod.sandbox.uid: f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e

$ ctr -n k8s.io task list | grep f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e
f760e07f0771b5b1b0404f3758dec1a31b534494ddd7bc9287d084e03240b20e    204039    RUNNING

# container
$ kubectl get po pod-nginx -o yaml | grep containerd
  - containerID: containerd://f2e17ff60a7c0e48cd44621fd4a236a8c1b5a2663cdb810aff5376c8dc6d3b9b

$ ctr -n k8s.io container info f2e17ff60a7c0e48cd44621fd4a236a8c1b5a2663cdb810aff5376c8dc6d3b9b | jq ".Spec.linux.namespaces"
[
  {
    "type": "pid"
  },
  {
    "type": "ipc",
    "path": "/proc/204039/ns/ipc"
  },
  {
    "type": "uts",
    "path": "/proc/204039/ns/uts"
  },
  {
    "type": "mount"
  },
  {
    "type": "network",
    "path": "/proc/204039/ns/net"
  }
]

$ ll /proc/204039/ns/net
lrwxrwxrwx 1 65535 65535 0 Jul 10 23:40 /proc/204039/ns/net -> 'net:[4026535107]'

$ lsns -o NSFS 4026535107
NSFS
/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9
/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9
/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9
/run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9

pause 容器 PID 为 204039
业务容器网络命名空间指定为 /proc/204039/ns/net
/proc/204039/ns/net 的 NSFS 就是 /run/netns/cni-9ca95d5b-837c-84c5-f2ee-b60a96ef36c9

Sandbox（pause）

runc

父进程 runc run

子进程 runc init

CNI 插件

业务容器

查看更多

父进程 `runc run`

子进程 `runc init`