Linux 进程结构 task_struct

Feb 26, 2022 17:00 · 2048 words · 5 minute read Linux

进程/线程在内核中由统一的结构 task_struct 管理。

任务 ID

任务状态

  • volatile long state;

    https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L75-L92

    /* Used in tsk->state: */
    #define TASK_RUNNING            0x0000
    #define TASK_INTERRUPTIBLE      0x0001
    #define TASK_UNINTERRUPTIBLE        0x0002
    #define __TASK_STOPPED          0x0004
    #define __TASK_TRACED           0x0008
    
    /* Used in tsk->state again: */
    #define TASK_PARKED         0x0040
    #define TASK_DEAD           0x0080
    #define TASK_WAKEKILL           0x0100
    #define TASK_WAKING         0x0200
    #define TASK_NOLOAD         0x0400
    #define TASK_NEW            0x0800
    #define TASK_STATE_MAX          0x1000
    

    https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L94-L97

    /* Convenience macros for the sake of set_current_state: */
    #define TASK_KILLABLE           (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
    #define TASK_STOPPED            (TASK_WAKEKILL | __TASK_STOPPED)
    #define TASK_TRACED         (TASK_WAKEKILL | __TASK_TRACED)
    
    • TASK_KILLABLE 可终止的睡眠状态

      可以响应致命信号

    • __TASK_STOPPED 接收到 SIGSTOP、SIGTTIN、SIGTSTP 或者 SIGTTOU 信号之后进入的状态

    • __TASK_TRACED 进程被 debugger 等进程监视

  • int exit_state;

    https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L81-L84

    /* Used in tsk->exit_state: */
    #define EXIT_DEAD           0x0010
    #define EXIT_ZOMBIE         0x0020
    #define EXIT_TRACE          (EXIT_ZOMBIE | EXIT_DEAD)
    
    • EXIT_DEAD 进程的最终状态

    • EXIT_ZOMBIE 僵尸状态

      一个进程一旦结束,先进入 EXIT_ZOMBIE 状态,但是它的父进程没有使用 wait() 系统调用回收

  • unsigned int flags;

    https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L1440-L1472

    /*
     * Per process flags
     */
    #define PF_IDLE         0x00000002  /* I am an IDLE thread */
    #define PF_EXITING      0x00000004  /* Getting shut down */
    #define PF_EXITPIDONE       0x00000008  /* PI exit done on shut down */
    #define PF_VCPU         0x00000010  /* I'm a virtual CPU */
    #define PF_WQ_WORKER        0x00000020  /* I'm a workqueue worker */
    #define PF_FORKNOEXEC       0x00000040  /* Forked but didn't exec */
    #define PF_MCE_PROCESS      0x00000080      /* Process policy on mce errors */
    #define PF_SUPERPRIV        0x00000100  /* Used super-user privileges */
    #define PF_DUMPCORE     0x00000200  /* Dumped core */
    #define PF_SIGNALED     0x00000400  /* Killed by a signal */
    #define PF_MEMALLOC     0x00000800  /* Allocating memory */
    #define PF_NPROC_EXCEEDED   0x00001000  /* set_user() noticed that RLIMIT_NPROC was exceeded */
    #define PF_USED_MATH        0x00002000  /* If unset the fpu must be initialized before use */
    #define PF_USED_ASYNC       0x00004000  /* Used async_schedule*(), used by module init */
    #define PF_NOFREEZE     0x00008000  /* This thread should not be frozen */
    #define PF_FROZEN       0x00010000  /* Frozen for system suspend */
    #define PF_KSWAPD       0x00020000  /* I am kswapd */
    #define PF_MEMALLOC_NOFS    0x00040000  /* All allocation requests will inherit GFP_NOFS */
    #define PF_MEMALLOC_NOIO    0x00080000  /* All allocation requests will inherit GFP_NOIO */
    #define PF_LESS_THROTTLE    0x00100000  /* Throttle me less: I clean memory */
    #define PF_KTHREAD      0x00200000  /* I am a kernel thread */
    #define PF_RANDOMIZE        0x00400000  /* Randomize virtual address space */
    #define PF_SWAPWRITE        0x00800000  /* Allowed to write to swap */
    #define PF_MEMSTALL     0x01000000  /* Stalled due to lack of memory */
    #define PF_UMH          0x02000000  /* I'm an Usermodehelper process */
    #define PF_NO_SETAFFINITY   0x04000000  /* Userland is not allowed to meddle with cpus_mask */
    #define PF_MCE_EARLY        0x08000000      /* Early kill for mce process policy */
    #define PF_MEMALLOC_NOCMA   0x10000000  /* All allocation request will have _GFP_MOVABLE cleared */
    #define PF_FREEZER_SKIP     0x40000000  /* Freezer should not count it as freezable */
    #define PF_SUSPEND_TASK     0x80000000      /* This thread called freeze_processes() and should not be frozen */
    

运行统计信息

进程亲缘关系

通常情况下 real_parentparent 一样; 例外情况:bash 创建进程后,使用 GDB 来 debug 该进程,此时 parent 是 GDB,real_parent 是 bash。

进程权限

  • const struct cred __rcu *real_cred; Objective and real subjective task credentials

    谁能操作我

  • const struct cred __rcu *cred; Effective (overridable) subjective task credentials

    我能操作谁

  • cred

    struct cred {
        atomic_t    usage;
    #ifdef CONFIG_DEBUG_CREDENTIALS
        atomic_t    subscribers;    /* number of processes subscribed */
        void        *put_addr;
        unsigned    magic;
    #define CRED_MAGIC  0x43736564
    #define CRED_MAGIC_DEAD 0x44656144
    #endif
        kuid_t      uid;        /* real UID of the task */
        kgid_t      gid;        /* real GID of the task */
        kuid_t      suid;       /* saved UID of the task */
        kgid_t      sgid;       /* saved GID of the task */
        kuid_t      euid;       /* effective UID of the task */
        kgid_t      egid;       /* effective GID of the task */
        kuid_t      fsuid;      /* UID for VFS ops */
        kgid_t      fsgid;      /* GID for VFS ops */
        unsigned    securebits; /* SUID-less security management */
        kernel_cap_t    cap_inheritable; /* caps our children can inherit */
        kernel_cap_t    cap_permitted;  /* caps we're permitted */
        kernel_cap_t    cap_effective;  /* caps we can actually use */
        kernel_cap_t    cap_bset;   /* capability bounding set */
        kernel_cap_t    cap_ambient;    /* Ambient capability set */
    #ifdef CONFIG_KEYS
        unsigned char   jit_keyring;    /* default keyring to attach requested
                         * keys to */
        struct key  *session_keyring; /* keyring inherited over fork */
        struct key  *process_keyring; /* keyring private to this process */
        struct key  *thread_keyring; /* keyring private to this thread */
        struct key  *request_key_auth; /* assumed request_key authority */
    #endif
    #ifdef CONFIG_SECURITY
        void        *security;  /* subjective LSM security */
    #endif
        struct user_struct *user;   /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct group_info *group_info;  /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
            int non_rcu;            /* Can we skip RCU deletion? */
            struct rcu_head rcu;        /* RCU deletion hook */
        };
    } __randomize_layout;
    

    Linux 中进程可以通过 setuid 设置用户 ID

  • capabilities 机制

    #define CAP_CHOWN            0
    #define CAP_DAC_OVERRIDE     1
    #define CAP_DAC_READ_SEARCH  2
    #define CAP_FOWNER           3
    #define CAP_FSETID           4
    #define CAP_KILL             5
    #define CAP_SETGID           6
    #define CAP_SETUID           7
    // ...
    

    类似 RBAC

用户态函数栈

https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L625-L641

struct thread_info      thread_info;
void                *stack;

进程内存空间中的栈,从高地址到低地址往下增长。

32 位

  • ESP(Extended Stack Pointer)栈顶指针寄存器,入栈/出栈操作就是调整 ESP 的值
  • EBP(Extended Base Pointer)栈基地址指针寄存器,指向当前栈帧最底部
  • EAX 保存返回值

A 调用 B,A 的栈帧包含:

  • A 函数的局部变量
  • 调用 B 的时候要传给 B 的参数
  • 返回 A 的地址

B 的栈帧包含:

  • A 栈帧的栈底位置 EBP
  • B 函数的局部变量

64 位

  • rsp 栈顶指针寄存器
  • rbp 栈基地址指针寄存器
  • rax 保存返回值
  • rdi、rsi、rdx、rcx、r8、r9 传递函数调用时的 6 个参数

内核态函数栈

Linux 给每个任务都分配了内核栈 void *stack;

  • pg_regs

    系统调用从用户态到内核态,将用户态运行过程中的 CPU 上下文保存至 pg_regs 结构的寄存器变量中:

    • 32 位
    struct pt_regs {
        /*
         * NB: 32-bit x86 CPUs are inconsistent as what happens in the
         * following cases (where %seg represents a segment register):
         *
         * - pushl %seg: some do a 16-bit write and leave the high
         *   bits alone
         * - movl %seg, [mem]: some do a 16-bit write despite the movl
         * - IDT entry: some (e.g. 486) will leave the high bits of CS
         *   and (if applicable) SS undefined.
         *
         * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
         * so we can just treat all of the segment registers as 16-bit
         * values.
         */
        unsigned long bx;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
        unsigned long bp;
        unsigned long ax;
        unsigned short ds;
        unsigned short __dsh;
        unsigned short es;
        unsigned short __esh;
        unsigned short fs;
        unsigned short __fsh;
        /* On interrupt, gs and __gsh store the vector number. */
        unsigned short gs;
        unsigned short __gsh;
        /* On interrupt, this is the error code. */
        unsigned long orig_ax;
        unsigned long ip;
        unsigned short cs;
        unsigned short __csh;
        unsigned long flags;
        unsigned long sp;
        unsigned short ss;
        unsigned short __ssh;
    };
    
    • 64 位
    struct pt_regs {
    /*
     * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
     * unless syscall needs a complete, fully filled "struct pt_regs".
     */
        unsigned long r15;
        unsigned long r14;
        unsigned long r13;
        unsigned long r12;
        unsigned long bp;
        unsigned long bx;
    /* These regs are callee-clobbered. Always saved on kernel entry. */
        unsigned long r11;
        unsigned long r10;
        unsigned long r9;
        unsigned long r8;
        unsigned long ax;
        unsigned long cx;
        unsigned long dx;
        unsigned long si;
        unsigned long di;
    /*
     * On syscall entry, this is syscall#. On CPU exception, this is error code.
     * On hw interrupt, it's IRQ number:
     */
        unsigned long orig_ax;
    /* Return frame for iretq */
        unsigned long ip;
        unsigned long cs;
        unsigned long flags;
        unsigned long sp;
        unsigned long ss;
    /* top of stack page */
    };
    

通过 task_struct 找内核栈

  1. task_stack_page

    static inline void *task_stack_page(const struct task_struct *task)
    {
        return task->stack;
    }
    

    从 task_struct 找到内核栈起始位置

  2. task_pt_regs

    #define task_pt_regs(task) \
    ({                                  \
        unsigned long __ptr = (unsigned long)task_stack_page(task); \
        __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;     \
        ((struct pt_regs *)__ptr) - 1;                  \
    })
    

    起始位置 + THREAD_SIZE 得到 pt_regs 位置

通过内核栈找 task_struct

  1. current_thread_info

    #define current_thread_info() ((struct thread_info *)current)
    
  2. current

    struct task_struct;
    
    DECLARE_PER_CPU(struct task_struct *, current_task);
    
    static __always_inline struct task_struct *get_current(void)
    {
        return this_cpu_read_stable(current_task);
    }
    
    #define current get_current()
    

    每个 CPU 运行的 task_struct 在 PER_CPU 变量中,在系统初始化时指向 init_task https://github.com/torvalds/linux/arch/x86/kernel/cpu/common.c#L1719

    DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;