Linux 进程结构 task_struct
Feb 26, 2022 17:00 · 2048 words · 5 minute read
进程/线程在内核中由统一的结构 task_struct 管理。
任务 ID
pid_t pid;
process idpid_t tgid;
thread group idstruct task_struct *group_leader;
进程的主线程
任务状态
-
https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L75-L92
/* Used in tsk->state: */ #define TASK_RUNNING 0x0000 #define TASK_INTERRUPTIBLE 0x0001 #define TASK_UNINTERRUPTIBLE 0x0002 #define __TASK_STOPPED 0x0004 #define __TASK_TRACED 0x0008 /* Used in tsk->state again: */ #define TASK_PARKED 0x0040 #define TASK_DEAD 0x0080 #define TASK_WAKEKILL 0x0100 #define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000
-
TASK_RUNNING
就绪进程时刻准备运行
-
TASK_INTERRUPTIBLE
浅度睡眠等待 I/O 完成过程中可被信号唤醒
-
TASK_UNINTERRUPTIBLE
深度睡眠不可被信号唤醒
https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L94-L97
/* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
-
TASK_KILLABLE
可终止的睡眠状态可以响应致命信号
-
__TASK_STOPPED
接收到 SIGSTOP、SIGTTIN、SIGTSTP 或者 SIGTTOU 信号之后进入的状态 -
__TASK_TRACED
进程被 debugger 等进程监视
-
-
https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L81-L84
/* Used in tsk->exit_state: */ #define EXIT_DEAD 0x0010 #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
-
EXIT_DEAD
进程的最终状态 -
EXIT_ZOMBIE
僵尸状态一个进程一旦结束,先进入 EXIT_ZOMBIE 状态,但是它的父进程没有使用 wait() 系统调用回收
-
-
https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L1440-L1472
/* * Per process flags */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
PF_EXITING
正在退出[PF_VCPU
](https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L1446) 进程运行在虚拟 CPU 上[PF_FORKNOEXEC
](https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L1448) fork 完了,还没有 exec
运行统计信息
u64 utime;
用户态 CPU 时间u64 stime;
内核态 CPU 时间unsigned long nvcsw;
自愿上下文切换计数unsigned long nivcsw;
非自愿上下文切换计数u64 start_time;
进程启动时间(不包含睡眠时间)u64 real_start_time;
进程启动时间(包含睡眠时间)
进程亲缘关系
struct task_struct __rcu *real_parent;
struct task_struct __rcu *parent;
父进程struct list_head children;
子进程链表struct list_head sibling;
把当前进程插入兄弟链表中
通常情况下 real_parent
和 parent
一样;
例外情况:bash 创建进程后,使用 GDB 来 debug 该进程,此时 parent
是 GDB,real_parent
是 bash。
进程权限
-
const struct cred __rcu *real_cred;
Objective and real subjective task credentials谁能操作我
-
const struct cred __rcu *cred;
Effective (overridable) subjective task credentials我能操作谁
-
struct cred { atomic_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; unsigned magic; #define CRED_MAGIC 0x43736564 #define CRED_MAGIC_DEAD 0x44656144 #endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout;
uid
/gid
谁启动进程就是谁的 ID[euid
](https://github.com/torvalds/linux/blob/v5.4/include/linux/cred.h#L124)/egid
进程操作消息队列、共享内存、信号量时审核权限[fsuid
](https://github.com/torvalds/linux/blob/v5.4/include/linux/cred.h#L126)/fsgid
进程操作文件时审核权限
Linux 中进程可以通过
setuid
设置用户 ID -
#define CAP_CHOWN 0 #define CAP_DAC_OVERRIDE 1 #define CAP_DAC_READ_SEARCH 2 #define CAP_FOWNER 3 #define CAP_FSETID 4 #define CAP_KILL 5 #define CAP_SETGID 6 #define CAP_SETUID 7 // ...
类似 RBAC
用户态函数栈
https://github.com/torvalds/linux/blob/v5.4/include/linux/sched.h#L625-L641
struct thread_info thread_info;
void *stack;
进程内存空间中的栈,从高地址到低地址往下增长。
32 位
- ESP(Extended Stack Pointer)栈顶指针寄存器,入栈/出栈操作就是调整 ESP 的值
- EBP(Extended Base Pointer)栈基地址指针寄存器,指向当前栈帧最底部
- EAX 保存返回值
A 调用 B,A 的栈帧包含:
- A 函数的局部变量
- 调用 B 的时候要传给 B 的参数
- 返回 A 的地址
B 的栈帧包含:
- A 栈帧的栈底位置 EBP
- B 函数的局部变量
64 位
- rsp 栈顶指针寄存器
- rbp 栈基地址指针寄存器
- rax 保存返回值
- rdi、rsi、rdx、rcx、r8、r9 传递函数调用时的 6 个参数
内核态函数栈
Linux 给每个任务都分配了内核栈 void *stack;
-
pg_regs
系统调用从用户态到内核态,将用户态运行过程中的 CPU 上下文保存至
pg_regs
结构的寄存器变量中:- 32 位
struct pt_regs { /* * NB: 32-bit x86 CPUs are inconsistent as what happens in the * following cases (where %seg represents a segment register): * * - pushl %seg: some do a 16-bit write and leave the high * bits alone * - movl %seg, [mem]: some do a 16-bit write despite the movl * - IDT entry: some (e.g. 486) will leave the high bits of CS * and (if applicable) SS undefined. * * Fortunately, x86-32 doesn't read the high bits on POP or IRET, * so we can just treat all of the segment registers as 16-bit * values. */ unsigned long bx; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; unsigned long bp; unsigned long ax; unsigned short ds; unsigned short __dsh; unsigned short es; unsigned short __esh; unsigned short fs; unsigned short __fsh; /* On interrupt, gs and __gsh store the vector number. */ unsigned short gs; unsigned short __gsh; /* On interrupt, this is the error code. */ unsigned long orig_ax; unsigned long ip; unsigned short cs; unsigned short __csh; unsigned long flags; unsigned long sp; unsigned short ss; unsigned short __ssh; };
- 64 位
struct pt_regs { /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; unsigned long r8; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; /* * On syscall entry, this is syscall#. On CPU exception, this is error code. * On hw interrupt, it's IRQ number: */ unsigned long orig_ax; /* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; unsigned long sp; unsigned long ss; /* top of stack page */ };
通过 task_struct 找内核栈
-
static inline void *task_stack_page(const struct task_struct *task) { return task->stack; }
从 task_struct 找到内核栈起始位置
-
#define task_pt_regs(task) \ ({ \ unsigned long __ptr = (unsigned long)task_stack_page(task); \ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ ((struct pt_regs *)__ptr) - 1; \ })
起始位置 + THREAD_SIZE 得到 pt_regs 位置
通过内核栈找 task_struct
-
#define current_thread_info() ((struct thread_info *)current)
-
struct task_struct; DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { return this_cpu_read_stable(current_task); } #define current get_current()
每个 CPU 运行的 task_struct 在 PER_CPU 变量中,在系统初始化时指向 init_task https://github.com/torvalds/linux/arch/x86/kernel/cpu/common.c#L1719:
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;