for (i = ns->level; i >= 0; i--) { int tid = 0; ... // 按照namespace所在的层级:依次为pid在各个namespace生成一个pid号 if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* * If ENOSPC is returned it means that the PID is * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) nr = -EEXIST; } else { int pid_min = 1; /* * init really needs pid 1, but after reaching the * maximum wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS;
/* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } ... // 将各个层级pid和namespace信息顺序存储在struct pid的number结构中 pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; }
... upid = pid->numbers + ns->level; ... for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; }
// https://github.com/torvalds/linux/blob/v6.0/include/linux/pid.h /* * the helpers to get the pid's id seen from different namespaces * * pid_nr() : global id, i.e. the id seen from the init namespace; * pid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * pid_nr_ns() : id seen from the ns specified. * * see also task_xid_nr() etc in include/linux/sched.h */ staticinlinepid_tpid_nr(struct pid *pid) { pid_t nr = 0; if (pid) nr = pid->numbers[0].nr; // 初始Namespace中的PID就是全局PID值,全局唯一 return nr; }
sleep(1); /* Give child time to do something */ system("id"); ... if (waitpid(pid, NULL, 0) == -1) /* Wait for child */ errExit("waitpid"); ... }
用普通用户waklerdu执行测试返回结果如下:
1 2 3 4 5 6
~> ./a.out NEWUSER in parent: clone() returned child pid=7165 in child: uid=65534(nobody) gid=65534(nogroup) groups=65534(nogroup) in parent: uid=1001(walkerdu) gid=1001(walkerdu) groups=1001(walkerdu)
sleep(1); /* Give child time to do something */ system("id"); // 设置子进程的User ID映射, std::string cmd = "echo '0 1001 1' > /proc/" + std::to_string(pid) + "/uid_map"; system(cmd.c_str()); ... if (waitpid(pid, NULL, 0) == -1) /* Wait for child */ errExit("waitpid"); ... }
如下测试结果:
1 2 3 4 5 6 7 8
$ ./a.out NEWUSER in parent: clone() returned child pid=10187 in child: uid=65534(nobody) gid=65534(nogroup) groups=65534(nogroup) in parent: uid=1001(walkerdu) gid=1001(walkerdu) groups=1001(walkerdu) in child: after parent set uid_map uid=0(root) gid=65534(nogroup) groups=65534(nogroup)
User Namespace是比较特殊的Namespace,所有其他类型的Namepsace(NonUser Namespace)在创建的时候都会绑定它所在的User Namespace,对于在这些NonUser Namespace中需要特权权限对资源进行操作,需要其对应绑定的User Namespace拥有对应的特权权限;
sleep(1); /* Give child time to do something */ ... if (waitpid(pid, NULL, 0) == -1) /* Wait for child */ errExit("waitpid"); ... }
执行上述测试代码,输出如下:
1 2 3 4 5 6 7 8 9 10
# ./a.out NEWIPC in parent: clone() returned child pid=10892 in child: shmid=0, key=0x111111 in child: shmid=32769, key=0x2222222 in child: ipcs -m info:
structvfsmount { structdentry *mnt_root;/* root of the mounted tree */ structsuper_block *mnt_sb;/* pointer to superblock */ int mnt_flags; structuser_namespace *mnt_userns; } __randomize_layout;
Linux Namespace提供了clone该挂载数据的功能,可以在独立的Mount Namespace中进行文件系统的挂载,而不影响到其他的Namespace,Mount Namespace是Linux引入的第一个Namespace类型,发布于2002年的Linux「2.4.19」;
在引入Mount Namespace后,需要知道共享子树(Shared subtrees)的概念,共享子树是为了解决,在挂载一个新的资源的时候,需要在在不同的Namepsace下手动执行挂载操作, Linux 「2.6.15」引入了shared subtrees feature ,可以支持在一个Namespace下进行资源挂载后,能够自动,可控的在不同的Namespace间进行传递;
According to these rules, the root mount would be MS_PRIVATE, and all descendant mounts would by default also be MS_PRIVATE. However, MS_SHARED would arguably have been a better default, since it is the more commonly employed propagation type. For that reason, systemd sets the propagation type of all mount points to MS_SHARED. Thus, on most modern Linux distributions, the default propagation type is effectively MS_SHARED.
# ./a.out NEWNS in parent: clone() returned child pid=18875 in child: /dev/vda1 on / type ext4 (rw,relatime,errors=remount-ro,data=ordered) ... /dev/vda1 on /rootfs/data type ext4 (rw,relatime,errors=remount-ro,data=ordered)
in parent: ... /dev/vda1 on / type ext4 (rw,relatime,errors=remount-ro,data=ordered) ...
sleep(1); /* Give child time to do something */ printf("in parent:\n"); system("ip address"); ... if (waitpid(pid, NULL, 0) == -1) /* Wait for child */ errExit("waitpid"); ... }
执行结果如下:父进程输出的Network信息如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
in parent: clone() returned child pid=22338
in parent: 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever 2: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP qlen 1000 link/ether 52:54:00:7f:45:73 brd ff:ff:ff:ff:ff:ff inet 9.134.131.137/21 brd 9.134.135.255 scope global eth1 valid_lft forever preferred_lft forever 3: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP link/ether 02:42:dd:ed:84:53 brd ff:ff:ff:ff:ff:ff inet 192.168.10.1/24 brd 192.168.10.255 scope global docker0 valid_lft forever preferred_lft forever 7: veth9e2fdfe: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master docker0 state UP link/ether 8a:c4:d5:22:a3:32 brd ff:ff:ff:ff:ff:ff
子进程中输出的Network信息如下:
1 2 3
in child: 1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
sleep(1); /* Give child time to do something */ structutsname uts; if (uname(&uts) == -1) errExit("uname"); printf("uts.nodename in parent: %s\n", uts.nodename); ... if (waitpid(pid, NULL, 0) == -1) /* Wait for child */ errExit("waitpid"); ... }
测试结果如下:
1 2 3 4
# ./a.out NEWUTS walkerdu-host in parent: clone() returned child pid=19292 uts.nodename in child: walkerdu-host uts.nodename in parent: qcloud