1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/file.h>
5 #include <linux/cgroup.h>
6 #include <linux/magic.h>
7 #include <linux/mount.h>
9 #include <linux/pidfs.h>
10 #include <linux/pid_namespace.h>
11 #include <linux/poll.h>
12 #include <linux/proc_fs.h>
13 #include <linux/proc_ns.h>
14 #include <linux/pseudo_fs.h>
15 #include <linux/ptrace.h>
16 #include <linux/seq_file.h>
17 #include <uapi/linux/pidfd.h>
18 #include <linux/ipc_namespace.h>
19 #include <linux/time_namespace.h>
20 #include <linux/utsname.h>
21 #include <net/net_namespace.h>
28 * pidfd_show_fdinfo - print information about a pidfd
29 * @m: proc fdinfo file
30 * @f: file referencing a pidfd
33 * This function will print the pid that a given pidfd refers to in the
34 * pid namespace of the procfs instance.
35 * If the pid namespace of the process is not a descendant of the pid
36 * namespace of the procfs instance 0 will be shown as its pid. This is
37 * similar to calling getppid() on a process whose parent is outside of
41 * If pid namespaces are supported then this function will also print
42 * the pid of a given pidfd refers to for all descendant pid namespaces
43 * starting from the current pid namespace of the instance, i.e. the
44 * Pid field and the first entry in the NSpid field will be identical.
45 * If the pid namespace of the process is not a descendant of the pid
46 * namespace of the procfs instance 0 will be shown as its first NSpid
47 * entry and no others will be shown.
48 * Note that this differs from the Pid and NSpid fields in
49 * /proc/<pid>/status where Pid and NSpid are always shown relative to
50 * the pid namespace of the procfs instance. The difference becomes
51 * obvious when sending around a pidfd between pid namespaces from a
52 * different branch of the tree, i.e. where no ancestral relation is
53 * present between the pid namespaces:
54 * - create two new pid namespaces ns1 and ns2 in the initial pid
55 * namespace (also take care to create new mount namespaces in the
56 * new pid namespace and mount procfs)
57 * - create a process with a pidfd in ns1
58 * - send pidfd from ns1 to ns2
59 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
60 * have exactly one entry, which is 0
62 static void pidfd_show_fdinfo(struct seq_file
*m
, struct file
*f
)
64 struct pid
*pid
= pidfd_pid(f
);
65 struct pid_namespace
*ns
;
68 if (likely(pid_has_task(pid
, PIDTYPE_PID
))) {
69 ns
= proc_pid_ns(file_inode(m
->file
)->i_sb
);
70 nr
= pid_nr_ns(pid
, ns
);
73 seq_put_decimal_ll(m
, "Pid:\t", nr
);
76 seq_put_decimal_ll(m
, "\nNSpid:\t", nr
);
80 /* If nr is non-zero it means that 'pid' is valid and that
81 * ns, i.e. the pid namespace associated with the procfs
82 * instance, is in the pid namespace hierarchy of pid.
83 * Start at one below the already printed level.
85 for (i
= ns
->level
+ 1; i
<= pid
->level
; i
++)
86 seq_put_decimal_ll(m
, "\t", pid
->numbers
[i
].nr
);
94 * Poll support for process exit notification.
96 static __poll_t
pidfd_poll(struct file
*file
, struct poll_table_struct
*pts
)
98 struct pid
*pid
= pidfd_pid(file
);
99 bool thread
= file
->f_flags
& PIDFD_THREAD
;
100 struct task_struct
*task
;
101 __poll_t poll_flags
= 0;
103 poll_wait(file
, &pid
->wait_pidfd
, pts
);
105 * Depending on PIDFD_THREAD, inform pollers when the thread
106 * or the whole thread-group exits.
109 task
= pid_task(pid
, PIDTYPE_PID
);
111 poll_flags
= EPOLLIN
| EPOLLRDNORM
| EPOLLHUP
;
112 else if (task
->exit_state
&& (thread
|| thread_group_empty(task
)))
113 poll_flags
= EPOLLIN
| EPOLLRDNORM
;
118 static long pidfd_info(struct task_struct
*task
, unsigned int cmd
, unsigned long arg
)
120 struct pidfd_info __user
*uinfo
= (struct pidfd_info __user
*)arg
;
121 size_t usize
= _IOC_SIZE(cmd
);
122 struct pidfd_info kinfo
= {};
123 struct user_namespace
*user_ns
;
124 const struct cred
*c
;
126 #ifdef CONFIG_CGROUPS
132 if (usize
< PIDFD_INFO_SIZE_VER0
)
133 return -EINVAL
; /* First version, no smaller struct possible */
135 if (copy_from_user(&mask
, &uinfo
->mask
, sizeof(mask
)))
138 c
= get_task_cred(task
);
142 /* Unconditionally return identifiers and credentials, the rest only on request */
144 user_ns
= current_user_ns();
145 kinfo
.ruid
= from_kuid_munged(user_ns
, c
->uid
);
146 kinfo
.rgid
= from_kgid_munged(user_ns
, c
->gid
);
147 kinfo
.euid
= from_kuid_munged(user_ns
, c
->euid
);
148 kinfo
.egid
= from_kgid_munged(user_ns
, c
->egid
);
149 kinfo
.suid
= from_kuid_munged(user_ns
, c
->suid
);
150 kinfo
.sgid
= from_kgid_munged(user_ns
, c
->sgid
);
151 kinfo
.fsuid
= from_kuid_munged(user_ns
, c
->fsuid
);
152 kinfo
.fsgid
= from_kgid_munged(user_ns
, c
->fsgid
);
153 kinfo
.mask
|= PIDFD_INFO_CREDS
;
156 #ifdef CONFIG_CGROUPS
158 cgrp
= task_dfl_cgroup(task
);
159 kinfo
.cgroupid
= cgroup_id(cgrp
);
160 kinfo
.mask
|= PIDFD_INFO_CGROUPID
;
165 * Copy pid/tgid last, to reduce the chances the information might be
166 * stale. Note that it is not possible to ensure it will be valid as the
167 * task might return as soon as the copy_to_user finishes, but that's ok
168 * and userspace expects that might happen and can act accordingly, so
169 * this is just best-effort. What we can do however is checking that all
170 * the fields are set correctly, or return ESRCH to avoid providing
171 * incomplete information. */
173 kinfo
.ppid
= task_ppid_nr_ns(task
, NULL
);
174 kinfo
.tgid
= task_tgid_vnr(task
);
175 kinfo
.pid
= task_pid_vnr(task
);
176 kinfo
.mask
|= PIDFD_INFO_PID
;
178 if (kinfo
.pid
== 0 || kinfo
.tgid
== 0 || (kinfo
.ppid
== 0 && kinfo
.pid
!= 1))
182 * If userspace and the kernel have the same struct size it can just
183 * be copied. If userspace provides an older struct, only the bits that
184 * userspace knows about will be copied. If userspace provides a new
185 * struct, only the bits that the kernel knows about will be copied.
187 if (copy_to_user(uinfo
, &kinfo
, min(usize
, sizeof(kinfo
))))
193 static long pidfd_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
195 struct task_struct
*task
__free(put_task
) = NULL
;
196 struct nsproxy
*nsp
__free(put_nsproxy
) = NULL
;
197 struct pid
*pid
= pidfd_pid(file
);
198 struct ns_common
*ns_common
= NULL
;
199 struct pid_namespace
*pid_ns
;
201 task
= get_pid_task(pid
, PIDTYPE_PID
);
205 /* Extensible IOCTL that does not open namespace FDs, take a shortcut */
206 if (_IOC_NR(cmd
) == _IOC_NR(PIDFD_GET_INFO
))
207 return pidfd_info(task
, cmd
, arg
);
212 scoped_guard(task_lock
, task
) {
218 return -ESRCH
; /* just pretend it didn't exist */
221 * We're trying to open a file descriptor to the namespace so perform a
222 * filesystem cred ptrace check. Also, we mirror nsfs behavior.
224 if (!ptrace_may_access(task
, PTRACE_MODE_READ_FSCREDS
))
228 /* Namespaces that hang of nsproxy. */
229 case PIDFD_GET_CGROUP_NAMESPACE
:
230 if (IS_ENABLED(CONFIG_CGROUPS
)) {
231 get_cgroup_ns(nsp
->cgroup_ns
);
232 ns_common
= to_ns_common(nsp
->cgroup_ns
);
235 case PIDFD_GET_IPC_NAMESPACE
:
236 if (IS_ENABLED(CONFIG_IPC_NS
)) {
237 get_ipc_ns(nsp
->ipc_ns
);
238 ns_common
= to_ns_common(nsp
->ipc_ns
);
241 case PIDFD_GET_MNT_NAMESPACE
:
242 get_mnt_ns(nsp
->mnt_ns
);
243 ns_common
= to_ns_common(nsp
->mnt_ns
);
245 case PIDFD_GET_NET_NAMESPACE
:
246 if (IS_ENABLED(CONFIG_NET_NS
)) {
247 ns_common
= to_ns_common(nsp
->net_ns
);
248 get_net_ns(ns_common
);
251 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE
:
252 if (IS_ENABLED(CONFIG_PID_NS
)) {
253 get_pid_ns(nsp
->pid_ns_for_children
);
254 ns_common
= to_ns_common(nsp
->pid_ns_for_children
);
257 case PIDFD_GET_TIME_NAMESPACE
:
258 if (IS_ENABLED(CONFIG_TIME_NS
)) {
259 get_time_ns(nsp
->time_ns
);
260 ns_common
= to_ns_common(nsp
->time_ns
);
263 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE
:
264 if (IS_ENABLED(CONFIG_TIME_NS
)) {
265 get_time_ns(nsp
->time_ns_for_children
);
266 ns_common
= to_ns_common(nsp
->time_ns_for_children
);
269 case PIDFD_GET_UTS_NAMESPACE
:
270 if (IS_ENABLED(CONFIG_UTS_NS
)) {
271 get_uts_ns(nsp
->uts_ns
);
272 ns_common
= to_ns_common(nsp
->uts_ns
);
275 /* Namespaces that don't hang of nsproxy. */
276 case PIDFD_GET_USER_NAMESPACE
:
277 if (IS_ENABLED(CONFIG_USER_NS
)) {
279 ns_common
= to_ns_common(get_user_ns(task_cred_xxx(task
, user_ns
)));
283 case PIDFD_GET_PID_NAMESPACE
:
284 if (IS_ENABLED(CONFIG_PID_NS
)) {
286 pid_ns
= task_active_pid_ns(task
);
288 ns_common
= to_ns_common(get_pid_ns(pid_ns
));
299 /* open_namespace() unconditionally consumes the reference */
300 return open_namespace(ns_common
);
303 static const struct file_operations pidfs_file_operations
= {
305 #ifdef CONFIG_PROC_FS
306 .show_fdinfo
= pidfd_show_fdinfo
,
308 .unlocked_ioctl
= pidfd_ioctl
,
309 .compat_ioctl
= compat_ptr_ioctl
,
312 struct pid
*pidfd_pid(const struct file
*file
)
314 if (file
->f_op
!= &pidfs_file_operations
)
315 return ERR_PTR(-EBADF
);
316 return file_inode(file
)->i_private
;
319 static struct vfsmount
*pidfs_mnt __ro_after_init
;
321 #if BITS_PER_LONG == 32
323 * Provide a fallback mechanism for 32-bit systems so processes remain
324 * reliably comparable by inode number even on those systems.
326 static DEFINE_IDA(pidfd_inum_ida
);
328 static int pidfs_inum(struct pid
*pid
, unsigned long *ino
)
332 ret
= ida_alloc_range(&pidfd_inum_ida
, RESERVED_PIDS
+ 1,
333 UINT_MAX
, GFP_ATOMIC
);
341 static inline void pidfs_free_inum(unsigned long ino
)
344 ida_free(&pidfd_inum_ida
, ino
);
347 static inline int pidfs_inum(struct pid
*pid
, unsigned long *ino
)
352 #define pidfs_free_inum(ino) ((void)(ino))
356 * The vfs falls back to simple_setattr() if i_op->setattr() isn't
357 * implemented. Let's reject it completely until we have a clean
358 * permission concept for pidfds.
360 static int pidfs_setattr(struct mnt_idmap
*idmap
, struct dentry
*dentry
,
368 * User space expects pidfs inodes to have no file type in st_mode.
370 * In particular, 'lsof' has this legacy logic:
372 * type = s->st_mode & S_IFMT;
376 * if (!strcmp(p, "anon_inode"))
377 * Lf->ntype = Ntype = N_ANON_INODE;
379 * to detect our old anon_inode logic.
381 * Rather than mess with our internal sane inode data, just fix it
382 * up here in getattr() by masking off the format bits.
384 static int pidfs_getattr(struct mnt_idmap
*idmap
, const struct path
*path
,
385 struct kstat
*stat
, u32 request_mask
,
386 unsigned int query_flags
)
388 struct inode
*inode
= d_inode(path
->dentry
);
390 generic_fillattr(&nop_mnt_idmap
, request_mask
, inode
, stat
);
391 stat
->mode
&= ~S_IFMT
;
395 static const struct inode_operations pidfs_inode_operations
= {
396 .getattr
= pidfs_getattr
,
397 .setattr
= pidfs_setattr
,
400 static void pidfs_evict_inode(struct inode
*inode
)
402 struct pid
*pid
= inode
->i_private
;
406 pidfs_free_inum(inode
->i_ino
);
409 static const struct super_operations pidfs_sops
= {
410 .drop_inode
= generic_delete_inode
,
411 .evict_inode
= pidfs_evict_inode
,
412 .statfs
= simple_statfs
,
416 * 'lsof' has knowledge of out historical anon_inode use, and expects
417 * the pidfs dentry name to start with 'anon_inode'.
419 static char *pidfs_dname(struct dentry
*dentry
, char *buffer
, int buflen
)
421 return dynamic_dname(buffer
, buflen
, "anon_inode:[pidfd]");
424 static const struct dentry_operations pidfs_dentry_operations
= {
425 .d_delete
= always_delete_dentry
,
426 .d_dname
= pidfs_dname
,
427 .d_prune
= stashed_dentry_prune
,
430 static int pidfs_init_inode(struct inode
*inode
, void *data
)
432 inode
->i_private
= data
;
433 inode
->i_flags
|= S_PRIVATE
;
434 inode
->i_mode
|= S_IRWXU
;
435 inode
->i_op
= &pidfs_inode_operations
;
436 inode
->i_fop
= &pidfs_file_operations
;
438 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
439 * avoids collisions with the root inode which is 1 for pseudo
442 return pidfs_inum(data
, &inode
->i_ino
);
445 static void pidfs_put_data(void *data
)
447 struct pid
*pid
= data
;
451 static const struct stashed_operations pidfs_stashed_ops
= {
452 .init_inode
= pidfs_init_inode
,
453 .put_data
= pidfs_put_data
,
456 static int pidfs_init_fs_context(struct fs_context
*fc
)
458 struct pseudo_fs_context
*ctx
;
460 ctx
= init_pseudo(fc
, PID_FS_MAGIC
);
464 ctx
->ops
= &pidfs_sops
;
465 ctx
->dops
= &pidfs_dentry_operations
;
466 fc
->s_fs_info
= (void *)&pidfs_stashed_ops
;
470 static struct file_system_type pidfs_type
= {
472 .init_fs_context
= pidfs_init_fs_context
,
473 .kill_sb
= kill_anon_super
,
476 struct file
*pidfs_alloc_file(struct pid
*pid
, unsigned int flags
)
479 struct file
*pidfd_file
;
483 ret
= path_from_stashed(&pid
->stashed
, pidfs_mnt
, get_pid(pid
), &path
);
487 pidfd_file
= dentry_open(&path
, flags
, current_cred());
492 void __init
pidfs_init(void)
494 pidfs_mnt
= kern_mount(&pidfs_type
);
495 if (IS_ERR(pidfs_mnt
))
496 panic("Failed to mount pidfs pseudo filesystem");