1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/file.h>
5 #include <linux/magic.h>
6 #include <linux/mount.h>
8 #include <linux/pidfs.h>
9 #include <linux/pid_namespace.h>
10 #include <linux/poll.h>
11 #include <linux/proc_fs.h>
12 #include <linux/proc_ns.h>
13 #include <linux/pseudo_fs.h>
14 #include <linux/ptrace.h>
15 #include <linux/seq_file.h>
16 #include <uapi/linux/pidfd.h>
17 #include <linux/ipc_namespace.h>
18 #include <linux/time_namespace.h>
19 #include <linux/utsname.h>
20 #include <net/net_namespace.h>
27 * pidfd_show_fdinfo - print information about a pidfd
28 * @m: proc fdinfo file
29 * @f: file referencing a pidfd
32 * This function will print the pid that a given pidfd refers to in the
33 * pid namespace of the procfs instance.
34 * If the pid namespace of the process is not a descendant of the pid
35 * namespace of the procfs instance 0 will be shown as its pid. This is
36 * similar to calling getppid() on a process whose parent is outside of
40 * If pid namespaces are supported then this function will also print
41 * the pid of a given pidfd refers to for all descendant pid namespaces
42 * starting from the current pid namespace of the instance, i.e. the
43 * Pid field and the first entry in the NSpid field will be identical.
44 * If the pid namespace of the process is not a descendant of the pid
45 * namespace of the procfs instance 0 will be shown as its first NSpid
46 * entry and no others will be shown.
47 * Note that this differs from the Pid and NSpid fields in
48 * /proc/<pid>/status where Pid and NSpid are always shown relative to
49 * the pid namespace of the procfs instance. The difference becomes
50 * obvious when sending around a pidfd between pid namespaces from a
51 * different branch of the tree, i.e. where no ancestral relation is
52 * present between the pid namespaces:
53 * - create two new pid namespaces ns1 and ns2 in the initial pid
54 * namespace (also take care to create new mount namespaces in the
55 * new pid namespace and mount procfs)
56 * - create a process with a pidfd in ns1
57 * - send pidfd from ns1 to ns2
58 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
59 * have exactly one entry, which is 0
61 static void pidfd_show_fdinfo(struct seq_file
*m
, struct file
*f
)
63 struct pid
*pid
= pidfd_pid(f
);
64 struct pid_namespace
*ns
;
67 if (likely(pid_has_task(pid
, PIDTYPE_PID
))) {
68 ns
= proc_pid_ns(file_inode(m
->file
)->i_sb
);
69 nr
= pid_nr_ns(pid
, ns
);
72 seq_put_decimal_ll(m
, "Pid:\t", nr
);
75 seq_put_decimal_ll(m
, "\nNSpid:\t", nr
);
79 /* If nr is non-zero it means that 'pid' is valid and that
80 * ns, i.e. the pid namespace associated with the procfs
81 * instance, is in the pid namespace hierarchy of pid.
82 * Start at one below the already printed level.
84 for (i
= ns
->level
+ 1; i
<= pid
->level
; i
++)
85 seq_put_decimal_ll(m
, "\t", pid
->numbers
[i
].nr
);
93 * Poll support for process exit notification.
95 static __poll_t
pidfd_poll(struct file
*file
, struct poll_table_struct
*pts
)
97 struct pid
*pid
= pidfd_pid(file
);
98 bool thread
= file
->f_flags
& PIDFD_THREAD
;
99 struct task_struct
*task
;
100 __poll_t poll_flags
= 0;
102 poll_wait(file
, &pid
->wait_pidfd
, pts
);
104 * Depending on PIDFD_THREAD, inform pollers when the thread
105 * or the whole thread-group exits.
108 task
= pid_task(pid
, PIDTYPE_PID
);
110 poll_flags
= EPOLLIN
| EPOLLRDNORM
| EPOLLHUP
;
111 else if (task
->exit_state
&& (thread
|| thread_group_empty(task
)))
112 poll_flags
= EPOLLIN
| EPOLLRDNORM
;
117 static long pidfd_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
119 struct task_struct
*task
__free(put_task
) = NULL
;
120 struct nsproxy
*nsp
__free(put_nsproxy
) = NULL
;
121 struct pid
*pid
= pidfd_pid(file
);
122 struct ns_common
*ns_common
= NULL
;
127 task
= get_pid_task(pid
, PIDTYPE_PID
);
131 scoped_guard(task_lock
, task
) {
137 return -ESRCH
; /* just pretend it didn't exist */
140 * We're trying to open a file descriptor to the namespace so perform a
141 * filesystem cred ptrace check. Also, we mirror nsfs behavior.
143 if (!ptrace_may_access(task
, PTRACE_MODE_READ_FSCREDS
))
147 /* Namespaces that hang of nsproxy. */
148 case PIDFD_GET_CGROUP_NAMESPACE
:
149 if (IS_ENABLED(CONFIG_CGROUPS
)) {
150 get_cgroup_ns(nsp
->cgroup_ns
);
151 ns_common
= to_ns_common(nsp
->cgroup_ns
);
154 case PIDFD_GET_IPC_NAMESPACE
:
155 if (IS_ENABLED(CONFIG_IPC_NS
)) {
156 get_ipc_ns(nsp
->ipc_ns
);
157 ns_common
= to_ns_common(nsp
->ipc_ns
);
160 case PIDFD_GET_MNT_NAMESPACE
:
161 get_mnt_ns(nsp
->mnt_ns
);
162 ns_common
= to_ns_common(nsp
->mnt_ns
);
164 case PIDFD_GET_NET_NAMESPACE
:
165 if (IS_ENABLED(CONFIG_NET_NS
)) {
166 ns_common
= to_ns_common(nsp
->net_ns
);
167 get_net_ns(ns_common
);
170 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE
:
171 if (IS_ENABLED(CONFIG_PID_NS
)) {
172 get_pid_ns(nsp
->pid_ns_for_children
);
173 ns_common
= to_ns_common(nsp
->pid_ns_for_children
);
176 case PIDFD_GET_TIME_NAMESPACE
:
177 if (IS_ENABLED(CONFIG_TIME_NS
)) {
178 get_time_ns(nsp
->time_ns
);
179 ns_common
= to_ns_common(nsp
->time_ns
);
182 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE
:
183 if (IS_ENABLED(CONFIG_TIME_NS
)) {
184 get_time_ns(nsp
->time_ns_for_children
);
185 ns_common
= to_ns_common(nsp
->time_ns_for_children
);
188 case PIDFD_GET_UTS_NAMESPACE
:
189 if (IS_ENABLED(CONFIG_UTS_NS
)) {
190 get_uts_ns(nsp
->uts_ns
);
191 ns_common
= to_ns_common(nsp
->uts_ns
);
194 /* Namespaces that don't hang of nsproxy. */
195 case PIDFD_GET_USER_NAMESPACE
:
196 if (IS_ENABLED(CONFIG_USER_NS
)) {
198 ns_common
= to_ns_common(get_user_ns(task_cred_xxx(task
, user_ns
)));
202 case PIDFD_GET_PID_NAMESPACE
:
203 if (IS_ENABLED(CONFIG_PID_NS
)) {
205 ns_common
= to_ns_common( get_pid_ns(task_active_pid_ns(task
)));
216 /* open_namespace() unconditionally consumes the reference */
217 return open_namespace(ns_common
);
220 static const struct file_operations pidfs_file_operations
= {
222 #ifdef CONFIG_PROC_FS
223 .show_fdinfo
= pidfd_show_fdinfo
,
225 .unlocked_ioctl
= pidfd_ioctl
,
226 .compat_ioctl
= compat_ptr_ioctl
,
229 struct pid
*pidfd_pid(const struct file
*file
)
231 if (file
->f_op
!= &pidfs_file_operations
)
232 return ERR_PTR(-EBADF
);
233 return file_inode(file
)->i_private
;
236 static struct vfsmount
*pidfs_mnt __ro_after_init
;
238 #if BITS_PER_LONG == 32
240 * Provide a fallback mechanism for 32-bit systems so processes remain
241 * reliably comparable by inode number even on those systems.
243 static DEFINE_IDA(pidfd_inum_ida
);
245 static int pidfs_inum(struct pid
*pid
, unsigned long *ino
)
249 ret
= ida_alloc_range(&pidfd_inum_ida
, RESERVED_PIDS
+ 1,
250 UINT_MAX
, GFP_ATOMIC
);
258 static inline void pidfs_free_inum(unsigned long ino
)
261 ida_free(&pidfd_inum_ida
, ino
);
264 static inline int pidfs_inum(struct pid
*pid
, unsigned long *ino
)
269 #define pidfs_free_inum(ino) ((void)(ino))
273 * The vfs falls back to simple_setattr() if i_op->setattr() isn't
274 * implemented. Let's reject it completely until we have a clean
275 * permission concept for pidfds.
277 static int pidfs_setattr(struct mnt_idmap
*idmap
, struct dentry
*dentry
,
285 * User space expects pidfs inodes to have no file type in st_mode.
287 * In particular, 'lsof' has this legacy logic:
289 * type = s->st_mode & S_IFMT;
293 * if (!strcmp(p, "anon_inode"))
294 * Lf->ntype = Ntype = N_ANON_INODE;
296 * to detect our old anon_inode logic.
298 * Rather than mess with our internal sane inode data, just fix it
299 * up here in getattr() by masking off the format bits.
301 static int pidfs_getattr(struct mnt_idmap
*idmap
, const struct path
*path
,
302 struct kstat
*stat
, u32 request_mask
,
303 unsigned int query_flags
)
305 struct inode
*inode
= d_inode(path
->dentry
);
307 generic_fillattr(&nop_mnt_idmap
, request_mask
, inode
, stat
);
308 stat
->mode
&= ~S_IFMT
;
312 static const struct inode_operations pidfs_inode_operations
= {
313 .getattr
= pidfs_getattr
,
314 .setattr
= pidfs_setattr
,
317 static void pidfs_evict_inode(struct inode
*inode
)
319 struct pid
*pid
= inode
->i_private
;
323 pidfs_free_inum(inode
->i_ino
);
326 static const struct super_operations pidfs_sops
= {
327 .drop_inode
= generic_delete_inode
,
328 .evict_inode
= pidfs_evict_inode
,
329 .statfs
= simple_statfs
,
333 * 'lsof' has knowledge of out historical anon_inode use, and expects
334 * the pidfs dentry name to start with 'anon_inode'.
336 static char *pidfs_dname(struct dentry
*dentry
, char *buffer
, int buflen
)
338 return dynamic_dname(buffer
, buflen
, "anon_inode:[pidfd]");
341 static const struct dentry_operations pidfs_dentry_operations
= {
342 .d_delete
= always_delete_dentry
,
343 .d_dname
= pidfs_dname
,
344 .d_prune
= stashed_dentry_prune
,
347 static int pidfs_init_inode(struct inode
*inode
, void *data
)
349 inode
->i_private
= data
;
350 inode
->i_flags
|= S_PRIVATE
;
351 inode
->i_mode
|= S_IRWXU
;
352 inode
->i_op
= &pidfs_inode_operations
;
353 inode
->i_fop
= &pidfs_file_operations
;
355 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
356 * avoids collisions with the root inode which is 1 for pseudo
359 return pidfs_inum(data
, &inode
->i_ino
);
362 static void pidfs_put_data(void *data
)
364 struct pid
*pid
= data
;
368 static const struct stashed_operations pidfs_stashed_ops
= {
369 .init_inode
= pidfs_init_inode
,
370 .put_data
= pidfs_put_data
,
373 static int pidfs_init_fs_context(struct fs_context
*fc
)
375 struct pseudo_fs_context
*ctx
;
377 ctx
= init_pseudo(fc
, PID_FS_MAGIC
);
381 ctx
->ops
= &pidfs_sops
;
382 ctx
->dops
= &pidfs_dentry_operations
;
383 fc
->s_fs_info
= (void *)&pidfs_stashed_ops
;
387 static struct file_system_type pidfs_type
= {
389 .init_fs_context
= pidfs_init_fs_context
,
390 .kill_sb
= kill_anon_super
,
393 struct file
*pidfs_alloc_file(struct pid
*pid
, unsigned int flags
)
396 struct file
*pidfd_file
;
400 ret
= path_from_stashed(&pid
->stashed
, pidfs_mnt
, get_pid(pid
), &path
);
404 pidfd_file
= dentry_open(&path
, flags
, current_cred());
409 void __init
pidfs_init(void)
411 pidfs_mnt
= kern_mount(&pidfs_type
);
412 if (IS_ERR(pidfs_mnt
))
413 panic("Failed to mount pidfs pseudo filesystem");