1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fdtable.h>
4 #include <linux/fsnotify_backend.h>
5 #include <linux/init.h>
6 #include <linux/jiffies.h>
7 #include <linux/kernel.h> /* UINT_MAX */
8 #include <linux/mount.h>
9 #include <linux/sched.h>
10 #include <linux/sched/user.h>
11 #include <linux/sched/signal.h>
12 #include <linux/types.h>
13 #include <linux/wait.h>
14 #include <linux/audit.h>
15 #include <linux/sched/mm.h>
16 #include <linux/statfs.h>
20 static bool should_merge(struct fsnotify_event
*old_fsn
,
21 struct fsnotify_event
*new_fsn
)
23 struct fanotify_event
*old
, *new;
25 pr_debug("%s: old=%p new=%p\n", __func__
, old_fsn
, new_fsn
);
26 old
= FANOTIFY_E(old_fsn
);
27 new = FANOTIFY_E(new_fsn
);
29 if (old_fsn
->objectid
!= new_fsn
->objectid
|| old
->pid
!= new->pid
||
30 old
->fh_type
!= new->fh_type
|| old
->fh_len
!= new->fh_len
)
33 if (fanotify_event_has_path(old
)) {
34 return old
->path
.mnt
== new->path
.mnt
&&
35 old
->path
.dentry
== new->path
.dentry
;
36 } else if (fanotify_event_has_fid(old
)) {
38 * We want to merge many dirent events in the same dir (i.e.
39 * creates/unlinks/renames), but we do not want to merge dirent
40 * events referring to subdirs with dirent events referring to
41 * non subdirs, otherwise, user won't be able to tell from a
42 * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
43 * unlink pair or rmdir+create pair of events.
45 return (old
->mask
& FS_ISDIR
) == (new->mask
& FS_ISDIR
) &&
46 fanotify_fid_equal(&old
->fid
, &new->fid
, old
->fh_len
);
49 /* Do not merge events if we failed to encode fid */
53 /* and the list better be locked by something too! */
54 static int fanotify_merge(struct list_head
*list
, struct fsnotify_event
*event
)
56 struct fsnotify_event
*test_event
;
57 struct fanotify_event
*new;
59 pr_debug("%s: list=%p event=%p\n", __func__
, list
, event
);
60 new = FANOTIFY_E(event
);
63 * Don't merge a permission event with any other event so that we know
64 * the event structure we have created in fanotify_handle_event() is the
65 * one we should check for permission response.
67 if (fanotify_is_perm_event(new->mask
))
70 list_for_each_entry_reverse(test_event
, list
, list
) {
71 if (should_merge(test_event
, event
)) {
72 FANOTIFY_E(test_event
)->mask
|= new->mask
;
81 * Wait for response to permission event. The function also takes care of
82 * freeing the permission event (or offloads that in case the wait is canceled
83 * by a signal). The function returns 0 in case access got allowed by userspace,
84 * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
85 * the wait got interrupted by a signal.
87 static int fanotify_get_response(struct fsnotify_group
*group
,
88 struct fanotify_perm_event
*event
,
89 struct fsnotify_iter_info
*iter_info
)
93 pr_debug("%s: group=%p event=%p\n", __func__
, group
, event
);
95 ret
= wait_event_killable(group
->fanotify_data
.access_waitq
,
96 event
->state
== FAN_EVENT_ANSWERED
);
99 spin_lock(&group
->notification_lock
);
100 /* Event reported to userspace and no answer yet? */
101 if (event
->state
== FAN_EVENT_REPORTED
) {
102 /* Event will get freed once userspace answers to it */
103 event
->state
= FAN_EVENT_CANCELED
;
104 spin_unlock(&group
->notification_lock
);
107 /* Event not yet reported? Just remove it. */
108 if (event
->state
== FAN_EVENT_INIT
)
109 fsnotify_remove_queued_event(group
, &event
->fae
.fse
);
111 * Event may be also answered in case signal delivery raced
112 * with wakeup. In that case we have nothing to do besides
113 * freeing the event and reporting error.
115 spin_unlock(&group
->notification_lock
);
119 /* userspace responded, convert to something usable */
120 switch (event
->response
& ~FAN_AUDIT
) {
129 /* Check if the response should be audited */
130 if (event
->response
& FAN_AUDIT
)
131 audit_fanotify(event
->response
& ~FAN_AUDIT
);
133 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__
,
136 fsnotify_destroy_event(group
, &event
->fae
.fse
);
142 * This function returns a mask for an event that only contains the flags
143 * that have been specifically requested by the user. Flags that may have
144 * been included within the event mask, but have not been explicitly
145 * requested by the user, will not be present in the returned mask.
147 static u32
fanotify_group_event_mask(struct fsnotify_group
*group
,
148 struct fsnotify_iter_info
*iter_info
,
149 u32 event_mask
, const void *data
,
152 __u32 marks_mask
= 0, marks_ignored_mask
= 0;
153 __u32 test_mask
, user_mask
= FANOTIFY_OUTGOING_EVENTS
;
154 const struct path
*path
= data
;
155 struct fsnotify_mark
*mark
;
158 pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
159 __func__
, iter_info
->report_mask
, event_mask
, data
, data_type
);
161 if (!FAN_GROUP_FLAG(group
, FAN_REPORT_FID
)) {
162 /* Do we have path to open a file descriptor? */
163 if (data_type
!= FSNOTIFY_EVENT_PATH
)
165 /* Path type events are only relevant for files and dirs */
166 if (!d_is_reg(path
->dentry
) && !d_can_lookup(path
->dentry
))
170 fsnotify_foreach_obj_type(type
) {
171 if (!fsnotify_iter_should_report_type(iter_info
, type
))
173 mark
= iter_info
->marks
[type
];
175 * If the event is for a child and this mark doesn't care about
176 * events on a child, don't send it!
178 if (event_mask
& FS_EVENT_ON_CHILD
&&
179 (type
!= FSNOTIFY_OBJ_TYPE_INODE
||
180 !(mark
->mask
& FS_EVENT_ON_CHILD
)))
183 marks_mask
|= mark
->mask
;
184 marks_ignored_mask
|= mark
->ignored_mask
;
187 test_mask
= event_mask
& marks_mask
& ~marks_ignored_mask
;
190 * dirent modification events (create/delete/move) do not carry the
191 * child entry name/inode information. Instead, we report FAN_ONDIR
192 * for mkdir/rmdir so user can differentiate them from creat/unlink.
194 * For backward compatibility and consistency, do not report FAN_ONDIR
195 * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
196 * to user in FAN_REPORT_FID mode for all event types.
198 if (FAN_GROUP_FLAG(group
, FAN_REPORT_FID
)) {
199 /* Do not report FAN_ONDIR without any event */
200 if (!(test_mask
& ~FAN_ONDIR
))
203 user_mask
&= ~FAN_ONDIR
;
206 if (event_mask
& FS_ISDIR
&&
207 !(marks_mask
& FS_ISDIR
& ~marks_ignored_mask
))
210 return test_mask
& user_mask
;
213 static int fanotify_encode_fid(struct fanotify_event
*event
,
214 struct inode
*inode
, gfp_t gfp
,
215 __kernel_fsid_t
*fsid
)
217 struct fanotify_fid
*fid
= &event
->fid
;
218 int dwords
, bytes
= 0;
224 type
= exportfs_encode_inode_fh(inode
, NULL
, &dwords
, NULL
);
229 if (bytes
> FANOTIFY_INLINE_FH_LEN
) {
230 /* Treat failure to allocate fh as failure to allocate event */
232 fid
->ext_fh
= kmalloc(bytes
, gfp
);
237 type
= exportfs_encode_inode_fh(inode
, fanotify_fid_fh(fid
, bytes
),
240 if (!type
|| type
== FILEID_INVALID
|| bytes
!= dwords
<< 2)
244 event
->fh_len
= bytes
;
249 pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
250 "type=%d, bytes=%d, err=%i)\n",
251 fsid
->val
[0], fsid
->val
[1], type
, bytes
, err
);
256 return FILEID_INVALID
;
260 * The inode to use as identifier when reporting fid depends on the event.
261 * Report the modified directory inode on dirent modification events.
262 * Report the "victim" inode otherwise.
264 * FS_ATTRIB reports the child inode even if reported on a watched parent.
265 * FS_CREATE reports the modified dir inode and not the created inode.
267 static struct inode
*fanotify_fid_inode(struct inode
*to_tell
, u32 event_mask
,
268 const void *data
, int data_type
)
270 if (event_mask
& ALL_FSNOTIFY_DIRENT_EVENTS
)
272 else if (data_type
== FSNOTIFY_EVENT_INODE
)
273 return (struct inode
*)data
;
274 else if (data_type
== FSNOTIFY_EVENT_PATH
)
275 return d_inode(((struct path
*)data
)->dentry
);
279 struct fanotify_event
*fanotify_alloc_event(struct fsnotify_group
*group
,
280 struct inode
*inode
, u32 mask
,
281 const void *data
, int data_type
,
282 __kernel_fsid_t
*fsid
)
284 struct fanotify_event
*event
= NULL
;
285 gfp_t gfp
= GFP_KERNEL_ACCOUNT
;
286 struct inode
*id
= fanotify_fid_inode(inode
, mask
, data
, data_type
);
289 * For queues with unlimited length lost events are not expected and
290 * can possibly have security implications. Avoid losing events when
291 * memory is short. For the limited size queues, avoid OOM killer in the
292 * target monitoring memcg as it may have security repercussion.
294 if (group
->max_events
== UINT_MAX
)
297 gfp
|= __GFP_RETRY_MAYFAIL
;
299 /* Whoever is interested in the event, pays for the allocation. */
300 memalloc_use_memcg(group
->memcg
);
302 if (fanotify_is_perm_event(mask
)) {
303 struct fanotify_perm_event
*pevent
;
305 pevent
= kmem_cache_alloc(fanotify_perm_event_cachep
, gfp
);
308 event
= &pevent
->fae
;
309 pevent
->response
= 0;
310 pevent
->state
= FAN_EVENT_INIT
;
313 event
= kmem_cache_alloc(fanotify_event_cachep
, gfp
);
318 * Use the victim inode instead of the watching inode as the id for
319 * event queue, so event reported on parent is merged with event
320 * reported on child when both directory and child watches exist.
322 fsnotify_init_event(&event
->fse
, (unsigned long)id
);
324 if (FAN_GROUP_FLAG(group
, FAN_REPORT_TID
))
325 event
->pid
= get_pid(task_pid(current
));
327 event
->pid
= get_pid(task_tgid(current
));
329 if (id
&& FAN_GROUP_FLAG(group
, FAN_REPORT_FID
)) {
330 /* Report the event without a file identifier on encode error */
331 event
->fh_type
= fanotify_encode_fid(event
, id
, gfp
, fsid
);
332 } else if (data_type
== FSNOTIFY_EVENT_PATH
) {
333 event
->fh_type
= FILEID_ROOT
;
334 event
->path
= *((struct path
*)data
);
335 path_get(&event
->path
);
337 event
->fh_type
= FILEID_INVALID
;
338 event
->path
.mnt
= NULL
;
339 event
->path
.dentry
= NULL
;
342 memalloc_unuse_memcg();
347 * Get cached fsid of the filesystem containing the object from any connector.
348 * All connectors are supposed to have the same fsid, but we do not verify that
351 static __kernel_fsid_t
fanotify_get_fsid(struct fsnotify_iter_info
*iter_info
)
354 __kernel_fsid_t fsid
= {};
356 fsnotify_foreach_obj_type(type
) {
357 struct fsnotify_mark_connector
*conn
;
359 if (!fsnotify_iter_should_report_type(iter_info
, type
))
362 conn
= READ_ONCE(iter_info
->marks
[type
]->connector
);
363 /* Mark is just getting destroyed or created? */
366 if (!(conn
->flags
& FSNOTIFY_CONN_FLAG_HAS_FSID
))
368 /* Pairs with smp_wmb() in fsnotify_add_mark_list() */
371 if (WARN_ON_ONCE(!fsid
.val
[0] && !fsid
.val
[1]))
379 static int fanotify_handle_event(struct fsnotify_group
*group
,
381 u32 mask
, const void *data
, int data_type
,
382 const struct qstr
*file_name
, u32 cookie
,
383 struct fsnotify_iter_info
*iter_info
)
386 struct fanotify_event
*event
;
387 struct fsnotify_event
*fsn_event
;
388 __kernel_fsid_t fsid
= {};
390 BUILD_BUG_ON(FAN_ACCESS
!= FS_ACCESS
);
391 BUILD_BUG_ON(FAN_MODIFY
!= FS_MODIFY
);
392 BUILD_BUG_ON(FAN_ATTRIB
!= FS_ATTRIB
);
393 BUILD_BUG_ON(FAN_CLOSE_NOWRITE
!= FS_CLOSE_NOWRITE
);
394 BUILD_BUG_ON(FAN_CLOSE_WRITE
!= FS_CLOSE_WRITE
);
395 BUILD_BUG_ON(FAN_OPEN
!= FS_OPEN
);
396 BUILD_BUG_ON(FAN_MOVED_TO
!= FS_MOVED_TO
);
397 BUILD_BUG_ON(FAN_MOVED_FROM
!= FS_MOVED_FROM
);
398 BUILD_BUG_ON(FAN_CREATE
!= FS_CREATE
);
399 BUILD_BUG_ON(FAN_DELETE
!= FS_DELETE
);
400 BUILD_BUG_ON(FAN_DELETE_SELF
!= FS_DELETE_SELF
);
401 BUILD_BUG_ON(FAN_MOVE_SELF
!= FS_MOVE_SELF
);
402 BUILD_BUG_ON(FAN_EVENT_ON_CHILD
!= FS_EVENT_ON_CHILD
);
403 BUILD_BUG_ON(FAN_Q_OVERFLOW
!= FS_Q_OVERFLOW
);
404 BUILD_BUG_ON(FAN_OPEN_PERM
!= FS_OPEN_PERM
);
405 BUILD_BUG_ON(FAN_ACCESS_PERM
!= FS_ACCESS_PERM
);
406 BUILD_BUG_ON(FAN_ONDIR
!= FS_ISDIR
);
407 BUILD_BUG_ON(FAN_OPEN_EXEC
!= FS_OPEN_EXEC
);
408 BUILD_BUG_ON(FAN_OPEN_EXEC_PERM
!= FS_OPEN_EXEC_PERM
);
410 BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS
) != 19);
412 mask
= fanotify_group_event_mask(group
, iter_info
, mask
, data
,
417 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__
, group
, inode
,
420 if (fanotify_is_perm_event(mask
)) {
422 * fsnotify_prepare_user_wait() fails if we race with mark
423 * deletion. Just let the operation pass in that case.
425 if (!fsnotify_prepare_user_wait(iter_info
))
429 if (FAN_GROUP_FLAG(group
, FAN_REPORT_FID
)) {
430 fsid
= fanotify_get_fsid(iter_info
);
431 /* Racing with mark destruction or creation? */
432 if (!fsid
.val
[0] && !fsid
.val
[1])
436 event
= fanotify_alloc_event(group
, inode
, mask
, data
, data_type
,
439 if (unlikely(!event
)) {
441 * We don't queue overflow events for permission events as
442 * there the access is denied and so no event is in fact lost.
444 if (!fanotify_is_perm_event(mask
))
445 fsnotify_queue_overflow(group
);
449 fsn_event
= &event
->fse
;
450 ret
= fsnotify_add_event(group
, fsn_event
, fanotify_merge
);
452 /* Permission events shouldn't be merged */
453 BUG_ON(ret
== 1 && mask
& FANOTIFY_PERM_EVENTS
);
454 /* Our event wasn't used in the end. Free it. */
455 fsnotify_destroy_event(group
, fsn_event
);
458 } else if (fanotify_is_perm_event(mask
)) {
459 ret
= fanotify_get_response(group
, FANOTIFY_PE(fsn_event
),
463 if (fanotify_is_perm_event(mask
))
464 fsnotify_finish_user_wait(iter_info
);
469 static void fanotify_free_group_priv(struct fsnotify_group
*group
)
471 struct user_struct
*user
;
473 user
= group
->fanotify_data
.user
;
474 atomic_dec(&user
->fanotify_listeners
);
478 static void fanotify_free_event(struct fsnotify_event
*fsn_event
)
480 struct fanotify_event
*event
;
482 event
= FANOTIFY_E(fsn_event
);
483 if (fanotify_event_has_path(event
))
484 path_put(&event
->path
);
485 else if (fanotify_event_has_ext_fh(event
))
486 kfree(event
->fid
.ext_fh
);
488 if (fanotify_is_perm_event(event
->mask
)) {
489 kmem_cache_free(fanotify_perm_event_cachep
,
490 FANOTIFY_PE(fsn_event
));
493 kmem_cache_free(fanotify_event_cachep
, event
);
496 static void fanotify_free_mark(struct fsnotify_mark
*fsn_mark
)
498 kmem_cache_free(fanotify_mark_cache
, fsn_mark
);
501 const struct fsnotify_ops fanotify_fsnotify_ops
= {
502 .handle_event
= fanotify_handle_event
,
503 .free_group_priv
= fanotify_free_group_priv
,
504 .free_event
= fanotify_free_event
,
505 .free_mark
= fanotify_free_mark
,