1 // SPDX-License-Identifier: GPL-2.0
3 * Code related to the io_uring_register() syscall
5 * Copyright (C) 2023 Jens Axboe
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
32 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
33 IORING_REGISTER_LAST + IORING_OP_LAST)
35 static __cold
int io_probe(struct io_ring_ctx
*ctx
, void __user
*arg
,
38 struct io_uring_probe
*p
;
42 if (nr_args
> IORING_OP_LAST
)
43 nr_args
= IORING_OP_LAST
;
45 size
= struct_size(p
, ops
, nr_args
);
46 p
= kzalloc(size
, GFP_KERNEL
);
51 if (copy_from_user(p
, arg
, size
))
54 if (memchr_inv(p
, 0, size
))
57 p
->last_op
= IORING_OP_LAST
- 1;
59 for (i
= 0; i
< nr_args
; i
++) {
61 if (io_uring_op_supported(i
))
62 p
->ops
[i
].flags
= IO_URING_OP_SUPPORTED
;
67 if (copy_to_user(arg
, p
, size
))
74 int io_unregister_personality(struct io_ring_ctx
*ctx
, unsigned id
)
76 const struct cred
*creds
;
78 creds
= xa_erase(&ctx
->personalities
, id
);
88 static int io_register_personality(struct io_ring_ctx
*ctx
)
90 const struct cred
*creds
;
94 creds
= get_current_cred();
96 ret
= xa_alloc_cyclic(&ctx
->personalities
, &id
, (void *)creds
,
97 XA_LIMIT(0, USHRT_MAX
), &ctx
->pers_next
, GFP_KERNEL
);
105 static __cold
int io_register_restrictions(struct io_ring_ctx
*ctx
,
106 void __user
*arg
, unsigned int nr_args
)
108 struct io_uring_restriction
*res
;
112 /* Restrictions allowed only if rings started disabled */
113 if (!(ctx
->flags
& IORING_SETUP_R_DISABLED
))
116 /* We allow only a single restrictions registration */
117 if (ctx
->restrictions
.registered
)
120 if (!arg
|| nr_args
> IORING_MAX_RESTRICTIONS
)
123 size
= array_size(nr_args
, sizeof(*res
));
124 if (size
== SIZE_MAX
)
127 res
= memdup_user(arg
, size
);
133 for (i
= 0; i
< nr_args
; i
++) {
134 switch (res
[i
].opcode
) {
135 case IORING_RESTRICTION_REGISTER_OP
:
136 if (res
[i
].register_op
>= IORING_REGISTER_LAST
) {
141 __set_bit(res
[i
].register_op
,
142 ctx
->restrictions
.register_op
);
144 case IORING_RESTRICTION_SQE_OP
:
145 if (res
[i
].sqe_op
>= IORING_OP_LAST
) {
150 __set_bit(res
[i
].sqe_op
, ctx
->restrictions
.sqe_op
);
152 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED
:
153 ctx
->restrictions
.sqe_flags_allowed
= res
[i
].sqe_flags
;
155 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED
:
156 ctx
->restrictions
.sqe_flags_required
= res
[i
].sqe_flags
;
165 /* Reset all restrictions if an error happened */
167 memset(&ctx
->restrictions
, 0, sizeof(ctx
->restrictions
));
169 ctx
->restrictions
.registered
= true;
175 static int io_register_enable_rings(struct io_ring_ctx
*ctx
)
177 if (!(ctx
->flags
& IORING_SETUP_R_DISABLED
))
180 if (ctx
->flags
& IORING_SETUP_SINGLE_ISSUER
&& !ctx
->submitter_task
) {
181 WRITE_ONCE(ctx
->submitter_task
, get_task_struct(current
));
183 * Lazy activation attempts would fail if it was polled before
184 * submitter_task is set.
186 if (wq_has_sleeper(&ctx
->poll_wq
))
187 io_activate_pollwq(ctx
);
190 if (ctx
->restrictions
.registered
)
193 ctx
->flags
&= ~IORING_SETUP_R_DISABLED
;
194 if (ctx
->sq_data
&& wq_has_sleeper(&ctx
->sq_data
->wait
))
195 wake_up(&ctx
->sq_data
->wait
);
199 static __cold
int __io_register_iowq_aff(struct io_ring_ctx
*ctx
,
200 cpumask_var_t new_mask
)
204 if (!(ctx
->flags
& IORING_SETUP_SQPOLL
)) {
205 ret
= io_wq_cpu_affinity(current
->io_uring
, new_mask
);
207 mutex_unlock(&ctx
->uring_lock
);
208 ret
= io_sqpoll_wq_cpu_affinity(ctx
, new_mask
);
209 mutex_lock(&ctx
->uring_lock
);
215 static __cold
int io_register_iowq_aff(struct io_ring_ctx
*ctx
,
216 void __user
*arg
, unsigned len
)
218 cpumask_var_t new_mask
;
221 if (!alloc_cpumask_var(&new_mask
, GFP_KERNEL
))
224 cpumask_clear(new_mask
);
225 if (len
> cpumask_size())
226 len
= cpumask_size();
229 if (in_compat_syscall())
230 ret
= compat_get_bitmap(cpumask_bits(new_mask
),
231 (const compat_ulong_t __user
*)arg
,
232 len
* 8 /* CHAR_BIT */);
235 ret
= copy_from_user(new_mask
, arg
, len
);
238 free_cpumask_var(new_mask
);
242 ret
= __io_register_iowq_aff(ctx
, new_mask
);
243 free_cpumask_var(new_mask
);
247 static __cold
int io_unregister_iowq_aff(struct io_ring_ctx
*ctx
)
249 return __io_register_iowq_aff(ctx
, NULL
);
252 static __cold
int io_register_iowq_max_workers(struct io_ring_ctx
*ctx
,
254 __must_hold(&ctx
->uring_lock
)
256 struct io_tctx_node
*node
;
257 struct io_uring_task
*tctx
= NULL
;
258 struct io_sq_data
*sqd
= NULL
;
262 if (copy_from_user(new_count
, arg
, sizeof(new_count
)))
264 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
265 if (new_count
[i
] > INT_MAX
)
268 if (ctx
->flags
& IORING_SETUP_SQPOLL
) {
272 * Observe the correct sqd->lock -> ctx->uring_lock
273 * ordering. Fine to drop uring_lock here, we hold
276 refcount_inc(&sqd
->refs
);
277 mutex_unlock(&ctx
->uring_lock
);
278 mutex_lock(&sqd
->lock
);
279 mutex_lock(&ctx
->uring_lock
);
281 tctx
= sqd
->thread
->io_uring
;
284 tctx
= current
->io_uring
;
287 BUILD_BUG_ON(sizeof(new_count
) != sizeof(ctx
->iowq_limits
));
289 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
291 ctx
->iowq_limits
[i
] = new_count
[i
];
292 ctx
->iowq_limits_set
= true;
294 if (tctx
&& tctx
->io_wq
) {
295 ret
= io_wq_max_workers(tctx
->io_wq
, new_count
);
299 memset(new_count
, 0, sizeof(new_count
));
303 mutex_unlock(&ctx
->uring_lock
);
304 mutex_unlock(&sqd
->lock
);
306 mutex_lock(&ctx
->uring_lock
);
309 if (copy_to_user(arg
, new_count
, sizeof(new_count
)))
312 /* that's it for SQPOLL, only the SQPOLL task creates requests */
316 /* now propagate the restriction to all registered users */
317 list_for_each_entry(node
, &ctx
->tctx_list
, ctx_node
) {
318 tctx
= node
->task
->io_uring
;
319 if (WARN_ON_ONCE(!tctx
->io_wq
))
322 for (i
= 0; i
< ARRAY_SIZE(new_count
); i
++)
323 new_count
[i
] = ctx
->iowq_limits
[i
];
324 /* ignore errors, it always returns zero anyway */
325 (void)io_wq_max_workers(tctx
->io_wq
, new_count
);
330 mutex_unlock(&ctx
->uring_lock
);
331 mutex_unlock(&sqd
->lock
);
333 mutex_lock(&ctx
->uring_lock
);
338 static int io_register_clock(struct io_ring_ctx
*ctx
,
339 struct io_uring_clock_register __user
*arg
)
341 struct io_uring_clock_register reg
;
343 if (copy_from_user(®
, arg
, sizeof(reg
)))
345 if (memchr_inv(®
.__resv
, 0, sizeof(reg
.__resv
)))
348 switch (reg
.clockid
) {
349 case CLOCK_MONOTONIC
:
350 ctx
->clock_offset
= 0;
353 ctx
->clock_offset
= TK_OFFS_BOOT
;
359 ctx
->clockid
= reg
.clockid
;
363 static int __io_uring_register(struct io_ring_ctx
*ctx
, unsigned opcode
,
364 void __user
*arg
, unsigned nr_args
)
365 __releases(ctx
->uring_lock
)
366 __acquires(ctx
->uring_lock
)
371 * We don't quiesce the refs for register anymore and so it can't be
372 * dying as we're holding a file ref here.
374 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx
->refs
)))
377 if (ctx
->submitter_task
&& ctx
->submitter_task
!= current
)
380 if (ctx
->restricted
) {
381 opcode
= array_index_nospec(opcode
, IORING_REGISTER_LAST
);
382 if (!test_bit(opcode
, ctx
->restrictions
.register_op
))
387 case IORING_REGISTER_BUFFERS
:
391 ret
= io_sqe_buffers_register(ctx
, arg
, nr_args
, NULL
);
393 case IORING_UNREGISTER_BUFFERS
:
397 ret
= io_sqe_buffers_unregister(ctx
);
399 case IORING_REGISTER_FILES
:
403 ret
= io_sqe_files_register(ctx
, arg
, nr_args
, NULL
);
405 case IORING_UNREGISTER_FILES
:
409 ret
= io_sqe_files_unregister(ctx
);
411 case IORING_REGISTER_FILES_UPDATE
:
412 ret
= io_register_files_update(ctx
, arg
, nr_args
);
414 case IORING_REGISTER_EVENTFD
:
418 ret
= io_eventfd_register(ctx
, arg
, 0);
420 case IORING_REGISTER_EVENTFD_ASYNC
:
424 ret
= io_eventfd_register(ctx
, arg
, 1);
426 case IORING_UNREGISTER_EVENTFD
:
430 ret
= io_eventfd_unregister(ctx
);
432 case IORING_REGISTER_PROBE
:
434 if (!arg
|| nr_args
> 256)
436 ret
= io_probe(ctx
, arg
, nr_args
);
438 case IORING_REGISTER_PERSONALITY
:
442 ret
= io_register_personality(ctx
);
444 case IORING_UNREGISTER_PERSONALITY
:
448 ret
= io_unregister_personality(ctx
, nr_args
);
450 case IORING_REGISTER_ENABLE_RINGS
:
454 ret
= io_register_enable_rings(ctx
);
456 case IORING_REGISTER_RESTRICTIONS
:
457 ret
= io_register_restrictions(ctx
, arg
, nr_args
);
459 case IORING_REGISTER_FILES2
:
460 ret
= io_register_rsrc(ctx
, arg
, nr_args
, IORING_RSRC_FILE
);
462 case IORING_REGISTER_FILES_UPDATE2
:
463 ret
= io_register_rsrc_update(ctx
, arg
, nr_args
,
466 case IORING_REGISTER_BUFFERS2
:
467 ret
= io_register_rsrc(ctx
, arg
, nr_args
, IORING_RSRC_BUFFER
);
469 case IORING_REGISTER_BUFFERS_UPDATE
:
470 ret
= io_register_rsrc_update(ctx
, arg
, nr_args
,
473 case IORING_REGISTER_IOWQ_AFF
:
475 if (!arg
|| !nr_args
)
477 ret
= io_register_iowq_aff(ctx
, arg
, nr_args
);
479 case IORING_UNREGISTER_IOWQ_AFF
:
483 ret
= io_unregister_iowq_aff(ctx
);
485 case IORING_REGISTER_IOWQ_MAX_WORKERS
:
487 if (!arg
|| nr_args
!= 2)
489 ret
= io_register_iowq_max_workers(ctx
, arg
);
491 case IORING_REGISTER_RING_FDS
:
492 ret
= io_ringfd_register(ctx
, arg
, nr_args
);
494 case IORING_UNREGISTER_RING_FDS
:
495 ret
= io_ringfd_unregister(ctx
, arg
, nr_args
);
497 case IORING_REGISTER_PBUF_RING
:
499 if (!arg
|| nr_args
!= 1)
501 ret
= io_register_pbuf_ring(ctx
, arg
);
503 case IORING_UNREGISTER_PBUF_RING
:
505 if (!arg
|| nr_args
!= 1)
507 ret
= io_unregister_pbuf_ring(ctx
, arg
);
509 case IORING_REGISTER_SYNC_CANCEL
:
511 if (!arg
|| nr_args
!= 1)
513 ret
= io_sync_cancel(ctx
, arg
);
515 case IORING_REGISTER_FILE_ALLOC_RANGE
:
519 ret
= io_register_file_alloc_range(ctx
, arg
);
521 case IORING_REGISTER_PBUF_STATUS
:
523 if (!arg
|| nr_args
!= 1)
525 ret
= io_register_pbuf_status(ctx
, arg
);
527 case IORING_REGISTER_NAPI
:
529 if (!arg
|| nr_args
!= 1)
531 ret
= io_register_napi(ctx
, arg
);
533 case IORING_UNREGISTER_NAPI
:
537 ret
= io_unregister_napi(ctx
, arg
);
539 case IORING_REGISTER_CLOCK
:
543 ret
= io_register_clock(ctx
, arg
);
545 case IORING_REGISTER_CLONE_BUFFERS
:
547 if (!arg
|| nr_args
!= 1)
549 ret
= io_register_clone_buffers(ctx
, arg
);
560 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
561 * true, then the registered index is used. Otherwise, the normal fd table.
562 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
564 struct file
*io_uring_register_get_file(unsigned int fd
, bool registered
)
570 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
571 * need only dereference our task private array to find it.
573 struct io_uring_task
*tctx
= current
->io_uring
;
575 if (unlikely(!tctx
|| fd
>= IO_RINGFD_REG_MAX
))
576 return ERR_PTR(-EINVAL
);
577 fd
= array_index_nospec(fd
, IO_RINGFD_REG_MAX
);
578 file
= tctx
->registered_rings
[fd
];
584 return ERR_PTR(-EBADF
);
585 if (io_is_uring_fops(file
))
588 return ERR_PTR(-EOPNOTSUPP
);
591 SYSCALL_DEFINE4(io_uring_register
, unsigned int, fd
, unsigned int, opcode
,
592 void __user
*, arg
, unsigned int, nr_args
)
594 struct io_ring_ctx
*ctx
;
597 bool use_registered_ring
;
599 use_registered_ring
= !!(opcode
& IORING_REGISTER_USE_REGISTERED_RING
);
600 opcode
&= ~IORING_REGISTER_USE_REGISTERED_RING
;
602 if (opcode
>= IORING_REGISTER_LAST
)
605 file
= io_uring_register_get_file(fd
, use_registered_ring
);
607 return PTR_ERR(file
);
608 ctx
= file
->private_data
;
610 mutex_lock(&ctx
->uring_lock
);
611 ret
= __io_uring_register(ctx
, opcode
, arg
, nr_args
);
612 mutex_unlock(&ctx
->uring_lock
);
613 trace_io_uring_register(ctx
, opcode
, ctx
->nr_user_files
, ctx
->nr_user_bufs
, ret
);
614 if (!use_registered_ring
)