1 // SPDX-License-Identifier: GPL-2.0-only
3 * Helpers for the host side of a virtio ring.
5 * Since these may be in userspace, we use (inline) accessors.
7 #include <linux/compiler.h>
8 #include <linux/module.h>
9 #include <linux/vringh.h>
10 #include <linux/virtio_ring.h>
11 #include <linux/kernel.h>
12 #include <linux/ratelimit.h>
13 #include <linux/uaccess.h>
14 #include <linux/slab.h>
15 #include <linux/export.h>
16 #if IS_REACHABLE(CONFIG_VHOST_IOTLB)
17 #include <linux/bvec.h>
18 #include <linux/highmem.h>
19 #include <linux/vhost_iotlb.h>
21 #include <uapi/linux/virtio_config.h>
23 static __printf(1,2) __cold
void vringh_bad(const char *fmt
, ...)
25 static DEFINE_RATELIMIT_STATE(vringh_rs
,
26 DEFAULT_RATELIMIT_INTERVAL
,
27 DEFAULT_RATELIMIT_BURST
);
28 if (__ratelimit(&vringh_rs
)) {
31 printk(KERN_NOTICE
"vringh:");
37 /* Returns vring->num if empty, -ve on error. */
38 static inline int __vringh_get_head(const struct vringh
*vrh
,
39 int (*getu16
)(const struct vringh
*vrh
,
40 u16
*val
, const __virtio16
*p
),
43 u16 avail_idx
, i
, head
;
46 err
= getu16(vrh
, &avail_idx
, &vrh
->vring
.avail
->idx
);
48 vringh_bad("Failed to access avail idx at %p",
49 &vrh
->vring
.avail
->idx
);
53 if (*last_avail_idx
== avail_idx
)
54 return vrh
->vring
.num
;
56 /* Only get avail ring entries after they have been exposed by guest. */
57 virtio_rmb(vrh
->weak_barriers
);
59 i
= *last_avail_idx
& (vrh
->vring
.num
- 1);
61 err
= getu16(vrh
, &head
, &vrh
->vring
.avail
->ring
[i
]);
63 vringh_bad("Failed to read head: idx %d address %p",
64 *last_avail_idx
, &vrh
->vring
.avail
->ring
[i
]);
68 if (head
>= vrh
->vring
.num
) {
69 vringh_bad("Guest says index %u > %u is available",
70 head
, vrh
->vring
.num
);
79 * vringh_kiov_advance - skip bytes from vring_kiov
80 * @iov: an iov passed to vringh_getdesc_*() (updated as we consume)
81 * @len: the maximum length to advance
83 void vringh_kiov_advance(struct vringh_kiov
*iov
, size_t len
)
85 while (len
&& iov
->i
< iov
->used
) {
86 size_t partlen
= min(iov
->iov
[iov
->i
].iov_len
, len
);
88 iov
->consumed
+= partlen
;
89 iov
->iov
[iov
->i
].iov_len
-= partlen
;
90 iov
->iov
[iov
->i
].iov_base
+= partlen
;
92 if (!iov
->iov
[iov
->i
].iov_len
) {
93 /* Fix up old iov element then increment. */
94 iov
->iov
[iov
->i
].iov_len
= iov
->consumed
;
95 iov
->iov
[iov
->i
].iov_base
-= iov
->consumed
;
104 EXPORT_SYMBOL(vringh_kiov_advance
);
106 /* Copy some bytes to/from the iovec. Returns num copied. */
107 static inline ssize_t
vringh_iov_xfer(struct vringh
*vrh
,
108 struct vringh_kiov
*iov
,
109 void *ptr
, size_t len
,
110 int (*xfer
)(const struct vringh
*vrh
,
111 void *addr
, void *ptr
,
116 while (len
&& iov
->i
< iov
->used
) {
119 partlen
= min(iov
->iov
[iov
->i
].iov_len
, len
);
120 err
= xfer(vrh
, iov
->iov
[iov
->i
].iov_base
, ptr
, partlen
);
126 iov
->consumed
+= partlen
;
127 iov
->iov
[iov
->i
].iov_len
-= partlen
;
128 iov
->iov
[iov
->i
].iov_base
+= partlen
;
130 if (!iov
->iov
[iov
->i
].iov_len
) {
131 /* Fix up old iov element then increment. */
132 iov
->iov
[iov
->i
].iov_len
= iov
->consumed
;
133 iov
->iov
[iov
->i
].iov_base
-= iov
->consumed
;
142 /* May reduce *len if range is shorter. */
143 static inline bool range_check(struct vringh
*vrh
, u64 addr
, size_t *len
,
144 struct vringh_range
*range
,
145 bool (*getrange
)(struct vringh
*,
146 u64
, struct vringh_range
*))
148 if (addr
< range
->start
|| addr
> range
->end_incl
) {
149 if (!getrange(vrh
, addr
, range
))
152 BUG_ON(addr
< range
->start
|| addr
> range
->end_incl
);
154 /* To end of memory? */
155 if (unlikely(addr
+ *len
== 0)) {
156 if (range
->end_incl
== -1ULL)
161 /* Otherwise, don't wrap. */
162 if (addr
+ *len
< addr
) {
163 vringh_bad("Wrapping descriptor %zu@0x%llx",
164 *len
, (unsigned long long)addr
);
168 if (unlikely(addr
+ *len
- 1 > range
->end_incl
))
173 *len
= range
->end_incl
+ 1 - addr
;
177 static inline bool no_range_check(struct vringh
*vrh
, u64 addr
, size_t *len
,
178 struct vringh_range
*range
,
179 bool (*getrange
)(struct vringh
*,
180 u64
, struct vringh_range
*))
185 /* No reason for this code to be inline. */
186 static int move_to_indirect(const struct vringh
*vrh
,
187 int *up_next
, u16
*i
, void *addr
,
188 const struct vring_desc
*desc
,
189 struct vring_desc
**descs
, int *desc_max
)
193 /* Indirect tables can't have indirect. */
194 if (*up_next
!= -1) {
195 vringh_bad("Multilevel indirect %u->%u", *up_next
, *i
);
199 len
= vringh32_to_cpu(vrh
, desc
->len
);
200 if (unlikely(len
% sizeof(struct vring_desc
))) {
201 vringh_bad("Strange indirect len %u", desc
->len
);
205 /* We will check this when we follow it! */
206 if (desc
->flags
& cpu_to_vringh16(vrh
, VRING_DESC_F_NEXT
))
207 *up_next
= vringh16_to_cpu(vrh
, desc
->next
);
211 *desc_max
= len
/ sizeof(struct vring_desc
);
213 /* Now, start at the first indirect. */
218 static int resize_iovec(struct vringh_kiov
*iov
, gfp_t gfp
)
221 unsigned int flag
, new_num
= (iov
->max_num
& ~VRINGH_IOV_ALLOCATED
) * 2;
226 flag
= (iov
->max_num
& VRINGH_IOV_ALLOCATED
);
228 new = krealloc_array(iov
->iov
, new_num
,
229 sizeof(struct iovec
), gfp
);
231 new = kmalloc_array(new_num
, sizeof(struct iovec
), gfp
);
233 memcpy(new, iov
->iov
,
234 iov
->max_num
* sizeof(struct iovec
));
235 flag
= VRINGH_IOV_ALLOCATED
;
241 iov
->max_num
= (new_num
| flag
);
245 static u16 __cold
return_from_indirect(const struct vringh
*vrh
, int *up_next
,
246 struct vring_desc
**descs
, int *desc_max
)
251 *descs
= vrh
->vring
.desc
;
252 *desc_max
= vrh
->vring
.num
;
256 static int slow_copy(struct vringh
*vrh
, void *dst
, const void *src
,
257 bool (*rcheck
)(struct vringh
*vrh
, u64 addr
, size_t *len
,
258 struct vringh_range
*range
,
259 bool (*getrange
)(struct vringh
*vrh
,
261 struct vringh_range
*)),
262 bool (*getrange
)(struct vringh
*vrh
,
264 struct vringh_range
*r
),
265 struct vringh_range
*range
,
266 int (*copy
)(const struct vringh
*vrh
,
267 void *dst
, const void *src
, size_t len
))
269 size_t part
, len
= sizeof(struct vring_desc
);
276 addr
= (u64
)(unsigned long)src
- range
->offset
;
278 if (!rcheck(vrh
, addr
, &part
, range
, getrange
))
281 err
= copy(vrh
, dst
, src
, part
);
293 __vringh_iov(struct vringh
*vrh
, u16 i
,
294 struct vringh_kiov
*riov
,
295 struct vringh_kiov
*wiov
,
296 bool (*rcheck
)(struct vringh
*vrh
, u64 addr
, size_t *len
,
297 struct vringh_range
*range
,
298 bool (*getrange
)(struct vringh
*, u64
,
299 struct vringh_range
*)),
300 bool (*getrange
)(struct vringh
*, u64
, struct vringh_range
*),
302 int (*copy
)(const struct vringh
*vrh
,
303 void *dst
, const void *src
, size_t len
))
305 int err
, count
= 0, indirect_count
= 0, up_next
, desc_max
;
306 struct vring_desc desc
, *descs
;
307 struct vringh_range range
= { -1ULL, 0 }, slowrange
;
310 /* We start traversing vring's descriptor table. */
311 descs
= vrh
->vring
.desc
;
312 desc_max
= vrh
->vring
.num
;
315 /* You must want something! */
316 if (WARN_ON(!riov
&& !wiov
))
320 riov
->i
= riov
->used
= riov
->consumed
= 0;
322 wiov
->i
= wiov
->used
= wiov
->consumed
= 0;
326 struct vringh_kiov
*iov
;
330 err
= slow_copy(vrh
, &desc
, &descs
[i
], rcheck
, getrange
,
333 err
= copy(vrh
, &desc
, &descs
[i
], sizeof(desc
));
337 if (unlikely(desc
.flags
&
338 cpu_to_vringh16(vrh
, VRING_DESC_F_INDIRECT
))) {
339 u64 a
= vringh64_to_cpu(vrh
, desc
.addr
);
341 /* Make sure it's OK, and get offset. */
342 len
= vringh32_to_cpu(vrh
, desc
.len
);
343 if (!rcheck(vrh
, a
, &len
, &range
, getrange
)) {
348 if (unlikely(len
!= vringh32_to_cpu(vrh
, desc
.len
))) {
350 /* We need to save this range to use offset */
354 addr
= (void *)(long)(a
+ range
.offset
);
355 err
= move_to_indirect(vrh
, &up_next
, &i
, addr
, &desc
,
367 if (count
> vrh
->vring
.num
|| indirect_count
> desc_max
) {
368 vringh_bad("Descriptor loop in %p", descs
);
373 if (desc
.flags
& cpu_to_vringh16(vrh
, VRING_DESC_F_WRITE
))
377 if (unlikely(wiov
&& wiov
->used
)) {
378 vringh_bad("Readable desc %p after writable",
386 vringh_bad("Unexpected %s desc",
387 !wiov
? "writable" : "readable");
393 /* Make sure it's OK, and get offset. */
394 len
= vringh32_to_cpu(vrh
, desc
.len
);
395 if (!rcheck(vrh
, vringh64_to_cpu(vrh
, desc
.addr
), &len
, &range
,
400 addr
= (void *)(unsigned long)(vringh64_to_cpu(vrh
, desc
.addr
) +
403 if (unlikely(iov
->used
== (iov
->max_num
& ~VRINGH_IOV_ALLOCATED
))) {
404 err
= resize_iovec(iov
, gfp
);
409 iov
->iov
[iov
->used
].iov_base
= addr
;
410 iov
->iov
[iov
->used
].iov_len
= len
;
413 if (unlikely(len
!= vringh32_to_cpu(vrh
, desc
.len
))) {
414 desc
.len
= cpu_to_vringh32(vrh
,
415 vringh32_to_cpu(vrh
, desc
.len
) - len
);
416 desc
.addr
= cpu_to_vringh64(vrh
,
417 vringh64_to_cpu(vrh
, desc
.addr
) + len
);
421 if (desc
.flags
& cpu_to_vringh16(vrh
, VRING_DESC_F_NEXT
)) {
422 i
= vringh16_to_cpu(vrh
, desc
.next
);
424 /* Just in case we need to finish traversing above. */
425 if (unlikely(up_next
> 0)) {
426 i
= return_from_indirect(vrh
, &up_next
,
435 vringh_bad("Chained index %u > %u", i
, desc_max
);
447 static inline int __vringh_complete(struct vringh
*vrh
,
448 const struct vring_used_elem
*used
,
449 unsigned int num_used
,
450 int (*putu16
)(const struct vringh
*vrh
,
451 __virtio16
*p
, u16 val
),
452 int (*putused
)(const struct vringh
*vrh
,
453 struct vring_used_elem
*dst
,
454 const struct vring_used_elem
457 struct vring_used
*used_ring
;
461 used_ring
= vrh
->vring
.used
;
462 used_idx
= vrh
->last_used_idx
+ vrh
->completed
;
464 off
= used_idx
% vrh
->vring
.num
;
466 /* Compiler knows num_used == 1 sometimes, hence extra check */
467 if (num_used
> 1 && unlikely(off
+ num_used
>= vrh
->vring
.num
)) {
468 u16 part
= vrh
->vring
.num
- off
;
469 err
= putused(vrh
, &used_ring
->ring
[off
], used
, part
);
471 err
= putused(vrh
, &used_ring
->ring
[0], used
+ part
,
474 err
= putused(vrh
, &used_ring
->ring
[off
], used
, num_used
);
477 vringh_bad("Failed to write %u used entries %u at %p",
478 num_used
, off
, &used_ring
->ring
[off
]);
482 /* Make sure buffer is written before we update index. */
483 virtio_wmb(vrh
->weak_barriers
);
485 err
= putu16(vrh
, &vrh
->vring
.used
->idx
, used_idx
+ num_used
);
487 vringh_bad("Failed to update used index at %p",
488 &vrh
->vring
.used
->idx
);
492 vrh
->completed
+= num_used
;
497 static inline int __vringh_need_notify(struct vringh
*vrh
,
498 int (*getu16
)(const struct vringh
*vrh
,
500 const __virtio16
*p
))
506 /* Flush out used index update. This is paired with the
507 * barrier that the Guest executes when enabling
509 virtio_mb(vrh
->weak_barriers
);
511 /* Old-style, without event indices. */
512 if (!vrh
->event_indices
) {
514 err
= getu16(vrh
, &flags
, &vrh
->vring
.avail
->flags
);
516 vringh_bad("Failed to get flags at %p",
517 &vrh
->vring
.avail
->flags
);
520 return (!(flags
& VRING_AVAIL_F_NO_INTERRUPT
));
523 /* Modern: we know when other side wants to know. */
524 err
= getu16(vrh
, &used_event
, &vring_used_event(&vrh
->vring
));
526 vringh_bad("Failed to get used event idx at %p",
527 &vring_used_event(&vrh
->vring
));
531 /* Just in case we added so many that we wrap. */
532 if (unlikely(vrh
->completed
> 0xffff))
535 notify
= vring_need_event(used_event
,
536 vrh
->last_used_idx
+ vrh
->completed
,
539 vrh
->last_used_idx
+= vrh
->completed
;
544 static inline bool __vringh_notify_enable(struct vringh
*vrh
,
545 int (*getu16
)(const struct vringh
*vrh
,
546 u16
*val
, const __virtio16
*p
),
547 int (*putu16
)(const struct vringh
*vrh
,
548 __virtio16
*p
, u16 val
))
552 if (!vrh
->event_indices
) {
553 /* Old-school; update flags. */
554 if (putu16(vrh
, &vrh
->vring
.used
->flags
, 0) != 0) {
555 vringh_bad("Clearing used flags %p",
556 &vrh
->vring
.used
->flags
);
560 if (putu16(vrh
, &vring_avail_event(&vrh
->vring
),
561 vrh
->last_avail_idx
) != 0) {
562 vringh_bad("Updating avail event index %p",
563 &vring_avail_event(&vrh
->vring
));
568 /* They could have slipped one in as we were doing that: make
569 * sure it's written, then check again. */
570 virtio_mb(vrh
->weak_barriers
);
572 if (getu16(vrh
, &avail
, &vrh
->vring
.avail
->idx
) != 0) {
573 vringh_bad("Failed to check avail idx at %p",
574 &vrh
->vring
.avail
->idx
);
578 /* This is unlikely, so we just leave notifications enabled
579 * (if we're using event_indices, we'll only get one
580 * notification anyway). */
581 return avail
== vrh
->last_avail_idx
;
584 static inline void __vringh_notify_disable(struct vringh
*vrh
,
585 int (*putu16
)(const struct vringh
*vrh
,
586 __virtio16
*p
, u16 val
))
588 if (!vrh
->event_indices
) {
589 /* Old-school; update flags. */
590 if (putu16(vrh
, &vrh
->vring
.used
->flags
,
591 VRING_USED_F_NO_NOTIFY
)) {
592 vringh_bad("Setting used flags %p",
593 &vrh
->vring
.used
->flags
);
598 /* Userspace access helpers: in this case, addresses are really userspace. */
599 static inline int getu16_user(const struct vringh
*vrh
, u16
*val
, const __virtio16
*p
)
602 int rc
= get_user(v
, (__force __virtio16 __user
*)p
);
603 *val
= vringh16_to_cpu(vrh
, v
);
607 static inline int putu16_user(const struct vringh
*vrh
, __virtio16
*p
, u16 val
)
609 __virtio16 v
= cpu_to_vringh16(vrh
, val
);
610 return put_user(v
, (__force __virtio16 __user
*)p
);
613 static inline int copydesc_user(const struct vringh
*vrh
,
614 void *dst
, const void *src
, size_t len
)
616 return copy_from_user(dst
, (__force
void __user
*)src
, len
) ?
620 static inline int putused_user(const struct vringh
*vrh
,
621 struct vring_used_elem
*dst
,
622 const struct vring_used_elem
*src
,
625 return copy_to_user((__force
void __user
*)dst
, src
,
626 sizeof(*dst
) * num
) ? -EFAULT
: 0;
629 static inline int xfer_from_user(const struct vringh
*vrh
, void *src
,
630 void *dst
, size_t len
)
632 return copy_from_user(dst
, (__force
void __user
*)src
, len
) ?
636 static inline int xfer_to_user(const struct vringh
*vrh
,
637 void *dst
, void *src
, size_t len
)
639 return copy_to_user((__force
void __user
*)dst
, src
, len
) ?
644 * vringh_init_user - initialize a vringh for a userspace vring.
645 * @vrh: the vringh to initialize.
646 * @features: the feature bits for this ring.
647 * @num: the number of elements.
648 * @weak_barriers: true if we only need memory barriers, not I/O.
649 * @desc: the userspace descriptor pointer.
650 * @avail: the userspace avail pointer.
651 * @used: the userspace used pointer.
653 * Returns an error if num is invalid: you should check pointers
656 int vringh_init_user(struct vringh
*vrh
, u64 features
,
657 unsigned int num
, bool weak_barriers
,
658 vring_desc_t __user
*desc
,
659 vring_avail_t __user
*avail
,
660 vring_used_t __user
*used
)
662 /* Sane power of 2 please! */
663 if (!num
|| num
> 0xffff || (num
& (num
- 1))) {
664 vringh_bad("Bad ring size %u", num
);
668 vrh
->little_endian
= (features
& (1ULL << VIRTIO_F_VERSION_1
));
669 vrh
->event_indices
= (features
& (1 << VIRTIO_RING_F_EVENT_IDX
));
670 vrh
->weak_barriers
= weak_barriers
;
672 vrh
->last_avail_idx
= 0;
673 vrh
->last_used_idx
= 0;
674 vrh
->vring
.num
= num
;
675 /* vring expects kernel addresses, but only used via accessors. */
676 vrh
->vring
.desc
= (__force
struct vring_desc
*)desc
;
677 vrh
->vring
.avail
= (__force
struct vring_avail
*)avail
;
678 vrh
->vring
.used
= (__force
struct vring_used
*)used
;
681 EXPORT_SYMBOL(vringh_init_user
);
684 * vringh_getdesc_user - get next available descriptor from userspace ring.
685 * @vrh: the userspace vring.
686 * @riov: where to put the readable descriptors (or NULL)
687 * @wiov: where to put the writable descriptors (or NULL)
688 * @getrange: function to call to check ranges.
689 * @head: head index we received, for passing to vringh_complete_user().
691 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
693 * Note that on error return, you can tell the difference between an
694 * invalid ring and a single invalid descriptor: in the former case,
695 * *head will be vrh->vring.num. You may be able to ignore an invalid
696 * descriptor, but there's not much you can do with an invalid ring.
698 * Note that you can reuse riov and wiov with subsequent calls. Content is
699 * overwritten and memory reallocated if more space is needed.
700 * When you don't have to use riov and wiov anymore, you should clean up them
701 * calling vringh_iov_cleanup() to release the memory, even on error!
703 int vringh_getdesc_user(struct vringh
*vrh
,
704 struct vringh_iov
*riov
,
705 struct vringh_iov
*wiov
,
706 bool (*getrange
)(struct vringh
*vrh
,
707 u64 addr
, struct vringh_range
*r
),
712 *head
= vrh
->vring
.num
;
713 err
= __vringh_get_head(vrh
, getu16_user
, &vrh
->last_avail_idx
);
718 if (err
== vrh
->vring
.num
)
721 /* We need the layouts to be the identical for this to work */
722 BUILD_BUG_ON(sizeof(struct vringh_kiov
) != sizeof(struct vringh_iov
));
723 BUILD_BUG_ON(offsetof(struct vringh_kiov
, iov
) !=
724 offsetof(struct vringh_iov
, iov
));
725 BUILD_BUG_ON(offsetof(struct vringh_kiov
, i
) !=
726 offsetof(struct vringh_iov
, i
));
727 BUILD_BUG_ON(offsetof(struct vringh_kiov
, used
) !=
728 offsetof(struct vringh_iov
, used
));
729 BUILD_BUG_ON(offsetof(struct vringh_kiov
, max_num
) !=
730 offsetof(struct vringh_iov
, max_num
));
731 BUILD_BUG_ON(sizeof(struct iovec
) != sizeof(struct kvec
));
732 BUILD_BUG_ON(offsetof(struct iovec
, iov_base
) !=
733 offsetof(struct kvec
, iov_base
));
734 BUILD_BUG_ON(offsetof(struct iovec
, iov_len
) !=
735 offsetof(struct kvec
, iov_len
));
736 BUILD_BUG_ON(sizeof(((struct iovec
*)NULL
)->iov_base
)
737 != sizeof(((struct kvec
*)NULL
)->iov_base
));
738 BUILD_BUG_ON(sizeof(((struct iovec
*)NULL
)->iov_len
)
739 != sizeof(((struct kvec
*)NULL
)->iov_len
));
742 err
= __vringh_iov(vrh
, *head
, (struct vringh_kiov
*)riov
,
743 (struct vringh_kiov
*)wiov
,
744 range_check
, getrange
, GFP_KERNEL
, copydesc_user
);
750 EXPORT_SYMBOL(vringh_getdesc_user
);
753 * vringh_iov_pull_user - copy bytes from vring_iov.
754 * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
755 * @dst: the place to copy.
756 * @len: the maximum length to copy.
758 * Returns the bytes copied <= len or a negative errno.
760 ssize_t
vringh_iov_pull_user(struct vringh_iov
*riov
, void *dst
, size_t len
)
762 return vringh_iov_xfer(NULL
, (struct vringh_kiov
*)riov
,
763 dst
, len
, xfer_from_user
);
765 EXPORT_SYMBOL(vringh_iov_pull_user
);
768 * vringh_iov_push_user - copy bytes into vring_iov.
769 * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
770 * @src: the place to copy from.
771 * @len: the maximum length to copy.
773 * Returns the bytes copied <= len or a negative errno.
775 ssize_t
vringh_iov_push_user(struct vringh_iov
*wiov
,
776 const void *src
, size_t len
)
778 return vringh_iov_xfer(NULL
, (struct vringh_kiov
*)wiov
,
779 (void *)src
, len
, xfer_to_user
);
781 EXPORT_SYMBOL(vringh_iov_push_user
);
784 * vringh_abandon_user - we've decided not to handle the descriptor(s).
786 * @num: the number of descriptors to put back (ie. num
787 * vringh_get_user() to undo).
789 * The next vringh_get_user() will return the old descriptor(s) again.
791 void vringh_abandon_user(struct vringh
*vrh
, unsigned int num
)
793 /* We only update vring_avail_event(vr) when we want to be notified,
794 * so we haven't changed that yet. */
795 vrh
->last_avail_idx
-= num
;
797 EXPORT_SYMBOL(vringh_abandon_user
);
800 * vringh_complete_user - we've finished with descriptor, publish it.
802 * @head: the head as filled in by vringh_getdesc_user.
803 * @len: the length of data we have written.
805 * You should check vringh_need_notify_user() after one or more calls
808 int vringh_complete_user(struct vringh
*vrh
, u16 head
, u32 len
)
810 struct vring_used_elem used
;
812 used
.id
= cpu_to_vringh32(vrh
, head
);
813 used
.len
= cpu_to_vringh32(vrh
, len
);
814 return __vringh_complete(vrh
, &used
, 1, putu16_user
, putused_user
);
816 EXPORT_SYMBOL(vringh_complete_user
);
819 * vringh_complete_multi_user - we've finished with many descriptors.
821 * @used: the head, length pairs.
822 * @num_used: the number of used elements.
824 * You should check vringh_need_notify_user() after one or more calls
827 int vringh_complete_multi_user(struct vringh
*vrh
,
828 const struct vring_used_elem used
[],
831 return __vringh_complete(vrh
, used
, num_used
,
832 putu16_user
, putused_user
);
834 EXPORT_SYMBOL(vringh_complete_multi_user
);
837 * vringh_notify_enable_user - we want to know if something changes.
840 * This always enables notifications, but returns false if there are
841 * now more buffers available in the vring.
843 bool vringh_notify_enable_user(struct vringh
*vrh
)
845 return __vringh_notify_enable(vrh
, getu16_user
, putu16_user
);
847 EXPORT_SYMBOL(vringh_notify_enable_user
);
850 * vringh_notify_disable_user - don't tell us if something changes.
853 * This is our normal running state: we disable and then only enable when
854 * we're going to sleep.
856 void vringh_notify_disable_user(struct vringh
*vrh
)
858 __vringh_notify_disable(vrh
, putu16_user
);
860 EXPORT_SYMBOL(vringh_notify_disable_user
);
863 * vringh_need_notify_user - must we tell the other side about used buffers?
864 * @vrh: the vring we've called vringh_complete_user() on.
866 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
868 int vringh_need_notify_user(struct vringh
*vrh
)
870 return __vringh_need_notify(vrh
, getu16_user
);
872 EXPORT_SYMBOL(vringh_need_notify_user
);
874 /* Kernelspace access helpers. */
875 static inline int getu16_kern(const struct vringh
*vrh
,
876 u16
*val
, const __virtio16
*p
)
878 *val
= vringh16_to_cpu(vrh
, READ_ONCE(*p
));
882 static inline int putu16_kern(const struct vringh
*vrh
, __virtio16
*p
, u16 val
)
884 WRITE_ONCE(*p
, cpu_to_vringh16(vrh
, val
));
888 static inline int copydesc_kern(const struct vringh
*vrh
,
889 void *dst
, const void *src
, size_t len
)
891 memcpy(dst
, src
, len
);
895 static inline int putused_kern(const struct vringh
*vrh
,
896 struct vring_used_elem
*dst
,
897 const struct vring_used_elem
*src
,
900 memcpy(dst
, src
, num
* sizeof(*dst
));
904 static inline int xfer_kern(const struct vringh
*vrh
, void *src
,
905 void *dst
, size_t len
)
907 memcpy(dst
, src
, len
);
911 static inline int kern_xfer(const struct vringh
*vrh
, void *dst
,
912 void *src
, size_t len
)
914 memcpy(dst
, src
, len
);
919 * vringh_init_kern - initialize a vringh for a kernelspace vring.
920 * @vrh: the vringh to initialize.
921 * @features: the feature bits for this ring.
922 * @num: the number of elements.
923 * @weak_barriers: true if we only need memory barriers, not I/O.
924 * @desc: the userspace descriptor pointer.
925 * @avail: the userspace avail pointer.
926 * @used: the userspace used pointer.
928 * Returns an error if num is invalid.
930 int vringh_init_kern(struct vringh
*vrh
, u64 features
,
931 unsigned int num
, bool weak_barriers
,
932 struct vring_desc
*desc
,
933 struct vring_avail
*avail
,
934 struct vring_used
*used
)
936 /* Sane power of 2 please! */
937 if (!num
|| num
> 0xffff || (num
& (num
- 1))) {
938 vringh_bad("Bad ring size %u", num
);
942 vrh
->little_endian
= (features
& (1ULL << VIRTIO_F_VERSION_1
));
943 vrh
->event_indices
= (features
& (1 << VIRTIO_RING_F_EVENT_IDX
));
944 vrh
->weak_barriers
= weak_barriers
;
946 vrh
->last_avail_idx
= 0;
947 vrh
->last_used_idx
= 0;
948 vrh
->vring
.num
= num
;
949 vrh
->vring
.desc
= desc
;
950 vrh
->vring
.avail
= avail
;
951 vrh
->vring
.used
= used
;
954 EXPORT_SYMBOL(vringh_init_kern
);
957 * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
958 * @vrh: the kernelspace vring.
959 * @riov: where to put the readable descriptors (or NULL)
960 * @wiov: where to put the writable descriptors (or NULL)
961 * @head: head index we received, for passing to vringh_complete_kern().
962 * @gfp: flags for allocating larger riov/wiov.
964 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
966 * Note that on error return, you can tell the difference between an
967 * invalid ring and a single invalid descriptor: in the former case,
968 * *head will be vrh->vring.num. You may be able to ignore an invalid
969 * descriptor, but there's not much you can do with an invalid ring.
971 * Note that you can reuse riov and wiov with subsequent calls. Content is
972 * overwritten and memory reallocated if more space is needed.
973 * When you don't have to use riov and wiov anymore, you should clean up them
974 * calling vringh_kiov_cleanup() to release the memory, even on error!
976 int vringh_getdesc_kern(struct vringh
*vrh
,
977 struct vringh_kiov
*riov
,
978 struct vringh_kiov
*wiov
,
984 err
= __vringh_get_head(vrh
, getu16_kern
, &vrh
->last_avail_idx
);
989 if (err
== vrh
->vring
.num
)
993 err
= __vringh_iov(vrh
, *head
, riov
, wiov
, no_range_check
, NULL
,
1000 EXPORT_SYMBOL(vringh_getdesc_kern
);
1003 * vringh_iov_pull_kern - copy bytes from vring_iov.
1004 * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
1005 * @dst: the place to copy.
1006 * @len: the maximum length to copy.
1008 * Returns the bytes copied <= len or a negative errno.
1010 ssize_t
vringh_iov_pull_kern(struct vringh_kiov
*riov
, void *dst
, size_t len
)
1012 return vringh_iov_xfer(NULL
, riov
, dst
, len
, xfer_kern
);
1014 EXPORT_SYMBOL(vringh_iov_pull_kern
);
1017 * vringh_iov_push_kern - copy bytes into vring_iov.
1018 * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
1019 * @src: the place to copy from.
1020 * @len: the maximum length to copy.
1022 * Returns the bytes copied <= len or a negative errno.
1024 ssize_t
vringh_iov_push_kern(struct vringh_kiov
*wiov
,
1025 const void *src
, size_t len
)
1027 return vringh_iov_xfer(NULL
, wiov
, (void *)src
, len
, kern_xfer
);
1029 EXPORT_SYMBOL(vringh_iov_push_kern
);
1032 * vringh_abandon_kern - we've decided not to handle the descriptor(s).
1034 * @num: the number of descriptors to put back (ie. num
1035 * vringh_get_kern() to undo).
1037 * The next vringh_get_kern() will return the old descriptor(s) again.
1039 void vringh_abandon_kern(struct vringh
*vrh
, unsigned int num
)
1041 /* We only update vring_avail_event(vr) when we want to be notified,
1042 * so we haven't changed that yet. */
1043 vrh
->last_avail_idx
-= num
;
1045 EXPORT_SYMBOL(vringh_abandon_kern
);
1048 * vringh_complete_kern - we've finished with descriptor, publish it.
1050 * @head: the head as filled in by vringh_getdesc_kern.
1051 * @len: the length of data we have written.
1053 * You should check vringh_need_notify_kern() after one or more calls
1056 int vringh_complete_kern(struct vringh
*vrh
, u16 head
, u32 len
)
1058 struct vring_used_elem used
;
1060 used
.id
= cpu_to_vringh32(vrh
, head
);
1061 used
.len
= cpu_to_vringh32(vrh
, len
);
1063 return __vringh_complete(vrh
, &used
, 1, putu16_kern
, putused_kern
);
1065 EXPORT_SYMBOL(vringh_complete_kern
);
1068 * vringh_notify_enable_kern - we want to know if something changes.
1071 * This always enables notifications, but returns false if there are
1072 * now more buffers available in the vring.
1074 bool vringh_notify_enable_kern(struct vringh
*vrh
)
1076 return __vringh_notify_enable(vrh
, getu16_kern
, putu16_kern
);
1078 EXPORT_SYMBOL(vringh_notify_enable_kern
);
1081 * vringh_notify_disable_kern - don't tell us if something changes.
1084 * This is our normal running state: we disable and then only enable when
1085 * we're going to sleep.
1087 void vringh_notify_disable_kern(struct vringh
*vrh
)
1089 __vringh_notify_disable(vrh
, putu16_kern
);
1091 EXPORT_SYMBOL(vringh_notify_disable_kern
);
1094 * vringh_need_notify_kern - must we tell the other side about used buffers?
1095 * @vrh: the vring we've called vringh_complete_kern() on.
1097 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
1099 int vringh_need_notify_kern(struct vringh
*vrh
)
1101 return __vringh_need_notify(vrh
, getu16_kern
);
1103 EXPORT_SYMBOL(vringh_need_notify_kern
);
1105 #if IS_REACHABLE(CONFIG_VHOST_IOTLB)
1109 struct iovec
*iovec
;
1110 struct bio_vec
*bvec
;
1115 static int iotlb_translate(const struct vringh
*vrh
,
1116 u64 addr
, u64 len
, u64
*translated
,
1117 struct iotlb_vec
*ivec
, u32 perm
)
1119 struct vhost_iotlb_map
*map
;
1120 struct vhost_iotlb
*iotlb
= vrh
->iotlb
;
1122 u64 s
= 0, last
= addr
+ len
- 1;
1124 spin_lock(vrh
->iotlb_lock
);
1131 if (unlikely(ret
>= ivec
->count
)) {
1136 map
= vhost_iotlb_itree_first(iotlb
, addr
, last
);
1137 if (!map
|| map
->start
> addr
) {
1140 } else if (!(map
->perm
& perm
)) {
1145 size
= map
->size
- addr
+ map
->start
;
1146 io_len
= min(len
- s
, size
);
1147 io_addr
= map
->addr
- map
->start
+ addr
;
1150 struct iovec
*iovec
= ivec
->iov
.iovec
;
1152 iovec
[ret
].iov_len
= io_len
;
1153 iovec
[ret
].iov_base
= (void __user
*)io_addr
;
1155 u64 pfn
= io_addr
>> PAGE_SHIFT
;
1156 struct bio_vec
*bvec
= ivec
->iov
.bvec
;
1158 bvec_set_page(&bvec
[ret
], pfn_to_page(pfn
), io_len
,
1159 io_addr
& (PAGE_SIZE
- 1));
1167 spin_unlock(vrh
->iotlb_lock
);
1170 *translated
= min(len
, s
);
1175 #define IOTLB_IOV_STRIDE 16
1177 static inline int copy_from_iotlb(const struct vringh
*vrh
, void *dst
,
1178 void *src
, size_t len
)
1180 struct iotlb_vec ivec
;
1182 struct iovec iovec
[IOTLB_IOV_STRIDE
];
1183 struct bio_vec bvec
[IOTLB_IOV_STRIDE
];
1185 u64 total_translated
= 0;
1187 ivec
.iov
.iovec
= iov
.iovec
;
1188 ivec
.count
= IOTLB_IOV_STRIDE
;
1190 while (total_translated
< len
) {
1191 struct iov_iter iter
;
1195 ret
= iotlb_translate(vrh
, (u64
)(uintptr_t)src
,
1196 len
- total_translated
, &translated
,
1197 &ivec
, VHOST_MAP_RO
);
1198 if (ret
== -ENOBUFS
)
1199 ret
= IOTLB_IOV_STRIDE
;
1204 iov_iter_init(&iter
, ITER_SOURCE
, ivec
.iov
.iovec
, ret
,
1207 iov_iter_bvec(&iter
, ITER_SOURCE
, ivec
.iov
.bvec
, ret
,
1211 ret
= copy_from_iter(dst
, translated
, &iter
);
1217 total_translated
+= translated
;
1220 return total_translated
;
1223 static inline int copy_to_iotlb(const struct vringh
*vrh
, void *dst
,
1224 void *src
, size_t len
)
1226 struct iotlb_vec ivec
;
1228 struct iovec iovec
[IOTLB_IOV_STRIDE
];
1229 struct bio_vec bvec
[IOTLB_IOV_STRIDE
];
1231 u64 total_translated
= 0;
1233 ivec
.iov
.iovec
= iov
.iovec
;
1234 ivec
.count
= IOTLB_IOV_STRIDE
;
1236 while (total_translated
< len
) {
1237 struct iov_iter iter
;
1241 ret
= iotlb_translate(vrh
, (u64
)(uintptr_t)dst
,
1242 len
- total_translated
, &translated
,
1243 &ivec
, VHOST_MAP_WO
);
1244 if (ret
== -ENOBUFS
)
1245 ret
= IOTLB_IOV_STRIDE
;
1250 iov_iter_init(&iter
, ITER_DEST
, ivec
.iov
.iovec
, ret
,
1253 iov_iter_bvec(&iter
, ITER_DEST
, ivec
.iov
.bvec
, ret
,
1257 ret
= copy_to_iter(src
, translated
, &iter
);
1263 total_translated
+= translated
;
1266 return total_translated
;
1269 static inline int getu16_iotlb(const struct vringh
*vrh
,
1270 u16
*val
, const __virtio16
*p
)
1272 struct iotlb_vec ivec
;
1274 struct iovec iovec
[1];
1275 struct bio_vec bvec
[1];
1280 ivec
.iov
.iovec
= iov
.iovec
;
1283 /* Atomic read is needed for getu16 */
1284 ret
= iotlb_translate(vrh
, (u64
)(uintptr_t)p
, sizeof(*p
),
1285 NULL
, &ivec
, VHOST_MAP_RO
);
1290 ret
= __get_user(tmp
, (__virtio16 __user
*)ivec
.iov
.iovec
[0].iov_base
);
1294 void *kaddr
= kmap_local_page(ivec
.iov
.bvec
[0].bv_page
);
1295 void *from
= kaddr
+ ivec
.iov
.bvec
[0].bv_offset
;
1297 tmp
= READ_ONCE(*(__virtio16
*)from
);
1298 kunmap_local(kaddr
);
1301 *val
= vringh16_to_cpu(vrh
, tmp
);
1306 static inline int putu16_iotlb(const struct vringh
*vrh
,
1307 __virtio16
*p
, u16 val
)
1309 struct iotlb_vec ivec
;
1312 struct bio_vec bvec
;
1317 ivec
.iov
.iovec
= &iov
.iovec
;
1320 /* Atomic write is needed for putu16 */
1321 ret
= iotlb_translate(vrh
, (u64
)(uintptr_t)p
, sizeof(*p
),
1322 NULL
, &ivec
, VHOST_MAP_RO
);
1326 tmp
= cpu_to_vringh16(vrh
, val
);
1329 ret
= __put_user(tmp
, (__virtio16 __user
*)ivec
.iov
.iovec
[0].iov_base
);
1333 void *kaddr
= kmap_local_page(ivec
.iov
.bvec
[0].bv_page
);
1334 void *to
= kaddr
+ ivec
.iov
.bvec
[0].bv_offset
;
1336 WRITE_ONCE(*(__virtio16
*)to
, tmp
);
1337 kunmap_local(kaddr
);
1343 static inline int copydesc_iotlb(const struct vringh
*vrh
,
1344 void *dst
, const void *src
, size_t len
)
1348 ret
= copy_from_iotlb(vrh
, dst
, (void *)src
, len
);
1355 static inline int xfer_from_iotlb(const struct vringh
*vrh
, void *src
,
1356 void *dst
, size_t len
)
1360 ret
= copy_from_iotlb(vrh
, dst
, src
, len
);
1367 static inline int xfer_to_iotlb(const struct vringh
*vrh
,
1368 void *dst
, void *src
, size_t len
)
1372 ret
= copy_to_iotlb(vrh
, dst
, src
, len
);
1379 static inline int putused_iotlb(const struct vringh
*vrh
,
1380 struct vring_used_elem
*dst
,
1381 const struct vring_used_elem
*src
,
1384 int size
= num
* sizeof(*dst
);
1387 ret
= copy_to_iotlb(vrh
, dst
, (void *)src
, num
* sizeof(*dst
));
1395 * vringh_init_iotlb - initialize a vringh for a ring with IOTLB.
1396 * @vrh: the vringh to initialize.
1397 * @features: the feature bits for this ring.
1398 * @num: the number of elements.
1399 * @weak_barriers: true if we only need memory barriers, not I/O.
1400 * @desc: the userspace descriptor pointer.
1401 * @avail: the userspace avail pointer.
1402 * @used: the userspace used pointer.
1404 * Returns an error if num is invalid.
1406 int vringh_init_iotlb(struct vringh
*vrh
, u64 features
,
1407 unsigned int num
, bool weak_barriers
,
1408 struct vring_desc
*desc
,
1409 struct vring_avail
*avail
,
1410 struct vring_used
*used
)
1412 vrh
->use_va
= false;
1414 return vringh_init_kern(vrh
, features
, num
, weak_barriers
,
1417 EXPORT_SYMBOL(vringh_init_iotlb
);
1420 * vringh_init_iotlb_va - initialize a vringh for a ring with IOTLB containing
1422 * @vrh: the vringh to initialize.
1423 * @features: the feature bits for this ring.
1424 * @num: the number of elements.
1425 * @weak_barriers: true if we only need memory barriers, not I/O.
1426 * @desc: the userspace descriptor pointer.
1427 * @avail: the userspace avail pointer.
1428 * @used: the userspace used pointer.
1430 * Returns an error if num is invalid.
1432 int vringh_init_iotlb_va(struct vringh
*vrh
, u64 features
,
1433 unsigned int num
, bool weak_barriers
,
1434 struct vring_desc
*desc
,
1435 struct vring_avail
*avail
,
1436 struct vring_used
*used
)
1440 return vringh_init_kern(vrh
, features
, num
, weak_barriers
,
1443 EXPORT_SYMBOL(vringh_init_iotlb_va
);
1446 * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
1448 * @iotlb: iotlb associated with this vring
1449 * @iotlb_lock: spinlock to synchronize the iotlb accesses
1451 void vringh_set_iotlb(struct vringh
*vrh
, struct vhost_iotlb
*iotlb
,
1452 spinlock_t
*iotlb_lock
)
1455 vrh
->iotlb_lock
= iotlb_lock
;
1457 EXPORT_SYMBOL(vringh_set_iotlb
);
1460 * vringh_getdesc_iotlb - get next available descriptor from ring with
1462 * @vrh: the kernelspace vring.
1463 * @riov: where to put the readable descriptors (or NULL)
1464 * @wiov: where to put the writable descriptors (or NULL)
1465 * @head: head index we received, for passing to vringh_complete_iotlb().
1466 * @gfp: flags for allocating larger riov/wiov.
1468 * Returns 0 if there was no descriptor, 1 if there was, or -errno.
1470 * Note that on error return, you can tell the difference between an
1471 * invalid ring and a single invalid descriptor: in the former case,
1472 * *head will be vrh->vring.num. You may be able to ignore an invalid
1473 * descriptor, but there's not much you can do with an invalid ring.
1475 * Note that you can reuse riov and wiov with subsequent calls. Content is
1476 * overwritten and memory reallocated if more space is needed.
1477 * When you don't have to use riov and wiov anymore, you should clean up them
1478 * calling vringh_kiov_cleanup() to release the memory, even on error!
1480 int vringh_getdesc_iotlb(struct vringh
*vrh
,
1481 struct vringh_kiov
*riov
,
1482 struct vringh_kiov
*wiov
,
1488 err
= __vringh_get_head(vrh
, getu16_iotlb
, &vrh
->last_avail_idx
);
1493 if (err
== vrh
->vring
.num
)
1497 err
= __vringh_iov(vrh
, *head
, riov
, wiov
, no_range_check
, NULL
,
1498 gfp
, copydesc_iotlb
);
1504 EXPORT_SYMBOL(vringh_getdesc_iotlb
);
1507 * vringh_iov_pull_iotlb - copy bytes from vring_iov.
1509 * @riov: the riov as passed to vringh_getdesc_iotlb() (updated as we consume)
1510 * @dst: the place to copy.
1511 * @len: the maximum length to copy.
1513 * Returns the bytes copied <= len or a negative errno.
1515 ssize_t
vringh_iov_pull_iotlb(struct vringh
*vrh
,
1516 struct vringh_kiov
*riov
,
1517 void *dst
, size_t len
)
1519 return vringh_iov_xfer(vrh
, riov
, dst
, len
, xfer_from_iotlb
);
1521 EXPORT_SYMBOL(vringh_iov_pull_iotlb
);
1524 * vringh_iov_push_iotlb - copy bytes into vring_iov.
1526 * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume)
1527 * @src: the place to copy from.
1528 * @len: the maximum length to copy.
1530 * Returns the bytes copied <= len or a negative errno.
1532 ssize_t
vringh_iov_push_iotlb(struct vringh
*vrh
,
1533 struct vringh_kiov
*wiov
,
1534 const void *src
, size_t len
)
1536 return vringh_iov_xfer(vrh
, wiov
, (void *)src
, len
, xfer_to_iotlb
);
1538 EXPORT_SYMBOL(vringh_iov_push_iotlb
);
1541 * vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
1543 * @num: the number of descriptors to put back (ie. num
1544 * vringh_get_iotlb() to undo).
1546 * The next vringh_get_iotlb() will return the old descriptor(s) again.
1548 void vringh_abandon_iotlb(struct vringh
*vrh
, unsigned int num
)
1550 /* We only update vring_avail_event(vr) when we want to be notified,
1551 * so we haven't changed that yet.
1553 vrh
->last_avail_idx
-= num
;
1555 EXPORT_SYMBOL(vringh_abandon_iotlb
);
1558 * vringh_complete_iotlb - we've finished with descriptor, publish it.
1560 * @head: the head as filled in by vringh_getdesc_iotlb.
1561 * @len: the length of data we have written.
1563 * You should check vringh_need_notify_iotlb() after one or more calls
1566 int vringh_complete_iotlb(struct vringh
*vrh
, u16 head
, u32 len
)
1568 struct vring_used_elem used
;
1570 used
.id
= cpu_to_vringh32(vrh
, head
);
1571 used
.len
= cpu_to_vringh32(vrh
, len
);
1573 return __vringh_complete(vrh
, &used
, 1, putu16_iotlb
, putused_iotlb
);
1575 EXPORT_SYMBOL(vringh_complete_iotlb
);
1578 * vringh_notify_enable_iotlb - we want to know if something changes.
1581 * This always enables notifications, but returns false if there are
1582 * now more buffers available in the vring.
1584 bool vringh_notify_enable_iotlb(struct vringh
*vrh
)
1586 return __vringh_notify_enable(vrh
, getu16_iotlb
, putu16_iotlb
);
1588 EXPORT_SYMBOL(vringh_notify_enable_iotlb
);
1591 * vringh_notify_disable_iotlb - don't tell us if something changes.
1594 * This is our normal running state: we disable and then only enable when
1595 * we're going to sleep.
1597 void vringh_notify_disable_iotlb(struct vringh
*vrh
)
1599 __vringh_notify_disable(vrh
, putu16_iotlb
);
1601 EXPORT_SYMBOL(vringh_notify_disable_iotlb
);
1604 * vringh_need_notify_iotlb - must we tell the other side about used buffers?
1605 * @vrh: the vring we've called vringh_complete_iotlb() on.
1607 * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
1609 int vringh_need_notify_iotlb(struct vringh
*vrh
)
1611 return __vringh_need_notify(vrh
, getu16_iotlb
);
1613 EXPORT_SYMBOL(vringh_need_notify_iotlb
);
1617 MODULE_DESCRIPTION("host side of a virtio ring");
1618 MODULE_LICENSE("GPL");