4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
39 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
44 #include <sys/errno.h>
46 #include <sys/sysmacros.h>
47 #include <sys/types.h>
48 #include <sys/uio_impl.h>
49 #include <sys/sysmacros.h>
50 #include <sys/string.h>
51 #include <sys/zfs_refcount.h>
52 #include <sys/zfs_debug.h>
53 #include <linux/kmap_compat.h>
54 #include <linux/uaccess.h>
55 #include <linux/pagemap.h>
56 #include <linux/mman.h>
59 * Move "n" bytes at byte address "p"; "rw" indicates the direction
60 * of the move, and the I/O parameters are provided in "uio", which is
61 * update to reflect the data which was moved. Returns 0 on success or
62 * a non-zero errno on failure.
65 zfs_uiomove_iov(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
)
67 const struct iovec
*iov
= uio
->uio_iov
;
68 size_t skip
= uio
->uio_skip
;
71 while (n
&& uio
->uio_resid
) {
72 cnt
= MIN(iov
->iov_len
- skip
, n
);
73 switch (uio
->uio_segflg
) {
76 * p = kernel data pointer
77 * iov->iov_base = user data pointer
80 if (copy_to_user(iov
->iov_base
+skip
, p
, cnt
))
83 unsigned long b_left
= 0;
84 if (uio
->uio_fault_disable
) {
85 if (!zfs_access_ok(VERIFY_READ
,
86 (iov
->iov_base
+ skip
), cnt
)) {
91 __copy_from_user_inatomic(p
,
92 (iov
->iov_base
+ skip
), cnt
);
97 (iov
->iov_base
+ skip
), cnt
);
100 unsigned long c_bytes
=
102 uio
->uio_skip
+= c_bytes
;
103 ASSERT3U(uio
->uio_skip
, <,
105 uio
->uio_resid
-= c_bytes
;
106 uio
->uio_loffset
+= c_bytes
;
113 memcpy(iov
->iov_base
+ skip
, p
, cnt
);
115 memcpy(p
, iov
->iov_base
+ skip
, cnt
);
121 if (skip
== iov
->iov_len
) {
123 uio
->uio_iov
= (++iov
);
126 uio
->uio_skip
= skip
;
127 uio
->uio_resid
-= cnt
;
128 uio
->uio_loffset
+= cnt
;
129 p
= (caddr_t
)p
+ cnt
;
136 zfs_uiomove_bvec_impl(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
)
138 const struct bio_vec
*bv
= uio
->uio_bvec
;
139 size_t skip
= uio
->uio_skip
;
142 while (n
&& uio
->uio_resid
) {
144 cnt
= MIN(bv
->bv_len
- skip
, n
);
146 paddr
= zfs_kmap_local(bv
->bv_page
);
147 if (rw
== UIO_READ
) {
148 /* Copy from buffer 'p' to the bvec data */
149 memcpy(paddr
+ bv
->bv_offset
+ skip
, p
, cnt
);
151 /* Copy from bvec data to buffer 'p' */
152 memcpy(p
, paddr
+ bv
->bv_offset
+ skip
, cnt
);
154 zfs_kunmap_local(paddr
);
157 if (skip
== bv
->bv_len
) {
159 uio
->uio_bvec
= (++bv
);
162 uio
->uio_skip
= skip
;
163 uio
->uio_resid
-= cnt
;
164 uio
->uio_loffset
+= cnt
;
165 p
= (caddr_t
)p
+ cnt
;
172 zfs_copy_bvec(void *p
, size_t skip
, size_t cnt
, zfs_uio_rw_t rw
,
177 paddr
= zfs_kmap_local(bv
->bv_page
);
178 if (rw
== UIO_READ
) {
179 /* Copy from buffer 'p' to the bvec data */
180 memcpy(paddr
+ bv
->bv_offset
+ skip
, p
, cnt
);
182 /* Copy from bvec data to buffer 'p' */
183 memcpy(p
, paddr
+ bv
->bv_offset
+ skip
, cnt
);
185 zfs_kunmap_local(paddr
);
189 * Copy 'n' bytes of data between the buffer p[] and the data represented
190 * by the request in the uio.
193 zfs_uiomove_bvec_rq(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
)
195 struct request
*rq
= uio
->rq
;
197 struct req_iterator iter
;
198 size_t this_seg_start
; /* logical offset */
199 size_t this_seg_end
; /* logical offset */
201 size_t copy_from_seg
;
206 * Get the original logical offset of this entire request (because
207 * uio->uio_loffset will be modified over time).
209 orig_loffset
= io_offset(NULL
, rq
);
210 this_seg_start
= orig_loffset
;
212 rq_for_each_segment(bv
, rq
, iter
) {
214 * Lookup what the logical offset of the last byte of this
217 this_seg_end
= this_seg_start
+ bv
.bv_len
- 1;
220 * We only need to operate on segments that have data we're
223 if (uio
->uio_loffset
>= this_seg_start
&&
224 uio
->uio_loffset
<= this_seg_end
) {
226 * Some, or all, of the data in this segment needs to be
231 * We may be not be copying from the first byte in the
232 * segment. Figure out how many bytes to skip copying
233 * from the beginning of this segment.
235 skip_in_seg
= uio
->uio_loffset
- this_seg_start
;
238 * Calculate the total number of bytes from this
239 * segment that we will be copying.
241 copy_from_seg
= MIN(bv
.bv_len
- skip_in_seg
, n
);
244 zfs_copy_bvec(p
, skip_in_seg
, copy_from_seg
, rw
, &bv
);
245 p
= ((char *)p
) + copy_from_seg
;
248 uio
->uio_resid
-= copy_from_seg
;
249 uio
->uio_loffset
+= copy_from_seg
;
250 copied
= 1; /* We copied some data */
253 this_seg_start
= this_seg_end
+ 1;
257 /* Didn't copy anything */
264 zfs_uiomove_bvec(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
)
267 return (zfs_uiomove_bvec_rq(p
, n
, rw
, uio
));
268 return (zfs_uiomove_bvec_impl(p
, n
, rw
, uio
));
271 #if defined(HAVE_VFS_IOV_ITER)
273 zfs_uiomove_iter(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
,
276 size_t cnt
= MIN(n
, uio
->uio_resid
);
279 iov_iter_advance(uio
->uio_iter
, uio
->uio_skip
);
282 cnt
= copy_to_iter(p
, cnt
, uio
->uio_iter
);
284 cnt
= copy_from_iter(p
, cnt
, uio
->uio_iter
);
287 * When operating on a full pipe no bytes are processed.
288 * In which case return EFAULT which is converted to EAGAIN
289 * by the kernel's generic_file_splice_read() function.
295 * Revert advancing the uio_iter. This is set by zfs_uiocopy()
296 * to avoid consuming the uio and its iov_iter structure.
299 iov_iter_revert(uio
->uio_iter
, cnt
);
301 uio
->uio_resid
-= cnt
;
302 uio
->uio_loffset
+= cnt
;
309 zfs_uiomove(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
)
311 if (uio
->uio_segflg
== UIO_BVEC
)
312 return (zfs_uiomove_bvec(p
, n
, rw
, uio
));
313 #if defined(HAVE_VFS_IOV_ITER)
314 else if (uio
->uio_segflg
== UIO_ITER
)
315 return (zfs_uiomove_iter(p
, n
, rw
, uio
, B_FALSE
));
318 return (zfs_uiomove_iov(p
, n
, rw
, uio
));
320 EXPORT_SYMBOL(zfs_uiomove
);
323 * Fault in the pages of the first n bytes specified by the uio structure.
324 * 1 byte in each page is touched and the uio struct is unmodified. Any
325 * error will terminate the process as this is only a best attempt to get
326 * the pages resident.
329 zfs_uio_prefaultpages(ssize_t n
, zfs_uio_t
*uio
)
331 if (uio
->uio_segflg
== UIO_SYSSPACE
|| uio
->uio_segflg
== UIO_BVEC
||
332 (uio
->uio_extflg
& UIO_DIRECT
)) {
334 * There's never a need to fault in kernel pages or Direct I/O
335 * write pages. Direct I/O write pages have been pinned in so
336 * there is never a time for these pages a fault will occur.
339 #if defined(HAVE_VFS_IOV_ITER)
340 } else if (uio
->uio_segflg
== UIO_ITER
) {
342 * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
343 * can be relied on to fault in user pages when referenced.
345 if (iov_iter_fault_in_readable(uio
->uio_iter
, n
))
349 /* Fault in all user pages */
350 ASSERT3S(uio
->uio_segflg
, ==, UIO_USERSPACE
);
351 const struct iovec
*iov
= uio
->uio_iov
;
352 int iovcnt
= uio
->uio_iovcnt
;
353 size_t skip
= uio
->uio_skip
;
357 for (; n
> 0 && iovcnt
> 0; iov
++, iovcnt
--, skip
= 0) {
358 ulong_t cnt
= MIN(iov
->iov_len
- skip
, n
);
363 /* touch each page in this segment. */
364 p
= iov
->iov_base
+ skip
;
366 if (copy_from_user(&tmp
, p
, 1))
368 ulong_t incr
= MIN(cnt
, PAGESIZE
);
372 /* touch the last byte in case it straddles a page. */
374 if (copy_from_user(&tmp
, p
, 1))
381 EXPORT_SYMBOL(zfs_uio_prefaultpages
);
384 * The same as zfs_uiomove() but doesn't modify uio structure.
385 * return in cbytes how many bytes were copied.
388 zfs_uiocopy(void *p
, size_t n
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
, size_t *cbytes
)
393 memcpy(&uio_copy
, uio
, sizeof (zfs_uio_t
));
395 if (uio
->uio_segflg
== UIO_BVEC
)
396 ret
= zfs_uiomove_bvec(p
, n
, rw
, &uio_copy
);
397 #if defined(HAVE_VFS_IOV_ITER)
398 else if (uio
->uio_segflg
== UIO_ITER
)
399 ret
= zfs_uiomove_iter(p
, n
, rw
, &uio_copy
, B_TRUE
);
402 ret
= zfs_uiomove_iov(p
, n
, rw
, &uio_copy
);
404 *cbytes
= uio
->uio_resid
- uio_copy
.uio_resid
;
408 EXPORT_SYMBOL(zfs_uiocopy
);
411 * Drop the next n chars out of *uio.
414 zfs_uioskip(zfs_uio_t
*uio
, size_t n
)
416 if (n
> uio
->uio_resid
)
419 * When using a uio with a struct request, we simply
420 * use uio_loffset as a pointer to the next logical byte to
421 * copy in the request. We don't have to do any fancy
422 * accounting with uio_bvec/uio_iovcnt since we don't use
425 if (uio
->uio_segflg
== UIO_BVEC
&& uio
->rq
== NULL
) {
427 while (uio
->uio_iovcnt
&&
428 uio
->uio_skip
>= uio
->uio_bvec
->bv_len
) {
429 uio
->uio_skip
-= uio
->uio_bvec
->bv_len
;
433 #if defined(HAVE_VFS_IOV_ITER)
434 } else if (uio
->uio_segflg
== UIO_ITER
) {
435 iov_iter_advance(uio
->uio_iter
, n
);
439 while (uio
->uio_iovcnt
&&
440 uio
->uio_skip
>= uio
->uio_iov
->iov_len
) {
441 uio
->uio_skip
-= uio
->uio_iov
->iov_len
;
447 uio
->uio_loffset
+= n
;
450 EXPORT_SYMBOL(zfs_uioskip
);
453 * Check if the uio is page-aligned in memory.
456 zfs_uio_page_aligned(zfs_uio_t
*uio
)
458 boolean_t aligned
= B_TRUE
;
460 if (uio
->uio_segflg
== UIO_USERSPACE
||
461 uio
->uio_segflg
== UIO_SYSSPACE
) {
462 const struct iovec
*iov
= uio
->uio_iov
;
463 size_t skip
= uio
->uio_skip
;
465 for (int i
= uio
->uio_iovcnt
; i
> 0; iov
++, i
--) {
466 uintptr_t addr
= (uintptr_t)(iov
->iov_base
+ skip
);
467 size_t size
= iov
->iov_len
- skip
;
468 if ((addr
& (PAGE_SIZE
- 1)) ||
469 (size
& (PAGE_SIZE
- 1))) {
475 #if defined(HAVE_VFS_IOV_ITER)
476 } else if (uio
->uio_segflg
== UIO_ITER
) {
477 unsigned long alignment
=
478 iov_iter_alignment(uio
->uio_iter
);
479 aligned
= IS_P2ALIGNED(alignment
, PAGE_SIZE
);
482 /* Currently not supported */
490 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
491 #define ZFS_MARKEED_PAGE 0x0
492 #define IS_ZFS_MARKED_PAGE(_p) 0
493 #define zfs_mark_page(_p)
494 #define zfs_unmark_page(_p)
495 #define IS_ZERO_PAGE(_p) 0
499 * Mark pages to know if they were allocated to replace ZERO_PAGE() for
502 #define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
503 #define IS_ZFS_MARKED_PAGE(_p) \
504 (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
505 #define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
508 zfs_mark_page(struct page
*page
)
510 ASSERT3P(page
, !=, NULL
);
512 SetPagePrivate(page
);
513 set_page_private(page
, ZFS_MARKED_PAGE
);
517 zfs_unmark_page(struct page
*page
)
519 ASSERT3P(page
, !=, NULL
);
520 set_page_private(page
, 0UL);
521 ClearPagePrivate(page
);
524 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
527 zfs_uio_dio_check_for_zero_page(zfs_uio_t
*uio
)
529 ASSERT3P(uio
->uio_dio
.pages
, !=, NULL
);
531 for (long i
= 0; i
< uio
->uio_dio
.npages
; i
++) {
532 struct page
*p
= uio
->uio_dio
.pages
[i
];
535 if (IS_ZERO_PAGE(p
)) {
537 * If the user page points the kernels ZERO_PAGE() a
538 * new zero filled page will just be allocated so the
539 * contents of the page can not be changed by the user
540 * while a Direct I/O write is taking place.
542 gfp_t gfp_zero_page
= __GFP_NOWARN
| GFP_NOIO
|
543 __GFP_ZERO
| GFP_KERNEL
;
545 ASSERT0(IS_ZFS_MARKED_PAGE(p
));
549 p
= __page_cache_alloc(gfp_zero_page
);
558 zfs_uio_free_dio_pages(zfs_uio_t
*uio
, zfs_uio_rw_t rw
)
561 ASSERT(uio
->uio_extflg
& UIO_DIRECT
);
562 ASSERT3P(uio
->uio_dio
.pages
, !=, NULL
);
564 for (long i
= 0; i
< uio
->uio_dio
.npages
; i
++) {
565 struct page
*p
= uio
->uio_dio
.pages
[i
];
567 if (IS_ZFS_MARKED_PAGE(p
)) {
576 vmem_free(uio
->uio_dio
.pages
,
577 uio
->uio_dio
.npages
* sizeof (struct page
*));
581 * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
582 * iov_iter_get_pages().
585 zfs_uio_iov_step(struct iovec v
, zfs_uio_rw_t rw
, zfs_uio_t
*uio
,
588 unsigned long addr
= (unsigned long)(v
.iov_base
);
589 size_t len
= v
.iov_len
;
590 unsigned long n
= DIV_ROUND_UP(len
, PAGE_SIZE
);
593 * read returning FOLL_WRITE is due to the fact that we are stating
594 * that the kernel will have write access to the user pages. So, when a
595 * Direct I/O read request is issued, the kernel must write to the user
598 long res
= get_user_pages_unlocked(
599 P2ALIGN_TYPED(addr
, PAGE_SIZE
, unsigned long), n
,
600 &uio
->uio_dio
.pages
[uio
->uio_dio
.npages
],
601 rw
== UIO_READ
? FOLL_WRITE
: 0);
603 return (SET_ERROR(-res
));
604 } else if (len
!= (res
* PAGE_SIZE
)) {
605 return (SET_ERROR(EFAULT
));
608 ASSERT3S(len
, ==, res
* PAGE_SIZE
);
614 zfs_uio_get_dio_pages_iov(zfs_uio_t
*uio
, zfs_uio_rw_t rw
)
616 const struct iovec
*iovp
= uio
->uio_iov
;
617 size_t skip
= uio
->uio_skip
;
618 size_t len
= uio
->uio_resid
- skip
;
620 ASSERT(uio
->uio_segflg
!= UIO_SYSSPACE
);
622 for (int i
= 0; i
< uio
->uio_iovcnt
; i
++) {
626 if (iovp
->iov_len
== 0) {
631 iov
.iov_len
= MIN(len
, iovp
->iov_len
- skip
);
632 iov
.iov_base
= iovp
->iov_base
+ skip
;
633 int error
= zfs_uio_iov_step(iov
, rw
, uio
, &numpages
);
638 uio
->uio_dio
.npages
+= numpages
;
649 #if defined(HAVE_VFS_IOV_ITER)
651 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t
*uio
, zfs_uio_rw_t rw
)
653 size_t skip
= uio
->uio_skip
;
654 size_t wanted
= uio
->uio_resid
- uio
->uio_skip
;
655 ssize_t rollback
= 0;
657 unsigned maxpages
= DIV_ROUND_UP(wanted
, PAGE_SIZE
);
660 #if defined(HAVE_IOV_ITER_GET_PAGES2)
661 cnt
= iov_iter_get_pages2(uio
->uio_iter
,
662 &uio
->uio_dio
.pages
[uio
->uio_dio
.npages
],
663 wanted
, maxpages
, &skip
);
665 cnt
= iov_iter_get_pages(uio
->uio_iter
,
666 &uio
->uio_dio
.pages
[uio
->uio_dio
.npages
],
667 wanted
, maxpages
, &skip
);
670 iov_iter_revert(uio
->uio_iter
, rollback
);
671 return (SET_ERROR(-cnt
));
673 uio
->uio_dio
.npages
+= DIV_ROUND_UP(cnt
, PAGE_SIZE
);
677 #if !defined(HAVE_IOV_ITER_GET_PAGES2)
679 * iov_iter_get_pages2() advances the iov_iter on success.
681 iov_iter_advance(uio
->uio_iter
, cnt
);
685 ASSERT3U(rollback
, ==, uio
->uio_resid
- uio
->uio_skip
);
686 iov_iter_revert(uio
->uio_iter
, rollback
);
690 #endif /* HAVE_VFS_IOV_ITER */
693 * This function pins user pages. In the event that the user pages were not
694 * successfully pinned an error value is returned.
696 * On success, 0 is returned.
699 zfs_uio_get_dio_pages_alloc(zfs_uio_t
*uio
, zfs_uio_rw_t rw
)
702 long npages
= DIV_ROUND_UP(uio
->uio_resid
, PAGE_SIZE
);
703 size_t size
= npages
* sizeof (struct page
*);
705 if (uio
->uio_segflg
== UIO_USERSPACE
) {
706 uio
->uio_dio
.pages
= vmem_alloc(size
, KM_SLEEP
);
707 error
= zfs_uio_get_dio_pages_iov(uio
, rw
);
708 #if defined(HAVE_VFS_IOV_ITER)
709 } else if (uio
->uio_segflg
== UIO_ITER
) {
710 uio
->uio_dio
.pages
= vmem_alloc(size
, KM_SLEEP
);
711 error
= zfs_uio_get_dio_pages_iov_iter(uio
, rw
);
714 return (SET_ERROR(EOPNOTSUPP
));
717 ASSERT3S(uio
->uio_dio
.npages
, >=, 0);
720 for (long i
= 0; i
< uio
->uio_dio
.npages
; i
++)
721 put_page(uio
->uio_dio
.pages
[i
]);
722 vmem_free(uio
->uio_dio
.pages
, size
);
725 ASSERT3S(uio
->uio_dio
.npages
, ==, npages
);
728 if (rw
== UIO_WRITE
) {
729 zfs_uio_dio_check_for_zero_page(uio
);
732 uio
->uio_extflg
|= UIO_DIRECT
;