zdb: show dedup table and log attributes
[zfs.git] / module / os / linux / zfs / zfs_uio.c
blob0146d842339a834d79a42ab84ee81f105ae7e1e3
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
39 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
42 #ifdef _KERNEL
44 #include <sys/errno.h>
45 #include <sys/vmem.h>
46 #include <sys/sysmacros.h>
47 #include <sys/types.h>
48 #include <sys/uio_impl.h>
49 #include <sys/sysmacros.h>
50 #include <sys/string.h>
51 #include <sys/zfs_refcount.h>
52 #include <sys/zfs_debug.h>
53 #include <linux/kmap_compat.h>
54 #include <linux/uaccess.h>
55 #include <linux/pagemap.h>
56 #include <linux/mman.h>
59 * Move "n" bytes at byte address "p"; "rw" indicates the direction
60 * of the move, and the I/O parameters are provided in "uio", which is
61 * update to reflect the data which was moved. Returns 0 on success or
62 * a non-zero errno on failure.
64 static int
65 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
67 const struct iovec *iov = uio->uio_iov;
68 size_t skip = uio->uio_skip;
69 ulong_t cnt;
71 while (n && uio->uio_resid) {
72 cnt = MIN(iov->iov_len - skip, n);
73 switch (uio->uio_segflg) {
74 case UIO_USERSPACE:
76 * p = kernel data pointer
77 * iov->iov_base = user data pointer
79 if (rw == UIO_READ) {
80 if (copy_to_user(iov->iov_base+skip, p, cnt))
81 return (EFAULT);
82 } else {
83 unsigned long b_left = 0;
84 if (uio->uio_fault_disable) {
85 if (!zfs_access_ok(VERIFY_READ,
86 (iov->iov_base + skip), cnt)) {
87 return (EFAULT);
89 pagefault_disable();
90 b_left =
91 __copy_from_user_inatomic(p,
92 (iov->iov_base + skip), cnt);
93 pagefault_enable();
94 } else {
95 b_left =
96 copy_from_user(p,
97 (iov->iov_base + skip), cnt);
99 if (b_left > 0) {
100 unsigned long c_bytes =
101 cnt - b_left;
102 uio->uio_skip += c_bytes;
103 ASSERT3U(uio->uio_skip, <,
104 iov->iov_len);
105 uio->uio_resid -= c_bytes;
106 uio->uio_loffset += c_bytes;
107 return (EFAULT);
110 break;
111 case UIO_SYSSPACE:
112 if (rw == UIO_READ)
113 memcpy(iov->iov_base + skip, p, cnt);
114 else
115 memcpy(p, iov->iov_base + skip, cnt);
116 break;
117 default:
118 ASSERT(0);
120 skip += cnt;
121 if (skip == iov->iov_len) {
122 skip = 0;
123 uio->uio_iov = (++iov);
124 uio->uio_iovcnt--;
126 uio->uio_skip = skip;
127 uio->uio_resid -= cnt;
128 uio->uio_loffset += cnt;
129 p = (caddr_t)p + cnt;
130 n -= cnt;
132 return (0);
135 static int
136 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
138 const struct bio_vec *bv = uio->uio_bvec;
139 size_t skip = uio->uio_skip;
140 ulong_t cnt;
142 while (n && uio->uio_resid) {
143 void *paddr;
144 cnt = MIN(bv->bv_len - skip, n);
146 paddr = zfs_kmap_local(bv->bv_page);
147 if (rw == UIO_READ) {
148 /* Copy from buffer 'p' to the bvec data */
149 memcpy(paddr + bv->bv_offset + skip, p, cnt);
150 } else {
151 /* Copy from bvec data to buffer 'p' */
152 memcpy(p, paddr + bv->bv_offset + skip, cnt);
154 zfs_kunmap_local(paddr);
156 skip += cnt;
157 if (skip == bv->bv_len) {
158 skip = 0;
159 uio->uio_bvec = (++bv);
160 uio->uio_iovcnt--;
162 uio->uio_skip = skip;
163 uio->uio_resid -= cnt;
164 uio->uio_loffset += cnt;
165 p = (caddr_t)p + cnt;
166 n -= cnt;
168 return (0);
171 static void
172 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
173 struct bio_vec *bv)
175 void *paddr;
177 paddr = zfs_kmap_local(bv->bv_page);
178 if (rw == UIO_READ) {
179 /* Copy from buffer 'p' to the bvec data */
180 memcpy(paddr + bv->bv_offset + skip, p, cnt);
181 } else {
182 /* Copy from bvec data to buffer 'p' */
183 memcpy(p, paddr + bv->bv_offset + skip, cnt);
185 zfs_kunmap_local(paddr);
189 * Copy 'n' bytes of data between the buffer p[] and the data represented
190 * by the request in the uio.
192 static int
193 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
195 struct request *rq = uio->rq;
196 struct bio_vec bv;
197 struct req_iterator iter;
198 size_t this_seg_start; /* logical offset */
199 size_t this_seg_end; /* logical offset */
200 size_t skip_in_seg;
201 size_t copy_from_seg;
202 size_t orig_loffset;
203 int copied = 0;
206 * Get the original logical offset of this entire request (because
207 * uio->uio_loffset will be modified over time).
209 orig_loffset = io_offset(NULL, rq);
210 this_seg_start = orig_loffset;
212 rq_for_each_segment(bv, rq, iter) {
214 * Lookup what the logical offset of the last byte of this
215 * segment is.
217 this_seg_end = this_seg_start + bv.bv_len - 1;
220 * We only need to operate on segments that have data we're
221 * copying.
223 if (uio->uio_loffset >= this_seg_start &&
224 uio->uio_loffset <= this_seg_end) {
226 * Some, or all, of the data in this segment needs to be
227 * copied.
231 * We may be not be copying from the first byte in the
232 * segment. Figure out how many bytes to skip copying
233 * from the beginning of this segment.
235 skip_in_seg = uio->uio_loffset - this_seg_start;
238 * Calculate the total number of bytes from this
239 * segment that we will be copying.
241 copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
243 /* Copy the bytes */
244 zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
245 p = ((char *)p) + copy_from_seg;
247 n -= copy_from_seg;
248 uio->uio_resid -= copy_from_seg;
249 uio->uio_loffset += copy_from_seg;
250 copied = 1; /* We copied some data */
253 this_seg_start = this_seg_end + 1;
256 if (!copied) {
257 /* Didn't copy anything */
258 uio->uio_resid = 0;
260 return (0);
263 static int
264 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
266 if (uio->rq != NULL)
267 return (zfs_uiomove_bvec_rq(p, n, rw, uio));
268 return (zfs_uiomove_bvec_impl(p, n, rw, uio));
271 #if defined(HAVE_VFS_IOV_ITER)
272 static int
273 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
274 boolean_t revert)
276 size_t cnt = MIN(n, uio->uio_resid);
278 if (uio->uio_skip)
279 iov_iter_advance(uio->uio_iter, uio->uio_skip);
281 if (rw == UIO_READ)
282 cnt = copy_to_iter(p, cnt, uio->uio_iter);
283 else
284 cnt = copy_from_iter(p, cnt, uio->uio_iter);
287 * When operating on a full pipe no bytes are processed.
288 * In which case return EFAULT which is converted to EAGAIN
289 * by the kernel's generic_file_splice_read() function.
291 if (cnt == 0)
292 return (EFAULT);
295 * Revert advancing the uio_iter. This is set by zfs_uiocopy()
296 * to avoid consuming the uio and its iov_iter structure.
298 if (revert)
299 iov_iter_revert(uio->uio_iter, cnt);
301 uio->uio_resid -= cnt;
302 uio->uio_loffset += cnt;
304 return (0);
306 #endif
309 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
311 if (uio->uio_segflg == UIO_BVEC)
312 return (zfs_uiomove_bvec(p, n, rw, uio));
313 #if defined(HAVE_VFS_IOV_ITER)
314 else if (uio->uio_segflg == UIO_ITER)
315 return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
316 #endif
317 else
318 return (zfs_uiomove_iov(p, n, rw, uio));
320 EXPORT_SYMBOL(zfs_uiomove);
323 * Fault in the pages of the first n bytes specified by the uio structure.
324 * 1 byte in each page is touched and the uio struct is unmodified. Any
325 * error will terminate the process as this is only a best attempt to get
326 * the pages resident.
329 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
331 if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
332 (uio->uio_extflg & UIO_DIRECT)) {
334 * There's never a need to fault in kernel pages or Direct I/O
335 * write pages. Direct I/O write pages have been pinned in so
336 * there is never a time for these pages a fault will occur.
338 return (0);
339 #if defined(HAVE_VFS_IOV_ITER)
340 } else if (uio->uio_segflg == UIO_ITER) {
342 * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
343 * can be relied on to fault in user pages when referenced.
345 if (iov_iter_fault_in_readable(uio->uio_iter, n))
346 return (EFAULT);
347 #endif
348 } else {
349 /* Fault in all user pages */
350 ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE);
351 const struct iovec *iov = uio->uio_iov;
352 int iovcnt = uio->uio_iovcnt;
353 size_t skip = uio->uio_skip;
354 uint8_t tmp;
355 caddr_t p;
357 for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
358 ulong_t cnt = MIN(iov->iov_len - skip, n);
359 /* empty iov */
360 if (cnt == 0)
361 continue;
362 n -= cnt;
363 /* touch each page in this segment. */
364 p = iov->iov_base + skip;
365 while (cnt) {
366 if (copy_from_user(&tmp, p, 1))
367 return (EFAULT);
368 ulong_t incr = MIN(cnt, PAGESIZE);
369 p += incr;
370 cnt -= incr;
372 /* touch the last byte in case it straddles a page. */
373 p--;
374 if (copy_from_user(&tmp, p, 1))
375 return (EFAULT);
379 return (0);
381 EXPORT_SYMBOL(zfs_uio_prefaultpages);
384 * The same as zfs_uiomove() but doesn't modify uio structure.
385 * return in cbytes how many bytes were copied.
388 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
390 zfs_uio_t uio_copy;
391 int ret;
393 memcpy(&uio_copy, uio, sizeof (zfs_uio_t));
395 if (uio->uio_segflg == UIO_BVEC)
396 ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
397 #if defined(HAVE_VFS_IOV_ITER)
398 else if (uio->uio_segflg == UIO_ITER)
399 ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
400 #endif
401 else
402 ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
404 *cbytes = uio->uio_resid - uio_copy.uio_resid;
406 return (ret);
408 EXPORT_SYMBOL(zfs_uiocopy);
411 * Drop the next n chars out of *uio.
413 void
414 zfs_uioskip(zfs_uio_t *uio, size_t n)
416 if (n > uio->uio_resid)
417 return;
419 * When using a uio with a struct request, we simply
420 * use uio_loffset as a pointer to the next logical byte to
421 * copy in the request. We don't have to do any fancy
422 * accounting with uio_bvec/uio_iovcnt since we don't use
423 * them.
425 if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
426 uio->uio_skip += n;
427 while (uio->uio_iovcnt &&
428 uio->uio_skip >= uio->uio_bvec->bv_len) {
429 uio->uio_skip -= uio->uio_bvec->bv_len;
430 uio->uio_bvec++;
431 uio->uio_iovcnt--;
433 #if defined(HAVE_VFS_IOV_ITER)
434 } else if (uio->uio_segflg == UIO_ITER) {
435 iov_iter_advance(uio->uio_iter, n);
436 #endif
437 } else {
438 uio->uio_skip += n;
439 while (uio->uio_iovcnt &&
440 uio->uio_skip >= uio->uio_iov->iov_len) {
441 uio->uio_skip -= uio->uio_iov->iov_len;
442 uio->uio_iov++;
443 uio->uio_iovcnt--;
447 uio->uio_loffset += n;
448 uio->uio_resid -= n;
450 EXPORT_SYMBOL(zfs_uioskip);
453 * Check if the uio is page-aligned in memory.
455 boolean_t
456 zfs_uio_page_aligned(zfs_uio_t *uio)
458 boolean_t aligned = B_TRUE;
460 if (uio->uio_segflg == UIO_USERSPACE ||
461 uio->uio_segflg == UIO_SYSSPACE) {
462 const struct iovec *iov = uio->uio_iov;
463 size_t skip = uio->uio_skip;
465 for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
466 uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
467 size_t size = iov->iov_len - skip;
468 if ((addr & (PAGE_SIZE - 1)) ||
469 (size & (PAGE_SIZE - 1))) {
470 aligned = B_FALSE;
471 break;
473 skip = 0;
475 #if defined(HAVE_VFS_IOV_ITER)
476 } else if (uio->uio_segflg == UIO_ITER) {
477 unsigned long alignment =
478 iov_iter_alignment(uio->uio_iter);
479 aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
480 #endif
481 } else {
482 /* Currently not supported */
483 aligned = B_FALSE;
486 return (aligned);
490 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
491 #define ZFS_MARKEED_PAGE 0x0
492 #define IS_ZFS_MARKED_PAGE(_p) 0
493 #define zfs_mark_page(_p)
494 #define zfs_unmark_page(_p)
495 #define IS_ZERO_PAGE(_p) 0
497 #else
499 * Mark pages to know if they were allocated to replace ZERO_PAGE() for
500 * Direct I/O writes.
502 #define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
503 #define IS_ZFS_MARKED_PAGE(_p) \
504 (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
505 #define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
507 static inline void
508 zfs_mark_page(struct page *page)
510 ASSERT3P(page, !=, NULL);
511 get_page(page);
512 SetPagePrivate(page);
513 set_page_private(page, ZFS_MARKED_PAGE);
516 static inline void
517 zfs_unmark_page(struct page *page)
519 ASSERT3P(page, !=, NULL);
520 set_page_private(page, 0UL);
521 ClearPagePrivate(page);
522 put_page(page);
524 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
526 static void
527 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
529 ASSERT3P(uio->uio_dio.pages, !=, NULL);
531 for (long i = 0; i < uio->uio_dio.npages; i++) {
532 struct page *p = uio->uio_dio.pages[i];
533 lock_page(p);
535 if (IS_ZERO_PAGE(p)) {
537 * If the user page points the kernels ZERO_PAGE() a
538 * new zero filled page will just be allocated so the
539 * contents of the page can not be changed by the user
540 * while a Direct I/O write is taking place.
542 gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
543 __GFP_ZERO | GFP_KERNEL;
545 ASSERT0(IS_ZFS_MARKED_PAGE(p));
546 unlock_page(p);
547 put_page(p);
549 p = __page_cache_alloc(gfp_zero_page);
550 zfs_mark_page(p);
551 } else {
552 unlock_page(p);
557 void
558 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
561 ASSERT(uio->uio_extflg & UIO_DIRECT);
562 ASSERT3P(uio->uio_dio.pages, !=, NULL);
564 for (long i = 0; i < uio->uio_dio.npages; i++) {
565 struct page *p = uio->uio_dio.pages[i];
567 if (IS_ZFS_MARKED_PAGE(p)) {
568 zfs_unmark_page(p);
569 __free_page(p);
570 continue;
573 put_page(p);
576 vmem_free(uio->uio_dio.pages,
577 uio->uio_dio.npages * sizeof (struct page *));
581 * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
582 * iov_iter_get_pages().
584 static int
585 zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
586 long *numpages)
588 unsigned long addr = (unsigned long)(v.iov_base);
589 size_t len = v.iov_len;
590 unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
593 * read returning FOLL_WRITE is due to the fact that we are stating
594 * that the kernel will have write access to the user pages. So, when a
595 * Direct I/O read request is issued, the kernel must write to the user
596 * pages.
598 long res = get_user_pages_unlocked(
599 P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n,
600 &uio->uio_dio.pages[uio->uio_dio.npages],
601 rw == UIO_READ ? FOLL_WRITE : 0);
602 if (res < 0) {
603 return (SET_ERROR(-res));
604 } else if (len != (res * PAGE_SIZE)) {
605 return (SET_ERROR(EFAULT));
608 ASSERT3S(len, ==, res * PAGE_SIZE);
609 *numpages = res;
610 return (0);
613 static int
614 zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
616 const struct iovec *iovp = uio->uio_iov;
617 size_t skip = uio->uio_skip;
618 size_t len = uio->uio_resid - skip;
620 ASSERT(uio->uio_segflg != UIO_SYSSPACE);
622 for (int i = 0; i < uio->uio_iovcnt; i++) {
623 struct iovec iov;
624 long numpages = 0;
626 if (iovp->iov_len == 0) {
627 iovp++;
628 skip = 0;
629 continue;
631 iov.iov_len = MIN(len, iovp->iov_len - skip);
632 iov.iov_base = iovp->iov_base + skip;
633 int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
635 if (error)
636 return (error);
638 uio->uio_dio.npages += numpages;
639 len -= iov.iov_len;
640 skip = 0;
641 iovp++;
644 ASSERT0(len);
646 return (0);
649 #if defined(HAVE_VFS_IOV_ITER)
650 static int
651 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
653 size_t skip = uio->uio_skip;
654 size_t wanted = uio->uio_resid - uio->uio_skip;
655 ssize_t rollback = 0;
656 ssize_t cnt;
657 unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
659 while (wanted) {
660 #if defined(HAVE_IOV_ITER_GET_PAGES2)
661 cnt = iov_iter_get_pages2(uio->uio_iter,
662 &uio->uio_dio.pages[uio->uio_dio.npages],
663 wanted, maxpages, &skip);
664 #else
665 cnt = iov_iter_get_pages(uio->uio_iter,
666 &uio->uio_dio.pages[uio->uio_dio.npages],
667 wanted, maxpages, &skip);
668 #endif
669 if (cnt < 0) {
670 iov_iter_revert(uio->uio_iter, rollback);
671 return (SET_ERROR(-cnt));
673 uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
674 rollback += cnt;
675 wanted -= cnt;
676 skip = 0;
677 #if !defined(HAVE_IOV_ITER_GET_PAGES2)
679 * iov_iter_get_pages2() advances the iov_iter on success.
681 iov_iter_advance(uio->uio_iter, cnt);
682 #endif
685 ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
686 iov_iter_revert(uio->uio_iter, rollback);
688 return (0);
690 #endif /* HAVE_VFS_IOV_ITER */
693 * This function pins user pages. In the event that the user pages were not
694 * successfully pinned an error value is returned.
696 * On success, 0 is returned.
699 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
701 int error = 0;
702 long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
703 size_t size = npages * sizeof (struct page *);
705 if (uio->uio_segflg == UIO_USERSPACE) {
706 uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
707 error = zfs_uio_get_dio_pages_iov(uio, rw);
708 #if defined(HAVE_VFS_IOV_ITER)
709 } else if (uio->uio_segflg == UIO_ITER) {
710 uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
711 error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
712 #endif
713 } else {
714 return (SET_ERROR(EOPNOTSUPP));
717 ASSERT3S(uio->uio_dio.npages, >=, 0);
719 if (error) {
720 for (long i = 0; i < uio->uio_dio.npages; i++)
721 put_page(uio->uio_dio.pages[i]);
722 vmem_free(uio->uio_dio.pages, size);
723 return (error);
724 } else {
725 ASSERT3S(uio->uio_dio.npages, ==, npages);
728 if (rw == UIO_WRITE) {
729 zfs_uio_dio_check_for_zero_page(uio);
732 uio->uio_extflg |= UIO_DIRECT;
734 return (0);
737 #endif /* _KERNEL */