4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
37 #include <sys/processor.h>
38 #include <sys/zfs_context.h>
39 #include <sys/rrwlock.h>
41 #include <sys/utsname.h>
42 #include <sys/systeminfo.h>
45 * Emulation of kernel services in userland.
50 vnode_t
*rootdir
= (vnode_t
*)0xabcd1234;
51 char hw_serial
[HW_HOSTID_LEN
];
53 vmem_t
*zio_arena
= NULL
;
55 /* If set, all blocks read will be copied to the specified directory. */
56 char *vn_dumpdir
= NULL
;
58 struct utsname utsname
= {
59 "userland", "libzpool", "1", "1", "na"
62 /* this only exists to have its address taken */
66 * =========================================================================
68 * =========================================================================
72 zk_thread_create(void (*func
)(), void *arg
, uint64_t len
)
77 VERIFY(thr_create(0, 0, (void *(*)(void *))func
, arg
, THR_DETACHED
,
80 return ((void *)(uintptr_t)tid
);
84 * =========================================================================
86 * =========================================================================
90 kstat_create(const char *module
, int instance
, const char *name
,
91 const char *class, uchar_t type
, ulong_t ndata
, uchar_t ks_flag
)
98 kstat_named_init(kstat_named_t
*knp
, const char *name
, uchar_t type
)
103 kstat_install(kstat_t
*ksp
)
108 kstat_delete(kstat_t
*ksp
)
113 kstat_waitq_enter(kstat_io_t
*kiop
)
118 kstat_waitq_exit(kstat_io_t
*kiop
)
123 kstat_runq_enter(kstat_io_t
*kiop
)
128 kstat_runq_exit(kstat_io_t
*kiop
)
133 kstat_waitq_to_runq(kstat_io_t
*kiop
)
138 kstat_runq_back_to_waitq(kstat_io_t
*kiop
)
142 * =========================================================================
144 * =========================================================================
147 zmutex_init(kmutex_t
*mp
)
150 mp
->initialized
= B_TRUE
;
151 (void) _mutex_init(&mp
->m_lock
, USYNC_THREAD
, NULL
);
155 zmutex_destroy(kmutex_t
*mp
)
157 ASSERT(mp
->initialized
== B_TRUE
);
158 ASSERT(mp
->m_owner
== NULL
);
159 (void) _mutex_destroy(&(mp
)->m_lock
);
160 mp
->m_owner
= (void *)-1UL;
161 mp
->initialized
= B_FALSE
;
165 zmutex_enter(kmutex_t
*mp
)
167 ASSERT(mp
->initialized
== B_TRUE
);
168 ASSERT(mp
->m_owner
!= (void *)-1UL);
169 ASSERT(mp
->m_owner
!= curthread
);
170 VERIFY(mutex_lock(&mp
->m_lock
) == 0);
171 ASSERT(mp
->m_owner
== NULL
);
172 mp
->m_owner
= curthread
;
176 mutex_tryenter(kmutex_t
*mp
)
178 ASSERT(mp
->initialized
== B_TRUE
);
179 ASSERT(mp
->m_owner
!= (void *)-1UL);
180 if (0 == mutex_trylock(&mp
->m_lock
)) {
181 ASSERT(mp
->m_owner
== NULL
);
182 mp
->m_owner
= curthread
;
190 zmutex_exit(kmutex_t
*mp
)
192 ASSERT(mp
->initialized
== B_TRUE
);
193 ASSERT(mutex_owner(mp
) == curthread
);
195 VERIFY(mutex_unlock(&mp
->m_lock
) == 0);
199 mutex_owner(kmutex_t
*mp
)
201 ASSERT(mp
->initialized
== B_TRUE
);
202 return (mp
->m_owner
);
206 * =========================================================================
208 * =========================================================================
212 rw_init(krwlock_t
*rwlp
, char *name
, int type
, void *arg
)
214 rwlock_init(&rwlp
->rw_lock
, USYNC_THREAD
, NULL
);
215 rwlp
->rw_owner
= NULL
;
216 rwlp
->initialized
= B_TRUE
;
220 rw_destroy(krwlock_t
*rwlp
)
222 rwlock_destroy(&rwlp
->rw_lock
);
223 rwlp
->rw_owner
= (void *)-1UL;
224 rwlp
->initialized
= B_FALSE
;
228 rw_enter(krwlock_t
*rwlp
, krw_t rw
)
230 ASSERT(!RW_LOCK_HELD(rwlp
));
231 ASSERT(rwlp
->initialized
== B_TRUE
);
232 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
233 ASSERT(rwlp
->rw_owner
!= curthread
);
236 VERIFY(rw_wrlock(&rwlp
->rw_lock
) == 0);
238 VERIFY(rw_rdlock(&rwlp
->rw_lock
) == 0);
240 rwlp
->rw_owner
= curthread
;
244 rw_exit(krwlock_t
*rwlp
)
246 ASSERT(rwlp
->initialized
== B_TRUE
);
247 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
249 rwlp
->rw_owner
= NULL
;
250 VERIFY(rw_unlock(&rwlp
->rw_lock
) == 0);
254 rw_tryenter(krwlock_t
*rwlp
, krw_t rw
)
258 ASSERT(rwlp
->initialized
== B_TRUE
);
259 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
262 rv
= rw_trywrlock(&rwlp
->rw_lock
);
264 rv
= rw_tryrdlock(&rwlp
->rw_lock
);
267 rwlp
->rw_owner
= curthread
;
276 rw_tryupgrade(krwlock_t
*rwlp
)
278 ASSERT(rwlp
->initialized
== B_TRUE
);
279 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
285 * =========================================================================
286 * condition variables
287 * =========================================================================
291 cv_init(kcondvar_t
*cv
, char *name
, int type
, void *arg
)
293 VERIFY(cond_init(cv
, type
, NULL
) == 0);
297 cv_destroy(kcondvar_t
*cv
)
299 VERIFY(cond_destroy(cv
) == 0);
303 cv_wait(kcondvar_t
*cv
, kmutex_t
*mp
)
305 ASSERT(mutex_owner(mp
) == curthread
);
307 int ret
= cond_wait(cv
, &mp
->m_lock
);
308 VERIFY(ret
== 0 || ret
== EINTR
);
309 mp
->m_owner
= curthread
;
313 cv_timedwait(kcondvar_t
*cv
, kmutex_t
*mp
, clock_t abstime
)
320 delta
= abstime
- ddi_get_lbolt();
324 ts
.tv_sec
= delta
/ hz
;
325 ts
.tv_nsec
= (delta
% hz
) * (NANOSEC
/ hz
);
327 ASSERT(mutex_owner(mp
) == curthread
);
329 error
= cond_reltimedwait(cv
, &mp
->m_lock
, &ts
);
330 mp
->m_owner
= curthread
;
345 cv_timedwait_hires(kcondvar_t
*cv
, kmutex_t
*mp
, hrtime_t tim
, hrtime_t res
,
352 ASSERT(flag
== 0 || flag
== CALLOUT_FLAG_ABSOLUTE
);
356 if (flag
& CALLOUT_FLAG_ABSOLUTE
)
357 delta
-= gethrtime();
362 ts
.tv_sec
= delta
/ NANOSEC
;
363 ts
.tv_nsec
= delta
% NANOSEC
;
365 ASSERT(mutex_owner(mp
) == curthread
);
367 error
= cond_reltimedwait(cv
, &mp
->m_lock
, &ts
);
368 mp
->m_owner
= curthread
;
382 cv_signal(kcondvar_t
*cv
)
384 VERIFY(cond_signal(cv
) == 0);
388 cv_broadcast(kcondvar_t
*cv
)
390 VERIFY(cond_broadcast(cv
) == 0);
394 * =========================================================================
396 * =========================================================================
399 * Note: for the xxxat() versions of these functions, we assume that the
400 * starting vp is always rootdir (which is true for spa_directory.c, the only
401 * ZFS consumer of these interfaces). We assert this is true, and then emulate
402 * them by adding '/' in front of the path.
407 vn_open(char *path
, int x1
, int flags
, int mode
, vnode_t
**vpp
, int x2
, int x3
)
413 char realpath
[MAXPATHLEN
];
417 * If we're accessing a real disk from userland, we need to use
418 * the character interface to avoid caching. This is particularly
419 * important if we're trying to look at a real in-kernel storage
420 * pool from userland, e.g. via zdb, because otherwise we won't
421 * see the changes occurring under the segmap cache.
422 * On the other hand, the stupid character device returns zero
423 * for its size. So -- gag -- we open the block device to get
424 * its size, and remember it for subsequent fop_getattr().
426 if (strncmp(path
, "/dev/", 5) == 0) {
428 fd
= open64(path
, O_RDONLY
);
431 if (fstat64(fd
, &st
) == -1) {
436 (void) sprintf(realpath
, "%s", path
);
437 dsk
= strstr(path
, "/dsk/");
439 (void) sprintf(realpath
+ (dsk
- path
) + 1, "r%s",
442 (void) sprintf(realpath
, "%s", path
);
443 if (!(flags
& FCREAT
) && stat64(realpath
, &st
) == -1)
448 old_umask
= umask(0);
451 * The construct 'flags - FREAD' conveniently maps combinations of
452 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
454 fd
= open64(realpath
, flags
- FREAD
, mode
);
457 (void) umask(old_umask
);
459 if (vn_dumpdir
!= NULL
) {
460 char dumppath
[MAXPATHLEN
];
461 (void) snprintf(dumppath
, sizeof (dumppath
),
462 "%s/%s", vn_dumpdir
, basename(realpath
));
463 dump_fd
= open64(dumppath
, O_CREAT
| O_WRONLY
, 0666);
473 if (fstat64(fd
, &st
) == -1) {
478 (void) fcntl(fd
, F_SETFD
, FD_CLOEXEC
);
480 *vpp
= vp
= umem_zalloc(sizeof (vnode_t
), UMEM_NOFAIL
);
483 vp
->v_size
= st
.st_size
;
484 vp
->v_path
= spa_strdup(path
);
485 vp
->v_dump_fd
= dump_fd
;
492 vn_openat(char *path
, int x1
, int flags
, int mode
, vnode_t
**vpp
, int x2
,
493 int x3
, vnode_t
*startvp
, int fd
)
495 char *realpath
= umem_alloc(strlen(path
) + 2, UMEM_NOFAIL
);
498 ASSERT(startvp
== rootdir
);
499 (void) sprintf(realpath
, "/%s", path
);
501 /* fd ignored for now, need if want to simulate nbmand support */
502 ret
= vn_open(realpath
, x1
, flags
, mode
, vpp
, x2
, x3
);
504 umem_free(realpath
, strlen(path
) + 2);
511 vn_rdwr(int uio
, vnode_t
*vp
, void *addr
, ssize_t len
, offset_t offset
,
512 int x1
, int x2
, rlim64_t x3
, void *x4
, ssize_t
*residp
)
514 ssize_t iolen
, split
;
516 if (uio
== UIO_READ
) {
517 iolen
= pread64(vp
->v_fd
, addr
, len
, offset
);
518 if (vp
->v_dump_fd
!= -1) {
520 pwrite64(vp
->v_dump_fd
, addr
, iolen
, offset
);
521 ASSERT(status
!= -1);
525 * To simulate partial disk writes, we split writes into two
526 * system calls so that the process can be killed in between.
528 int sectors
= len
>> SPA_MINBLOCKSHIFT
;
529 split
= (sectors
> 0 ? rand() % sectors
: 0) <<
531 iolen
= pwrite64(vp
->v_fd
, addr
, split
, offset
);
532 iolen
+= pwrite64(vp
->v_fd
, (char *)addr
+ split
,
533 len
- split
, offset
+ split
);
539 *residp
= len
- iolen
;
540 else if (iolen
!= len
)
546 vn_close(vnode_t
*vp
)
549 if (vp
->v_dump_fd
!= -1)
550 close(vp
->v_dump_fd
);
551 spa_strfree(vp
->v_path
);
552 umem_free(vp
, sizeof (vnode_t
));
556 * At a minimum we need to update the size since vdev_reopen()
557 * will no longer call vn_openat().
560 fop_getattr_real(vnode_t
*vp
, vattr_t
*vap
)
564 if (fstat64(vp
->v_fd
, &st
) == -1) {
569 vap
->va_size
= st
.st_size
;
576 * =========================================================================
577 * Figure out which debugging statements to print
578 * =========================================================================
581 static char *dprintf_string
;
582 static int dprintf_print_all
;
585 dprintf_find_string(const char *string
)
587 char *tmp_str
= dprintf_string
;
588 int len
= strlen(string
);
591 * Find out if this is a string we want to print.
592 * String format: file1.c,function_name1,file2.c,file3.c
595 while (tmp_str
!= NULL
) {
596 if (strncmp(tmp_str
, string
, len
) == 0 &&
597 (tmp_str
[len
] == ',' || tmp_str
[len
] == '\0'))
599 tmp_str
= strchr(tmp_str
, ',');
601 tmp_str
++; /* Get rid of , */
607 dprintf_setup(int *argc
, char **argv
)
612 * Debugging can be specified two ways: by setting the
613 * environment variable ZFS_DEBUG, or by including a
614 * "debug=..." argument on the command line. The command
615 * line setting overrides the environment variable.
618 for (i
= 1; i
< *argc
; i
++) {
619 int len
= strlen("debug=");
620 /* First look for a command line argument */
621 if (strncmp("debug=", argv
[i
], len
) == 0) {
622 dprintf_string
= argv
[i
] + len
;
623 /* Remove from args */
624 for (j
= i
; j
< *argc
; j
++)
631 if (dprintf_string
== NULL
) {
632 /* Look for ZFS_DEBUG environment variable */
633 dprintf_string
= getenv("ZFS_DEBUG");
637 * Are we just turning on all debugging?
639 if (dprintf_find_string("on"))
640 dprintf_print_all
= 1;
642 if (dprintf_string
!= NULL
)
643 zfs_flags
|= ZFS_DEBUG_DPRINTF
;
647 * =========================================================================
649 * =========================================================================
652 __dprintf(const char *file
, const char *func
, int line
, const char *fmt
, ...)
658 * Get rid of annoying "../common/" prefix to filename.
660 newfile
= strrchr(file
, '/');
661 if (newfile
!= NULL
) {
662 newfile
= newfile
+ 1; /* Get rid of leading / */
667 if (dprintf_print_all
||
668 dprintf_find_string(newfile
) ||
669 dprintf_find_string(func
)) {
670 /* Print out just the function name if requested */
672 if (dprintf_find_string("pid"))
673 (void) printf("%d ", getpid());
674 if (dprintf_find_string("tid"))
675 (void) printf("%u ", thr_self());
676 if (dprintf_find_string("cpu"))
677 (void) printf("%u ", getcpuid());
678 if (dprintf_find_string("time"))
679 (void) printf("%llu ", gethrtime());
680 if (dprintf_find_string("long"))
681 (void) printf("%s, line %d: ", newfile
, line
);
682 (void) printf("%s: ", func
);
684 (void) vprintf(fmt
, adx
);
690 #endif /* ZFS_DEBUG */
693 * =========================================================================
694 * cmn_err() and panic()
695 * =========================================================================
697 static char ce_prefix
[CE_IGNORE
][10] = { "", "NOTICE: ", "WARNING: ", "" };
698 static char ce_suffix
[CE_IGNORE
][2] = { "", "\n", "\n", "" };
701 vpanic(const char *fmt
, va_list adx
)
704 (void) vsnprintf(buf
, 512, fmt
, adx
);
705 assfail(buf
, NULL
, 0);
706 abort(); /* necessary to make vpanic meet noreturn requirements */
710 panic(const char *fmt
, ...)
720 vcmn_err(int ce
, const char *fmt
, va_list adx
)
724 if (ce
!= CE_NOTE
) { /* suppress noise in userland stress testing */
725 (void) fprintf(stderr
, "%s", ce_prefix
[ce
]);
726 (void) vfprintf(stderr
, fmt
, adx
);
727 (void) fprintf(stderr
, "%s", ce_suffix
[ce
]);
733 cmn_err(int ce
, const char *fmt
, ...)
738 vcmn_err(ce
, fmt
, adx
);
743 * =========================================================================
745 * =========================================================================
748 kobj_open_file(char *name
)
753 /* set vp as the _fd field of the file */
754 if (vn_openat(name
, UIO_SYSSPACE
, FREAD
, 0, &vp
, 0, 0, rootdir
,
756 return ((void *)-1UL);
758 file
= umem_zalloc(sizeof (struct _buf
), UMEM_NOFAIL
);
759 file
->_fd
= (intptr_t)vp
;
764 kobj_read_file(struct _buf
*file
, char *buf
, unsigned size
, unsigned off
)
768 vn_rdwr(UIO_READ
, (vnode_t
*)file
->_fd
, buf
, size
, (offset_t
)off
,
769 UIO_SYSSPACE
, 0, 0, 0, &resid
);
771 return (size
- resid
);
775 kobj_close_file(struct _buf
*file
)
777 vn_close((vnode_t
*)file
->_fd
);
778 umem_free(file
, sizeof (struct _buf
));
782 kobj_get_filesize(struct _buf
*file
, uint64_t *size
)
785 vnode_t
*vp
= (vnode_t
*)file
->_fd
;
787 if (fstat64(vp
->v_fd
, &st
) == -1) {
796 * =========================================================================
798 * =========================================================================
804 poll(0, 0, ticks
* (1000 / hz
));
808 * Find highest one bit set.
809 * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
812 highbit64(uint64_t i
)
818 if (i
& 0xffffffff00000000ULL
) {
821 if (i
& 0xffff0000) {
839 static int random_fd
= -1, urandom_fd
= -1;
842 random_get_bytes_common(uint8_t *ptr
, size_t len
, int fd
)
850 bytes
= read(fd
, ptr
, resid
);
851 ASSERT3S(bytes
, >=, 0);
860 random_get_bytes(uint8_t *ptr
, size_t len
)
862 return (random_get_bytes_common(ptr
, len
, random_fd
));
866 random_get_pseudo_bytes(uint8_t *ptr
, size_t len
)
868 return (random_get_bytes_common(ptr
, len
, urandom_fd
));
872 ddi_strtoul(const char *hw_serial
, char **nptr
, int base
, unsigned long *result
)
876 *result
= strtoul(hw_serial
, &end
, base
);
883 ddi_strtoull(const char *str
, char **nptr
, int base
, u_longlong_t
*result
)
887 *result
= strtoull(str
, &end
, base
);
895 cyclic_add(cyc_handler_t
*hdlr
, cyc_time_t
*when
)
902 cyclic_remove(cyclic_id_t id
)
908 cyclic_reprogram(cyclic_id_t id
, hrtime_t expiration
)
914 * =========================================================================
915 * kernel emulation setup & teardown
916 * =========================================================================
919 umem_out_of_memory(void)
921 char errmsg
[] = "out of memory -- generating core dump\n";
923 write(fileno(stderr
), errmsg
, sizeof (errmsg
));
929 kernel_init(int mode
)
931 extern uint_t rrw_tsd_key
;
933 umem_nofail_callback(umem_out_of_memory
);
935 physmem
= sysconf(_SC_PHYS_PAGES
);
937 dprintf("physmem = %llu pages (%.2f GB)\n", physmem
,
938 (double)physmem
* sysconf(_SC_PAGE_SIZE
) / (1ULL << 30));
940 (void) snprintf(hw_serial
, sizeof (hw_serial
), "%ld",
941 (mode
& FWRITE
) ? gethostid() : 0);
943 VERIFY((random_fd
= open("/dev/random", O_RDONLY
)) != -1);
944 VERIFY((urandom_fd
= open("/dev/urandom", O_RDONLY
)) != -1);
948 mutex_init(&cpu_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
952 tsd_create(&rrw_tsd_key
, rrw_tsd_destroy
);
970 z_uncompress(void *dst
, size_t *dstlen
, const void *src
, size_t srclen
)
973 uLongf len
= *dstlen
;
975 if ((ret
= uncompress(dst
, &len
, src
, srclen
)) == Z_OK
)
976 *dstlen
= (size_t)len
;
982 z_compress_level(void *dst
, size_t *dstlen
, const void *src
, size_t srclen
,
986 uLongf len
= *dstlen
;
988 if ((ret
= compress2(dst
, &len
, src
, srclen
, level
)) == Z_OK
)
989 *dstlen
= (size_t)len
;
1001 crgetruid(cred_t
*cr
)
1007 crgetgid(cred_t
*cr
)
1013 crgetngroups(cred_t
*cr
)
1019 crgetgroups(cred_t
*cr
)
1025 zfs_secpolicy_snapshot_perms(const char *name
, cred_t
*cr
)
1031 zfs_secpolicy_rename_perms(const char *from
, const char *to
, cred_t
*cr
)
1037 zfs_secpolicy_destroy_perms(const char *name
, cred_t
*cr
)
1043 ksid_lookupdomain(const char *dom
)
1047 kd
= umem_zalloc(sizeof (ksiddomain_t
), UMEM_NOFAIL
);
1048 kd
->kd_name
= spa_strdup(dom
);
1053 ksiddomain_rele(ksiddomain_t
*ksid
)
1055 spa_strfree(ksid
->kd_name
);
1056 umem_free(ksid
, sizeof (ksiddomain_t
));
1060 * Do not change the length of the returned string; it must be freed
1064 kmem_asprintf(const char *fmt
, ...)
1071 size
= vsnprintf(NULL
, 0, fmt
, adx
) + 1;
1074 buf
= kmem_alloc(size
, KM_SLEEP
);
1077 size
= vsnprintf(buf
, size
, fmt
, adx
);
1085 zfs_onexit_fd_hold(int fd
, minor_t
*minorp
)
1093 zfs_onexit_fd_rele(int fd
)
1099 zfs_onexit_add_cb(minor_t minor
, void (*func
)(void *), void *data
,
1100 uint64_t *action_handle
)
1107 zfs_onexit_del_cb(minor_t minor
, uint64_t action_handle
, boolean_t fire
)
1114 zfs_onexit_cb_data(minor_t minor
, uint64_t action_handle
, void **data
)
1122 bzero(bp
, sizeof (buf_t
));
1128 if (bp
->b_iodone
!= NULL
) {
1129 (*(bp
->b_iodone
))(bp
);
1132 ASSERT((bp
->b_flags
& B_DONE
) == 0);
1133 bp
->b_flags
|= B_DONE
;
1137 bioerror(buf_t
*bp
, int error
)
1143 bp
->b_flags
|= B_ERROR
;
1145 bp
->b_flags
&= ~B_ERROR
;
1147 bp
->b_error
= error
;
1152 geterror(struct buf
*bp
)
1156 if (bp
->b_flags
& B_ERROR
) {
1157 error
= bp
->b_error
;