4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/types.h>
28 #include <sys/resource.h>
29 #include <sys/priocntl.h>
30 #include <sys/rtpriocntl.h>
31 #include <sys/tspriocntl.h>
46 #include <sys/nsctl/cfg.h>
47 #include <sys/nsctl/nsctl.h>
48 #include <sys/nsctl/nsc_ioctl.h>
49 #include <sys/nskernd.h>
52 #include <sys/mkdev.h>
53 #include <sys/nsctl/sv_efi.h>
55 static const char *rdev
= "/dev/nsctl";
58 * Define a minimal user stack size in bytes over and above the
59 * libthread THR_STACK_MIN minimum value.
61 * This stack size needs to be sufficient to run _newlwp() and then
62 * ioctl() down into the kernel.
64 #define NSK_STACK_SIZE 512
67 * LWP scheduling control switches.
69 * allow_pri - set to non-zero to enable priocntl() manipulations of
71 * allow_rt - set to non-zero to use the RT rather than the TS
72 * scheduling class when manipulating the schduling
73 * parameters for an LWP. Only used if allow_pri is
76 static int allow_pri
= 1;
77 static int allow_rt
= 0; /* disallow - bad interactions with timeout() */
79 static int nsctl_fd
= -1;
82 static int nthreads
; /* number of threads in the kernel */
83 static int exiting
; /* shutdown in progress flag */
84 static mutex_t thr_mutex
= DEFAULTMUTEX
;
85 static mutex_t cfg_mutex
= DEFAULTMUTEX
;
87 static int cl_nodeid
= -1;
89 static int display_msg
= 0;
90 static int delay_time
= 30;
95 (void) fprintf(stderr
, gettext("usage: nskernd\n"));
103 if (sig
== SIGTERM
) {
110 * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel
115 (void) mutex_lock(&thr_mutex
);
117 /* cannot enter kernel as nskernd is being shutdown - exit */
118 (void) mutex_unlock(&thr_mutex
);
122 (void) mutex_unlock(&thr_mutex
);
130 (void) mutex_lock(&thr_mutex
);
132 (void) mutex_unlock(&thr_mutex
);
137 * returns: 1 - can shutdown; 0 - unable to shutdown
145 (void) mutex_lock(&thr_mutex
);
148 (void) fprintf(stderr
,
149 gettext("nskernd: unable to shutdown: "
150 "%d kernel threads in use\n"), nthreads
);
152 start_delay
= time(0);
153 while (nthreads
> 0 && (time(0) - start_delay
) < delay_time
) {
154 (void) mutex_unlock(&thr_mutex
);
156 (void) mutex_lock(&thr_mutex
);
157 (void) fprintf(stderr
,
158 gettext("nskernd: delay shutdown: "
159 "%d kernel threads in use\n"), nthreads
);
167 /* flag shutdown in progress */
170 (void) mutex_unlock(&thr_mutex
);
177 * returns: 1 - shutdown successful; 0 - unable to shutdown
188 bzero(&data
, sizeof (data
));
189 data
.command
= NSKERND_STOP
;
191 if (!canshutdown()) {
195 rc
= ioctl(nsctl_fd
, NSCIOC_NSKERND
, &data
);
197 if (errno
!= EINTR
|| !sigterm
) {
198 (void) fprintf(stderr
,
199 gettext("nskernd: NSKERND_STOP failed\n"));
208 * First function run by a NSKERND_NEWLWP thread.
210 * Determines if it needs to change the scheduling priority of the LWP,
211 * and then calls back into the kernel.
220 /* copy arguments onto stack and free heap memory */
221 bcopy(arg
, &nsk
, sizeof (nsk
));
224 if (nsk
.data2
&& allow_pri
) {
225 /* increase the scheduling priority of this LWP */
227 bzero(&pcinfo
, sizeof (pcinfo
));
228 (void) strcpy(pcinfo
.pc_clname
, allow_rt
? "RT" : "TS");
230 if (priocntl(0, 0, PC_GETCID
, (char *)&pcinfo
) < 0) {
231 (void) fprintf(stderr
,
233 "nskernd: priocntl(PC_GETCID) failed: %s\n"),
238 bzero(&pcparms
, sizeof (pcparms
));
239 pcparms
.pc_cid
= pcinfo
.pc_cid
;
242 ((rtparms_t
*)pcparms
.pc_clparms
)->rt_pri
=
243 (pri_t
)0; /* minimum RT priority */
244 ((rtparms_t
*)pcparms
.pc_clparms
)->rt_tqsecs
=
246 ((rtparms_t
*)pcparms
.pc_clparms
)->rt_tqnsecs
=
249 ((tsparms_t
*)pcparms
.pc_clparms
)->ts_uprilim
=
250 ((tsinfo_t
*)&pcinfo
.pc_clinfo
)->ts_maxupri
;
251 ((tsparms_t
*)pcparms
.pc_clparms
)->ts_upri
=
252 ((tsinfo_t
*)&pcinfo
.pc_clinfo
)->ts_maxupri
;
255 if (priocntl(P_LWPID
, P_MYID
,
256 PC_SETPARMS
, (char *)&pcparms
) < 0) {
257 (void) fprintf(stderr
,
259 "nskernd: priocntl(PC_SETPARMS) failed: %s\n"),
266 (void) ioctl(nsctl_fd
, NSCIOC_NSKERND
, &nsk
);
274 * Start a new thread bound to an LWP.
276 * This is the user level side of nsc_create_process().
279 newlwp(struct nskernd
*req
)
281 struct nskernd
*nskp
;
285 nskp
= malloc(sizeof (*nskp
));
288 (void) fprintf(stderr
, gettext("nskernd: malloc(%d) failed\n"),
291 req
->data1
= (uint64_t)ENOMEM
;
295 /* copy args for child */
296 bcopy(req
, nskp
, sizeof (*nskp
));
298 rc
= thr_create(NULL
, (THR_MIN_STACK
+ NSK_STACK_SIZE
),
299 _newlwp
, nskp
, THR_BOUND
|THR_DETACHED
, &tid
);
302 /* thr_create failed */
304 (void) fprintf(stderr
,
305 gettext("nskernd: thr_create failed: %s\n"),
308 req
->data1
= (uint64_t)errno
;
311 /* success - _newlwp() will free nskp */
312 req
->data1
= (uint64_t)0;
317 log_iibmp_err(char *set
, int flags
)
320 char key
[CFG_MAX_KEY
];
321 char buf
[CFG_MAX_BUF
];
322 char newflags
[CFG_MAX_BUF
];
323 char outbuf
[CFG_MAX_BUF
];
324 char *mst
, *shd
, *bmp
, *mode
, *ovr
, *cnode
, *opt
, *grp
;
325 int setno
, found
= 0;
331 setlen
= strlen(set
);
336 (void) mutex_lock(&cfg_mutex
);
339 (void) mutex_unlock(&cfg_mutex
);
343 if (!cfg_lock(cfg
, CFG_WRLOCK
)) {
345 (void) mutex_unlock(&cfg_mutex
);
351 (void) fprintf(stderr
, gettext(
352 "nskernd: Error forking\n"));
354 } else if (pid
> 0) {
355 (void) fprintf(stdout
, gettext(
356 "nskernd: Attempting deferred bitmap error\n"));
360 (void) mutex_lock(&cfg_mutex
);
363 (void) mutex_unlock(&cfg_mutex
);
364 (void) fprintf(stderr
, gettext(
365 "nskernd: Failed cfg_open, deferred bitmap\n"));
369 /* Sooner or later, this lock will be free */
370 while (!cfg_lock(cfg
, CFG_WRLOCK
))
374 /* find the proper set number */
375 for (setno
= 1; !found
; setno
++) {
376 (void) snprintf(key
, CFG_MAX_KEY
, "ii.set%d", setno
);
377 if (cfg_get_cstring(cfg
, key
, buf
, CFG_MAX_BUF
) < 0) {
381 mst
= strtok(buf
, " ");
382 shd
= strtok(NULL
, " ");
383 if (strncmp(shd
, set
, setlen
) == 0) {
386 bmp
= strtok(NULL
, " ");
387 mode
= strtok(NULL
, " ");
388 ovr
= strtok(NULL
, " ");
389 cnode
= strtok(NULL
, " ");
390 opt
= strtok(NULL
, " ");
391 grp
= strtok(NULL
, " ");
397 /* were there flags in the options field already? */
398 (void) snprintf(newflags
, CFG_MAX_BUF
, "%s=0x%x",
399 NSKERN_II_BMP_OPTION
, flags
);
400 if (opt
&& strcmp(opt
, "-") != 0) {
401 bzero(newflags
, CFG_MAX_BUF
);
402 opt
= strtok(opt
, ";");
404 if (strncmp(opt
, NSKERN_II_BMP_OPTION
,
405 strlen(NSKERN_II_BMP_OPTION
)) != 0) {
406 (void) strcat(newflags
, ";");
407 (void) strcat(newflags
, opt
);
411 (void) snprintf(key
, CFG_MAX_KEY
, "ii.set%d", setno
);
412 (void) snprintf(outbuf
, CFG_MAX_BUF
, "%s %s %s %s %s %s %s %s",
413 mst
, shd
, bmp
, mode
, ovr
, cnode
, newflags
, grp
);
414 if (cfg_put_cstring(cfg
, key
, outbuf
, CFG_MAX_BUF
) < 0) {
415 (void) printf("Failed to put [%s]\n", outbuf
);
418 (void) cfg_commit(cfg
);
422 (void) fprintf(stderr
, gettext(
423 "nskernd: Failed deferred bitmap [%s]\n"), set
);
428 (void) mutex_unlock(&cfg_mutex
);
431 * if we are the fork'ed client, just exit, if parent just return
442 * First function run by a NSKERND_LOCK thread.
444 * Opens dscfg and locks it,
445 * and then calls back into the kernel.
448 * data1 is the kernel address of the sync structure.
449 * data2 is read(0)/write(1) lock mode.
464 /* copy arguments onto stack and free heap memory */
465 bcopy(arg
, &nsk
, sizeof (nsk
));
468 (void) mutex_lock(&cfg_mutex
);
472 (void) fprintf(stderr
,
473 gettext("nskernd: cfg_open failed: %s\n"),
479 if (nsk
.data2
== 0) {
487 if (cfg_lock(cfg
, mode
)) {
491 (void) fprintf(stderr
,
492 gettext("nskernd: cfg_lock failed: %s\n"),
499 /* return to kernel */
501 nsk
.data2
= (uint64_t)rc
;
503 (void) ioctl(nsctl_fd
, NSCIOC_NSKERND
, &nsk
);
518 (void) mutex_unlock(&cfg_mutex
);
525 * Inter-node lock thread.
527 * This is the user level side of nsc_rmlock().
530 dolock(struct nskernd
*req
)
532 struct nskernd
*nskp
;
536 /* create a new thread to do the lock and return to kernel */
538 nskp
= malloc(sizeof (*nskp
));
541 (void) fprintf(stderr
,
542 gettext("nskernd:dolock: malloc(%d) failed\n"),
545 req
->data1
= (uint64_t)ENOMEM
;
549 /* copy args for child */
550 bcopy(req
, nskp
, sizeof (*nskp
));
552 rc
= thr_create(NULL
, (THR_MIN_STACK
+ NSK_STACK_SIZE
),
553 _dolock
, nskp
, THR_BOUND
|THR_DETACHED
, &tid
);
556 /* thr_create failed */
558 (void) fprintf(stderr
,
559 gettext("nskernd: thr_create failed: %s\n"),
562 req
->data1
= (uint64_t)errno
;
565 /* success - _dolock() will free nskp */
566 req
->data1
= (uint64_t)0;
572 * Convenience code for engineering test of multi-terabyte volumes.
574 * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI
575 * labels. This code allocates a simple efi label structure and ioctls
576 * to extract the size of a zvol. It only handles the minimal EFI ioctl
577 * implementation in zvol.
581 zvol_bsize(char *path
, uint64_t *size
, const int pnum
)
583 struct stat64 stb1
, stb2
;
588 if (cl_nodeid
|| pnum
!= 0)
591 if ((fd
= open(path
, O_RDONLY
)) < 0) {
595 if (stat64("/devices/pseudo/zfs@0:zfs", &stb1
) != 0 ||
596 fstat64(fd
, &stb2
) != 0 ||
597 !S_ISCHR(stb1
.st_mode
) ||
598 !S_ISCHR(stb2
.st_mode
) ||
599 major(stb1
.st_rdev
) != major(stb2
.st_rdev
)) {
604 rc
= ioctl(fd
, DKIOCGMEDIAINFO
, (void *)&dkm
);
606 *size
= LE_64(dkm
.dki_capacity
) *
607 (dkm
.dki_lbsize
) / 512;
615 get_bsize(uint64_t raw_fd
, uint64_t *size
, int *partitionp
, char *path
)
617 struct nscioc_bsize bsize
;
618 #ifdef DKIOCPARTITION
619 struct partition64 p64
;
621 struct dk_cinfo dki_info
;
628 dki_info
.dki_partition
= (ushort_t
)-1;
629 bsize
.dki_info
= (uint64_t)(unsigned long)&dki_info
;
630 bsize
.vtoc
= (uint64_t)(unsigned long)&vtoc
;
631 bsize
.raw_fd
= raw_fd
;
634 fd
= open(rdev
, O_RDONLY
);
638 if (ioctl(fd
, NSCIOC_BSIZE
, &bsize
) < 0) {
639 if (dki_info
.dki_partition
!= (ushort_t
)-1) {
640 /* assume part# is ok and just the size failed */
641 *partitionp
= (int)dki_info
.dki_partition
;
643 #ifdef DKIOCPARTITION
644 /* see if this is an EFI label */
645 bzero(&p64
, sizeof (p64
));
646 p64
.p_partno
= (uint_t
)*partitionp
;
647 if ((ioctl(fd
, DKIOCPARTITION
, &p64
)) > 0) {
648 *size
= (uint64_t)p64
.p_size
;
650 bsize
.p64
= (uint64_t)(unsigned long)&p64
;
653 if (ioctl(fd
, NSCIOC_BSIZE
, &bsize
) < 0) {
654 /* see if this is a zvol */
655 zvol_bsize(path
, size
, *partitionp
);
657 *size
= (uint64_t)p64
.p_size
;
660 #endif /* DKIOCPARTITION */
669 *partitionp
= (int)dki_info
.dki_partition
;
671 if (vtoc
.v_sanity
!= VTOC_SANE
)
674 if (vtoc
.v_version
!= V_VERSION
&& vtoc
.v_version
!= 0)
677 if (dki_info
.dki_partition
> V_NUMPAR
)
680 *size
= (uint64_t)vtoc
.v_part
[(int)dki_info
.dki_partition
].p_size
;
688 * Find out if we are running in a cluster
690 cl_nodeid
= cfg_iscluster();
693 } else if (cl_nodeid
== 0) {
697 (void) fprintf(stderr
, "%s\n",
698 gettext("nskernd: unable to ascertain environment"));
704 * Runtime Solaris release checking - build release == runtime release
705 * is always considered success, so only keep entries in the map for
708 static nsc_release_t nskernd_rel_map
[] = {
709 /* { "5.10", "5.10" }, */
716 #define main nskernd_main
720 main(int argc
, char *argv
[])
722 const char *dir
= "/";
731 (void) setlocale(LC_ALL
, "");
732 (void) textdomain("nskernd");
734 rc
= nsc_check_release(BUILD_REV_STR
, nskernd_rel_map
, &reqd
);
736 (void) fprintf(stderr
,
737 gettext("nskernd: unable to determine the current "
738 "Solaris release: %s\n"), strerror(errno
));
740 } else if (rc
== FALSE
) {
741 (void) fprintf(stderr
,
742 gettext("nskernd: incorrect Solaris release "
743 "(requires %s)\n"), reqd
);
753 * Usage: <progname> [-g] [-d <seconds to delay>]
755 while ((i
= getopt(argc
, argv
, "gd:")) != EOF
) {
761 delay_time
= atoi(optarg
);
762 if (delay_time
<= 0) {
768 "Usage: nskernd [-g] [-d <seconds to delay>]");
774 if (chroot(dir
) < 0) {
775 (void) fprintf(stderr
, gettext("nskernd: chroot failed: %s\n"),
780 if (chdir(dir
) < 0) {
781 (void) fprintf(stderr
, gettext("nskernd: chdir failed: %s\n"),
787 * Determine if we are in a Sun Cluster or not, before fork'ing
792 * create a pipe to synchronise the parent with the
793 * child just before it enters its service loop.
795 if (pipe(syncpipe
) < 0) {
796 (void) fprintf(stderr
,
797 gettext("nskernd: cannot create pipe: %s\n"),
802 * Fork off a child that becomes the daemon.
805 if ((rc
= fork()) > 0) {
808 (void) close(syncpipe
[1]);
810 * wait for the close of the pipe.
811 * If we get a char back, indicates good
812 * status from child, so exit 0.
813 * If we get a zero length read, then the
814 * child has failed, so we do too.
816 n
= read(syncpipe
[0], &c
, 1);
817 exit((n
<= 0) ? 1 : 0);
819 (void) fprintf(stderr
, gettext("nskernd: cannot fork: %s\n"),
825 * In child - become daemon.
828 /* use closefrom(3C) from PSARC/2000/193 when possible */
829 for (i
= 0; i
< syncpipe
[1]; i
++) {
832 closefrom(syncpipe
[1] + 1);
834 (void) open("/dev/console", O_WRONLY
|O_APPEND
);
842 * Ignore all signals apart from SIGTERM.
845 for (i
= 1; i
< _sys_nsig
; i
++)
846 (void) sigset(i
, SIG_IGN
);
848 (void) sigset(SIGTERM
, sighand
);
851 * Increase the number of fd's that can be open.
854 rl
.rlim_cur
= RLIM_INFINITY
;
855 rl
.rlim_max
= RLIM_INFINITY
;
856 if (setrlimit(RLIMIT_NOFILE
, &rl
) < 0) {
857 (void) fprintf(stderr
,
858 gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"),
860 (void) fprintf(stderr
,
861 gettext("nskernd: the maximum number of nsctl open "
862 "devices may be reduced\n"));
866 * Open /dev/nsctl and startup.
869 nsctl_fd
= open(rdev
, O_RDONLY
);
871 (void) fprintf(stderr
, gettext("nskernd: unable to open %s\n"),
876 bzero(&data
, sizeof (data
));
878 data
.command
= NSKERND_START
;
879 data
.data1
= (uint64_t)cl_nodeid
;
884 rc
= ioctl(nsctl_fd
, NSCIOC_NSKERND
, &data
);
886 /* try and do kernel cleanup and exit */
893 (void) fprintf(stderr
,
894 gettext("nskernd: NSCIOC_NSKERND failed: %s\n"),
897 } else if (sigterm
) {
898 /* SIGTERM received - terminate */
899 if (data
.command
!= NSKERND_START
&&
900 (data
.command
!= NSKERND_STOP
||
901 data
.data1
!= (uint64_t)1)) {
902 /* need to do kernel cleanup */
907 data
.command
= NSKERND_START
;
908 data
.data1
= (uint64_t)cl_nodeid
;
915 /* cannot shutdown - threads active */
917 data
.command
= NSKERND_START
;
918 data
.data1
= (uint64_t)cl_nodeid
;
925 (void) write(syncpipe
[1], &c
, 1);
926 (void) close(syncpipe
[1]);
929 switch (data
.command
) {
930 case NSKERND_START
: /* (re)start completion */
932 (void) fprintf(stderr
,
933 gettext("nskernd: already started\n"));
935 } else if (rc
== 2) {
936 (void) fprintf(stderr
,
937 gettext("nskernd: stopped by kernel\n"));
940 data
.command
= NSKERND_WAIT
;
943 case NSKERND_STOP
: /* kernel telling daemon to stop */
944 if (data
.data1
!= (uint64_t)1) {
952 * kernel requesting partsize
953 * data1 - size return
954 * data2 - raw_fd (entry)
955 * - partition number (return)
958 get_bsize(data
.data2
, &data
.data1
,
959 &partition
, data
.char1
);
960 data
.data2
= (uint64_t)partition
;
961 data
.command
= NSKERND_WAIT
;
964 case NSKERND_NEWLWP
: /* kernel requesting a new LWP */
966 data
.command
= NSKERND_WAIT
;
969 case NSKERND_LOCK
: /* kernel requesting lock */
971 data
.command
= NSKERND_WAIT
;
974 case NSKERND_WAIT
: /* kernel retrying wait */
976 * the kernel thread can be woken by the dr config
977 * utilities (ie cfgadm) therefore we just reissue
982 case NSKERND_IIBITMAP
:
983 rc
= log_iibmp_err(data
.char1
, (int)data
.data1
);
984 data
.data1
= (uint64_t)rc
;
985 data
.command
= NSKERND_WAIT
;
989 (void) fprintf(stderr
,
990 gettext("nskernd: unknown command %d"),
992 data
.command
= NSKERND_WAIT
;
997 (void) close(nsctl_fd
);