kernel/disp/thread.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  25  */
  26
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/signal.h>
  31 #include <sys/stack.h>
  32 #include <sys/pcb.h>
  33 #include <sys/user.h>
  34 #include <sys/systm.h>
  35 #include <sys/sysinfo.h>
  36 #include <sys/errno.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/cred.h>
  39 #include <sys/resource.h>
  40 #include <sys/task.h>
  41 #include <sys/project.h>
  42 #include <sys/proc.h>
  43 #include <sys/debug.h>
  44 #include <sys/disp.h>
  45 #include <sys/class.h>
  46 #include <vm/seg_kmem.h>
  47 #include <vm/seg_kp.h>
  48 #include <sys/machlock.h>
  49 #include <sys/kmem.h>
  50 #include <sys/varargs.h>
  51 #include <sys/turnstile.h>
  52 #include <sys/poll.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/callb.h>
  55 #include <c2/audit.h>
  56 #include <sys/tnf.h>
  57 #include <sys/sobject.h>
  58 #include <sys/cpupart.h>
  59 #include <sys/pset.h>
  60 #include <sys/door.h>
  61 #include <sys/spl.h>
  62 #include <sys/copyops.h>
  63 #include <sys/rctl.h>
  64 #include <sys/brand.h>
  65 #include <sys/pool.h>
  66 #include <sys/zone.h>
  67 #include <sys/cpc_impl.h>
  68 #include <sys/sdt.h>
  69 #include <sys/reboot.h>
  70 #include <sys/kdi.h>
  71 #include <sys/schedctl.h>
  72 #include <sys/waitq.h>
  73 #include <sys/cpucaps.h>
  74 #include <sys/kiconv.h>
  75
  76 struct kmem_cache *thread_cache;        /* cache of free threads */
  77 struct kmem_cache *lwp_cache;           /* cache of free lwps */
  78 struct kmem_cache *turnstile_cache;     /* cache of free turnstiles */
  79
  80 /*
  81  * allthreads is only for use by kmem_readers.  All kernel loops can use
  82  * the current thread as a start/end point.
  83  */
  84 kthread_t *allthreads = &t0;    /* circular list of all threads */
  85
  86 static kcondvar_t reaper_cv;            /* synchronization var */
  87 kthread_t       *thread_deathrow;       /* circular list of reapable threads */
  88 kthread_t       *lwp_deathrow;          /* circular list of reapable threads */
  89 kmutex_t        reaplock;               /* protects lwp and thread deathrows */
  90 int     thread_reapcnt = 0;             /* number of threads on deathrow */
  91 int     lwp_reapcnt = 0;                /* number of lwps on deathrow */
  92 int     reaplimit = 16;                 /* delay reaping until reaplimit */
  93
  94 thread_free_lock_t      *thread_free_lock;
  95                                         /* protects tick thread from reaper */
  96
  97 extern int nthread;
  98
  99 /* System Scheduling classes. */
 100 id_t    syscid;                         /* system scheduling class ID */
 101 id_t    sysdccid = CLASS_UNUSED;        /* reset when SDC loads */
 102
 103 void    *segkp_thread;                  /* cookie for segkp pool */
 104
 105 int lwp_cache_sz = 32;
 106 int t_cache_sz = 8;
 107 static kt_did_t next_t_id = 1;
 108
 109 /* Default mode for thread binding to CPUs and processor sets */
 110 int default_binding_mode = TB_ALLHARD;
 111
 112 /*
 113  * Min/Max stack sizes for stack size parameters
 114  */
 115 #define MAX_STKSIZE     (32 * DEFAULTSTKSZ)
 116 #define MIN_STKSIZE     DEFAULTSTKSZ
 117
 118 /*
 119  * default_stksize overrides lwp_default_stksize if it is set.
 120  */
 121 int     default_stksize;
 122 int     lwp_default_stksize;
 123
 124 static zone_key_t zone_thread_key;
 125
 126 unsigned int kmem_stackinfo;            /* stackinfo feature on-off */
 127 kmem_stkinfo_t *kmem_stkinfo_log;       /* stackinfo circular log */
 128 static kmutex_t kmem_stkinfo_lock;      /* protects kmem_stkinfo_log */
 129
 130 /*
 131  * forward declarations for internal thread specific data (tsd)
 132  */
 133 static void *tsd_realloc(void *, size_t, size_t);
 134
 135 void thread_reaper(void);
 136
 137 /* forward declarations for stackinfo feature */
 138 static void stkinfo_begin(kthread_t *);
 139 static void stkinfo_end(kthread_t *);
 140 static size_t stkinfo_percent(caddr_t, caddr_t, caddr_t);
 141
 142 /*ARGSUSED*/
 143 static int
 144 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
 145 {
 146         bzero(buf, sizeof (turnstile_t));
 147         return (0);
 148 }
 149
 150 /*ARGSUSED*/
 151 static void
 152 turnstile_destructor(void *buf, void *cdrarg)
 153 {
 154         turnstile_t *ts = buf;
 155
 156         ASSERT(ts->ts_free == NULL);
 157         ASSERT(ts->ts_waiters == 0);
 158         ASSERT(ts->ts_inheritor == NULL);
 159         ASSERT(ts->ts_sleepq[0].sq_first == NULL);
 160         ASSERT(ts->ts_sleepq[1].sq_first == NULL);
 161 }
 162
 163 void
 164 thread_init(void)
 165 {
 166         kthread_t *tp;
 167         extern char sys_name[];
 168         extern void idle();
 169         struct cpu *cpu = CPU;
 170         int i;
 171         kmutex_t *lp;
 172
 173         mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
 174         thread_free_lock =
 175             kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
 176         for (i = 0; i < THREAD_FREE_NUM; i++) {
 177                 lp = &thread_free_lock[i].tf_lock;
 178                 mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
 179         }
 180
 181 #if defined(__i386) || defined(__amd64)
 182         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
 183             PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
 184
 185         /*
 186          * "struct _klwp" includes a "struct pcb", which includes a
 187          * "struct fpu", which needs to be 64-byte aligned on amd64
 188          * (and even on i386) for xsave/xrstor.
 189          */
 190         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
 191             64, NULL, NULL, NULL, NULL, NULL, 0);
 192 #else
 193         /*
 194          * Allocate thread structures from static_arena.  This prevents
 195          * issues where a thread tries to relocate its own thread
 196          * structure and touches it after the mapping has been suspended.
 197          */
 198         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
 199             PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
 200
 201         lwp_stk_cache_init();
 202
 203         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
 204             0, NULL, NULL, NULL, NULL, NULL, 0);
 205 #endif
 206
 207         turnstile_cache = kmem_cache_create("turnstile_cache",
 208             sizeof (turnstile_t), 0,
 209             turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
 210
 211         cred_init();
 212
 213         /*
 214          * Initialize various resource management facilities.
 215          */
 216         rctl_init();
 217         cpucaps_init();
 218         /*
 219          * Zone_init() should be called before project_init() so that project ID
 220          * for the first project is initialized correctly.
 221          */
 222         zone_init();
 223         project_init();
 224         brand_init();
 225         kiconv_init();
 226         task_init();
 227         pool_init();
 228
 229         curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
 230
 231         /*
 232          * Originally, we had two parameters to set default stack
 233          * size: one for lwp's (lwp_default_stksize), and one for
 234          * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
 235          * Now we have a third parameter that overrides both if it is
 236          * set to a legal stack size, called default_stksize.
 237          */
 238
 239         if (default_stksize == 0) {
 240                 default_stksize = DEFAULTSTKSZ;
 241         } else if (default_stksize % PAGESIZE != 0 ||
 242             default_stksize > MAX_STKSIZE ||
 243             default_stksize < MIN_STKSIZE) {
 244                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
 245                     (int)DEFAULTSTKSZ);
 246                 default_stksize = DEFAULTSTKSZ;
 247         } else {
 248                 lwp_default_stksize = default_stksize;
 249         }
 250
 251         if (lwp_default_stksize == 0) {
 252                 lwp_default_stksize = default_stksize;
 253         } else if (lwp_default_stksize % PAGESIZE != 0 ||
 254             lwp_default_stksize > MAX_STKSIZE ||
 255             lwp_default_stksize < MIN_STKSIZE) {
 256                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
 257                     default_stksize);
 258                 lwp_default_stksize = default_stksize;
 259         }
 260
 261         segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
 262             lwp_default_stksize,
 263             (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
 264
 265         segkp_thread = segkp_cache_init(segkp, t_cache_sz,
 266             default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
 267
 268         (void) getcid(sys_name, &syscid);
 269         curthread->t_cid = syscid;      /* current thread is t0 */
 270
 271         /*
 272          * Set up the first CPU's idle thread.
 273          * It runs whenever the CPU has nothing worthwhile to do.
 274          */
 275         tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
 276         cpu->cpu_idle_thread = tp;
 277         tp->t_preempt = 1;
 278         tp->t_disp_queue = cpu->cpu_disp;
 279         ASSERT(tp->t_disp_queue != NULL);
 280         tp->t_bound_cpu = cpu;
 281         tp->t_affinitycnt = 1;
 282
 283         /*
 284          * Registering a thread in the callback table is usually
 285          * done in the initialization code of the thread. In this
 286          * case, we do it right after thread creation to avoid
 287          * blocking idle thread while registering itself. It also
 288          * avoids the possibility of reregistration in case a CPU
 289          * restarts its idle thread.
 290          */
 291         CALLB_CPR_INIT_SAFE(tp, "idle");
 292
 293         /*
 294          * Create the thread_reaper daemon. From this point on, exited
 295          * threads will get reaped.
 296          */
 297         (void) thread_create(NULL, 0, (void (*)())thread_reaper,
 298             NULL, 0, &p0, TS_RUN, minclsyspri);
 299
 300         /*
 301          * Finish initializing the kernel memory allocator now that
 302          * thread_create() is available.
 303          */
 304         kmem_thread_init();
 305
 306         if (boothowto & RB_DEBUG)
 307                 kdi_dvec_thravail();
 308 }
 309
 310 /*
 311  * Create a thread.
 312  *
 313  * thread_create() blocks for memory if necessary.  It never fails.
 314  *
 315  * If stk is NULL, the thread is created at the base of the stack
 316  * and cannot be swapped.
 317  */
 318 kthread_t *
 319 thread_create(
 320         caddr_t stk,
 321         size_t  stksize,
 322         void    (*proc)(),
 323         void    *arg,
 324         size_t  len,
 325         proc_t   *pp,
 326         int     state,
 327         pri_t   pri)
 328 {
 329         kthread_t *t;
 330         extern struct classfuncs sys_classfuncs;
 331         turnstile_t *ts;
 332
 333         /*
 334          * Every thread keeps a turnstile around in case it needs to block.
 335          * The only reason the turnstile is not simply part of the thread
 336          * structure is that we may have to break the association whenever
 337          * more than one thread blocks on a given synchronization object.
 338          * From a memory-management standpoint, turnstiles are like the
 339          * "attached mblks" that hang off dblks in the streams allocator.
 340          */
 341         ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
 342
 343         if (stk == NULL) {
 344                 /*
 345                  * alloc both thread and stack in segkp chunk
 346                  */
 347
 348                 if (stksize < default_stksize)
 349                         stksize = default_stksize;
 350
 351                 if (stksize == default_stksize) {
 352                         stk = (caddr_t)segkp_cache_get(segkp_thread);
 353                 } else {
 354                         stksize = roundup(stksize, PAGESIZE);
 355                         stk = (caddr_t)segkp_get(segkp, stksize,
 356                             (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
 357                 }
 358
 359                 ASSERT(stk != NULL);
 360
 361                 /*
 362                  * The machine-dependent mutex code may require that
 363                  * thread pointers (since they may be used for mutex owner
 364                  * fields) have certain alignment requirements.
 365                  * PTR24_ALIGN is the size of the alignment quanta.
 366                  * XXX - assumes stack grows toward low addresses.
 367                  */
 368                 if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 369                         cmn_err(CE_PANIC, "thread_create: proposed stack size"
 370                             " too small to hold thread.");
 371 #ifdef STACK_GROWTH_DOWN
 372                 stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 373                 stksize &= -PTR24_ALIGN;        /* make thread aligned */
 374                 t = (kthread_t *)(stk + stksize);
 375                 bzero(t, sizeof (kthread_t));
 376                 if (audit_active)
 377                         audit_thread_create(t);
 378                 t->t_stk = stk + stksize;
 379                 t->t_stkbase = stk;
 380 #else   /* stack grows to larger addresses */
 381                 stksize -= SA(sizeof (kthread_t));
 382                 t = (kthread_t *)(stk);
 383                 bzero(t, sizeof (kthread_t));
 384                 t->t_stk = stk + sizeof (kthread_t);
 385                 t->t_stkbase = stk + stksize + sizeof (kthread_t);
 386 #endif  /* STACK_GROWTH_DOWN */
 387                 t->t_flag |= T_TALLOCSTK;
 388                 t->t_swap = stk;
 389         } else {
 390                 t = kmem_cache_alloc(thread_cache, KM_SLEEP);
 391                 bzero(t, sizeof (kthread_t));
 392                 ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
 393                 if (audit_active)
 394                         audit_thread_create(t);
 395                 /*
 396                  * Initialize t_stk to the kernel stack pointer to use
 397                  * upon entry to the kernel
 398                  */
 399 #ifdef STACK_GROWTH_DOWN
 400                 t->t_stk = stk + stksize;
 401                 t->t_stkbase = stk;
 402 #else
 403                 t->t_stk = stk;                 /* 3b2-like */
 404                 t->t_stkbase = stk + stksize;
 405 #endif /* STACK_GROWTH_DOWN */
 406         }
 407
 408         if (kmem_stackinfo != 0) {
 409                 stkinfo_begin(t);
 410         }
 411
 412         t->t_ts = ts;
 413
 414         /*
 415          * p_cred could be NULL if it thread_create is called before cred_init
 416          * is called in main.
 417          */
 418         mutex_enter(&pp->p_crlock);
 419         if (pp->p_cred)
 420                 crhold(t->t_cred = pp->p_cred);
 421         mutex_exit(&pp->p_crlock);
 422         t->t_start = gethrestime_sec();
 423         t->t_startpc = proc;
 424         t->t_procp = pp;
 425         t->t_clfuncs = &sys_classfuncs.thread;
 426         t->t_cid = syscid;
 427         t->t_pri = pri;
 428         t->t_schedflag = 0;
 429         t->t_bind_cpu = PBIND_NONE;
 430         t->t_bindflag = (uchar_t)default_binding_mode;
 431         t->t_bind_pset = PS_NONE;
 432         t->t_plockp = &pp->p_lock;
 433         t->t_copyops = NULL;
 434         t->t_taskq = NULL;
 435         t->t_anttime = 0;
 436         t->t_hatdepth = 0;
 437
 438         t->t_dtrace_vtime = 1;  /* assure vtimestamp is always non-zero */
 439
 440         CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
 441 #ifndef NPROBE
 442         /* Kernel probe */
 443         tnf_thread_create(t);
 444 #endif /* NPROBE */
 445         LOCK_INIT_CLEAR(&t->t_lock);
 446
 447         /*
 448          * Callers who give us a NULL proc must do their own
 449          * stack initialization.  e.g. lwp_create()
 450          */
 451         if (proc != NULL) {
 452                 t->t_stk = thread_stk_init(t->t_stk);
 453                 thread_load(t, proc, arg, len);
 454         }
 455
 456         /*
 457          * Put a hold on project0. If this thread is actually in a
 458          * different project, then t_proj will be changed later in
 459          * lwp_create().  All kernel-only threads must be in project 0.
 460          */
 461         t->t_proj = project_hold(proj0p);
 462
 463         lgrp_affinity_init(&t->t_lgrp_affinity);
 464
 465         mutex_enter(&pidlock);
 466         nthread++;
 467         t->t_did = next_t_id++;
 468         t->t_prev = curthread->t_prev;
 469         t->t_next = curthread;
 470
 471         /*
 472          * Add the thread to the list of all threads, and initialize
 473          * its t_cpu pointer.  We need to block preemption since
 474          * cpu_offline walks the thread list looking for threads
 475          * with t_cpu pointing to the CPU being offlined.  We want
 476          * to make sure that the list is consistent and that if t_cpu
 477          * is set, the thread is on the list.
 478          */
 479         kpreempt_disable();
 480         curthread->t_prev->t_next = t;
 481         curthread->t_prev = t;
 482
 483         /*
 484          * Threads should never have a NULL t_cpu pointer so assign it
 485          * here.  If the thread is being created with state TS_RUN a
 486          * better CPU may be chosen when it is placed on the run queue.
 487          *
 488          * We need to keep kernel preemption disabled when setting all
 489          * three fields to keep them in sync.  Also, always create in
 490          * the default partition since that's where kernel threads go
 491          * (if this isn't a kernel thread, t_cpupart will be changed
 492          * in lwp_create before setting the thread runnable).
 493          */
 494         t->t_cpupart = &cp_default;
 495
 496         /*
 497          * For now, affiliate this thread with the root lgroup.
 498          * Since the kernel does not (presently) allocate its memory
 499          * in a locality aware fashion, the root is an appropriate home.
 500          * If this thread is later associated with an lwp, it will have
 501          * it's lgroup re-assigned at that time.
 502          */
 503         lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
 504
 505         /*
 506          * Inherit the current cpu.  If this cpu isn't part of the chosen
 507          * lgroup, a new cpu will be chosen by cpu_choose when the thread
 508          * is ready to run.
 509          */
 510         if (CPU->cpu_part == &cp_default)
 511                 t->t_cpu = CPU;
 512         else
 513                 t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
 514                     t->t_pri, NULL);
 515
 516         t->t_disp_queue = t->t_cpu->cpu_disp;
 517         kpreempt_enable();
 518
 519         /*
 520          * Initialize thread state and the dispatcher lock pointer.
 521          * Need to hold onto pidlock to block allthreads walkers until
 522          * the state is set.
 523          */
 524         switch (state) {
 525         case TS_RUN:
 526                 curthread->t_oldspl = splhigh();        /* get dispatcher spl */
 527                 THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
 528                 CL_SETRUN(t);
 529                 thread_unlock(t);
 530                 break;
 531
 532         case TS_ONPROC:
 533                 THREAD_ONPROC(t, t->t_cpu);
 534                 break;
 535
 536         case TS_FREE:
 537                 /*
 538                  * Free state will be used for intr threads.
 539                  * The interrupt routine must set the thread dispatcher
 540                  * lock pointer (t_lockp) if starting on a CPU
 541                  * other than the current one.
 542                  */
 543                 THREAD_FREEINTR(t, CPU);
 544                 break;
 545
 546         case TS_STOPPED:
 547                 THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
 548                 break;
 549
 550         default:                        /* TS_SLEEP, TS_ZOMB or TS_TRANS */
 551                 cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
 552         }
 553         mutex_exit(&pidlock);
 554         return (t);
 555 }
 556
 557 /*
 558  * Move thread to project0 and take care of project reference counters.
 559  */
 560 void
 561 thread_rele(kthread_t *t)
 562 {
 563         kproject_t *kpj;
 564
 565         thread_lock(t);
 566
 567         ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
 568         kpj = ttoproj(t);
 569         t->t_proj = proj0p;
 570
 571         thread_unlock(t);
 572
 573         if (kpj != proj0p) {
 574                 project_rele(kpj);
 575                 (void) project_hold(proj0p);
 576         }
 577 }
 578
 579 void
 580 thread_exit(void)
 581 {
 582         kthread_t *t = curthread;
 583
 584         if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 585                 cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 586
 587         tsd_exit();             /* Clean up this thread's TSD */
 588
 589         kcpc_passivate();       /* clean up performance counter state */
 590
 591         /*
 592          * No kernel thread should have called poll() without arranging
 593          * calling pollcleanup() here.
 594          */
 595         ASSERT(t->t_pollstate == NULL);
 596         ASSERT(t->t_schedctl == NULL);
 597         if (t->t_door)
 598                 door_slam();    /* in case thread did an upcall */
 599
 600 #ifndef NPROBE
 601         /* Kernel probe */
 602         if (t->t_tnf_tpdp)
 603                 tnf_thread_exit();
 604 #endif /* NPROBE */
 605
 606         thread_rele(t);
 607         t->t_preempt++;
 608
 609         /*
 610          * remove thread from the all threads list so that
 611          * death-row can use the same pointers.
 612          */
 613         mutex_enter(&pidlock);
 614         t->t_next->t_prev = t->t_prev;
 615         t->t_prev->t_next = t->t_next;
 616         ASSERT(allthreads != t);        /* t0 never exits */
 617         cv_broadcast(&t->t_joincv);     /* wake up anyone in thread_join */
 618         mutex_exit(&pidlock);
 619
 620         if (t->t_ctx != NULL)
 621                 exitctx(t);
 622         if (t->t_procp->p_pctx != NULL)
 623                 exitpctx(t->t_procp);
 624
 625         if (kmem_stackinfo != 0) {
 626                 stkinfo_end(t);
 627         }
 628
 629         t->t_state = TS_ZOMB;   /* set zombie thread */
 630
 631         swtch_from_zombie();    /* give up the CPU */
 632         /* NOTREACHED */
 633 }
 634
 635 /*
 636  * Check to see if the specified thread is active (defined as being on
 637  * the thread list).  This is certainly a slow way to do this; if there's
 638  * ever a reason to speed it up, we could maintain a hash table of active
 639  * threads indexed by their t_did.
 640  */
 641 static kthread_t *
 642 did_to_thread(kt_did_t tid)
 643 {
 644         kthread_t *t;
 645
 646         ASSERT(MUTEX_HELD(&pidlock));
 647         for (t = curthread->t_next; t != curthread; t = t->t_next) {
 648                 if (t->t_did == tid)
 649                         break;
 650         }
 651         if (t->t_did == tid)
 652                 return (t);
 653         else
 654                 return (NULL);
 655 }
 656
 657 /*
 658  * Wait for specified thread to exit.  Returns immediately if the thread
 659  * could not be found, meaning that it has either already exited or never
 660  * existed.
 661  */
 662 void
 663 thread_join(kt_did_t tid)
 664 {
 665         kthread_t *t;
 666
 667         ASSERT(tid != curthread->t_did);
 668         ASSERT(tid != t0.t_did);
 669
 670         mutex_enter(&pidlock);
 671         /*
 672          * Make sure we check that the thread is on the thread list
 673          * before blocking on it; otherwise we could end up blocking on
 674          * a cv that's already been freed.  In other words, don't cache
 675          * the thread pointer across calls to cv_wait.
 676          *
 677          * The choice of loop invariant means that whenever a thread
 678          * is taken off the allthreads list, a cv_broadcast must be
 679          * performed on that thread's t_joincv to wake up any waiters.
 680          * The broadcast doesn't have to happen right away, but it
 681          * shouldn't be postponed indefinitely (e.g., by doing it in
 682          * thread_free which may only be executed when the deathrow
 683          * queue is processed.
 684          */
 685         while (t = did_to_thread(tid))
 686                 cv_wait(&t->t_joincv, &pidlock);
 687         mutex_exit(&pidlock);
 688 }
 689
 690 void
 691 thread_free_prevent(kthread_t *t)
 692 {
 693         kmutex_t *lp;
 694
 695         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 696         mutex_enter(lp);
 697 }
 698
 699 void
 700 thread_free_allow(kthread_t *t)
 701 {
 702         kmutex_t *lp;
 703
 704         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 705         mutex_exit(lp);
 706 }
 707
 708 static void
 709 thread_free_barrier(kthread_t *t)
 710 {
 711         kmutex_t *lp;
 712
 713         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 714         mutex_enter(lp);
 715         mutex_exit(lp);
 716 }
 717
 718 void
 719 thread_free(kthread_t *t)
 720 {
 721         boolean_t allocstk = (t->t_flag & T_TALLOCSTK);
 722         klwp_t *lwp = t->t_lwp;
 723         caddr_t swap = t->t_swap;
 724
 725         ASSERT(t != &t0 && t->t_state == TS_FREE);
 726         ASSERT(t->t_door == NULL);
 727         ASSERT(t->t_schedctl == NULL);
 728         ASSERT(t->t_pollstate == NULL);
 729
 730         t->t_pri = 0;
 731         t->t_pc = 0;
 732         t->t_sp = 0;
 733         t->t_wchan0 = NULL;
 734         t->t_wchan = NULL;
 735         if (t->t_cred != NULL) {
 736                 crfree(t->t_cred);
 737                 t->t_cred = 0;
 738         }
 739         if (t->t_pdmsg) {
 740                 kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
 741                 t->t_pdmsg = NULL;
 742         }
 743         if (audit_active)
 744                 audit_thread_free(t);
 745 #ifndef NPROBE
 746         if (t->t_tnf_tpdp)
 747                 tnf_thread_free(t);
 748 #endif /* NPROBE */
 749         if (t->t_cldata) {
 750                 CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
 751         }
 752         if (t->t_rprof != NULL) {
 753                 kmem_free(t->t_rprof, sizeof (*t->t_rprof));
 754                 t->t_rprof = NULL;
 755         }
 756         t->t_lockp = NULL;      /* nothing should try to lock this thread now */
 757         if (lwp)
 758                 lwp_freeregs(lwp, 0);
 759         if (t->t_ctx)
 760                 freectx(t, 0);
 761         t->t_stk = NULL;
 762         if (lwp)
 763                 lwp_stk_fini(lwp);
 764         lock_clear(&t->t_lock);
 765
 766         if (t->t_ts->ts_waiters > 0)
 767                 panic("thread_free: turnstile still active");
 768
 769         kmem_cache_free(turnstile_cache, t->t_ts);
 770
 771         free_afd(&t->t_activefd);
 772
 773         /*
 774          * Barrier for the tick accounting code.  The tick accounting code
 775          * holds this lock to keep the thread from going away while it's
 776          * looking at it.
 777          */
 778         thread_free_barrier(t);
 779
 780         ASSERT(ttoproj(t) == proj0p);
 781         project_rele(ttoproj(t));
 782
 783         lgrp_affinity_free(&t->t_lgrp_affinity);
 784
 785         mutex_enter(&pidlock);
 786         nthread--;
 787         mutex_exit(&pidlock);
 788
 789         /*
 790          * Free thread, lwp and stack.  This needs to be done carefully, since
 791          * if T_TALLOCSTK is set, the thread is part of the stack.
 792          */
 793         t->t_lwp = NULL;
 794         t->t_swap = NULL;
 795
 796         if (swap) {
 797                 segkp_release(segkp, swap);
 798         }
 799         if (lwp) {
 800                 kmem_cache_free(lwp_cache, lwp);
 801         }
 802         if (!allocstk) {
 803                 kmem_cache_free(thread_cache, t);
 804         }
 805 }
 806
 807 /*
 808  * Removes threads associated with the given zone from a deathrow queue.
 809  * tp is a pointer to the head of the deathrow queue, and countp is a
 810  * pointer to the current deathrow count.  Returns a linked list of
 811  * threads removed from the list.
 812  */
 813 static kthread_t *
 814 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
 815 {
 816         kthread_t *tmp, *list = NULL;
 817         cred_t *cr;
 818
 819         ASSERT(MUTEX_HELD(&reaplock));
 820         while (*tp != NULL) {
 821                 if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
 822                         tmp = *tp;
 823                         *tp = tmp->t_forw;
 824                         tmp->t_forw = list;
 825                         list = tmp;
 826                         (*countp)--;
 827                 } else {
 828                         tp = &(*tp)->t_forw;
 829                 }
 830         }
 831         return (list);
 832 }
 833
 834 static void
 835 thread_reap_list(kthread_t *t)
 836 {
 837         kthread_t *next;
 838
 839         while (t != NULL) {
 840                 next = t->t_forw;
 841                 thread_free(t);
 842                 t = next;
 843         }
 844 }
 845
 846 /* ARGSUSED */
 847 static void
 848 thread_zone_destroy(zoneid_t zoneid, void *unused)
 849 {
 850         kthread_t *t, *l;
 851
 852         mutex_enter(&reaplock);
 853         /*
 854          * Pull threads and lwps associated with zone off deathrow lists.
 855          */
 856         t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
 857         l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
 858         mutex_exit(&reaplock);
 859
 860         /*
 861          * Guard against race condition in mutex_owner_running:
 862          *      thread=owner(mutex)
 863          *      <interrupt>
 864          *                              thread exits mutex
 865          *                              thread exits
 866          *                              thread reaped
 867          *                              thread struct freed
 868          * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 869          * A cross call to all cpus will cause the interrupt handler
 870          * to reset the PC if it is in mutex_owner_running, refreshing
 871          * stale thread pointers.
 872          */
 873         mutex_sync();   /* sync with mutex code */
 874
 875         /*
 876          * Reap threads
 877          */
 878         thread_reap_list(t);
 879
 880         /*
 881          * Reap lwps
 882          */
 883         thread_reap_list(l);
 884 }
 885
 886 /*
 887  * cleanup zombie threads that are on deathrow.
 888  */
 889 void
 890 thread_reaper()
 891 {
 892         kthread_t *t, *l;
 893         callb_cpr_t cprinfo;
 894
 895         /*
 896          * Register callback to clean up threads when zone is destroyed.
 897          */
 898         zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
 899
 900         CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
 901         for (;;) {
 902                 mutex_enter(&reaplock);
 903                 while (thread_deathrow == NULL && lwp_deathrow == NULL) {
 904                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 905                         cv_wait(&reaper_cv, &reaplock);
 906                         CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
 907                 }
 908                 /*
 909                  * mutex_sync() needs to be called when reaping, but
 910                  * not too often.  We limit reaping rate to once
 911                  * per second.  Reaplimit is max rate at which threads can
 912                  * be freed. Does not impact thread destruction/creation.
 913                  */
 914                 t = thread_deathrow;
 915                 l = lwp_deathrow;
 916                 thread_deathrow = NULL;
 917                 lwp_deathrow = NULL;
 918                 thread_reapcnt = 0;
 919                 lwp_reapcnt = 0;
 920                 mutex_exit(&reaplock);
 921
 922                 /*
 923                  * Guard against race condition in mutex_owner_running:
 924                  *      thread=owner(mutex)
 925                  *      <interrupt>
 926                  *                              thread exits mutex
 927                  *                              thread exits
 928                  *                              thread reaped
 929                  *                              thread struct freed
 930                  * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 931                  * A cross call to all cpus will cause the interrupt handler
 932                  * to reset the PC if it is in mutex_owner_running, refreshing
 933                  * stale thread pointers.
 934                  */
 935                 mutex_sync();   /* sync with mutex code */
 936                 /*
 937                  * Reap threads
 938                  */
 939                 thread_reap_list(t);
 940
 941                 /*
 942                  * Reap lwps
 943                  */
 944                 thread_reap_list(l);
 945                 ddi_sleep(1);
 946         }
 947 }
 948
 949 /*
 950  * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
 951  * thread_deathrow. The thread's state is changed already TS_FREE to indicate
 952  * that is reapable. The thread already holds the reaplock, and was already
 953  * freed.
 954  */
 955 void
 956 reapq_move_lq_to_tq(kthread_t *t)
 957 {
 958         ASSERT(t->t_state == TS_FREE);
 959         ASSERT(MUTEX_HELD(&reaplock));
 960         t->t_forw = thread_deathrow;
 961         thread_deathrow = t;
 962         thread_reapcnt++;
 963         if (lwp_reapcnt + thread_reapcnt > reaplimit)
 964                 cv_signal(&reaper_cv);  /* wake the reaper */
 965 }
 966
 967 /*
 968  * This is called by resume() to put a zombie thread onto deathrow.
 969  * The thread's state is changed to TS_FREE to indicate that is reapable.
 970  * This is called from the idle thread so it must not block - just spin.
 971  */
 972 void
 973 reapq_add(kthread_t *t)
 974 {
 975         mutex_enter(&reaplock);
 976
 977         /*
 978          * lwp_deathrow contains threads with lwp linkage and
 979          * swappable thread stacks which have the default stacksize.
 980          * These threads' lwps and stacks may be reused by lwp_create().
 981          *
 982          * Anything else goes on thread_deathrow(), where it will eventually
 983          * be thread_free()d.
 984          */
 985         if (t->t_flag & T_LWPREUSE) {
 986                 ASSERT(ttolwp(t) != NULL);
 987                 t->t_forw = lwp_deathrow;
 988                 lwp_deathrow = t;
 989                 lwp_reapcnt++;
 990         } else {
 991                 t->t_forw = thread_deathrow;
 992                 thread_deathrow = t;
 993                 thread_reapcnt++;
 994         }
 995         if (lwp_reapcnt + thread_reapcnt > reaplimit)
 996                 cv_signal(&reaper_cv);  /* wake the reaper */
 997         t->t_state = TS_FREE;
 998         lock_clear(&t->t_lock);
 999
1000         /*
1001          * Before we return, we need to grab and drop the thread lock for
1002          * the dead thread.  At this point, the current thread is the idle
1003          * thread, and the dead thread's CPU lock points to the current
1004          * CPU -- and we must grab and drop the lock to synchronize with
1005          * a racing thread walking a blocking chain that the zombie thread
1006          * was recently in.  By this point, that blocking chain is (by
1007          * definition) stale:  the dead thread is not holding any locks, and
1008          * is therefore not in any blocking chains -- but if we do not regrab
1009          * our lock before freeing the dead thread's data structures, the
1010          * thread walking the (stale) blocking chain will die on memory
1011          * corruption when it attempts to drop the dead thread's lock.  We
1012          * only need do this once because there is no way for the dead thread
1013          * to ever again be on a blocking chain:  once we have grabbed and
1014          * dropped the thread lock, we are guaranteed that anyone that could
1015          * have seen this thread in a blocking chain can no longer see it.
1016          */
1017         thread_lock(t);
1018         thread_unlock(t);
1019
1020         mutex_exit(&reaplock);
1021 }
1022
1023 /*
1024  * Install thread context ops for the current thread.
1025  */
1026 void
1027 installctx(
1028         kthread_t *t,
1029         void    *arg,
1030         void    (*save)(void *),
1031         void    (*restore)(void *),
1032         void    (*fork)(void *, void *),
1033         void    (*lwp_create)(void *, void *),
1034         void    (*exit)(void *),
1035         void    (*free)(void *, int))
1036 {
1037         struct ctxop *ctx;
1038
1039         ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
1040         ctx->save_op = save;
1041         ctx->restore_op = restore;
1042         ctx->fork_op = fork;
1043         ctx->lwp_create_op = lwp_create;
1044         ctx->exit_op = exit;
1045         ctx->free_op = free;
1046         ctx->arg = arg;
1047         ctx->next = t->t_ctx;
1048         t->t_ctx = ctx;
1049 }
1050
1051 /*
1052  * Remove the thread context ops from a thread.
1053  */
1054 int
1055 removectx(
1056         kthread_t *t,
1057         void    *arg,
1058         void    (*save)(void *),
1059         void    (*restore)(void *),
1060         void    (*fork)(void *, void *),
1061         void    (*lwp_create)(void *, void *),
1062         void    (*exit)(void *),
1063         void    (*free)(void *, int))
1064 {
1065         struct ctxop *ctx, *prev_ctx;
1066
1067         /*
1068          * The incoming kthread_t (which is the thread for which the
1069          * context ops will be removed) should be one of the following:
1070          *
1071          * a) the current thread,
1072          *
1073          * b) a thread of a process that's being forked (SIDL),
1074          *
1075          * c) a thread that belongs to the same process as the current
1076          *    thread and for which the current thread is the agent thread,
1077          *
1078          * d) a thread that is TS_STOPPED which is indicative of it
1079          *    being (if curthread is not an agent) a thread being created
1080          *    as part of an lwp creation.
1081          */
1082         ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
1083             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1084
1085         /*
1086          * Serialize modifications to t->t_ctx to prevent the agent thread
1087          * and the target thread from racing with each other during lwp exit.
1088          */
1089         mutex_enter(&t->t_ctx_lock);
1090         prev_ctx = NULL;
1091         kpreempt_disable();
1092         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
1093                 if (ctx->save_op == save && ctx->restore_op == restore &&
1094                     ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
1095                     ctx->exit_op == exit && ctx->free_op == free &&
1096                     ctx->arg == arg) {
1097                         if (prev_ctx)
1098                                 prev_ctx->next = ctx->next;
1099                         else
1100                                 t->t_ctx = ctx->next;
1101                         mutex_exit(&t->t_ctx_lock);
1102                         if (ctx->free_op != NULL)
1103                                 (ctx->free_op)(ctx->arg, 0);
1104                         kmem_free(ctx, sizeof (struct ctxop));
1105                         kpreempt_enable();
1106                         return (1);
1107                 }
1108                 prev_ctx = ctx;
1109         }
1110         mutex_exit(&t->t_ctx_lock);
1111         kpreempt_enable();
1112
1113         return (0);
1114 }
1115
1116 void
1117 savectx(kthread_t *t)
1118 {
1119         struct ctxop *ctx;
1120
1121         ASSERT(t == curthread);
1122         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1123                 if (ctx->save_op != NULL)
1124                         (ctx->save_op)(ctx->arg);
1125 }
1126
1127 void
1128 restorectx(kthread_t *t)
1129 {
1130         struct ctxop *ctx;
1131
1132         ASSERT(t == curthread);
1133         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1134                 if (ctx->restore_op != NULL)
1135                         (ctx->restore_op)(ctx->arg);
1136 }
1137
1138 void
1139 forkctx(kthread_t *t, kthread_t *ct)
1140 {
1141         struct ctxop *ctx;
1142
1143         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1144                 if (ctx->fork_op != NULL)
1145                         (ctx->fork_op)(t, ct);
1146 }
1147
1148 /*
1149  * Note that this operator is only invoked via the _lwp_create
1150  * system call.  The system may have other reasons to create lwps
1151  * e.g. the agent lwp or the doors unreferenced lwp.
1152  */
1153 void
1154 lwp_createctx(kthread_t *t, kthread_t *ct)
1155 {
1156         struct ctxop *ctx;
1157
1158         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1159                 if (ctx->lwp_create_op != NULL)
1160                         (ctx->lwp_create_op)(t, ct);
1161 }
1162
1163 /*
1164  * exitctx is called from thread_exit() and lwp_exit() to perform any actions
1165  * needed when the thread/LWP leaves the processor for the last time. This
1166  * routine is not intended to deal with freeing memory; freectx() is used for
1167  * that purpose during thread_free(). This routine is provided to allow for
1168  * clean-up that can't wait until thread_free().
1169  */
1170 void
1171 exitctx(kthread_t *t)
1172 {
1173         struct ctxop *ctx;
1174
1175         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1176                 if (ctx->exit_op != NULL)
1177                         (ctx->exit_op)(t);
1178 }
1179
1180 /*
1181  * freectx is called from thread_free() and exec() to get
1182  * rid of old thread context ops.
1183  */
1184 void
1185 freectx(kthread_t *t, int isexec)
1186 {
1187         struct ctxop *ctx;
1188
1189         kpreempt_disable();
1190         while ((ctx = t->t_ctx) != NULL) {
1191                 t->t_ctx = ctx->next;
1192                 if (ctx->free_op != NULL)
1193                         (ctx->free_op)(ctx->arg, isexec);
1194                 kmem_free(ctx, sizeof (struct ctxop));
1195         }
1196         kpreempt_enable();
1197 }
1198
1199 /*
1200  * freectx_ctx is called from lwp_create() when lwp is reused from
1201  * lwp_deathrow and its thread structure is added to thread_deathrow.
1202  * The thread structure to which this ctx was attached may be already
1203  * freed by the thread reaper so free_op implementations shouldn't rely
1204  * on thread structure to which this ctx was attached still being around.
1205  */
1206 void
1207 freectx_ctx(struct ctxop *ctx)
1208 {
1209         struct ctxop *nctx;
1210
1211         ASSERT(ctx != NULL);
1212
1213         kpreempt_disable();
1214         do {
1215                 nctx = ctx->next;
1216                 if (ctx->free_op != NULL)
1217                         (ctx->free_op)(ctx->arg, 0);
1218                 kmem_free(ctx, sizeof (struct ctxop));
1219         } while ((ctx = nctx) != NULL);
1220         kpreempt_enable();
1221 }
1222
1223 /*
1224  * Set the thread running; arrange for it to be swapped in if necessary.
1225  */
1226 void
1227 setrun_locked(kthread_t *t)
1228 {
1229         ASSERT(THREAD_LOCK_HELD(t));
1230         if (t->t_state == TS_SLEEP) {
1231                 /*
1232                  * Take off sleep queue.
1233                  */
1234                 SOBJ_UNSLEEP(t->t_sobj_ops, t);
1235         } else if (t->t_state & (TS_RUN | TS_ONPROC)) {
1236                 /*
1237                  * Already on dispatcher queue.
1238                  */
1239                 return;
1240         } else if (t->t_state == TS_WAIT) {
1241                 waitq_setrun(t);
1242         } else if (t->t_state == TS_STOPPED) {
1243                 /*
1244                  * All of the sending of SIGCONT (TC_XSTART) and /proc
1245                  * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
1246                  * requested that the thread be run.
1247                  * Just calling setrun() is not sufficient to set a stopped
1248                  * thread running.  TP_TXSTART is always set if the thread
1249                  * is not stopped by a jobcontrol stop signal.
1250                  * TP_TPSTART is always set if /proc is not controlling it.
1251                  * TP_TCSTART is always set if lwp_suspend() didn't stop it.
1252                  * The thread won't be stopped unless one of these
1253                  * three mechanisms did it.
1254                  *
1255                  * These flags must be set before calling setrun_locked(t).
1256                  * They can't be passed as arguments because the streams
1257                  * code calls setrun() indirectly and the mechanism for
1258                  * doing so admits only one argument.  Note that the
1259                  * thread must be locked in order to change t_schedflags.
1260                  */
1261                 if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
1262                         return;
1263                 /*
1264                  * Process is no longer stopped (a thread is running).
1265                  */
1266                 t->t_whystop = 0;
1267                 t->t_whatstop = 0;
1268                 /*
1269                  * Strictly speaking, we do not have to clear these
1270                  * flags here; they are cleared on entry to stop().
1271                  * However, they are confusing when doing kernel
1272                  * debugging or when they are revealed by ps(1).
1273                  */
1274                 t->t_schedflag &= ~TS_ALLSTART;
1275                 THREAD_TRANSITION(t);   /* drop stopped-thread lock */
1276                 ASSERT(t->t_lockp == &transition_lock);
1277                 ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
1278                 /*
1279                  * Let the class put the process on the dispatcher queue.
1280                  */
1281                 CL_SETRUN(t);
1282         }
1283 }
1284
1285 void
1286 setrun(kthread_t *t)
1287 {
1288         thread_lock(t);
1289         setrun_locked(t);
1290         thread_unlock(t);
1291 }
1292
1293 /*
1294  * Unpin an interrupted thread.
1295  *      When an interrupt occurs, the interrupt is handled on the stack
1296  *      of an interrupt thread, taken from a pool linked to the CPU structure.
1297  *
1298  *      When swtch() is switching away from an interrupt thread because it
1299  *      blocked or was preempted, this routine is called to complete the
1300  *      saving of the interrupted thread state, and returns the interrupted
1301  *      thread pointer so it may be resumed.
1302  *
1303  *      Called by swtch() only at high spl.
1304  */
1305 kthread_t *
1306 thread_unpin()
1307 {
1308         kthread_t       *t = curthread; /* current thread */
1309         kthread_t       *itp;           /* interrupted thread */
1310         int             i;              /* interrupt level */
1311         extern int      intr_passivate();
1312
1313         ASSERT(t->t_intr != NULL);
1314
1315         itp = t->t_intr;                /* interrupted thread */
1316         t->t_intr = NULL;               /* clear interrupt ptr */
1317
1318         /*
1319          * Get state from interrupt thread for the one
1320          * it interrupted.
1321          */
1322
1323         i = intr_passivate(t, itp);
1324
1325         TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
1326             "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
1327             i, t, t, itp, itp);
1328
1329         /*
1330          * Dissociate the current thread from the interrupted thread's LWP.
1331          */
1332         t->t_lwp = NULL;
1333
1334         /*
1335          * Interrupt handlers above the level that spinlocks block must
1336          * not block.
1337          */
1338 #if DEBUG
1339         if (i < 0 || i > LOCK_LEVEL)
1340                 cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
1341 #endif
1342
1343         /*
1344          * Compute the CPU's base interrupt level based on the active
1345          * interrupts.
1346          */
1347         ASSERT(CPU->cpu_intr_actv & (1 << i));
1348         set_base_spl();
1349
1350         return (itp);
1351 }
1352
1353 /*
1354  * Create and initialize an interrupt thread.
1355  *      Returns non-zero on error.
1356  *      Called at spl7() or better.
1357  */
1358 void
1359 thread_create_intr(struct cpu *cp)
1360 {
1361         kthread_t *tp;
1362
1363         tp = thread_create(NULL, 0,
1364             (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
1365
1366         /*
1367          * Set the thread in the TS_FREE state.  The state will change
1368          * to TS_ONPROC only while the interrupt is active.  Think of these
1369          * as being on a private free list for the CPU.  Being TS_FREE keeps
1370          * inactive interrupt threads out of debugger thread lists.
1371          *
1372          * We cannot call thread_create with TS_FREE because of the current
1373          * checks there for ONPROC.  Fix this when thread_create takes flags.
1374          */
1375         THREAD_FREEINTR(tp, cp);
1376
1377         /*
1378          * Nobody should ever reference the credentials of an interrupt
1379          * thread so make it NULL to catch any such references.
1380          */
1381         tp->t_cred = NULL;
1382         tp->t_flag |= T_INTR_THREAD;
1383         tp->t_cpu = cp;
1384         tp->t_bound_cpu = cp;
1385         tp->t_disp_queue = cp->cpu_disp;
1386         tp->t_affinitycnt = 1;
1387         tp->t_preempt = 1;
1388
1389         /*
1390          * Don't make a user-requested binding on this thread so that
1391          * the processor can be offlined.
1392          */
1393         tp->t_bind_cpu = PBIND_NONE;    /* no USER-requested binding */
1394         tp->t_bind_pset = PS_NONE;
1395
1396 #if defined(__i386) || defined(__amd64)
1397         tp->t_stk -= STACK_ALIGN;
1398         *(tp->t_stk) = 0;               /* terminate intr thread stack */
1399 #endif
1400
1401         /*
1402          * Link onto CPU's interrupt pool.
1403          */
1404         tp->t_link = cp->cpu_intr_thread;
1405         cp->cpu_intr_thread = tp;
1406 }
1407
1408 /*
1409  * TSD -- THREAD SPECIFIC DATA
1410  */
1411 static kmutex_t         tsd_mutex;       /* linked list spin lock */
1412 static uint_t           tsd_nkeys;       /* size of destructor array */
1413 /* per-key destructor funcs */
1414 static void             (**tsd_destructor)(void *);
1415 /* list of tsd_thread's */
1416 static struct tsd_thread        *tsd_list;
1417
1418 /*
1419  * Default destructor
1420  *      Needed because NULL destructor means that the key is unused
1421  */
1422 /* ARGSUSED */
1423 void
1424 tsd_defaultdestructor(void *value)
1425 {}
1426
1427 /*
1428  * Create a key (index into per thread array)
1429  *      Locks out tsd_create, tsd_destroy, and tsd_exit
1430  *      May allocate memory with lock held
1431  */
1432 void
1433 tsd_create(uint_t *keyp, void (*destructor)(void *))
1434 {
1435         int     i;
1436         uint_t  nkeys;
1437
1438         /*
1439          * if key is allocated, do nothing
1440          */
1441         mutex_enter(&tsd_mutex);
1442         if (*keyp) {
1443                 mutex_exit(&tsd_mutex);
1444                 return;
1445         }
1446         /*
1447          * find an unused key
1448          */
1449         if (destructor == NULL)
1450                 destructor = tsd_defaultdestructor;
1451
1452         for (i = 0; i < tsd_nkeys; ++i)
1453                 if (tsd_destructor[i] == NULL)
1454                         break;
1455
1456         /*
1457          * if no unused keys, increase the size of the destructor array
1458          */
1459         if (i == tsd_nkeys) {
1460                 if ((nkeys = (tsd_nkeys << 1)) == 0)
1461                         nkeys = 1;
1462                 tsd_destructor =
1463                     (void (**)(void *))tsd_realloc((void *)tsd_destructor,
1464                     (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
1465                     (size_t)(nkeys * sizeof (void (*)(void *))));
1466                 tsd_nkeys = nkeys;
1467         }
1468
1469         /*
1470          * allocate the next available unused key
1471          */
1472         tsd_destructor[i] = destructor;
1473         *keyp = i + 1;
1474         mutex_exit(&tsd_mutex);
1475 }
1476
1477 /*
1478  * Destroy a key -- this is for unloadable modules
1479  *
1480  * Assumes that the caller is preventing tsd_set and tsd_get
1481  * Locks out tsd_create, tsd_destroy, and tsd_exit
1482  * May free memory with lock held
1483  */
1484 void
1485 tsd_destroy(uint_t *keyp)
1486 {
1487         uint_t key;
1488         struct tsd_thread *tsd;
1489
1490         /*
1491          * protect the key namespace and our destructor lists
1492          */
1493         mutex_enter(&tsd_mutex);
1494         key = *keyp;
1495         *keyp = 0;
1496
1497         ASSERT(key <= tsd_nkeys);
1498
1499         /*
1500          * if the key is valid
1501          */
1502         if (key != 0) {
1503                 uint_t k = key - 1;
1504                 /*
1505                  * for every thread with TSD, call key's destructor
1506                  */
1507                 for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
1508                         /*
1509                          * no TSD for key in this thread
1510                          */
1511                         if (key > tsd->ts_nkeys)
1512                                 continue;
1513                         /*
1514                          * call destructor for key
1515                          */
1516                         if (tsd->ts_value[k] && tsd_destructor[k])
1517                                 (*tsd_destructor[k])(tsd->ts_value[k]);
1518                         /*
1519                          * reset value for key
1520                          */
1521                         tsd->ts_value[k] = NULL;
1522                 }
1523                 /*
1524                  * actually free the key (NULL destructor == unused)
1525                  */
1526                 tsd_destructor[k] = NULL;
1527         }
1528
1529         mutex_exit(&tsd_mutex);
1530 }
1531
1532 /*
1533  * Quickly return the per thread value that was stored with the specified key
1534  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1535  */
1536 void *
1537 tsd_get(uint_t key)
1538 {
1539         return (tsd_agent_get(curthread, key));
1540 }
1541
1542 /*
1543  * Set a per thread value indexed with the specified key
1544  */
1545 int
1546 tsd_set(uint_t key, void *value)
1547 {
1548         return (tsd_agent_set(curthread, key, value));
1549 }
1550
1551 /*
1552  * Like tsd_get(), except that the agent lwp can get the tsd of
1553  * another thread in the same process (the agent thread only runs when the
1554  * process is completely stopped by /proc), or syslwp is creating a new lwp.
1555  */
1556 void *
1557 tsd_agent_get(kthread_t *t, uint_t key)
1558 {
1559         struct tsd_thread *tsd = t->t_tsd;
1560
1561         ASSERT(t == curthread ||
1562             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1563
1564         if (key && tsd != NULL && key <= tsd->ts_nkeys)
1565                 return (tsd->ts_value[key - 1]);
1566         return (NULL);
1567 }
1568
1569 /*
1570  * Like tsd_set(), except that the agent lwp can set the tsd of
1571  * another thread in the same process, or syslwp can set the tsd
1572  * of a thread it's in the middle of creating.
1573  *
1574  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1575  * May lock out tsd_destroy (and tsd_create), may allocate memory with
1576  * lock held
1577  */
1578 int
1579 tsd_agent_set(kthread_t *t, uint_t key, void *value)
1580 {
1581         struct tsd_thread *tsd = t->t_tsd;
1582
1583         ASSERT(t == curthread ||
1584             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1585
1586         if (key == 0)
1587                 return (EINVAL);
1588         if (tsd == NULL)
1589                 tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1590         if (key <= tsd->ts_nkeys) {
1591                 tsd->ts_value[key - 1] = value;
1592                 return (0);
1593         }
1594
1595         ASSERT(key <= tsd_nkeys);
1596
1597         /*
1598          * lock out tsd_destroy()
1599          */
1600         mutex_enter(&tsd_mutex);
1601         if (tsd->ts_nkeys == 0) {
1602                 /*
1603                  * Link onto list of threads with TSD
1604                  */
1605                 if ((tsd->ts_next = tsd_list) != NULL)
1606                         tsd_list->ts_prev = tsd;
1607                 tsd_list = tsd;
1608         }
1609
1610         /*
1611          * Allocate thread local storage and set the value for key
1612          */
1613         tsd->ts_value = tsd_realloc(tsd->ts_value,
1614             tsd->ts_nkeys * sizeof (void *),
1615             key * sizeof (void *));
1616         tsd->ts_nkeys = key;
1617         tsd->ts_value[key - 1] = value;
1618         mutex_exit(&tsd_mutex);
1619
1620         return (0);
1621 }
1622
1623
1624 /*
1625  * Return the per thread value that was stored with the specified key
1626  *      If necessary, create the key and the value
1627  *      Assumes the caller is protecting *keyp from tsd_destroy
1628  */
1629 void *
1630 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
1631 {
1632         void *value;
1633         uint_t key = *keyp;
1634         struct tsd_thread *tsd = curthread->t_tsd;
1635
1636         if (tsd == NULL)
1637                 tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1638         if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
1639                 return (value);
1640         if (key == 0)
1641                 tsd_create(keyp, destroy);
1642         (void) tsd_set(*keyp, value = (*allocate)());
1643
1644         return (value);
1645 }
1646
1647 /*
1648  * Called from thread_exit() to run the destructor function for each tsd
1649  *      Locks out tsd_create and tsd_destroy
1650  *      Assumes that the destructor *DOES NOT* use tsd
1651  */
1652 void
1653 tsd_exit(void)
1654 {
1655         int i;
1656         struct tsd_thread *tsd = curthread->t_tsd;
1657
1658         if (tsd == NULL)
1659                 return;
1660
1661         if (tsd->ts_nkeys == 0) {
1662                 kmem_free(tsd, sizeof (*tsd));
1663                 curthread->t_tsd = NULL;
1664                 return;
1665         }
1666
1667         /*
1668          * lock out tsd_create and tsd_destroy, call
1669          * the destructor, and mark the value as destroyed.
1670          */
1671         mutex_enter(&tsd_mutex);
1672
1673         for (i = 0; i < tsd->ts_nkeys; i++) {
1674                 if (tsd->ts_value[i] && tsd_destructor[i])
1675                         (*tsd_destructor[i])(tsd->ts_value[i]);
1676                 tsd->ts_value[i] = NULL;
1677         }
1678
1679         /*
1680          * remove from linked list of threads with TSD
1681          */
1682         if (tsd->ts_next)
1683                 tsd->ts_next->ts_prev = tsd->ts_prev;
1684         if (tsd->ts_prev)
1685                 tsd->ts_prev->ts_next = tsd->ts_next;
1686         if (tsd_list == tsd)
1687                 tsd_list = tsd->ts_next;
1688
1689         mutex_exit(&tsd_mutex);
1690
1691         /*
1692          * free up the TSD
1693          */
1694         kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
1695         kmem_free(tsd, sizeof (struct tsd_thread));
1696         curthread->t_tsd = NULL;
1697 }
1698
1699 /*
1700  * realloc
1701  */
1702 static void *
1703 tsd_realloc(void *old, size_t osize, size_t nsize)
1704 {
1705         void *new;
1706
1707         new = kmem_zalloc(nsize, KM_SLEEP);
1708         if (old) {
1709                 bcopy(old, new, osize);
1710                 kmem_free(old, osize);
1711         }
1712         return (new);
1713 }
1714
1715 /*
1716  * Return non-zero if an interrupt is being serviced.
1717  */
1718 int
1719 servicing_interrupt()
1720 {
1721         int onintr = 0;
1722
1723         /* Are we an interrupt thread */
1724         if (curthread->t_flag & T_INTR_THREAD)
1725                 return (1);
1726         /* Are we servicing a high level interrupt? */
1727         if (CPU_ON_INTR(CPU)) {
1728                 kpreempt_disable();
1729                 onintr = CPU_ON_INTR(CPU);
1730                 kpreempt_enable();
1731         }
1732         return (onintr);
1733 }
1734
1735
1736 /*
1737  * Change the dispatch priority of a thread in the system.
1738  * Used when raising or lowering a thread's priority.
1739  * (E.g., priority inheritance)
1740  *
1741  * Since threads are queued according to their priority, we
1742  * we must check the thread's state to determine whether it
1743  * is on a queue somewhere. If it is, we've got to:
1744  *
1745  *      o Dequeue the thread.
1746  *      o Change its effective priority.
1747  *      o Enqueue the thread.
1748  *
1749  * Assumptions: The thread whose priority we wish to change
1750  * must be locked before we call thread_change_(e)pri().
1751  * The thread_change(e)pri() function doesn't drop the thread
1752  * lock--that must be done by its caller.
1753  */
1754 void
1755 thread_change_epri(kthread_t *t, pri_t disp_pri)
1756 {
1757         uint_t  state;
1758
1759         ASSERT(THREAD_LOCK_HELD(t));
1760
1761         /*
1762          * If the inherited priority hasn't actually changed,
1763          * just return.
1764          */
1765         if (t->t_epri == disp_pri)
1766                 return;
1767
1768         state = t->t_state;
1769
1770         /*
1771          * If it's not on a queue, change the priority with impunity.
1772          */
1773         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1774                 t->t_epri = disp_pri;
1775                 if (state == TS_ONPROC) {
1776                         cpu_t *cp = t->t_disp_queue->disp_cpu;
1777
1778                         if (t == cp->cpu_dispthread)
1779                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
1780                 }
1781         } else if (state == TS_SLEEP) {
1782                 /*
1783                  * Take the thread out of its sleep queue.
1784                  * Change the inherited priority.
1785                  * Re-enqueue the thread.
1786                  * Each synchronization object exports a function
1787                  * to do this in an appropriate manner.
1788                  */
1789                 SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
1790         } else if (state == TS_WAIT) {
1791                 /*
1792                  * Re-enqueue a thread on the wait queue if its
1793                  * effective priority needs to change.
1794                  */
1795                 if (disp_pri != t->t_epri)
1796                         waitq_change_pri(t, disp_pri);
1797         } else {
1798                 /*
1799                  * The thread is on a run queue.
1800                  * Note: setbackdq() may not put the thread
1801                  * back on the same run queue where it originally
1802                  * resided.
1803                  */
1804                 (void) dispdeq(t);
1805                 t->t_epri = disp_pri;
1806                 setbackdq(t);
1807         }
1808         schedctl_set_cidpri(t);
1809 }
1810
1811 /*
1812  * Function: Change the t_pri field of a thread.
1813  * Side Effects: Adjust the thread ordering on a run queue
1814  *               or sleep queue, if necessary.
1815  * Returns: 1 if the thread was on a run queue, else 0.
1816  */
1817 int
1818 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
1819 {
1820         uint_t  state;
1821         int     on_rq = 0;
1822
1823         ASSERT(THREAD_LOCK_HELD(t));
1824
1825         state = t->t_state;
1826         THREAD_WILLCHANGE_PRI(t, disp_pri);
1827
1828         /*
1829          * If it's not on a queue, change the priority with impunity.
1830          */
1831         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1832                 t->t_pri = disp_pri;
1833
1834                 if (state == TS_ONPROC) {
1835                         cpu_t *cp = t->t_disp_queue->disp_cpu;
1836
1837                         if (t == cp->cpu_dispthread)
1838                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
1839                 }
1840         } else if (state == TS_SLEEP) {
1841                 /*
1842                  * If the priority has changed, take the thread out of
1843                  * its sleep queue and change the priority.
1844                  * Re-enqueue the thread.
1845                  * Each synchronization object exports a function
1846                  * to do this in an appropriate manner.
1847                  */
1848                 if (disp_pri != t->t_pri)
1849                         SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
1850         } else if (state == TS_WAIT) {
1851                 /*
1852                  * Re-enqueue a thread on the wait queue if its
1853                  * priority needs to change.
1854                  */
1855                 if (disp_pri != t->t_pri)
1856                         waitq_change_pri(t, disp_pri);
1857         } else {
1858                 /*
1859                  * The thread is on a run queue.
1860                  * Note: setbackdq() may not put the thread
1861                  * back on the same run queue where it originally
1862                  * resided.
1863                  *
1864                  * We still requeue the thread even if the priority
1865                  * is unchanged to preserve round-robin (and other)
1866                  * effects between threads of the same priority.
1867                  */
1868                 on_rq = dispdeq(t);
1869                 ASSERT(on_rq);
1870                 t->t_pri = disp_pri;
1871                 if (front) {
1872                         setfrontdq(t);
1873                 } else {
1874                         setbackdq(t);
1875                 }
1876         }
1877         schedctl_set_cidpri(t);
1878         return (on_rq);
1879 }
1880
1881 /*
1882  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
1883  * specific pattern.
1884  */
1885 static void
1886 stkinfo_begin(kthread_t *t)
1887 {
1888         caddr_t start;  /* stack start */
1889         caddr_t end;    /* stack end  */
1890         uint64_t *ptr;  /* pattern pointer */
1891
1892         /*
1893          * Stack grows up or down, see thread_create(),
1894          * compute stack memory area start and end (start < end).
1895          */
1896         if (t->t_stk > t->t_stkbase) {
1897                 /* stack grows down */
1898                 start = t->t_stkbase;
1899                 end = t->t_stk;
1900         } else {
1901                 /* stack grows up */
1902                 start = t->t_stk;
1903                 end = t->t_stkbase;
1904         }
1905
1906         /*
1907          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1908          * alignement for start and end in stack area boundaries
1909          * (protection against corrupt t_stkbase/t_stk data).
1910          */
1911         if ((((uintptr_t)start) & 0x7) != 0) {
1912                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1913         }
1914         end = (caddr_t)(((uintptr_t)end) & (~0x7));
1915
1916         if ((end <= start) || (end - start) > (1024 * 1024)) {
1917                 /* negative or stack size > 1 meg, assume bogus */
1918                 return;
1919         }
1920
1921         /* fill stack area with a pattern (instead of zeros) */
1922         ptr = (uint64_t *)((void *)start);
1923         while (ptr < (uint64_t *)((void *)end)) {
1924                 *ptr++ = KMEM_STKINFO_PATTERN;
1925         }
1926 }
1927
1928
1929 /*
1930  * Tunable kmem_stackinfo is set, create stackinfo log if doesn't already exist,
1931  * compute the percentage of kernel stack really used, and set in the log
1932  * if it's the latest highest percentage.
1933  */
1934 static void
1935 stkinfo_end(kthread_t *t)
1936 {
1937         caddr_t start;  /* stack start */
1938         caddr_t end;    /* stack end  */
1939         uint64_t *ptr;  /* pattern pointer */
1940         size_t stksz;   /* stack size */
1941         size_t smallest = 0;
1942         size_t percent = 0;
1943         uint_t index = 0;
1944         uint_t i;
1945         static size_t smallest_percent = (size_t)-1;
1946         static uint_t full = 0;
1947
1948         /* create the stackinfo log, if doesn't already exist */
1949         mutex_enter(&kmem_stkinfo_lock);
1950         if (kmem_stkinfo_log == NULL) {
1951                 kmem_stkinfo_log = (kmem_stkinfo_t *)
1952                     kmem_zalloc(KMEM_STKINFO_LOG_SIZE *
1953                     (sizeof (kmem_stkinfo_t)), KM_NOSLEEP);
1954                 if (kmem_stkinfo_log == NULL) {
1955                         mutex_exit(&kmem_stkinfo_lock);
1956                         return;
1957                 }
1958         }
1959         mutex_exit(&kmem_stkinfo_lock);
1960
1961         /*
1962          * Stack grows up or down, see thread_create(),
1963          * compute stack memory area start and end (start < end).
1964          */
1965         if (t->t_stk > t->t_stkbase) {
1966                 /* stack grows down */
1967                 start = t->t_stkbase;
1968                 end = t->t_stk;
1969         } else {
1970                 /* stack grows up */
1971                 start = t->t_stk;
1972                 end = t->t_stkbase;
1973         }
1974
1975         /* stack size as found in kthread_t */
1976         stksz = end - start;
1977
1978         /*
1979          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1980          * alignement for start and end in stack area boundaries
1981          * (protection against corrupt t_stkbase/t_stk data).
1982          */
1983         if ((((uintptr_t)start) & 0x7) != 0) {
1984                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1985         }
1986         end = (caddr_t)(((uintptr_t)end) & (~0x7));
1987
1988         if ((end <= start) || (end - start) > (1024 * 1024)) {
1989                 /* negative or stack size > 1 meg, assume bogus */
1990                 return;
1991         }
1992
1993         /* search until no pattern in the stack */
1994         if (t->t_stk > t->t_stkbase) {
1995                 /* stack grows down */
1996 #if defined(__i386) || defined(__amd64)
1997                 /*
1998                  * 6 longs are pushed on stack, see thread_load(). Skip
1999                  * them, so if kthread has never run, percent is zero.
2000                  * 8 bytes alignement is preserved for a 32 bit kernel,
2001                  * 6 x 4 = 24, 24 is a multiple of 8.
2002                  *
2003                  */
2004                 end -= (6 * sizeof (long));
2005 #endif
2006                 ptr = (uint64_t *)((void *)start);
2007                 while (ptr < (uint64_t *)((void *)end)) {
2008                         if (*ptr != KMEM_STKINFO_PATTERN) {
2009                                 percent = stkinfo_percent(end,
2010                                     start, (caddr_t)ptr);
2011                                 break;
2012                         }
2013                         ptr++;
2014                 }
2015         } else {
2016                 /* stack grows up */
2017                 ptr = (uint64_t *)((void *)end);
2018                 ptr--;
2019                 while (ptr >= (uint64_t *)((void *)start)) {
2020                         if (*ptr != KMEM_STKINFO_PATTERN) {
2021                                 percent = stkinfo_percent(start,
2022                                     end, (caddr_t)ptr);
2023                                 break;
2024                         }
2025                         ptr--;
2026                 }
2027         }
2028
2029         DTRACE_PROBE3(stack__usage, kthread_t *, t,
2030             size_t, stksz, size_t, percent);
2031
2032         if (percent == 0) {
2033                 return;
2034         }
2035
2036         mutex_enter(&kmem_stkinfo_lock);
2037         if (full == KMEM_STKINFO_LOG_SIZE && percent < smallest_percent) {
2038                 /*
2039                  * The log is full and already contains the highest values
2040                  */
2041                 mutex_exit(&kmem_stkinfo_lock);
2042                 return;
2043         }
2044
2045         /* keep a log of the highest used stack */
2046         for (i = 0; i < KMEM_STKINFO_LOG_SIZE; i++) {
2047                 if (kmem_stkinfo_log[i].percent == 0) {
2048                         index = i;
2049                         full++;
2050                         break;
2051                 }
2052                 if (smallest == 0) {
2053                         smallest = kmem_stkinfo_log[i].percent;
2054                         index = i;
2055                         continue;
2056                 }
2057                 if (kmem_stkinfo_log[i].percent < smallest) {
2058                         smallest = kmem_stkinfo_log[i].percent;
2059                         index = i;
2060                 }
2061         }
2062
2063         if (percent >= kmem_stkinfo_log[index].percent) {
2064                 kmem_stkinfo_log[index].kthread = (caddr_t)t;
2065                 kmem_stkinfo_log[index].t_startpc = (caddr_t)t->t_startpc;
2066                 kmem_stkinfo_log[index].start = start;
2067                 kmem_stkinfo_log[index].stksz = stksz;
2068                 kmem_stkinfo_log[index].percent = percent;
2069                 kmem_stkinfo_log[index].t_tid = t->t_tid;
2070                 kmem_stkinfo_log[index].cmd[0] = '\0';
2071                 if (t->t_tid != 0) {
2072                         stksz = strlen((t->t_procp)->p_user.u_comm);
2073                         if (stksz >= KMEM_STKINFO_STR_SIZE) {
2074                                 stksz = KMEM_STKINFO_STR_SIZE - 1;
2075                                 kmem_stkinfo_log[index].cmd[stksz] = '\0';
2076                         } else {
2077                                 stksz += 1;
2078                         }
2079                         (void) memcpy(kmem_stkinfo_log[index].cmd,
2080                             (t->t_procp)->p_user.u_comm, stksz);
2081                 }
2082                 if (percent < smallest_percent) {
2083                         smallest_percent = percent;
2084                 }
2085         }
2086         mutex_exit(&kmem_stkinfo_lock);
2087 }
2088
2089 /*
2090  * Tunable kmem_stackinfo is set, compute stack utilization percentage.
2091  */
2092 static size_t
2093 stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
2094 {
2095         size_t percent;
2096         size_t s;
2097
2098         if (t_stk > t_stkbase) {
2099                 /* stack grows down */
2100                 if (sp > t_stk) {
2101                         return (0);
2102                 }
2103                 if (sp < t_stkbase) {
2104                         return (100);
2105                 }
2106                 percent = t_stk - sp + 1;
2107                 s = t_stk - t_stkbase + 1;
2108         } else {
2109                 /* stack grows up */
2110                 if (sp < t_stk) {
2111                         return (0);
2112                 }
2113                 if (sp > t_stkbase) {
2114                         return (100);
2115                 }
2116                 percent = sp - t_stk + 1;
2117                 s = t_stkbase - t_stk + 1;
2118         }
2119         percent = ((100 * percent) / s) + 1;
2120         if (percent > 100) {
2121                 percent = 100;
2122         }
2123         return (percent);
2124 }