4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
40 int callout_init_done
; /* useful during boot */
43 * Callout tables. See timeout(9F) for details.
45 static int callout_threads
; /* callout normal threads */
46 static hrtime_t callout_debug_hrtime
; /* debugger entry time */
47 static int callout_chunk
; /* callout heap chunk size */
48 static int callout_min_reap
; /* callout minimum reap count */
49 static int callout_tolerance
; /* callout hires tolerance */
50 static callout_table_t
*callout_boot_ct
; /* Boot CPU's callout tables */
51 static clock_t callout_max_ticks
; /* max interval */
52 static hrtime_t callout_longterm
; /* longterm nanoseconds */
53 static ulong_t callout_counter_low
; /* callout ID increment */
54 static ulong_t callout_table_bits
; /* number of table bits in ID */
55 static ulong_t callout_table_mask
; /* mask for the table bits */
56 static callout_cache_t
*callout_caches
; /* linked list of caches */
57 #pragma align 64(callout_table)
58 static callout_table_t
*callout_table
; /* global callout table array */
61 * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
62 * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
63 * via taskq, to a thread that executes at PIL 0 - so we end up running
64 * 'normal' callouts at PIL 0.
66 static volatile int callout_realtime_level
= CY_LOW_LEVEL
;
67 static volatile int callout_normal_level
= CY_LOCK_LEVEL
;
69 static char *callout_kstat_names
[] = {
71 "callout_timeouts_pending",
72 "callout_untimeouts_unexpired",
73 "callout_untimeouts_executing",
74 "callout_untimeouts_expired",
75 "callout_expirations",
76 "callout_allocations",
80 static hrtime_t
callout_heap_process(callout_table_t
*, hrtime_t
, int);
82 #define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \
84 callout_hash_t *hashp = &(hash); \
87 cp->cnext = hashp->ch_head; \
88 if (hashp->ch_head == NULL) \
89 hashp->ch_tail = cp; \
91 cp->cnext->cprev = cp; \
92 hashp->ch_head = cp; \
95 #define CALLOUT_HASH_APPEND(hash, cp, cnext, cprev) \
97 callout_hash_t *hashp = &(hash); \
100 cp->cprev = hashp->ch_tail; \
101 if (hashp->ch_tail == NULL) \
102 hashp->ch_head = cp; \
104 cp->cprev->cnext = cp; \
105 hashp->ch_tail = cp; \
108 #define CALLOUT_HASH_DELETE(hash, cp, cnext, cprev) \
110 callout_hash_t *hashp = &(hash); \
112 if (cp->cnext == NULL) \
113 hashp->ch_tail = cp->cprev; \
115 cp->cnext->cprev = cp->cprev; \
116 if (cp->cprev == NULL) \
117 hashp->ch_head = cp->cnext; \
119 cp->cprev->cnext = cp->cnext; \
123 * These definitions help us queue callouts and callout lists. Here is
124 * the queueing rationale:
126 * - callouts are queued in a FIFO manner in the ID hash table.
127 * TCP timers are typically cancelled in the same order that they
128 * were issued. The FIFO queueing shortens the search for a callout
129 * during untimeout().
131 * - callouts are queued in a FIFO manner in their callout lists.
132 * This ensures that the callouts are executed in the same order that
133 * they were queued. This is fair. Plus, it helps to make each
134 * callout expiration timely. It also favors cancellations.
136 * - callout lists are queued in the following manner in the callout
137 * hash table buckets:
139 * - appended, if the callout list is a 1-nanosecond resolution
140 * callout list. When a callout is created, we first look for
141 * a callout list that has the same expiration so we can avoid
142 * allocating a callout list and inserting the expiration into
143 * the heap. However, we do not want to look at 1-nanosecond
144 * resolution callout lists as we will seldom find a match in
145 * them. Keeping these callout lists in the rear of the hash
146 * buckets allows us to skip these during the lookup.
148 * - inserted at the beginning, if the callout list is not a
149 * 1-nanosecond resolution callout list. This also has the
150 * side-effect of keeping the long term timers away from the
151 * front of the buckets.
153 * - callout lists are queued in a FIFO manner in the expired callouts
154 * list. This ensures that callout lists are executed in the order
157 #define CALLOUT_APPEND(ct, cp) \
158 CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \
159 cp, c_idnext, c_idprev); \
160 CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
162 #define CALLOUT_DELETE(ct, cp) \
163 CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)], \
164 cp, c_idnext, c_idprev); \
165 CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
167 #define CALLOUT_LIST_INSERT(hash, cl) \
168 CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
170 #define CALLOUT_LIST_APPEND(hash, cl) \
171 CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
173 #define CALLOUT_LIST_DELETE(hash, cl) \
174 CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
176 #define CALLOUT_LIST_BEFORE(cl, nextcl) \
178 (cl)->cl_prev = (nextcl)->cl_prev; \
179 (cl)->cl_next = (nextcl); \
180 (nextcl)->cl_prev = (cl); \
181 if (cl->cl_prev != NULL) \
182 cl->cl_prev->cl_next = cl; \
186 * For normal callouts, there is a deadlock scenario if two callouts that
187 * have an inter-dependency end up on the same callout list. To break the
188 * deadlock, you need two taskq threads running in parallel. We compute
189 * the number of taskq threads here using a bunch of conditions to make
190 * it optimal for the common case. This is an ugly hack, but one that is
193 #define CALLOUT_THRESHOLD 100000000
194 #define CALLOUT_EXEC_COMPUTE(ct, nextexp, exec) \
196 callout_list_t *cl; \
198 cl = ct->ct_expired.ch_head; \
201 * If the expired list is NULL, there is nothing to \
205 } else if ((cl->cl_next == NULL) && \
206 (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) { \
208 * If there is only one callout list and it contains \
209 * only one callout, there is no need for two threads. \
212 } else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) { \
214 * If the next expiration of the cyclic is way out into \
215 * the future, we need two threads. \
220 * We have multiple callouts to process. But the cyclic \
221 * will fire in the near future. So, we only need one \
229 * Macro to swap two heap items.
231 #define CALLOUT_SWAP(h1, h2) \
233 callout_heap_t tmp; \
241 * Macro to free a callout list.
243 #define CALLOUT_LIST_FREE(ct, cl) \
245 cl->cl_next = ct->ct_lfree; \
247 cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \
251 * Macro to free a callout.
253 #define CALLOUT_FREE(ct, cl) \
255 cp->c_idnext = ct->ct_free; \
257 cp->c_xid |= CALLOUT_ID_FREE; \
261 * Allocate a callout structure. We try quite hard because we
262 * can't sleep, and if we can't do the allocation, we're toast.
263 * Failing all, we try a KM_PANIC allocation. Note that we never
264 * deallocate a callout. See untimeout() for the reasoning.
267 callout_alloc(callout_table_t
*ct
)
272 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
273 mutex_exit(&ct
->ct_mutex
);
275 cp
= kmem_cache_alloc(ct
->ct_cache
, KM_NOSLEEP
);
277 size
= sizeof (callout_t
);
278 cp
= kmem_alloc_tryhard(size
, &size
, KM_NOSLEEP
| KM_PANIC
);
281 cp
->c_executor
= NULL
;
282 cv_init(&cp
->c_done
, NULL
, CV_DEFAULT
, NULL
);
285 mutex_enter(&ct
->ct_mutex
);
286 ct
->ct_allocations
++;
291 * Allocate a callout list structure. We try quite hard because we
292 * can't sleep, and if we can't do the allocation, we're toast.
293 * Failing all, we try a KM_PANIC allocation. Note that we never
294 * deallocate a callout list.
297 callout_list_alloc(callout_table_t
*ct
)
302 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
303 mutex_exit(&ct
->ct_mutex
);
305 cl
= kmem_cache_alloc(ct
->ct_lcache
, KM_NOSLEEP
);
307 size
= sizeof (callout_list_t
);
308 cl
= kmem_alloc_tryhard(size
, &size
, KM_NOSLEEP
| KM_PANIC
);
310 bzero(cl
, sizeof (callout_list_t
));
312 mutex_enter(&ct
->ct_mutex
);
313 CALLOUT_LIST_FREE(ct
, cl
);
317 * Find a callout list that corresponds to an expiration and matching flags.
319 static callout_list_t
*
320 callout_list_get(callout_table_t
*ct
, hrtime_t expiration
, int flags
, int hash
)
325 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
327 if (flags
& CALLOUT_LIST_FLAG_NANO
) {
329 * This is a 1-nanosecond resolution callout. We will rarely
330 * find a match for this. So, bail out.
335 clflags
= (CALLOUT_LIST_FLAG_ABSOLUTE
| CALLOUT_LIST_FLAG_HRESTIME
);
336 for (cl
= ct
->ct_clhash
[hash
].ch_head
; (cl
!= NULL
); cl
= cl
->cl_next
) {
338 * If we have reached a 1-nanosecond resolution callout list,
339 * we don't have much hope of finding a match in this hash
340 * bucket. So, just bail out.
342 if (cl
->cl_flags
& CALLOUT_LIST_FLAG_NANO
)
345 if ((cl
->cl_expiration
== expiration
) &&
346 ((cl
->cl_flags
& clflags
) == (flags
& clflags
)))
354 * Add a new callout list into a callout table's queue in sorted order by
358 callout_queue_add(callout_table_t
*ct
, callout_list_t
*cl
)
360 callout_list_t
*nextcl
;
363 expiration
= cl
->cl_expiration
;
364 nextcl
= ct
->ct_queue
.ch_head
;
365 if ((nextcl
== NULL
) || (expiration
< nextcl
->cl_expiration
)) {
366 CALLOUT_LIST_INSERT(ct
->ct_queue
, cl
);
370 while (nextcl
!= NULL
) {
371 if (expiration
< nextcl
->cl_expiration
) {
372 CALLOUT_LIST_BEFORE(cl
, nextcl
);
375 nextcl
= nextcl
->cl_next
;
377 CALLOUT_LIST_APPEND(ct
->ct_queue
, cl
);
383 * Insert a callout list into a callout table's queue and reprogram the queue
387 callout_queue_insert(callout_table_t
*ct
, callout_list_t
*cl
)
389 cl
->cl_flags
|= CALLOUT_LIST_FLAG_QUEUED
;
392 * Add the callout to the callout queue. If it ends up at the head,
393 * the cyclic needs to be reprogrammed as we have an earlier
396 * Also, during the CPR suspend phase, do not reprogram the cyclic.
397 * We don't want any callout activity. When the CPR resume phase is
398 * entered, the cyclic will be programmed for the earliest expiration
401 if (callout_queue_add(ct
, cl
) && (ct
->ct_suspend
== 0))
402 (void) cyclic_reprogram(ct
->ct_qcyclic
, cl
->cl_expiration
);
406 * Delete and handle all past expirations in a callout table's queue.
409 callout_queue_delete(callout_table_t
*ct
)
414 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
417 while ((cl
= ct
->ct_queue
.ch_head
) != NULL
) {
418 if (cl
->cl_expiration
> now
)
420 cl
->cl_flags
&= ~CALLOUT_LIST_FLAG_QUEUED
;
421 CALLOUT_LIST_DELETE(ct
->ct_queue
, cl
);
422 CALLOUT_LIST_APPEND(ct
->ct_expired
, cl
);
426 * If this callout queue is empty or callouts have been suspended,
429 if ((cl
== NULL
) || (ct
->ct_suspend
> 0))
430 return (CY_INFINITY
);
432 (void) cyclic_reprogram(ct
->ct_qcyclic
, cl
->cl_expiration
);
434 return (cl
->cl_expiration
);
438 callout_queue_process(callout_table_t
*ct
, hrtime_t delta
, int timechange
)
440 callout_list_t
*firstcl
, *cl
;
441 hrtime_t expiration
, now
;
445 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
447 firstcl
= ct
->ct_queue
.ch_head
;
449 return (CY_INFINITY
);
452 * We walk the callout queue. If we encounter a hrestime entry that
453 * must be removed, we clean it out. Otherwise, we apply any
454 * adjustments needed to it. Because of the latter, we need to
455 * recreate the list as we go along.
458 ct
->ct_queue
.ch_head
= NULL
;
459 ct
->ct_queue
.ch_tail
= NULL
;
461 clflags
= (CALLOUT_LIST_FLAG_HRESTIME
| CALLOUT_LIST_FLAG_ABSOLUTE
);
463 while ((cl
= temp
.ch_head
) != NULL
) {
464 CALLOUT_LIST_DELETE(temp
, cl
);
467 * Delete the callout and expire it, if one of the following
469 * - the callout has expired
470 * - the callout is an absolute hrestime one and
471 * there has been a system time change
473 if ((cl
->cl_expiration
<= now
) ||
474 (timechange
&& ((cl
->cl_flags
& clflags
) == clflags
))) {
475 cl
->cl_flags
&= ~CALLOUT_LIST_FLAG_QUEUED
;
476 CALLOUT_LIST_APPEND(ct
->ct_expired
, cl
);
481 * Apply adjustments, if any. Adjustments are applied after
482 * the system returns from KMDB or OBP. They are only applied
483 * to relative callout lists.
485 if (delta
&& !(cl
->cl_flags
& CALLOUT_LIST_FLAG_ABSOLUTE
)) {
486 expiration
= cl
->cl_expiration
+ delta
;
488 expiration
= CY_INFINITY
;
489 cl
->cl_expiration
= expiration
;
492 (void) callout_queue_add(ct
, cl
);
496 * We need to return the expiration to help program the cyclic.
497 * If there are expired callouts, the cyclic needs to go off
498 * immediately. If the queue has become empty, then we return infinity.
499 * Else, we return the expiration of the earliest callout in the queue.
501 if (ct
->ct_expired
.ch_head
!= NULL
)
502 return (gethrtime());
504 cl
= ct
->ct_queue
.ch_head
;
506 return (CY_INFINITY
);
508 return (cl
->cl_expiration
);
512 * Initialize a callout table's heap, if necessary. Preallocate some free
513 * entries so we don't have to check for NULL elsewhere.
516 callout_heap_init(callout_table_t
*ct
)
520 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
521 ASSERT(ct
->ct_heap
== NULL
);
524 ct
->ct_heap_max
= callout_chunk
;
525 size
= sizeof (callout_heap_t
) * callout_chunk
;
526 ct
->ct_heap
= kmem_alloc(size
, KM_SLEEP
);
530 * Reallocate the heap. Return 0 if the heap is still full at the end of it.
531 * Return 1 otherwise. Note that the heap only expands, it never contracts.
534 callout_heap_expand(callout_table_t
*ct
)
536 size_t max
, size
, osize
;
537 callout_heap_t
*heap
;
539 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
540 ASSERT(ct
->ct_heap_num
<= ct
->ct_heap_max
);
542 while (ct
->ct_heap_num
== ct
->ct_heap_max
) {
543 max
= ct
->ct_heap_max
;
544 mutex_exit(&ct
->ct_mutex
);
546 osize
= sizeof (callout_heap_t
) * max
;
547 size
= sizeof (callout_heap_t
) * (max
+ callout_chunk
);
548 heap
= kmem_alloc(size
, KM_NOSLEEP
);
550 mutex_enter(&ct
->ct_mutex
);
553 * We could not allocate memory. If we can free up
554 * some entries, that would be great.
556 if (ct
->ct_nreap
> 0)
557 (void) callout_heap_process(ct
, 0, 0);
559 * If we still have no space in the heap, inform the
562 if (ct
->ct_heap_num
== ct
->ct_heap_max
)
566 if (max
< ct
->ct_heap_max
) {
568 * Someone beat us to the allocation. Free what we
569 * just allocated and proceed.
571 kmem_free(heap
, size
);
575 bcopy(ct
->ct_heap
, heap
, osize
);
576 kmem_free(ct
->ct_heap
, osize
);
578 ct
->ct_heap_max
= size
/ sizeof (callout_heap_t
);
585 * Move an expiration from the bottom of the heap to its correct place
586 * in the heap. If we reached the root doing this, return 1. Else,
590 callout_upheap(callout_table_t
*ct
)
593 callout_heap_t
*heap
, *hcurrent
, *hparent
;
595 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
596 ASSERT(ct
->ct_heap_num
>= 1);
598 if (ct
->ct_heap_num
== 1) {
603 current
= ct
->ct_heap_num
- 1;
606 parent
= CALLOUT_HEAP_PARENT(current
);
607 hparent
= &heap
[parent
];
608 hcurrent
= &heap
[current
];
611 * We have an expiration later than our parent; we're done.
613 if (hcurrent
->ch_expiration
>= hparent
->ch_expiration
) {
618 * We need to swap with our parent, and continue up the heap.
620 CALLOUT_SWAP(hparent
, hcurrent
);
623 * If we just reached the root, we're done.
635 * Insert a new heap item into a callout table's heap.
638 callout_heap_insert(callout_table_t
*ct
, callout_list_t
*cl
)
640 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
641 ASSERT(ct
->ct_heap_num
< ct
->ct_heap_max
);
643 cl
->cl_flags
|= CALLOUT_LIST_FLAG_HEAPED
;
645 * First, copy the expiration and callout list pointer to the bottom
648 ct
->ct_heap
[ct
->ct_heap_num
].ch_expiration
= cl
->cl_expiration
;
649 ct
->ct_heap
[ct
->ct_heap_num
].ch_list
= cl
;
653 * Now, perform an upheap operation. If we reached the root, then
654 * the cyclic needs to be reprogrammed as we have an earlier
657 * Also, during the CPR suspend phase, do not reprogram the cyclic.
658 * We don't want any callout activity. When the CPR resume phase is
659 * entered, the cyclic will be programmed for the earliest expiration
662 if (callout_upheap(ct
) && (ct
->ct_suspend
== 0))
663 (void) cyclic_reprogram(ct
->ct_cyclic
, cl
->cl_expiration
);
667 * Move an expiration from the top of the heap to its correct place
671 callout_downheap(callout_table_t
*ct
)
673 int current
, left
, right
, nelems
;
674 callout_heap_t
*heap
, *hleft
, *hright
, *hcurrent
;
676 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
677 ASSERT(ct
->ct_heap_num
>= 1);
681 nelems
= ct
->ct_heap_num
;
685 * If we don't have a left child (i.e., we're a leaf), we're
688 if ((left
= CALLOUT_HEAP_LEFT(current
)) >= nelems
)
692 hcurrent
= &heap
[current
];
694 right
= CALLOUT_HEAP_RIGHT(current
);
697 * Even if we don't have a right child, we still need to compare
698 * our expiration against that of our left child.
703 hright
= &heap
[right
];
706 * We have both a left and a right child. We need to compare
707 * the expiration of the children to determine which
710 if (hright
->ch_expiration
< hleft
->ch_expiration
) {
712 * Our right child is the earlier of our children.
713 * We'll now compare our expiration to its expiration.
714 * If ours is the earlier one, we're done.
716 if (hcurrent
->ch_expiration
<= hright
->ch_expiration
)
720 * Our right child expires earlier than we do; swap
721 * with our right child, and descend right.
723 CALLOUT_SWAP(hright
, hcurrent
);
730 * Our left child is the earlier of our children (or we have
731 * no right child). We'll now compare our expiration
732 * to its expiration. If ours is the earlier one, we're done.
734 if (hcurrent
->ch_expiration
<= hleft
->ch_expiration
)
738 * Our left child expires earlier than we do; swap with our
739 * left child, and descend left.
741 CALLOUT_SWAP(hleft
, hcurrent
);
747 * Delete and handle all past expirations in a callout table's heap.
750 callout_heap_delete(callout_table_t
*ct
)
752 hrtime_t now
, expiration
, next
;
754 callout_heap_t
*heap
;
757 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
759 if (CALLOUT_CLEANUP(ct
)) {
761 * There are too many heap elements pointing to empty callout
762 * lists. Clean them out.
764 (void) callout_heap_process(ct
, 0, 0);
770 while (ct
->ct_heap_num
> 0) {
771 expiration
= heap
->ch_expiration
;
772 hash
= CALLOUT_CLHASH(expiration
);
774 ASSERT(expiration
== cl
->cl_expiration
);
776 if (cl
->cl_callouts
.ch_head
== NULL
) {
778 * If the callout list is empty, reap it.
779 * Decrement the reap count.
781 CALLOUT_LIST_DELETE(ct
->ct_clhash
[hash
], cl
);
782 CALLOUT_LIST_FREE(ct
, cl
);
786 * If the root of the heap expires in the future,
789 if (expiration
> now
)
793 * Move the callout list for this expiration to the
794 * list of expired callout lists. It will be processed
795 * by the callout executor.
797 cl
->cl_flags
&= ~CALLOUT_LIST_FLAG_HEAPED
;
798 CALLOUT_LIST_DELETE(ct
->ct_clhash
[hash
], cl
);
799 CALLOUT_LIST_APPEND(ct
->ct_expired
, cl
);
803 * Now delete the root. This is done by swapping the root with
804 * the last item in the heap and downheaping the item.
807 if (ct
->ct_heap_num
> 0) {
808 heap
[0] = heap
[ct
->ct_heap_num
];
809 callout_downheap(ct
);
814 * If this callout table is empty or callouts have been suspended,
815 * just return. The cyclic has already been programmed to
816 * infinity by the cyclic subsystem.
818 if ((ct
->ct_heap_num
== 0) || (ct
->ct_suspend
> 0))
819 return (CY_INFINITY
);
822 * If the top expirations are within callout_tolerance of each other,
823 * delay the cyclic expire so that they can be processed together.
824 * This is to prevent high resolution timers from swamping the system
825 * with cyclic activity.
827 if (ct
->ct_heap_num
> 2) {
828 next
= expiration
+ callout_tolerance
;
829 if ((heap
[1].ch_expiration
< next
) ||
830 (heap
[2].ch_expiration
< next
))
834 (void) cyclic_reprogram(ct
->ct_cyclic
, expiration
);
840 * There are some situations when the entire heap is walked and processed.
841 * This function is called to do the processing. These are the situations:
843 * 1. When the reap count reaches its threshold, the heap has to be cleared
844 * of all empty callout lists.
846 * 2. When the system enters and exits KMDB/OBP, all entries in the heap
847 * need to be adjusted by the interval spent in KMDB/OBP.
849 * 3. When system time is changed, the heap has to be scanned for
850 * absolute hrestime timers. These need to be removed from the heap
851 * and expired immediately.
853 * In cases 2 and 3, it is a good idea to do 1 as well since we are
854 * scanning the heap anyway.
856 * If the root gets changed and/or callout lists are expired, return the
857 * new expiration to the caller so it can reprogram the cyclic accordingly.
860 callout_heap_process(callout_table_t
*ct
, hrtime_t delta
, int timechange
)
862 callout_heap_t
*heap
;
864 hrtime_t expiration
, now
;
865 int i
, hash
, clflags
;
868 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
870 if (ct
->ct_heap_num
== 0)
871 return (CY_INFINITY
);
873 if (ct
->ct_nreap
> 0)
879 * We walk the heap from the top to the bottom. If we encounter
880 * a heap item that points to an empty callout list, we clean
881 * it out. If we encounter a hrestime entry that must be removed,
882 * again we clean it out. Otherwise, we apply any adjustments needed
885 * During the walk, we also compact the heap from the bottom and
886 * reconstruct the heap using upheap operations. This is very
887 * efficient if the number of elements to be cleaned is greater than
888 * or equal to half the heap. This is the common case.
890 * Even in the non-common case, the upheap operations should be short
891 * as the entries below generally tend to be bigger than the entries
894 num
= ct
->ct_heap_num
;
896 clflags
= (CALLOUT_LIST_FLAG_HRESTIME
| CALLOUT_LIST_FLAG_ABSOLUTE
);
898 for (i
= 0; i
< num
; i
++) {
899 cl
= heap
[i
].ch_list
;
901 * If the callout list is empty, delete the heap element and
902 * free the callout list.
904 if (cl
->cl_callouts
.ch_head
== NULL
) {
905 hash
= CALLOUT_CLHASH(cl
->cl_expiration
);
906 CALLOUT_LIST_DELETE(ct
->ct_clhash
[hash
], cl
);
907 CALLOUT_LIST_FREE(ct
, cl
);
912 * Delete the heap element and expire the callout list, if
913 * one of the following is true:
914 * - the callout list has expired
915 * - the callout list is an absolute hrestime one and
916 * there has been a system time change
918 if ((cl
->cl_expiration
<= now
) ||
919 (timechange
&& ((cl
->cl_flags
& clflags
) == clflags
))) {
920 hash
= CALLOUT_CLHASH(cl
->cl_expiration
);
921 cl
->cl_flags
&= ~CALLOUT_LIST_FLAG_HEAPED
;
922 CALLOUT_LIST_DELETE(ct
->ct_clhash
[hash
], cl
);
923 CALLOUT_LIST_APPEND(ct
->ct_expired
, cl
);
928 * Apply adjustments, if any. Adjustments are applied after
929 * the system returns from KMDB or OBP. They are only applied
930 * to relative callout lists.
932 if (delta
&& !(cl
->cl_flags
& CALLOUT_LIST_FLAG_ABSOLUTE
)) {
933 hash
= CALLOUT_CLHASH(cl
->cl_expiration
);
934 CALLOUT_LIST_DELETE(ct
->ct_clhash
[hash
], cl
);
935 expiration
= cl
->cl_expiration
+ delta
;
937 expiration
= CY_INFINITY
;
938 heap
[i
].ch_expiration
= expiration
;
939 cl
->cl_expiration
= expiration
;
940 hash
= CALLOUT_CLHASH(cl
->cl_expiration
);
941 if (cl
->cl_flags
& CALLOUT_LIST_FLAG_NANO
) {
942 CALLOUT_LIST_APPEND(ct
->ct_clhash
[hash
], cl
);
944 CALLOUT_LIST_INSERT(ct
->ct_clhash
[hash
], cl
);
948 heap
[ct
->ct_heap_num
] = heap
[i
];
950 (void) callout_upheap(ct
);
956 * We need to return the expiration to help program the cyclic.
957 * If there are expired callouts, the cyclic needs to go off
958 * immediately. If the heap has become empty, then we return infinity.
959 * Else, return the expiration of the earliest callout in the heap.
961 if (ct
->ct_expired
.ch_head
!= NULL
)
962 return (gethrtime());
964 if (ct
->ct_heap_num
== 0)
965 return (CY_INFINITY
);
967 return (heap
->ch_expiration
);
971 * Common function used to create normal and realtime callouts.
973 * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
974 * there is one restriction on a realtime callout handler - it should not
975 * directly or indirectly acquire cpu_lock. CPU offline waits for pending
976 * cyclic handlers to complete while holding cpu_lock. So, if a realtime
977 * callout handler were to try to get cpu_lock, there would be a deadlock
978 * during CPU offline.
981 timeout_generic(int type
, void (*func
)(void *), void *arg
,
982 hrtime_t expiration
, hrtime_t resolution
, int flags
)
988 hrtime_t now
, interval
;
991 ASSERT(resolution
> 0);
992 ASSERT(func
!= NULL
);
995 * We get the current hrtime right upfront so that latencies in
996 * this function do not affect the accuracy of the callout.
1001 * We disable kernel preemption so that we remain on the same CPU
1002 * throughout. If we needed to reprogram the callout table's cyclic,
1003 * we can avoid X-calls if we are on the same CPU.
1005 * Note that callout_alloc() releases and reacquires the callout
1006 * table mutex. While reacquiring the mutex, it is possible for us
1007 * to go to sleep and later migrate to another CPU. This should be
1008 * pretty rare, though.
1012 ct
= &callout_table
[CALLOUT_TABLE(type
, CPU
->cpu_seqid
)];
1013 mutex_enter(&ct
->ct_mutex
);
1015 if (ct
->ct_cyclic
== CYCLIC_NONE
) {
1016 mutex_exit(&ct
->ct_mutex
);
1018 * The callout table has not yet been initialized fully.
1019 * So, put this one on the boot callout table which is
1020 * always initialized.
1022 ct
= &callout_boot_ct
[type
];
1023 mutex_enter(&ct
->ct_mutex
);
1026 if (CALLOUT_CLEANUP(ct
)) {
1028 * There are too many heap elements pointing to empty callout
1029 * lists. Clean them out. Since cleanup is only done once
1030 * in a while, no need to reprogram the cyclic if the root
1031 * of the heap gets cleaned out.
1033 (void) callout_heap_process(ct
, 0, 0);
1036 if ((cp
= ct
->ct_free
) == NULL
)
1037 cp
= callout_alloc(ct
);
1039 ct
->ct_free
= cp
->c_idnext
;
1045 * Compute the expiration hrtime.
1047 if (flags
& CALLOUT_FLAG_ABSOLUTE
) {
1048 interval
= expiration
- now
;
1050 interval
= expiration
;
1054 if (resolution
> 1) {
1056 * Align expiration to the specified resolution.
1058 if (flags
& CALLOUT_FLAG_ROUNDUP
)
1059 expiration
+= resolution
- 1;
1060 expiration
= (expiration
/ resolution
) * resolution
;
1063 if (expiration
<= 0) {
1065 * expiration hrtime overflow has occurred. Just set the
1066 * expiration to infinity.
1068 expiration
= CY_INFINITY
;
1072 * Assign an ID to this callout
1074 if (flags
& CALLOUT_FLAG_32BIT
) {
1075 if (interval
> callout_longterm
) {
1076 id
= (ct
->ct_long_id
- callout_counter_low
);
1077 id
|= CALLOUT_COUNTER_HIGH
;
1078 ct
->ct_long_id
= id
;
1080 id
= (ct
->ct_short_id
- callout_counter_low
);
1081 id
|= CALLOUT_COUNTER_HIGH
;
1082 ct
->ct_short_id
= id
;
1085 id
= (ct
->ct_gen_id
- callout_counter_low
);
1086 if ((id
& CALLOUT_COUNTER_HIGH
) == 0) {
1087 id
|= CALLOUT_COUNTER_HIGH
;
1088 id
+= CALLOUT_GENERATION_LOW
;
1096 if (flags
& CALLOUT_FLAG_ABSOLUTE
)
1097 clflags
|= CALLOUT_LIST_FLAG_ABSOLUTE
;
1098 if (flags
& CALLOUT_FLAG_HRESTIME
)
1099 clflags
|= CALLOUT_LIST_FLAG_HRESTIME
;
1100 if (resolution
== 1)
1101 clflags
|= CALLOUT_LIST_FLAG_NANO
;
1102 hash
= CALLOUT_CLHASH(expiration
);
1106 * Try to see if a callout list already exists for this expiration.
1108 cl
= callout_list_get(ct
, expiration
, clflags
, hash
);
1111 * Check the free list. If we don't find one, we have to
1112 * take the slow path and allocate from kmem.
1114 if ((cl
= ct
->ct_lfree
) == NULL
) {
1115 callout_list_alloc(ct
);
1117 * In the above call, we drop the lock, allocate and
1118 * reacquire the lock. So, we could have been away
1119 * for a while. In the meantime, someone could have
1120 * inserted a callout list with the same expiration.
1121 * Plus, the heap could have become full. So, the best
1122 * course is to repeat the steps. This should be an
1127 ct
->ct_lfree
= cl
->cl_next
;
1128 cl
->cl_expiration
= expiration
;
1129 cl
->cl_flags
= clflags
;
1132 * Check if we have enough space in the heap to insert one
1133 * expiration. If not, expand the heap.
1135 if (ct
->ct_heap_num
== ct
->ct_heap_max
) {
1136 if (callout_heap_expand(ct
) == 0) {
1138 * Could not expand the heap. Just queue it.
1140 callout_queue_insert(ct
, cl
);
1145 * In the above call, we drop the lock, allocate and
1146 * reacquire the lock. So, we could have been away
1147 * for a while. In the meantime, someone could have
1148 * inserted a callout list with the same expiration.
1149 * But we will not go back and check for it as this
1150 * should be a really infrequent event. There is no
1155 if (clflags
& CALLOUT_LIST_FLAG_NANO
) {
1156 CALLOUT_LIST_APPEND(ct
->ct_clhash
[hash
], cl
);
1158 CALLOUT_LIST_INSERT(ct
->ct_clhash
[hash
], cl
);
1162 * This is a new expiration. So, insert it into the heap.
1163 * This will also reprogram the cyclic, if the expiration
1164 * propagated to the root of the heap.
1166 callout_heap_insert(ct
, cl
);
1169 * If the callout list was empty, untimeout_generic() would
1170 * have incremented a reap count. Decrement the reap count
1171 * as we are going to insert a callout into this list.
1173 if (cl
->cl_callouts
.ch_head
== NULL
)
1178 CALLOUT_APPEND(ct
, cp
);
1181 ct
->ct_timeouts_pending
++;
1183 mutex_exit(&ct
->ct_mutex
);
1187 TRACE_4(TR_FAC_CALLOUT
, TR_TIMEOUT
,
1188 "timeout:%K(%p) in %llx expiration, cp %p", func
, arg
, expiration
,
1195 timeout(void (*func
)(void *), void *arg
, clock_t delta
)
1200 * Make sure the callout runs at least 1 tick in the future.
1204 else if (delta
> callout_max_ticks
)
1205 delta
= callout_max_ticks
;
1207 id
= (ulong_t
)timeout_generic(CALLOUT_NORMAL
, func
, arg
,
1208 TICK_TO_NSEC(delta
), nsec_per_tick
, CALLOUT_LEGACY
);
1210 return ((timeout_id_t
)id
);
1214 * Convenience function that creates a normal callout with default parameters
1215 * and returns a full ID.
1218 timeout_default(void (*func
)(void *), void *arg
, clock_t delta
)
1223 * Make sure the callout runs at least 1 tick in the future.
1227 else if (delta
> callout_max_ticks
)
1228 delta
= callout_max_ticks
;
1230 id
= timeout_generic(CALLOUT_NORMAL
, func
, arg
, TICK_TO_NSEC(delta
),
1237 realtime_timeout(void (*func
)(void *), void *arg
, clock_t delta
)
1242 * Make sure the callout runs at least 1 tick in the future.
1246 else if (delta
> callout_max_ticks
)
1247 delta
= callout_max_ticks
;
1249 id
= (ulong_t
)timeout_generic(CALLOUT_REALTIME
, func
, arg
,
1250 TICK_TO_NSEC(delta
), nsec_per_tick
, CALLOUT_LEGACY
);
1252 return ((timeout_id_t
)id
);
1256 * Convenience function that creates a realtime callout with default parameters
1257 * and returns a full ID.
1260 realtime_timeout_default(void (*func
)(void *), void *arg
, clock_t delta
)
1265 * Make sure the callout runs at least 1 tick in the future.
1269 else if (delta
> callout_max_ticks
)
1270 delta
= callout_max_ticks
;
1272 id
= timeout_generic(CALLOUT_REALTIME
, func
, arg
, TICK_TO_NSEC(delta
),
1279 untimeout_generic(callout_id_t id
, int nowait
)
1281 callout_table_t
*ct
;
1288 ct
= &callout_table
[CALLOUT_ID_TO_TABLE(id
)];
1289 hash
= CALLOUT_IDHASH(id
);
1291 mutex_enter(&ct
->ct_mutex
);
1294 * Search the ID hash table for the callout.
1296 for (cp
= ct
->ct_idhash
[hash
].ch_head
; cp
; cp
= cp
->c_idnext
) {
1301 * Match the ID and generation number.
1303 if ((xid
& CALLOUT_ID_MASK
) != id
)
1306 if ((xid
& CALLOUT_EXECUTING
) == 0) {
1307 hrtime_t expiration
;
1310 * Delete the callout. If the callout list becomes
1311 * NULL, we don't remove it from the table. This is
1312 * so it can be reused. If the empty callout list
1313 * corresponds to the top of the the callout heap, we
1314 * don't reprogram the table cyclic here. This is in
1315 * order to avoid lots of X-calls to the CPU associated
1316 * with the callout table.
1319 expiration
= cl
->cl_expiration
;
1320 CALLOUT_DELETE(ct
, cp
);
1321 CALLOUT_FREE(ct
, cp
);
1322 ct
->ct_untimeouts_unexpired
++;
1323 ct
->ct_timeouts_pending
--;
1326 * If the callout list has become empty, there are 3
1327 * possibilities. If it is present:
1328 * - in the heap, it needs to be cleaned along
1329 * with its heap entry. Increment a reap count.
1330 * - in the callout queue, free it.
1331 * - in the expired list, free it.
1333 if (cl
->cl_callouts
.ch_head
== NULL
) {
1334 flags
= cl
->cl_flags
;
1335 if (flags
& CALLOUT_LIST_FLAG_HEAPED
) {
1337 } else if (flags
& CALLOUT_LIST_FLAG_QUEUED
) {
1338 CALLOUT_LIST_DELETE(ct
->ct_queue
, cl
);
1339 CALLOUT_LIST_FREE(ct
, cl
);
1341 CALLOUT_LIST_DELETE(ct
->ct_expired
, cl
);
1342 CALLOUT_LIST_FREE(ct
, cl
);
1345 mutex_exit(&ct
->ct_mutex
);
1347 expiration
-= gethrtime();
1348 TRACE_2(TR_FAC_CALLOUT
, TR_UNTIMEOUT
,
1349 "untimeout:ID %lx hrtime left %llx", id
,
1351 return (expiration
< 0 ? 0 : expiration
);
1354 ct
->ct_untimeouts_executing
++;
1356 * The callout we want to delete is currently executing.
1357 * The DDI states that we must wait until the callout
1358 * completes before returning, so we block on c_done until the
1359 * callout ID changes (to the old ID if it's on the freelist,
1360 * or to a new callout ID if it's in use). This implicitly
1361 * assumes that callout structures are persistent (they are).
1363 if (cp
->c_executor
== curthread
) {
1365 * The timeout handler called untimeout() on itself.
1366 * Stupid, but legal. We can't wait for the timeout
1367 * to complete without deadlocking, so we just return.
1369 mutex_exit(&ct
->ct_mutex
);
1370 TRACE_1(TR_FAC_CALLOUT
, TR_UNTIMEOUT_SELF
,
1371 "untimeout_self:ID %x", id
);
1376 * We need to wait. Indicate that we are waiting by
1377 * incrementing c_waiting. This prevents the executor
1378 * from doing a wakeup on c_done if there are no
1381 while (cp
->c_xid
== xid
) {
1383 cv_wait(&cp
->c_done
, &ct
->ct_mutex
);
1386 mutex_exit(&ct
->ct_mutex
);
1387 TRACE_1(TR_FAC_CALLOUT
, TR_UNTIMEOUT_EXECUTING
,
1388 "untimeout_executing:ID %lx", id
);
1391 ct
->ct_untimeouts_expired
++;
1393 mutex_exit(&ct
->ct_mutex
);
1394 TRACE_1(TR_FAC_CALLOUT
, TR_UNTIMEOUT_BOGUS_ID
,
1395 "untimeout_bogus_id:ID %lx", id
);
1398 * We didn't find the specified callout ID. This means either
1399 * (1) the callout already fired, or (2) the caller passed us
1400 * a bogus value. Perform a sanity check to detect case (2).
1402 bogus
= (CALLOUT_ID_FLAGS
| CALLOUT_COUNTER_HIGH
);
1403 if (((id
& bogus
) != CALLOUT_COUNTER_HIGH
) && (id
!= 0))
1404 panic("untimeout: impossible timeout id %llx",
1405 (unsigned long long)id
);
1411 untimeout(timeout_id_t id_arg
)
1417 id
= (ulong_t
)id_arg
;
1418 hleft
= untimeout_generic(id
, 0);
1421 else if (hleft
== 0)
1424 tleft
= NSEC_TO_TICK(hleft
);
1430 * Convenience function to untimeout a timeout with a full ID with default
1434 untimeout_default(callout_id_t id
, int nowait
)
1439 hleft
= untimeout_generic(id
, nowait
);
1442 else if (hleft
== 0)
1445 tleft
= NSEC_TO_TICK(hleft
);
1451 * Expire all the callouts queued in the specified callout list.
1454 callout_list_expire(callout_table_t
*ct
, callout_list_t
*cl
)
1456 callout_t
*cp
, *cnext
;
1458 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
1461 for (cp
= cl
->cl_callouts
.ch_head
; cp
!= NULL
; cp
= cnext
) {
1463 * Multiple executor threads could be running at the same
1464 * time. If this callout is already being executed,
1465 * go on to the next one.
1467 if (cp
->c_xid
& CALLOUT_EXECUTING
) {
1468 cnext
= cp
->c_clnext
;
1473 * Indicate to untimeout() that a callout is
1474 * being expired by the executor.
1476 cp
->c_xid
|= CALLOUT_EXECUTING
;
1477 cp
->c_executor
= curthread
;
1478 mutex_exit(&ct
->ct_mutex
);
1480 DTRACE_PROBE1(callout__start
, callout_t
*, cp
);
1481 (*cp
->c_func
)(cp
->c_arg
);
1482 DTRACE_PROBE1(callout__end
, callout_t
*, cp
);
1484 mutex_enter(&ct
->ct_mutex
);
1486 ct
->ct_expirations
++;
1487 ct
->ct_timeouts_pending
--;
1489 * Indicate completion for c_done.
1491 cp
->c_xid
&= ~CALLOUT_EXECUTING
;
1492 cp
->c_executor
= NULL
;
1493 cnext
= cp
->c_clnext
;
1496 * Delete callout from ID hash table and the callout
1497 * list, return to freelist, and tell any untimeout() that
1498 * cares that we're done.
1500 CALLOUT_DELETE(ct
, cp
);
1501 CALLOUT_FREE(ct
, cp
);
1503 if (cp
->c_waiting
) {
1505 cv_broadcast(&cp
->c_done
);
1511 * Execute all expired callout lists for a callout table.
1514 callout_expire(callout_table_t
*ct
)
1516 callout_list_t
*cl
, *clnext
;
1518 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
1520 for (cl
= ct
->ct_expired
.ch_head
; (cl
!= NULL
); cl
= clnext
) {
1522 * Expire all the callouts in this callout list.
1524 callout_list_expire(ct
, cl
);
1526 clnext
= cl
->cl_next
;
1527 if (cl
->cl_callouts
.ch_head
== NULL
) {
1529 * Free the callout list.
1531 CALLOUT_LIST_DELETE(ct
->ct_expired
, cl
);
1532 CALLOUT_LIST_FREE(ct
, cl
);
1538 * The cyclic handlers below process callouts in two steps:
1540 * 1. Find all expired callout lists and queue them in a separate
1541 * list of expired callouts.
1542 * 2. Execute the expired callout lists.
1544 * This is done for two reasons:
1546 * 1. We want to quickly find the next earliest expiration to program
1547 * the cyclic to and reprogram it. We can do this right at the end
1549 * 2. The realtime cyclic handler expires callouts in place. However,
1550 * for normal callouts, callouts are expired by a taskq thread.
1551 * So, it is simpler and more robust to have the taskq thread just
1556 * Realtime callout cyclic handlers.
1559 callout_realtime(callout_table_t
*ct
)
1561 mutex_enter(&ct
->ct_mutex
);
1562 (void) callout_heap_delete(ct
);
1564 mutex_exit(&ct
->ct_mutex
);
1568 callout_queue_realtime(callout_table_t
*ct
)
1570 mutex_enter(&ct
->ct_mutex
);
1571 (void) callout_queue_delete(ct
);
1573 mutex_exit(&ct
->ct_mutex
);
1577 callout_execute(callout_table_t
*ct
)
1579 mutex_enter(&ct
->ct_mutex
);
1581 mutex_exit(&ct
->ct_mutex
);
1585 * Normal callout cyclic handlers.
1588 callout_normal(callout_table_t
*ct
)
1593 mutex_enter(&ct
->ct_mutex
);
1594 exp
= callout_heap_delete(ct
);
1595 CALLOUT_EXEC_COMPUTE(ct
, exp
, exec
);
1596 mutex_exit(&ct
->ct_mutex
);
1598 for (i
= 0; i
< exec
; i
++) {
1599 ASSERT(ct
->ct_taskq
!= NULL
);
1600 (void) taskq_dispatch(ct
->ct_taskq
,
1601 (task_func_t
*)callout_execute
, ct
, TQ_NOSLEEP
);
1606 callout_queue_normal(callout_table_t
*ct
)
1611 mutex_enter(&ct
->ct_mutex
);
1612 exp
= callout_queue_delete(ct
);
1613 CALLOUT_EXEC_COMPUTE(ct
, exp
, exec
);
1614 mutex_exit(&ct
->ct_mutex
);
1616 for (i
= 0; i
< exec
; i
++) {
1617 ASSERT(ct
->ct_taskq
!= NULL
);
1618 (void) taskq_dispatch(ct
->ct_taskq
,
1619 (task_func_t
*)callout_execute
, ct
, TQ_NOSLEEP
);
1624 * Suspend callout processing.
1627 callout_suspend(void)
1630 callout_table_t
*ct
;
1633 * Traverse every callout table in the system and suspend callout
1636 * We need to suspend all the tables (including the inactive ones)
1637 * so that if a table is made active while the suspend is still on,
1638 * the table remains suspended.
1640 for (f
= 0; f
< max_ncpus
; f
++) {
1641 for (t
= 0; t
< CALLOUT_NTYPES
; t
++) {
1642 ct
= &callout_table
[CALLOUT_TABLE(t
, f
)];
1644 mutex_enter(&ct
->ct_mutex
);
1646 if (ct
->ct_cyclic
== CYCLIC_NONE
) {
1647 mutex_exit(&ct
->ct_mutex
);
1650 if (ct
->ct_suspend
== 1) {
1651 (void) cyclic_reprogram(ct
->ct_cyclic
,
1653 (void) cyclic_reprogram(ct
->ct_qcyclic
,
1656 mutex_exit(&ct
->ct_mutex
);
1662 * Resume callout processing.
1665 callout_resume(hrtime_t delta
, int timechange
)
1667 hrtime_t hexp
, qexp
;
1669 callout_table_t
*ct
;
1672 * Traverse every callout table in the system and resume callout
1673 * processing. For active tables, perform any hrtime adjustments
1676 for (f
= 0; f
< max_ncpus
; f
++) {
1677 for (t
= 0; t
< CALLOUT_NTYPES
; t
++) {
1678 ct
= &callout_table
[CALLOUT_TABLE(t
, f
)];
1680 mutex_enter(&ct
->ct_mutex
);
1681 if (ct
->ct_cyclic
== CYCLIC_NONE
) {
1683 mutex_exit(&ct
->ct_mutex
);
1688 * If a delta is specified, adjust the expirations in
1689 * the heap by delta. Also, if the caller indicates
1690 * a timechange, process that. This step also cleans
1691 * out any empty callout lists that might happen to
1694 hexp
= callout_heap_process(ct
, delta
, timechange
);
1695 qexp
= callout_queue_process(ct
, delta
, timechange
);
1698 if (ct
->ct_suspend
== 0) {
1699 (void) cyclic_reprogram(ct
->ct_cyclic
, hexp
);
1700 (void) cyclic_reprogram(ct
->ct_qcyclic
, qexp
);
1703 mutex_exit(&ct
->ct_mutex
);
1709 * Callback handler used by CPR to stop and resume callouts.
1710 * The cyclic subsystem saves and restores hrtime during CPR.
1711 * That is why callout_resume() is called with a 0 delta.
1712 * Although hrtime is the same, hrestime (system time) has
1713 * progressed during CPR. So, we have to indicate a time change
1714 * to expire the absolute hrestime timers.
1718 callout_cpr_callb(void *arg
, int code
)
1720 if (code
== CB_CODE_CPR_CHKPT
)
1723 callout_resume(0, 1);
1729 * Callback handler invoked when the debugger is entered or exited.
1733 callout_debug_callb(void *arg
, int code
)
1738 * When the system enters the debugger. make a note of the hrtime.
1739 * When it is resumed, compute how long the system was in the
1740 * debugger. This interval should not be counted for callouts.
1744 callout_debug_hrtime
= gethrtime();
1746 delta
= gethrtime() - callout_debug_hrtime
;
1747 callout_resume(delta
, 0);
1754 * Move the absolute hrestime callouts to the expired list. Then program the
1755 * table's cyclic to expire immediately so that the callouts can be executed
1759 callout_hrestime_one(callout_table_t
*ct
)
1761 hrtime_t hexp
, qexp
;
1763 mutex_enter(&ct
->ct_mutex
);
1764 if (ct
->ct_cyclic
== CYCLIC_NONE
) {
1765 mutex_exit(&ct
->ct_mutex
);
1770 * Walk the heap and process all the absolute hrestime entries.
1772 hexp
= callout_heap_process(ct
, 0, 1);
1773 qexp
= callout_queue_process(ct
, 0, 1);
1775 if (ct
->ct_suspend
== 0) {
1776 (void) cyclic_reprogram(ct
->ct_cyclic
, hexp
);
1777 (void) cyclic_reprogram(ct
->ct_qcyclic
, qexp
);
1780 mutex_exit(&ct
->ct_mutex
);
1784 * This function is called whenever system time (hrestime) is changed
1785 * explicitly. All the HRESTIME callouts must be expired at once.
1789 callout_hrestime(void)
1792 callout_table_t
*ct
;
1795 * Traverse every callout table in the system and process the hrestime
1798 * We look at all the tables because we don't know which ones were
1799 * onlined and offlined in the past. The offlined tables may still
1800 * have active cyclics processing timers somewhere.
1802 for (f
= 0; f
< max_ncpus
; f
++) {
1803 for (t
= 0; t
< CALLOUT_NTYPES
; t
++) {
1804 ct
= &callout_table
[CALLOUT_TABLE(t
, f
)];
1805 callout_hrestime_one(ct
);
1811 * Create the hash tables for this callout table.
1814 callout_hash_init(callout_table_t
*ct
)
1818 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
1819 ASSERT((ct
->ct_idhash
== NULL
) && (ct
->ct_clhash
== NULL
));
1821 size
= sizeof (callout_hash_t
) * CALLOUT_BUCKETS
;
1822 ct
->ct_idhash
= kmem_zalloc(size
, KM_SLEEP
);
1823 ct
->ct_clhash
= kmem_zalloc(size
, KM_SLEEP
);
1827 * Create per-callout table kstats.
1830 callout_kstat_init(callout_table_t
*ct
)
1832 callout_stat_type_t stat
;
1836 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
1837 ASSERT(ct
->ct_kstats
== NULL
);
1839 ndx
= ct
- callout_table
;
1840 ct_kstats
= kstat_create("unix", ndx
, "callout",
1841 "misc", KSTAT_TYPE_NAMED
, CALLOUT_NUM_STATS
, KSTAT_FLAG_VIRTUAL
);
1843 if (ct_kstats
== NULL
) {
1844 cmn_err(CE_WARN
, "kstat_create for callout table %p failed",
1847 ct_kstats
->ks_data
= ct
->ct_kstat_data
;
1848 for (stat
= 0; stat
< CALLOUT_NUM_STATS
; stat
++)
1849 kstat_named_init(&ct
->ct_kstat_data
[stat
],
1850 callout_kstat_names
[stat
], KSTAT_DATA_INT64
);
1851 ct
->ct_kstats
= ct_kstats
;
1852 kstat_install(ct_kstats
);
1857 callout_cyclic_init(callout_table_t
*ct
)
1861 processorid_t seqid
;
1863 cyclic_id_t cyclic
, qcyclic
;
1865 ASSERT(MUTEX_HELD(&ct
->ct_mutex
));
1868 seqid
= CALLOUT_TABLE_SEQID(ct
);
1871 * Create the taskq thread if the table type is normal.
1872 * Realtime tables are handled at PIL1 by a softint
1875 if (t
== CALLOUT_NORMAL
) {
1876 ASSERT(ct
->ct_taskq
== NULL
);
1878 * Each callout thread consumes exactly one
1879 * task structure while active. Therefore,
1880 * prepopulating with 2 * callout_threads tasks
1881 * ensures that there's at least one task per
1882 * thread that's either scheduled or on the
1883 * freelist. In turn, this guarantees that
1884 * taskq_dispatch() will always either succeed
1885 * (because there's a free task structure) or
1886 * be unnecessary (because "callout_excute(ct)"
1887 * has already scheduled).
1890 taskq_create_instance("callout_taskq", seqid
,
1891 callout_threads
, maxclsyspri
,
1892 2 * callout_threads
, 2 * callout_threads
,
1893 TASKQ_PREPOPULATE
| TASKQ_CPR_SAFE
);
1897 * callouts can only be created in a table whose
1898 * cyclic has been initialized.
1900 ASSERT(ct
->ct_heap_num
== 0);
1903 * Drop the mutex before creating the callout cyclics. cyclic_add()
1904 * could potentially expand the cyclic heap. We don't want to be
1905 * holding the callout table mutex in that case. Note that this
1906 * function is called during CPU online. cpu_lock is held at this
1907 * point. So, only one thread can be executing the cyclic add logic
1908 * below at any time.
1910 mutex_exit(&ct
->ct_mutex
);
1913 * Create the callout table cyclics.
1915 * The realtime cyclic handler executes at low PIL. The normal cyclic
1916 * handler executes at lock PIL. This is because there are cases
1917 * where code can block at PIL > 1 waiting for a normal callout handler
1918 * to unblock it directly or indirectly. If the normal cyclic were to
1919 * be executed at low PIL, it could get blocked out by the waiter
1920 * and cause a deadlock.
1922 ASSERT(ct
->ct_cyclic
== CYCLIC_NONE
);
1924 if (t
== CALLOUT_REALTIME
) {
1925 hdlr
.cyh_level
= callout_realtime_level
;
1926 hdlr
.cyh_func
= (cyc_func_t
)callout_realtime
;
1928 hdlr
.cyh_level
= callout_normal_level
;
1929 hdlr
.cyh_func
= (cyc_func_t
)callout_normal
;
1932 when
.cyt_when
= CY_INFINITY
;
1933 when
.cyt_interval
= CY_INFINITY
;
1935 cyclic
= cyclic_add(&hdlr
, &when
);
1937 if (t
== CALLOUT_REALTIME
)
1938 hdlr
.cyh_func
= (cyc_func_t
)callout_queue_realtime
;
1940 hdlr
.cyh_func
= (cyc_func_t
)callout_queue_normal
;
1942 qcyclic
= cyclic_add(&hdlr
, &when
);
1944 mutex_enter(&ct
->ct_mutex
);
1945 ct
->ct_cyclic
= cyclic
;
1946 ct
->ct_qcyclic
= qcyclic
;
1950 callout_cpu_online(cpu_t
*cp
)
1953 callout_cache_t
*cache
;
1954 char s
[KMEM_CACHE_NAMELEN
];
1955 callout_table_t
*ct
;
1956 processorid_t seqid
;
1959 ASSERT(MUTEX_HELD(&cpu_lock
));
1962 * Locate the cache corresponding to the onlined CPU's lgroup.
1963 * Note that access to callout_caches is protected by cpu_lock.
1965 hand
= lgrp_plat_cpu_to_hand(cp
->cpu_id
);
1966 for (cache
= callout_caches
; cache
!= NULL
; cache
= cache
->cc_next
) {
1967 if (cache
->cc_hand
== hand
)
1972 * If not found, create one. The caches are never destroyed.
1974 if (cache
== NULL
) {
1975 cache
= kmem_alloc(sizeof (callout_cache_t
), KM_SLEEP
);
1976 cache
->cc_hand
= hand
;
1977 (void) snprintf(s
, KMEM_CACHE_NAMELEN
, "callout_cache%lx",
1979 cache
->cc_cache
= kmem_cache_create(s
, sizeof (callout_t
),
1980 CALLOUT_ALIGN
, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1981 (void) snprintf(s
, KMEM_CACHE_NAMELEN
, "callout_lcache%lx",
1983 cache
->cc_lcache
= kmem_cache_create(s
, sizeof (callout_list_t
),
1984 CALLOUT_ALIGN
, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
1985 cache
->cc_next
= callout_caches
;
1986 callout_caches
= cache
;
1989 seqid
= cp
->cpu_seqid
;
1991 for (t
= 0; t
< CALLOUT_NTYPES
; t
++) {
1992 ct
= &callout_table
[CALLOUT_TABLE(t
, seqid
)];
1994 mutex_enter(&ct
->ct_mutex
);
1996 * Store convinience pointers to the kmem caches
1997 * in the callout table. These assignments should always be
1998 * done as callout tables can map to different physical
2001 ct
->ct_cache
= cache
->cc_cache
;
2002 ct
->ct_lcache
= cache
->cc_lcache
;
2005 * We use the heap pointer to check if stuff has been
2006 * initialized for this callout table.
2008 if (ct
->ct_heap
== NULL
) {
2009 callout_heap_init(ct
);
2010 callout_hash_init(ct
);
2011 callout_kstat_init(ct
);
2012 callout_cyclic_init(ct
);
2015 mutex_exit(&ct
->ct_mutex
);
2018 * Move the cyclics to this CPU by doing a bind.
2020 cyclic_bind(ct
->ct_cyclic
, cp
, NULL
);
2021 cyclic_bind(ct
->ct_qcyclic
, cp
, NULL
);
2026 callout_cpu_offline(cpu_t
*cp
)
2028 callout_table_t
*ct
;
2029 processorid_t seqid
;
2032 ASSERT(MUTEX_HELD(&cpu_lock
));
2034 seqid
= cp
->cpu_seqid
;
2036 for (t
= 0; t
< CALLOUT_NTYPES
; t
++) {
2037 ct
= &callout_table
[CALLOUT_TABLE(t
, seqid
)];
2040 * Unbind the cyclics. This will allow the cyclic subsystem
2041 * to juggle the cyclics during CPU offline.
2043 cyclic_bind(ct
->ct_cyclic
, NULL
, NULL
);
2044 cyclic_bind(ct
->ct_qcyclic
, NULL
, NULL
);
2049 * This is called to perform per-CPU initialization for slave CPUs at
2053 callout_mp_init(void)
2058 if (callout_chunk
== CALLOUT_CHUNK
) {
2060 * No one has specified a chunk in /etc/system. We need to
2061 * compute it here based on the number of online CPUs and
2062 * available physical memory.
2064 min
= CALLOUT_MIN_HEAP_SIZE
;
2065 max
= ptob(physmem
/ CALLOUT_MEM_FRACTION
);
2068 callout_chunk
= min
/ sizeof (callout_heap_t
);
2069 callout_chunk
/= ncpus_online
;
2070 callout_chunk
= P2ROUNDUP(callout_chunk
, CALLOUT_CHUNK
);
2073 mutex_enter(&cpu_lock
);
2077 callout_cpu_online(cp
);
2078 } while ((cp
= cp
->cpu_next_onln
) != cpu_active
);
2080 mutex_exit(&cpu_lock
);
2084 * Initialize all callout tables. Called at boot time just before clkstart().
2092 callout_table_t
*ct
;
2097 * Initialize callout globals.
2100 for (fanout
= 1; (fanout
< max_ncpus
); fanout
<<= 1)
2102 callout_table_bits
= CALLOUT_TYPE_BITS
+ bits
;
2103 callout_table_mask
= (1 << callout_table_bits
) - 1;
2104 callout_counter_low
= 1 << CALLOUT_COUNTER_SHIFT
;
2105 callout_longterm
= TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS
);
2106 callout_max_ticks
= CALLOUT_MAX_TICKS
;
2107 if (callout_min_reap
== 0)
2108 callout_min_reap
= CALLOUT_MIN_REAP
;
2110 if (callout_tolerance
<= 0)
2111 callout_tolerance
= CALLOUT_TOLERANCE
;
2112 if (callout_threads
<= 0)
2113 callout_threads
= CALLOUT_THREADS
;
2114 if (callout_chunk
<= 0)
2115 callout_chunk
= CALLOUT_CHUNK
;
2117 callout_chunk
= P2ROUNDUP(callout_chunk
, CALLOUT_CHUNK
);
2120 * Allocate all the callout tables based on max_ncpus. We have chosen
2121 * to do boot-time allocation instead of dynamic allocation because:
2123 * - the size of the callout tables is not too large.
2124 * - there are race conditions involved in making this dynamic.
2125 * - the hash tables that go with the callout tables consume
2126 * most of the memory and they are only allocated in
2127 * callout_cpu_online().
2129 * Each CPU has two tables that are consecutive in the array. The first
2130 * one is for realtime callouts and the second one is for normal ones.
2132 * We do this alignment dance to make sure that callout table
2133 * structures will always be on a cache line boundary.
2135 size
= sizeof (callout_table_t
) * CALLOUT_NTYPES
* max_ncpus
;
2136 size
+= CALLOUT_ALIGN
;
2137 buf
= (uintptr_t)kmem_zalloc(size
, KM_SLEEP
);
2138 callout_table
= (callout_table_t
*)P2ROUNDUP(buf
, CALLOUT_ALIGN
);
2140 size
= sizeof (kstat_named_t
) * CALLOUT_NUM_STATS
;
2142 * Now, initialize the tables for all the CPUs.
2144 for (f
= 0; f
< max_ncpus
; f
++) {
2145 for (t
= 0; t
< CALLOUT_NTYPES
; t
++) {
2146 table_id
= CALLOUT_TABLE(t
, f
);
2147 ct
= &callout_table
[table_id
];
2149 mutex_init(&ct
->ct_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
2151 * Precompute the base IDs for long and short-term
2152 * legacy IDs. This makes ID generation during
2155 ct
->ct_short_id
= CALLOUT_SHORT_ID(table_id
);
2156 ct
->ct_long_id
= CALLOUT_LONG_ID(table_id
);
2158 * Precompute the base ID for generation-based IDs.
2159 * Note that when the first ID gets allocated, the
2160 * ID will wrap. This will cause the generation
2161 * number to be incremented to 1.
2163 ct
->ct_gen_id
= CALLOUT_SHORT_ID(table_id
);
2165 * Initialize the cyclics as NONE. This will get set
2166 * during CPU online. This is so that partially
2167 * populated systems will only have the required
2168 * number of cyclics, not more.
2170 ct
->ct_cyclic
= CYCLIC_NONE
;
2171 ct
->ct_qcyclic
= CYCLIC_NONE
;
2172 ct
->ct_kstat_data
= kmem_zalloc(size
, KM_SLEEP
);
2177 * Add the callback for CPR. This is called during checkpoint
2178 * resume to suspend and resume callouts.
2180 (void) callb_add(callout_cpr_callb
, 0, CB_CL_CPR_CALLOUT
,
2182 (void) callb_add(callout_debug_callb
, 0, CB_CL_ENTER_DEBUGGER
,
2186 * Call the per-CPU initialization function for the boot CPU. This
2187 * is done here because the function is not called automatically for
2188 * the boot CPU from the CPU online/offline hooks. Note that the
2189 * CPU lock is taken here because of convention.
2191 mutex_enter(&cpu_lock
);
2192 callout_boot_ct
= &callout_table
[CALLOUT_TABLE(0, CPU
->cpu_seqid
)];
2193 callout_cpu_online(CPU
);
2194 mutex_exit(&cpu_lock
);
2196 /* heads-up to boot-time clients that timeouts now available */
2197 callout_init_done
= 1;