5 * Functions for collecting statistics.
8 //===----------------------------------------------------------------------===//
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14 //===----------------------------------------------------------------------===//
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
20 /* Statistics accumulator.
21 Accumulates number of samples and computes min, max, mean, standard deviation
24 Online variance calculation algorithm from
25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
28 #include "kmp_stats_timing.h"
31 #include <new> // placement new
36 /* Enable developer statistics here if you want them. They are more detailed
37 than is useful for application characterisation and are intended for the
38 runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
45 * @ingroup STATS_GATHERING
46 * \brief flags to describe the statistic (timer or counter)
50 noTotal
= 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
51 onlyInMaster
= 1 << 1, //!< statistic is valid only for primary thread
52 noUnits
= 1 << 2, //!< statistic doesn't need units printed next to it
53 notInMaster
= 1 << 3, //!< statistic is valid only for non-primary threads
54 logEvent
= 1 << 4 //!< statistic can be logged on the event timeline when
55 //! KMP_STATS_EVENTS is on (valid only for timers)
59 * @ingroup STATS_GATHERING
60 * \brief the states which a thread can be in
77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
79 * @param macro a user defined macro that takes three arguments -
80 * macro(COUNTER_NAME, flags, arg)
81 * @param arg a user defined argument to send to the user defined macro
83 * \details A counter counts the occurrence of some event. Each thread
84 * accumulates its own count, at the end of execution the counts are aggregated
85 * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86 * in which case there's only a single measurement). The min,mean,max are
87 * therefore the values for the threads. Adding the counter here and then
88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89 * need to do. All of the tables and printing is generated from this macro.
90 * Format is "macro(name, flags, arg)"
92 * @ingroup STATS_GATHERING
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97 macro(OMP_NESTED_PARALLEL, 0, arg) \
98 macro(OMP_LOOP_STATIC, 0, arg) \
99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100 macro(OMP_LOOP_DYNAMIC, 0, arg) \
101 macro(OMP_DISTRIBUTE, 0, arg) \
102 macro(OMP_BARRIER, 0, arg) \
103 macro(OMP_CRITICAL, 0, arg) \
104 macro(OMP_SINGLE, 0, arg) \
105 macro(OMP_SECTIONS, 0, arg) \
106 macro(OMP_MASTER, 0, arg) \
107 macro(OMP_MASKED, 0, arg) \
108 macro(OMP_TEAMS, 0, arg) \
109 macro(OMP_set_lock, 0, arg) \
110 macro(OMP_test_lock, 0, arg) \
111 macro(REDUCE_wait, 0, arg) \
112 macro(REDUCE_nowait, 0, arg) \
113 macro(OMP_TASKYIELD, 0, arg) \
114 macro(OMP_TASKLOOP, 0, arg) \
115 macro(TASK_executed, 0, arg) \
116 macro(TASK_cancelled, 0, arg) \
117 macro(TASK_stolen, 0, arg)
121 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
123 * @param macro a user defined macro that takes three arguments -
124 * macro(TIMER_NAME, flags, arg)
125 * @param arg a user defined argument to send to the user defined macro
127 * \details A timer collects multiple samples of some count in each thread and
128 * then finally aggregates all of the samples from all of the threads. For most
129 * timers the printing code also provides an aggregation over the thread totals.
130 * These are printed as TOTAL_foo. The count is normally a time (in ticks),
131 * hence the name "timer". (But can be any value, so we use this for "number of
132 * arguments passed to fork" as well). For timers the threads are not
133 * significant, it's the individual observations that count, so the statistics
134 * are at that level. Format is "macro(name, flags, arg)"
136 * @ingroup STATS_GATHERING2
139 #define KMP_FOREACH_TIMER(macro, arg) \
140 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
141 macro (OMP_parallel, stats_flags_e::logEvent, arg) \
142 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
143 macro (OMP_teams, stats_flags_e::logEvent, arg) \
144 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
145 macro (OMP_loop_static, 0, arg) \
146 macro (OMP_loop_static_scheduling, 0, arg) \
147 macro (OMP_loop_dynamic, 0, arg) \
148 macro (OMP_loop_dynamic_scheduling, 0, arg) \
149 macro (OMP_distribute, 0, arg) \
150 macro (OMP_distribute_scheduling, 0, arg) \
151 macro (OMP_critical, 0, arg) \
152 macro (OMP_critical_wait, 0, arg) \
153 macro (OMP_single, 0, arg) \
154 macro (OMP_sections, 0, arg) \
155 macro (OMP_sections_overhead, 0, arg) \
156 macro (OMP_master, 0, arg) \
157 macro (OMP_masked, 0, arg) \
158 macro (OMP_task_immediate, 0, arg) \
159 macro (OMP_task_taskwait, 0, arg) \
160 macro (OMP_task_taskyield, 0, arg) \
161 macro (OMP_task_taskgroup, 0, arg) \
162 macro (OMP_task_join_bar, 0, arg) \
163 macro (OMP_task_plain_bar, 0, arg) \
164 macro (OMP_taskloop_scheduling, 0, arg) \
165 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
166 macro (OMP_idle, stats_flags_e::logEvent, arg) \
167 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
168 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
169 macro (OMP_serial, stats_flags_e::logEvent, arg) \
170 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
172 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
174 macro (OMP_loop_static_iterations, \
175 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
176 macro (OMP_loop_static_total_iterations, \
177 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
178 macro (OMP_loop_dynamic_iterations, \
179 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
180 macro (OMP_loop_dynamic_total_iterations, \
181 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
182 macro (OMP_distribute_iterations, \
183 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
184 KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
187 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
188 // initializing OpenMP or being created by a primary
189 // thread) until the thread is destroyed
190 // OMP_parallel -- Time thread spends executing work directly
191 // within a #pragma omp parallel
192 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
193 // OMP_loop_static -- Time thread spends executing loop iterations from
194 // a statically scheduled loop
195 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
196 // from a statically scheduled loop
197 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
198 // a dynamically scheduled loop
199 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
200 // from a dynamically scheduled loop
201 // OMP_critical -- Time thread spends executing critical section
202 // OMP_critical_wait -- Time thread spends waiting to enter
203 // a critical section
204 // OMP_single -- Time spent executing a "single" region
205 // OMP_master -- Time spent executing a "master" region
206 // OMP_masked -- Time spent executing a "masked" region
207 // OMP_task_immediate -- Time spent executing non-deferred tasks
208 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
210 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
212 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
214 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
215 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
217 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
219 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
220 // inside implicit barrier at end of worksharing
222 // OMP_idle -- Time worker threads spend waiting for next
224 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
226 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
228 // OMP_serial -- Time thread zero spends executing serial code
229 // OMP_set_numthreads -- Values passed to omp_set_num_threads
230 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
231 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
232 // statically scheduled loops
233 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
234 // dynamically scheduled loops
236 #if (KMP_DEVELOPER_STATS)
237 // Timers which are of interest to runtime library developers, not end users.
238 // These have to be explicitly enabled in addition to the other stats.
240 // KMP_fork_barrier -- time in __kmp_fork_barrier
241 // KMP_join_barrier -- time in __kmp_join_barrier
242 // KMP_barrier -- time in __kmp_barrier
243 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
244 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
245 // KMP_icv_copy -- start/stop timer for any ICV copying
246 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
247 // KMP_linear_release -- time in __kmp_linear_barrier_release
248 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
249 // KMP_tree_release -- time in __kmp_tree_barrier_release
250 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
251 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
252 // KMP_dist_gather -- time in __kmp_dist_barrier_gather
253 // KMP_dist_release -- time in __kmp_dist_barrier_release
255 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
256 macro(KMP_fork_call, 0, arg) \
257 macro(KMP_join_call, 0, arg) \
258 macro(KMP_end_split_barrier, 0, arg) \
259 macro(KMP_hier_gather, 0, arg) \
260 macro(KMP_hier_release, 0, arg) \
261 macro(KMP_hyper_gather, 0, arg) \
262 macro(KMP_hyper_release, 0, arg) \
263 macro(KMP_dist_gather, 0, arg) \
264 macro(KMP_dist_release, 0, arg) \
265 macro(KMP_linear_gather, 0, arg) \
266 macro(KMP_linear_release, 0, arg) \
267 macro(KMP_tree_gather, 0, arg) \
268 macro(KMP_tree_release, 0, arg) \
269 macro(USER_resume, 0, arg) \
270 macro(USER_suspend, 0, arg) \
271 macro(USER_mwait, 0, arg) \
272 macro(KMP_allocate_team, 0, arg) \
273 macro(KMP_setup_icv_copy, 0, arg) \
274 macro(USER_icv_copy, 0, arg) \
275 macro (FOR_static_steal_stolen, \
276 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
277 macro (FOR_static_steal_chunks, \
278 stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
280 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
285 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
287 * @param macro a user defined macro that takes three arguments -
288 * macro(TIMER_NAME, flags, arg)
289 * @param arg a user defined argument to send to the user defined macro
291 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
292 * BAD THINGS WILL HAPPEN!
294 * \details Explicit timers are ones where we need to allocate a timer itself
295 * (as well as the accumulated timing statistics). We allocate these on a
296 * per-thread basis, and explicitly start and stop them. Block timers just
297 * allocate the timer itself on the stack, and use the destructor to notice
298 * block exit; they don't need to be defined here. The name here should be the
299 * same as that of a timer above.
301 * @ingroup STATS_GATHERING
303 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
305 #define ENUMERATE(name, ignore, prefix) prefix##name,
306 enum timer_e
{ KMP_FOREACH_TIMER(ENUMERATE
, TIMER_
) TIMER_LAST
};
308 enum explicit_timer_e
{
309 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE
, EXPLICIT_TIMER_
) EXPLICIT_TIMER_LAST
312 enum counter_e
{ KMP_FOREACH_COUNTER(ENUMERATE
, COUNTER_
) COUNTER_LAST
};
316 * A logarithmic histogram. It accumulates the number of values in each power of
317 * ten bin. So 1<=x<10, 10<=x<100, ...
318 * Mostly useful where we have some big outliers and want to see information
323 numBins
= 31, /* Number of powers of 10. If this changes you need to change
324 * the initializer for binMax */
327 * If you want to use this to analyse values that may be less than 1, (for
328 * instance times in s), then the logOffset gives you negative powers.
329 * In our case here, we're just looking at times in ticks, or counts, so we
330 * can never see values with magnitude < 1 (other than zero), so we can set
331 * it to 0. As above change the initializer if you change this.
335 uint32_t KMP_ALIGN_CACHE zeroCount
;
341 static double binMax
[numBins
];
347 uint64_t t
= zeroCount
;
348 for (int i
= 0; i
< numBins
; i
++)
350 KMP_DEBUG_ASSERT(t
== _total
);
353 void check() const {}
357 logHistogram() { reset(); }
359 logHistogram(logHistogram
const &o
) {
360 for (int i
= 0; i
< numBins
; i
++)
369 for (int i
= 0; i
< numBins
; i
++) {
378 uint32_t count(int b
) const { return bins
[b
+ logOffset
].count
; }
379 double total(int b
) const { return bins
[b
+ logOffset
].total
; }
380 static uint32_t findBin(double sample
);
382 logHistogram
&operator+=(logHistogram
const &o
) {
383 zeroCount
+= o
.zeroCount
;
384 for (int i
= 0; i
< numBins
; i
++) {
385 bins
[i
].count
+= o
.bins
[i
].count
;
386 bins
[i
].total
+= o
.bins
[i
].total
;
396 void addSample(double sample
);
400 std::string
format(char) const;
404 double KMP_ALIGN_CACHE minVal
;
408 uint64_t sampleCount
;
414 statistic(bool doHist
= bool(KMP_STATS_HIST
)) {
416 collectingHist
= doHist
;
418 statistic(statistic
const &o
)
419 : minVal(o
.minVal
), maxVal(o
.maxVal
), meanVal(o
.meanVal
), m2(o
.m2
),
420 sampleCount(o
.sampleCount
), offset(o
.offset
),
421 collectingHist(o
.collectingHist
), hist(o
.hist
) {}
422 statistic(double minv
, double maxv
, double meanv
, uint64_t sc
, double sd
)
423 : minVal(minv
), maxVal(maxv
), meanVal(meanv
), m2(sd
* sd
* sc
),
424 sampleCount(sc
), offset(0.0), collectingHist(false) {}
425 bool haveHist() const { return collectingHist
; }
426 double getMin() const { return minVal
; }
427 double getMean() const { return meanVal
; }
428 double getMax() const { return maxVal
; }
429 uint64_t getCount() const { return sampleCount
; }
430 double getSD() const { return sqrt(m2
/ sampleCount
); }
431 double getTotal() const { return sampleCount
* meanVal
; }
432 logHistogram
const *getHist() const { return &hist
; }
433 void setOffset(double d
) { offset
= d
; }
436 minVal
= (std::numeric_limits
<double>::max
)();
444 void addSample(double sample
);
445 void scale(double factor
);
446 void scaleDown(double f
) { scale(1. / f
); }
447 void forceCount(uint64_t count
) { sampleCount
= count
; }
448 statistic
&operator+=(statistic
const &other
);
450 std::string
format(char unit
, bool total
= false) const;
451 std::string
formatHist(char unit
) const { return hist
.format(unit
); }
459 class timeStat
: public statistic
{
460 static statInfo timerInfo
[];
463 timeStat() : statistic() {}
464 static const char *name(timer_e e
) { return timerInfo
[e
].name
; }
465 static bool noTotal(timer_e e
) {
466 return timerInfo
[e
].flags
& stats_flags_e::noTotal
;
468 static bool masterOnly(timer_e e
) {
469 return timerInfo
[e
].flags
& stats_flags_e::onlyInMaster
;
471 static bool workerOnly(timer_e e
) {
472 return timerInfo
[e
].flags
& stats_flags_e::notInMaster
;
474 static bool noUnits(timer_e e
) {
475 return timerInfo
[e
].flags
& stats_flags_e::noUnits
;
477 static bool logEvent(timer_e e
) {
478 return timerInfo
[e
].flags
& stats_flags_e::logEvent
;
480 static void clearEventFlags() {
481 for (int i
= 0; i
< TIMER_LAST
; i
++) {
482 timerInfo
[i
].flags
&= (~(stats_flags_e::logEvent
));
487 // Where we need explicitly to start and end the timer, this version can be used
488 // Since these timers normally aren't nicely scoped, so don't have a good place
489 // to live on the stack of the thread, they're more work to use.
490 class explicitTimer
{
492 timer_e timerEnumValue
;
493 tsc_tick_count startTime
;
494 tsc_tick_count pauseStartTime
;
495 tsc_tick_count::tsc_interval_t totalPauseTime
;
498 explicitTimer(timeStat
*s
, timer_e te
)
499 : stat(s
), timerEnumValue(te
), startTime(), pauseStartTime(0),
502 // void setStat(timeStat *s) { stat = s; }
503 void start(tsc_tick_count tick
);
504 void pause(tsc_tick_count tick
) { pauseStartTime
= tick
; }
505 void resume(tsc_tick_count tick
) {
506 totalPauseTime
+= (tick
- pauseStartTime
);
508 void stop(tsc_tick_count tick
, kmp_stats_list
*stats_ptr
= nullptr);
514 timer_e
get_type() const { return timerEnumValue
; }
517 // Where you need to partition a threads clock ticks into separate states
518 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
519 // DOING_NOTHING would render these conditions:
520 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
521 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
523 class partitionedTimers
{
525 std::vector
<explicitTimer
> timer_stack
;
529 void init(explicitTimer timer
);
530 void exchange(explicitTimer timer
);
531 void push(explicitTimer timer
);
536 // Special wrapper around the partitioned timers to aid timing code blocks
537 // It avoids the need to have an explicit end, leaving the scope suffices.
538 class blockPartitionedTimer
{
539 partitionedTimers
*part_timers
;
542 blockPartitionedTimer(partitionedTimers
*pt
, explicitTimer timer
)
544 part_timers
->push(timer
);
546 ~blockPartitionedTimer() { part_timers
->pop(); }
549 // Special wrapper around the thread state to aid in keeping state in code
550 // blocks It avoids the need to have an explicit end, leaving the scope
552 class blockThreadState
{
553 stats_state_e
*state_pointer
;
554 stats_state_e old_state
;
557 blockThreadState(stats_state_e
*thread_state_pointer
, stats_state_e new_state
)
558 : state_pointer(thread_state_pointer
), old_state(*thread_state_pointer
) {
559 *state_pointer
= new_state
;
561 ~blockThreadState() { *state_pointer
= old_state
; }
564 // If all you want is a count, then you can use this...
565 // The individual per-thread counts will be aggregated into a statistic at
569 static const statInfo counterInfo
[];
572 counter() : value(0) {}
573 void increment() { value
++; }
574 uint64_t getValue() const { return value
; }
575 void reset() { value
= 0; }
576 static const char *name(counter_e e
) { return counterInfo
[e
].name
; }
577 static bool masterOnly(counter_e e
) {
578 return counterInfo
[e
].flags
& stats_flags_e::onlyInMaster
;
582 /* ****************************************************************
583 Class to implement an event
585 There are four components to an event: start time, stop time
586 nest_level, and timer_name.
587 The start and stop time should be obvious (recorded in clock ticks).
588 The nest_level relates to the bar width in the timeline graph.
589 The timer_name is used to determine which timer event triggered this event.
591 the interface to this class is through four read-only operations:
592 1) getStart() -- returns the start time as 64 bit integer
593 2) getStop() -- returns the stop time as 64 bit integer
594 3) getNestLevel() -- returns the nest level of the event
595 4) getTimerName() -- returns the timer name that triggered event
598 The nest level is used in the bar graph that represents the timeline.
599 Its main purpose is for showing how events are nested inside eachother.
600 For example, say events, A, B, and C are recorded. If the timeline
603 Begin -------------------------------------------------------------> Time
606 start start start end end end
608 Then A, B, C will have a nest level of 1, 2, 3 respectively.
609 These values are then used to calculate the barwidth so you can
610 see that inside A, B has occurred, and inside B, C has occurred.
611 Currently, this is shown with A's bar width being larger than B's
612 bar width, and B's bar width being larger than C's bar width.
614 **************************************************************** */
615 class kmp_stats_event
{
623 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST
) {}
624 kmp_stats_event(uint64_t strt
, uint64_t stp
, int nst
, timer_e nme
)
625 : start(strt
), stop(stp
), nest_level(nst
), timer_name(nme
) {}
626 inline uint64_t getStart() const { return start
; }
627 inline uint64_t getStop() const { return stop
; }
628 inline int getNestLevel() const { return nest_level
; }
629 inline timer_e
getTimerName() const { return timer_name
; }
632 /* ****************************************************************
633 Class to implement a dynamically expandable array of events
635 ---------------------------------------------------------
636 | event 1 | event 2 | event 3 | event 4 | ... | event N |
637 ---------------------------------------------------------
639 An event is pushed onto the back of this array at every
640 explicitTimer->stop() call. The event records the thread #,
641 start time, stop time, and nest level related to the bar width.
643 The event vector starts at size INIT_SIZE and grows (doubles in size)
644 if needed. An implication of this behavior is that log(N)
645 reallocations are needed (where N is number of events). If you want
646 to avoid reallocations, then set INIT_SIZE to a large value.
648 the interface to this class is through six operations:
649 1) reset() -- sets the internal_size back to 0 but does not deallocate any
651 2) size() -- returns the number of valid elements in the vector
652 3) push_back(start, stop, nest, timer_name) -- pushes an event onto
653 the back of the array
654 4) deallocate() -- frees all memory associated with the vector
655 5) sort() -- sorts the vector by start time
656 6) operator[index] or at(index) -- returns event reference at that index
657 **************************************************************** */
658 class kmp_stats_event_vector
{
659 kmp_stats_event
*events
;
662 static const int INIT_SIZE
= 1024;
665 kmp_stats_event_vector() {
667 (kmp_stats_event
*)__kmp_allocate(sizeof(kmp_stats_event
) * INIT_SIZE
);
669 allocated_size
= INIT_SIZE
;
671 ~kmp_stats_event_vector() {}
672 inline void reset() { internal_size
= 0; }
673 inline int size() const { return internal_size
; }
674 void push_back(uint64_t start_time
, uint64_t stop_time
, int nest_level
,
677 if (internal_size
== allocated_size
) {
678 kmp_stats_event
*tmp
= (kmp_stats_event
*)__kmp_allocate(
679 sizeof(kmp_stats_event
) * allocated_size
* 2);
680 for (i
= 0; i
< internal_size
; i
++)
686 events
[internal_size
] =
687 kmp_stats_event(start_time
, stop_time
, nest_level
, name
);
693 const kmp_stats_event
&operator[](int index
) const { return events
[index
]; }
694 kmp_stats_event
&operator[](int index
) { return events
[index
]; }
695 const kmp_stats_event
&at(int index
) const { return events
[index
]; }
696 kmp_stats_event
&at(int index
) { return events
[index
]; }
699 /* ****************************************************************
700 Class to implement a doubly-linked, circular, statistics list
702 |---| ---> |---| ---> |---| ---> |---| ---> ... next
704 |---| <--- |---| <--- |---| <--- |---| <--- ... prev
705 Sentinel first second third
708 The Sentinel Node is the user handle on the list.
709 The first node corresponds to thread 0's statistics.
710 The second node corresponds to thread 1's statistics and so on...
712 Each node has a _timers, _counters, and _explicitTimers array to hold that
713 thread's statistics. The _explicitTimers point to the correct _timer and
714 update its statistics at every stop() call. The explicitTimers' pointers are
715 set up in the constructor. Each node also has an event vector to hold that
716 thread's timing events. The event vector expands as necessary and records
717 the start-stop times for each timer.
719 The nestLevel variable is for plotting events and is related
720 to the bar width in the timeline graph.
722 Every thread will have a thread local pointer to its node in
723 the list. The sentinel node is used by the primary thread to
724 store "dummy" statistics before __kmp_create_worker() is called.
725 **************************************************************** */
726 class kmp_stats_list
{
728 timeStat _timers
[TIMER_LAST
+ 1];
729 counter _counters
[COUNTER_LAST
+ 1];
730 explicitTimer thread_life_timer
;
731 partitionedTimers _partitionedTimers
;
732 int _nestLevel
; // one per thread
733 kmp_stats_event_vector _event_vector
;
734 kmp_stats_list
*next
;
735 kmp_stats_list
*prev
;
737 int thread_is_idle_flag
;
741 : thread_life_timer(&_timers
[TIMER_OMP_worker_thread_life
],
742 TIMER_OMP_worker_thread_life
),
743 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE
),
744 thread_is_idle_flag(0) {}
746 inline timeStat
*getTimer(timer_e idx
) { return &_timers
[idx
]; }
747 inline counter
*getCounter(counter_e idx
) { return &_counters
[idx
]; }
748 inline partitionedTimers
*getPartitionedTimers() {
749 return &_partitionedTimers
;
751 inline timeStat
*getTimers() { return _timers
; }
752 inline counter
*getCounters() { return _counters
; }
753 inline kmp_stats_event_vector
&getEventVector() { return _event_vector
; }
754 inline void startLife() { thread_life_timer
.start(tsc_tick_count::now()); }
755 inline void endLife() { thread_life_timer
.stop(tsc_tick_count::now(), this); }
756 inline void resetEventVector() { _event_vector
.reset(); }
757 inline void incrementNestValue() { _nestLevel
++; }
758 inline int getNestValue() { return _nestLevel
; }
759 inline void decrementNestValue() { _nestLevel
--; }
760 inline int getGtid() const { return gtid
; }
761 inline void setGtid(int newgtid
) { gtid
= newgtid
; }
762 inline void setState(stats_state_e newstate
) { state
= newstate
; }
763 inline stats_state_e
getState() const { return state
; }
764 inline stats_state_e
*getStatePointer() { return &state
; }
765 inline bool isIdle() { return thread_is_idle_flag
== 1; }
766 inline void setIdleFlag() { thread_is_idle_flag
= 1; }
767 inline void resetIdleFlag() { thread_is_idle_flag
= 0; }
768 kmp_stats_list
*push_back(int gtid
); // returns newly created list node
769 inline void push_event(uint64_t start_time
, uint64_t stop_time
,
770 int nest_level
, timer_e name
) {
771 _event_vector
.push_back(start_time
, stop_time
, nest_level
, name
);
775 kmp_stats_list::iterator
begin();
776 kmp_stats_list::iterator
end();
780 friend kmp_stats_list::iterator
kmp_stats_list::begin();
781 friend kmp_stats_list::iterator
kmp_stats_list::end();
786 iterator
operator++();
787 iterator
operator++(int dummy
);
788 iterator
operator--();
789 iterator
operator--(int dummy
);
790 bool operator!=(const iterator
&rhs
);
791 bool operator==(const iterator
&rhs
);
792 kmp_stats_list
*operator*() const; // dereference operator
796 /* ****************************************************************
797 Class to encapsulate all output functions and the environment variables
799 This module holds filenames for various outputs (normal stats, events, plot
800 file), as well as coloring information for the plot file.
802 The filenames and flags variables are read from environment variables.
803 These are read once by the constructor of the global variable
804 __kmp_stats_output which calls init().
806 During this init() call, event flags for the timeStat::timerInfo[] global
807 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
809 The only interface function that is public is outputStats(heading). This
810 function should print out everything it needs to, either to files or stderr,
811 depending on the environment variables described below
813 ENVIRONMENT VARIABLES:
814 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
815 file, otherwise, print to stderr
816 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
817 either KMP_STATS_FILE or stderr
818 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
819 otherwise, the plot file is sent to "events.plt"
820 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
822 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
823 otherwise, output is sent to "events.dat"
824 **************************************************************** */
825 class kmp_stats_output_module
{
835 std::string outputFileName
;
836 static const char *eventsFileName
;
837 static const char *plotFileName
;
838 static int printPerThreadFlag
;
839 static int printPerThreadEventsFlag
;
840 static const rgb_color globalColorArray
[];
841 static rgb_color timerColorInfo
[];
844 static void setupEventColors();
845 static void printPloticusFile();
846 static void printHeaderInfo(FILE *statsOut
);
847 static void printTimerStats(FILE *statsOut
, statistic
const *theStats
,
848 statistic
const *totalStats
);
849 static void printCounterStats(FILE *statsOut
, statistic
const *theStats
);
850 static void printCounters(FILE *statsOut
, counter
const *theCounters
);
851 static void printEvents(FILE *eventsOut
, kmp_stats_event_vector
*theEvents
,
853 static rgb_color
getEventColor(timer_e e
) { return timerColorInfo
[e
]; }
854 static void windupExplicitTimers();
855 bool eventPrintingEnabled() const { return printPerThreadEventsFlag
; }
858 kmp_stats_output_module() { init(); }
859 void outputStats(const char *heading
);
865 void __kmp_stats_init();
866 void __kmp_stats_fini();
867 void __kmp_reset_stats();
868 void __kmp_output_stats(const char *);
869 void __kmp_accumulate_stats_at_exit(void);
870 // thread local pointer to stats node within list
871 extern KMP_THREAD_LOCAL kmp_stats_list
*__kmp_stats_thread_ptr
;
872 // head to stats list.
873 extern kmp_stats_list
*__kmp_stats_list
;
874 // lock for __kmp_stats_list
875 extern kmp_tas_lock_t __kmp_stats_lock
;
876 // reference start time
877 extern tsc_tick_count __kmp_stats_start_time
;
878 // interface to output
879 extern kmp_stats_output_module __kmp_stats_output
;
885 // Simple, standard interfaces that drop out completely if stats aren't enabled
888 * \brief Adds value to specified timer (name).
890 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
891 * @param value double precision sample value to add to statistics for the timer
893 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
894 * a timer statistics.
896 * @ingroup STATS_GATHERING
898 #define KMP_COUNT_VALUE(name, value) \
899 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
902 * \brief Increments specified counter (name).
904 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
906 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
907 * counter for the executing thread.
909 * @ingroup STATS_GATHERING
911 #define KMP_COUNT_BLOCK(name) \
912 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
915 * \brief Outputs the current thread statistics and reset them.
917 * @param heading_string heading put above the final stats output
919 * \details Explicitly stops all timers and outputs all stats. Environment
920 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
921 * filename instead of stderr. Environment variable,
922 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
923 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
924 * defined with any value, which will print out thread specific stats, or it can
925 * be undefined (not specified in the environment) and thread specific stats
926 * won't be printed. It should be noted that all statistics are reset when this
929 * @ingroup STATS_GATHERING
931 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
934 * \brief Initializes the partitioned timers to begin with name.
936 * @param name timer which you want this thread to begin with
938 * @ingroup STATS_GATHERING
940 #define KMP_INIT_PARTITIONED_TIMERS(name) \
941 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
942 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
944 #define KMP_TIME_PARTITIONED_BLOCK(name) \
945 blockPartitionedTimer __PBLOCKTIME__( \
946 __kmp_stats_thread_ptr->getPartitionedTimers(), \
947 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
950 #define KMP_PUSH_PARTITIONED_TIMER(name) \
951 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
952 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
954 #define KMP_POP_PARTITIONED_TIMER() \
955 __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
957 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
958 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
959 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
961 #define KMP_SET_THREAD_STATE(state_name) \
962 __kmp_stats_thread_ptr->setState(state_name)
964 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
966 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
967 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
971 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
973 * \details Reset all stats for all threads.
975 * @ingroup STATS_GATHERING
977 #define KMP_RESET_STATS() __kmp_reset_stats()
979 #if (KMP_DEVELOPER_STATS)
980 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
981 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
982 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
983 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
984 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
985 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
986 KMP_EXCHANGE_PARTITIONED_TIMER(n)
989 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
990 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
991 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
992 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
993 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
994 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
997 #else // KMP_STATS_ENABLED
1000 #define KMP_COUNT_VALUE(n, v) ((void)0)
1001 #define KMP_COUNT_BLOCK(n) ((void)0)
1003 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
1004 #define KMP_RESET_STATS() ((void)0)
1006 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1007 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1008 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1009 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1010 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1011 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1012 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1013 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1014 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1015 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1016 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1017 #define KMP_GET_THREAD_STATE() ((void)0)
1018 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1019 #endif // KMP_STATS_ENABLED
1021 #endif // KMP_STATS_H