openmp/runtime/src/kmp_affinity.h

   1 /*
   2  * kmp_affinity.h -- header for affinity management
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef KMP_AFFINITY_H
  14 #define KMP_AFFINITY_H
  15
  16 #include "kmp.h"
  17 #include "kmp_os.h"
  18 #include <limits>
  19
  20 #if KMP_AFFINITY_SUPPORTED
  21 #if KMP_USE_HWLOC
  22 class KMPHwlocAffinity : public KMPAffinity {
  23 public:
  24   class Mask : public KMPAffinity::Mask {
  25     hwloc_cpuset_t mask;
  26
  27   public:
  28     Mask() {
  29       mask = hwloc_bitmap_alloc();
  30       this->zero();
  31     }
  32     ~Mask() { hwloc_bitmap_free(mask); }
  33     void set(int i) override { hwloc_bitmap_set(mask, i); }
  34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
  35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
  36     void zero() override { hwloc_bitmap_zero(mask); }
  37     bool empty() const override { return hwloc_bitmap_iszero(mask); }
  38     void copy(const KMPAffinity::Mask *src) override {
  39       const Mask *convert = static_cast<const Mask *>(src);
  40       hwloc_bitmap_copy(mask, convert->mask);
  41     }
  42     void bitwise_and(const KMPAffinity::Mask *rhs) override {
  43       const Mask *convert = static_cast<const Mask *>(rhs);
  44       hwloc_bitmap_and(mask, mask, convert->mask);
  45     }
  46     void bitwise_or(const KMPAffinity::Mask *rhs) override {
  47       const Mask *convert = static_cast<const Mask *>(rhs);
  48       hwloc_bitmap_or(mask, mask, convert->mask);
  49     }
  50     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
  51     bool is_equal(const KMPAffinity::Mask *rhs) const override {
  52       const Mask *convert = static_cast<const Mask *>(rhs);
  53       return hwloc_bitmap_isequal(mask, convert->mask);
  54     }
  55     int begin() const override { return hwloc_bitmap_first(mask); }
  56     int end() const override { return -1; }
  57     int next(int previous) const override {
  58       return hwloc_bitmap_next(mask, previous);
  59     }
  60     int get_system_affinity(bool abort_on_error) override {
  61       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  62                   "Illegal get affinity operation when not capable");
  63       long retval =
  64           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
  65       if (retval >= 0) {
  66         return 0;
  67       }
  68       int error = errno;
  69       if (abort_on_error) {
  70         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
  71                     KMP_ERR(error), __kmp_msg_null);
  72       }
  73       return error;
  74     }
  75     int set_system_affinity(bool abort_on_error) const override {
  76       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  77                   "Illegal set affinity operation when not capable");
  78       long retval =
  79           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
  80       if (retval >= 0) {
  81         return 0;
  82       }
  83       int error = errno;
  84       if (abort_on_error) {
  85         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
  86                     KMP_ERR(error), __kmp_msg_null);
  87       }
  88       return error;
  89     }
  90 #if KMP_OS_WINDOWS
  91     int set_process_affinity(bool abort_on_error) const override {
  92       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  93                   "Illegal set process affinity operation when not capable");
  94       int error = 0;
  95       const hwloc_topology_support *support =
  96           hwloc_topology_get_support(__kmp_hwloc_topology);
  97       if (support->cpubind->set_proc_cpubind) {
  98         int retval;
  99         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
 100                                    HWLOC_CPUBIND_PROCESS);
 101         if (retval >= 0)
 102           return 0;
 103         error = errno;
 104         if (abort_on_error)
 105           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
 106                       KMP_ERR(error), __kmp_msg_null);
 107       }
 108       return error;
 109     }
 110 #endif
 111     int get_proc_group() const override {
 112       int group = -1;
 113 #if KMP_OS_WINDOWS
 114       if (__kmp_num_proc_groups == 1) {
 115         return 1;
 116       }
 117       for (int i = 0; i < __kmp_num_proc_groups; i++) {
 118         // On windows, the long type is always 32 bits
 119         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
 120         unsigned long second_32_bits =
 121             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
 122         if (first_32_bits == 0 && second_32_bits == 0) {
 123           continue;
 124         }
 125         if (group >= 0) {
 126           return -1;
 127         }
 128         group = i;
 129       }
 130 #endif /* KMP_OS_WINDOWS */
 131       return group;
 132     }
 133   };
 134   void determine_capable(const char *var) override {
 135     const hwloc_topology_support *topology_support;
 136     if (__kmp_hwloc_topology == NULL) {
 137       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
 138         __kmp_hwloc_error = TRUE;
 139         if (__kmp_affinity.flags.verbose) {
 140           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
 141         }
 142       }
 143       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
 144         __kmp_hwloc_error = TRUE;
 145         if (__kmp_affinity.flags.verbose) {
 146           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
 147         }
 148       }
 149     }
 150     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
 151     // Is the system capable of setting/getting this thread's affinity?
 152     // Also, is topology discovery possible? (pu indicates ability to discover
 153     // processing units). And finally, were there no errors when calling any
 154     // hwloc_* API functions?
 155     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
 156         topology_support->cpubind->get_thisthread_cpubind &&
 157         topology_support->discovery->pu && !__kmp_hwloc_error) {
 158       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
 159       KMP_AFFINITY_ENABLE(TRUE);
 160     } else {
 161       // indicate that hwloc didn't work and disable affinity
 162       __kmp_hwloc_error = TRUE;
 163       KMP_AFFINITY_DISABLE();
 164     }
 165   }
 166   void bind_thread(int which) override {
 167     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 168                 "Illegal set affinity operation when not capable");
 169     KMPAffinity::Mask *mask;
 170     KMP_CPU_ALLOC_ON_STACK(mask);
 171     KMP_CPU_ZERO(mask);
 172     KMP_CPU_SET(which, mask);
 173     __kmp_set_system_affinity(mask, TRUE);
 174     KMP_CPU_FREE_FROM_STACK(mask);
 175   }
 176   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
 177   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
 178   KMPAffinity::Mask *allocate_mask_array(int num) override {
 179     return new Mask[num];
 180   }
 181   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 182     Mask *hwloc_array = static_cast<Mask *>(array);
 183     delete[] hwloc_array;
 184   }
 185   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 186                                       int index) override {
 187     Mask *hwloc_array = static_cast<Mask *>(array);
 188     return &(hwloc_array[index]);
 189   }
 190   api_type get_api_type() const override { return HWLOC; }
 191 };
 192 #endif /* KMP_USE_HWLOC */
 193
 194 #if KMP_OS_LINUX || KMP_OS_FREEBSD
 195 #if KMP_OS_LINUX
 196 /* On some of the older OS's that we build on, these constants aren't present
 197    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
 198    all systems of the same arch where they are defined, and they cannot change.
 199    stone forever. */
 200 #include <sys/syscall.h>
 201 #if KMP_ARCH_X86 || KMP_ARCH_ARM
 202 #ifndef __NR_sched_setaffinity
 203 #define __NR_sched_setaffinity 241
 204 #elif __NR_sched_setaffinity != 241
 205 #error Wrong code for setaffinity system call.
 206 #endif /* __NR_sched_setaffinity */
 207 #ifndef __NR_sched_getaffinity
 208 #define __NR_sched_getaffinity 242
 209 #elif __NR_sched_getaffinity != 242
 210 #error Wrong code for getaffinity system call.
 211 #endif /* __NR_sched_getaffinity */
 212 #elif KMP_ARCH_AARCH64
 213 #ifndef __NR_sched_setaffinity
 214 #define __NR_sched_setaffinity 122
 215 #elif __NR_sched_setaffinity != 122
 216 #error Wrong code for setaffinity system call.
 217 #endif /* __NR_sched_setaffinity */
 218 #ifndef __NR_sched_getaffinity
 219 #define __NR_sched_getaffinity 123
 220 #elif __NR_sched_getaffinity != 123
 221 #error Wrong code for getaffinity system call.
 222 #endif /* __NR_sched_getaffinity */
 223 #elif KMP_ARCH_X86_64
 224 #ifndef __NR_sched_setaffinity
 225 #define __NR_sched_setaffinity 203
 226 #elif __NR_sched_setaffinity != 203
 227 #error Wrong code for setaffinity system call.
 228 #endif /* __NR_sched_setaffinity */
 229 #ifndef __NR_sched_getaffinity
 230 #define __NR_sched_getaffinity 204
 231 #elif __NR_sched_getaffinity != 204
 232 #error Wrong code for getaffinity system call.
 233 #endif /* __NR_sched_getaffinity */
 234 #elif KMP_ARCH_PPC64
 235 #ifndef __NR_sched_setaffinity
 236 #define __NR_sched_setaffinity 222
 237 #elif __NR_sched_setaffinity != 222
 238 #error Wrong code for setaffinity system call.
 239 #endif /* __NR_sched_setaffinity */
 240 #ifndef __NR_sched_getaffinity
 241 #define __NR_sched_getaffinity 223
 242 #elif __NR_sched_getaffinity != 223
 243 #error Wrong code for getaffinity system call.
 244 #endif /* __NR_sched_getaffinity */
 245 #elif KMP_ARCH_MIPS
 246 #ifndef __NR_sched_setaffinity
 247 #define __NR_sched_setaffinity 4239
 248 #elif __NR_sched_setaffinity != 4239
 249 #error Wrong code for setaffinity system call.
 250 #endif /* __NR_sched_setaffinity */
 251 #ifndef __NR_sched_getaffinity
 252 #define __NR_sched_getaffinity 4240
 253 #elif __NR_sched_getaffinity != 4240
 254 #error Wrong code for getaffinity system call.
 255 #endif /* __NR_sched_getaffinity */
 256 #elif KMP_ARCH_MIPS64
 257 #ifndef __NR_sched_setaffinity
 258 #define __NR_sched_setaffinity 5195
 259 #elif __NR_sched_setaffinity != 5195
 260 #error Wrong code for setaffinity system call.
 261 #endif /* __NR_sched_setaffinity */
 262 #ifndef __NR_sched_getaffinity
 263 #define __NR_sched_getaffinity 5196
 264 #elif __NR_sched_getaffinity != 5196
 265 #error Wrong code for getaffinity system call.
 266 #endif /* __NR_sched_getaffinity */
 267 #elif KMP_ARCH_LOONGARCH64
 268 #ifndef __NR_sched_setaffinity
 269 #define __NR_sched_setaffinity 122
 270 #elif __NR_sched_setaffinity != 122
 271 #error Wrong code for setaffinity system call.
 272 #endif /* __NR_sched_setaffinity */
 273 #ifndef __NR_sched_getaffinity
 274 #define __NR_sched_getaffinity 123
 275 #elif __NR_sched_getaffinity != 123
 276 #error Wrong code for getaffinity system call.
 277 #endif /* __NR_sched_getaffinity */
 278 #elif KMP_ARCH_RISCV64
 279 #ifndef __NR_sched_setaffinity
 280 #define __NR_sched_setaffinity 122
 281 #elif __NR_sched_setaffinity != 122
 282 #error Wrong code for setaffinity system call.
 283 #endif /* __NR_sched_setaffinity */
 284 #ifndef __NR_sched_getaffinity
 285 #define __NR_sched_getaffinity 123
 286 #elif __NR_sched_getaffinity != 123
 287 #error Wrong code for getaffinity system call.
 288 #endif /* __NR_sched_getaffinity */
 289 #elif KMP_ARCH_VE
 290 #ifndef __NR_sched_setaffinity
 291 #define __NR_sched_setaffinity 203
 292 #elif __NR_sched_setaffinity != 203
 293 #error Wrong code for setaffinity system call.
 294 #endif /* __NR_sched_setaffinity */
 295 #ifndef __NR_sched_getaffinity
 296 #define __NR_sched_getaffinity 204
 297 #elif __NR_sched_getaffinity != 204
 298 #error Wrong code for getaffinity system call.
 299 #endif /* __NR_sched_getaffinity */
 300 #else
 301 #error Unknown or unsupported architecture
 302 #endif /* KMP_ARCH_* */
 303 #elif KMP_OS_FREEBSD
 304 #include <pthread.h>
 305 #include <pthread_np.h>
 306 #endif
 307 class KMPNativeAffinity : public KMPAffinity {
 308   class Mask : public KMPAffinity::Mask {
 309     typedef unsigned long mask_t;
 310     typedef decltype(__kmp_affin_mask_size) mask_size_type;
 311     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 312     static const mask_t ONE = 1;
 313     mask_size_type get_num_mask_types() const {
 314       return __kmp_affin_mask_size / sizeof(mask_t);
 315     }
 316
 317   public:
 318     mask_t *mask;
 319     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
 320     ~Mask() {
 321       if (mask)
 322         __kmp_free(mask);
 323     }
 324     void set(int i) override {
 325       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
 326     }
 327     bool is_set(int i) const override {
 328       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
 329     }
 330     void clear(int i) override {
 331       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
 332     }
 333     void zero() override {
 334       mask_size_type e = get_num_mask_types();
 335       for (mask_size_type i = 0; i < e; ++i)
 336         mask[i] = (mask_t)0;
 337     }
 338     bool empty() const override {
 339       mask_size_type e = get_num_mask_types();
 340       for (mask_size_type i = 0; i < e; ++i)
 341         if (mask[i] != (mask_t)0)
 342           return false;
 343       return true;
 344     }
 345     void copy(const KMPAffinity::Mask *src) override {
 346       const Mask *convert = static_cast<const Mask *>(src);
 347       mask_size_type e = get_num_mask_types();
 348       for (mask_size_type i = 0; i < e; ++i)
 349         mask[i] = convert->mask[i];
 350     }
 351     void bitwise_and(const KMPAffinity::Mask *rhs) override {
 352       const Mask *convert = static_cast<const Mask *>(rhs);
 353       mask_size_type e = get_num_mask_types();
 354       for (mask_size_type i = 0; i < e; ++i)
 355         mask[i] &= convert->mask[i];
 356     }
 357     void bitwise_or(const KMPAffinity::Mask *rhs) override {
 358       const Mask *convert = static_cast<const Mask *>(rhs);
 359       mask_size_type e = get_num_mask_types();
 360       for (mask_size_type i = 0; i < e; ++i)
 361         mask[i] |= convert->mask[i];
 362     }
 363     void bitwise_not() override {
 364       mask_size_type e = get_num_mask_types();
 365       for (mask_size_type i = 0; i < e; ++i)
 366         mask[i] = ~(mask[i]);
 367     }
 368     bool is_equal(const KMPAffinity::Mask *rhs) const override {
 369       const Mask *convert = static_cast<const Mask *>(rhs);
 370       mask_size_type e = get_num_mask_types();
 371       for (mask_size_type i = 0; i < e; ++i)
 372         if (mask[i] != convert->mask[i])
 373           return false;
 374       return true;
 375     }
 376     int begin() const override {
 377       int retval = 0;
 378       while (retval < end() && !is_set(retval))
 379         ++retval;
 380       return retval;
 381     }
 382     int end() const override {
 383       int e;
 384       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
 385       return e;
 386     }
 387     int next(int previous) const override {
 388       int retval = previous + 1;
 389       while (retval < end() && !is_set(retval))
 390         ++retval;
 391       return retval;
 392     }
 393     int get_system_affinity(bool abort_on_error) override {
 394       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 395                   "Illegal get affinity operation when not capable");
 396 #if KMP_OS_LINUX
 397       long retval =
 398           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
 399 #elif KMP_OS_FREEBSD
 400       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
 401                                      reinterpret_cast<cpuset_t *>(mask));
 402       int retval = (r == 0 ? 0 : -1);
 403 #endif
 404       if (retval >= 0) {
 405         return 0;
 406       }
 407       int error = errno;
 408       if (abort_on_error) {
 409         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
 410                     KMP_ERR(error), __kmp_msg_null);
 411       }
 412       return error;
 413     }
 414     int set_system_affinity(bool abort_on_error) const override {
 415       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 416                   "Illegal set affinity operation when not capable");
 417 #if KMP_OS_LINUX
 418       long retval =
 419           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
 420 #elif KMP_OS_FREEBSD
 421       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
 422                                      reinterpret_cast<cpuset_t *>(mask));
 423       int retval = (r == 0 ? 0 : -1);
 424 #endif
 425       if (retval >= 0) {
 426         return 0;
 427       }
 428       int error = errno;
 429       if (abort_on_error) {
 430         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
 431                     KMP_ERR(error), __kmp_msg_null);
 432       }
 433       return error;
 434     }
 435   };
 436   void determine_capable(const char *env_var) override {
 437     __kmp_affinity_determine_capable(env_var);
 438   }
 439   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
 440   KMPAffinity::Mask *allocate_mask() override {
 441     KMPNativeAffinity::Mask *retval = new Mask();
 442     return retval;
 443   }
 444   void deallocate_mask(KMPAffinity::Mask *m) override {
 445     KMPNativeAffinity::Mask *native_mask =
 446         static_cast<KMPNativeAffinity::Mask *>(m);
 447     delete native_mask;
 448   }
 449   KMPAffinity::Mask *allocate_mask_array(int num) override {
 450     return new Mask[num];
 451   }
 452   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 453     Mask *linux_array = static_cast<Mask *>(array);
 454     delete[] linux_array;
 455   }
 456   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 457                                       int index) override {
 458     Mask *linux_array = static_cast<Mask *>(array);
 459     return &(linux_array[index]);
 460   }
 461   api_type get_api_type() const override { return NATIVE_OS; }
 462 };
 463 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
 464
 465 #if KMP_OS_WINDOWS
 466 class KMPNativeAffinity : public KMPAffinity {
 467   class Mask : public KMPAffinity::Mask {
 468     typedef ULONG_PTR mask_t;
 469     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 470     mask_t *mask;
 471
 472   public:
 473     Mask() {
 474       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
 475     }
 476     ~Mask() {
 477       if (mask)
 478         __kmp_free(mask);
 479     }
 480     void set(int i) override {
 481       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
 482     }
 483     bool is_set(int i) const override {
 484       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
 485     }
 486     void clear(int i) override {
 487       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
 488     }
 489     void zero() override {
 490       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 491         mask[i] = 0;
 492     }
 493     bool empty() const override {
 494       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
 495         if (mask[i])
 496           return false;
 497       return true;
 498     }
 499     void copy(const KMPAffinity::Mask *src) override {
 500       const Mask *convert = static_cast<const Mask *>(src);
 501       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 502         mask[i] = convert->mask[i];
 503     }
 504     void bitwise_and(const KMPAffinity::Mask *rhs) override {
 505       const Mask *convert = static_cast<const Mask *>(rhs);
 506       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 507         mask[i] &= convert->mask[i];
 508     }
 509     void bitwise_or(const KMPAffinity::Mask *rhs) override {
 510       const Mask *convert = static_cast<const Mask *>(rhs);
 511       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 512         mask[i] |= convert->mask[i];
 513     }
 514     void bitwise_not() override {
 515       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 516         mask[i] = ~(mask[i]);
 517     }
 518     bool is_equal(const KMPAffinity::Mask *rhs) const override {
 519       const Mask *convert = static_cast<const Mask *>(rhs);
 520       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
 521         if (mask[i] != convert->mask[i])
 522           return false;
 523       return true;
 524     }
 525     int begin() const override {
 526       int retval = 0;
 527       while (retval < end() && !is_set(retval))
 528         ++retval;
 529       return retval;
 530     }
 531     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
 532     int next(int previous) const override {
 533       int retval = previous + 1;
 534       while (retval < end() && !is_set(retval))
 535         ++retval;
 536       return retval;
 537     }
 538     int set_process_affinity(bool abort_on_error) const override {
 539       if (__kmp_num_proc_groups <= 1) {
 540         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
 541           DWORD error = GetLastError();
 542           if (abort_on_error) {
 543             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 544                         __kmp_msg_null);
 545           }
 546           return error;
 547         }
 548       }
 549       return 0;
 550     }
 551     int set_system_affinity(bool abort_on_error) const override {
 552       if (__kmp_num_proc_groups > 1) {
 553         // Check for a valid mask.
 554         GROUP_AFFINITY ga;
 555         int group = get_proc_group();
 556         if (group < 0) {
 557           if (abort_on_error) {
 558             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
 559           }
 560           return -1;
 561         }
 562         // Transform the bit vector into a GROUP_AFFINITY struct
 563         // and make the system call to set affinity.
 564         ga.Group = group;
 565         ga.Mask = mask[group];
 566         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
 567
 568         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
 569         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
 570           DWORD error = GetLastError();
 571           if (abort_on_error) {
 572             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 573                         __kmp_msg_null);
 574           }
 575           return error;
 576         }
 577       } else {
 578         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
 579           DWORD error = GetLastError();
 580           if (abort_on_error) {
 581             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 582                         __kmp_msg_null);
 583           }
 584           return error;
 585         }
 586       }
 587       return 0;
 588     }
 589     int get_system_affinity(bool abort_on_error) override {
 590       if (__kmp_num_proc_groups > 1) {
 591         this->zero();
 592         GROUP_AFFINITY ga;
 593         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
 594         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
 595           DWORD error = GetLastError();
 596           if (abort_on_error) {
 597             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
 598                         KMP_ERR(error), __kmp_msg_null);
 599           }
 600           return error;
 601         }
 602         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
 603             (ga.Mask == 0)) {
 604           return -1;
 605         }
 606         mask[ga.Group] = ga.Mask;
 607       } else {
 608         mask_t newMask, sysMask, retval;
 609         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
 610           DWORD error = GetLastError();
 611           if (abort_on_error) {
 612             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
 613                         KMP_ERR(error), __kmp_msg_null);
 614           }
 615           return error;
 616         }
 617         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
 618         if (!retval) {
 619           DWORD error = GetLastError();
 620           if (abort_on_error) {
 621             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
 622                         KMP_ERR(error), __kmp_msg_null);
 623           }
 624           return error;
 625         }
 626         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
 627         if (!newMask) {
 628           DWORD error = GetLastError();
 629           if (abort_on_error) {
 630             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
 631                         KMP_ERR(error), __kmp_msg_null);
 632           }
 633         }
 634         *mask = retval;
 635       }
 636       return 0;
 637     }
 638     int get_proc_group() const override {
 639       int group = -1;
 640       if (__kmp_num_proc_groups == 1) {
 641         return 1;
 642       }
 643       for (int i = 0; i < __kmp_num_proc_groups; i++) {
 644         if (mask[i] == 0)
 645           continue;
 646         if (group >= 0)
 647           return -1;
 648         group = i;
 649       }
 650       return group;
 651     }
 652   };
 653   void determine_capable(const char *env_var) override {
 654     __kmp_affinity_determine_capable(env_var);
 655   }
 656   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
 657   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
 658   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
 659   KMPAffinity::Mask *allocate_mask_array(int num) override {
 660     return new Mask[num];
 661   }
 662   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 663     Mask *windows_array = static_cast<Mask *>(array);
 664     delete[] windows_array;
 665   }
 666   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 667                                       int index) override {
 668     Mask *windows_array = static_cast<Mask *>(array);
 669     return &(windows_array[index]);
 670   }
 671   api_type get_api_type() const override { return NATIVE_OS; }
 672 };
 673 #endif /* KMP_OS_WINDOWS */
 674 #endif /* KMP_AFFINITY_SUPPORTED */
 675
 676 // Describe an attribute for a level in the machine topology
 677 struct kmp_hw_attr_t {
 678   int core_type : 8;
 679   int core_eff : 8;
 680   unsigned valid : 1;
 681   unsigned reserved : 15;
 682
 683   static const int UNKNOWN_CORE_EFF = -1;
 684
 685   kmp_hw_attr_t()
 686       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
 687         valid(0), reserved(0) {}
 688   void set_core_type(kmp_hw_core_type_t type) {
 689     valid = 1;
 690     core_type = type;
 691   }
 692   void set_core_eff(int eff) {
 693     valid = 1;
 694     core_eff = eff;
 695   }
 696   kmp_hw_core_type_t get_core_type() const {
 697     return (kmp_hw_core_type_t)core_type;
 698   }
 699   int get_core_eff() const { return core_eff; }
 700   bool is_core_type_valid() const {
 701     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
 702   }
 703   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
 704   operator bool() const { return valid; }
 705   void clear() {
 706     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
 707     core_eff = UNKNOWN_CORE_EFF;
 708     valid = 0;
 709   }
 710   bool contains(const kmp_hw_attr_t &other) const {
 711     if (!valid && !other.valid)
 712       return true;
 713     if (valid && other.valid) {
 714       if (other.is_core_type_valid()) {
 715         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
 716           return false;
 717       }
 718       if (other.is_core_eff_valid()) {
 719         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
 720           return false;
 721       }
 722       return true;
 723     }
 724     return false;
 725   }
 726 #if KMP_AFFINITY_SUPPORTED
 727   bool contains(const kmp_affinity_attrs_t &attr) const {
 728     if (!valid && !attr.valid)
 729       return true;
 730     if (valid && attr.valid) {
 731       if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
 732         return (is_core_type_valid() &&
 733                 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
 734       if (attr.core_eff != UNKNOWN_CORE_EFF)
 735         return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
 736       return true;
 737     }
 738     return false;
 739   }
 740 #endif // KMP_AFFINITY_SUPPORTED
 741   bool operator==(const kmp_hw_attr_t &rhs) const {
 742     return (rhs.valid == valid && rhs.core_eff == core_eff &&
 743             rhs.core_type == core_type);
 744   }
 745   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
 746 };
 747
 748 #if KMP_AFFINITY_SUPPORTED
 749 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
 750 #endif
 751
 752 class kmp_hw_thread_t {
 753 public:
 754   static const int UNKNOWN_ID = -1;
 755   static const int MULTIPLE_ID = -2;
 756   static int compare_ids(const void *a, const void *b);
 757   static int compare_compact(const void *a, const void *b);
 758   int ids[KMP_HW_LAST];
 759   int sub_ids[KMP_HW_LAST];
 760   bool leader;
 761   int os_id;
 762   kmp_hw_attr_t attrs;
 763
 764   void print() const;
 765   void clear() {
 766     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
 767       ids[i] = UNKNOWN_ID;
 768     leader = false;
 769     attrs.clear();
 770   }
 771 };
 772
 773 class kmp_topology_t {
 774
 775   struct flags_t {
 776     int uniform : 1;
 777     int reserved : 31;
 778   };
 779
 780   int depth;
 781
 782   // The following arrays are all 'depth' long and have been
 783   // allocated to hold up to KMP_HW_LAST number of objects if
 784   // needed so layers can be added without reallocation of any array
 785
 786   // Orderd array of the types in the topology
 787   kmp_hw_t *types;
 788
 789   // Keep quick topology ratios, for non-uniform topologies,
 790   // this ratio holds the max number of itemAs per itemB
 791   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
 792   int *ratio;
 793
 794   // Storage containing the absolute number of each topology layer
 795   int *count;
 796
 797   // The number of core efficiencies. This is only useful for hybrid
 798   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
 799   int num_core_efficiencies;
 800   int num_core_types;
 801   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
 802
 803   // The hardware threads array
 804   // hw_threads is num_hw_threads long
 805   // Each hw_thread's ids and sub_ids are depth deep
 806   int num_hw_threads;
 807   kmp_hw_thread_t *hw_threads;
 808
 809   // Equivalence hash where the key is the hardware topology item
 810   // and the value is the equivalent hardware topology type in the
 811   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
 812   // known equivalence for the topology type
 813   kmp_hw_t equivalent[KMP_HW_LAST];
 814
 815   // Flags describing the topology
 816   flags_t flags;
 817
 818   // Compact value used during sort_compact()
 819   int compact;
 820
 821   // Insert a new topology layer after allocation
 822   void _insert_layer(kmp_hw_t type, const int *ids);
 823
 824 #if KMP_GROUP_AFFINITY
 825   // Insert topology information about Windows Processor groups
 826   void _insert_windows_proc_groups();
 827 #endif
 828
 829   // Count each item & get the num x's per y
 830   // e.g., get the number of cores and the number of threads per core
 831   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
 832   void _gather_enumeration_information();
 833
 834   // Remove layers that don't add information to the topology.
 835   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
 836   void _remove_radix1_layers();
 837
 838   // Find out if the topology is uniform
 839   void _discover_uniformity();
 840
 841   // Set all the sub_ids for each hardware thread
 842   void _set_sub_ids();
 843
 844   // Set global affinity variables describing the number of threads per
 845   // core, the number of packages, the number of cores per package, and
 846   // the number of cores.
 847   void _set_globals();
 848
 849   // Set the last level cache equivalent type
 850   void _set_last_level_cache();
 851
 852   // Return the number of cores with a particular attribute, 'attr'.
 853   // If 'find_all' is true, then find all cores on the machine, otherwise find
 854   // all cores per the layer 'above'
 855   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
 856                             bool find_all = false) const;
 857
 858 public:
 859   // Force use of allocate()/deallocate()
 860   kmp_topology_t() = delete;
 861   kmp_topology_t(const kmp_topology_t &t) = delete;
 862   kmp_topology_t(kmp_topology_t &&t) = delete;
 863   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
 864   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
 865
 866   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
 867   static void deallocate(kmp_topology_t *);
 868
 869   // Functions used in create_map() routines
 870   kmp_hw_thread_t &at(int index) {
 871     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
 872     return hw_threads[index];
 873   }
 874   const kmp_hw_thread_t &at(int index) const {
 875     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
 876     return hw_threads[index];
 877   }
 878   int get_num_hw_threads() const { return num_hw_threads; }
 879   void sort_ids() {
 880     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
 881           kmp_hw_thread_t::compare_ids);
 882   }
 883   // Check if the hardware ids are unique, if they are
 884   // return true, otherwise return false
 885   bool check_ids() const;
 886
 887   // Function to call after the create_map() routine
 888   void canonicalize();
 889   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
 890
 891 // Functions used after canonicalize() called
 892
 893 #if KMP_AFFINITY_SUPPORTED
 894   // Set the granularity for affinity settings
 895   void set_granularity(kmp_affinity_t &stgs) const;
 896   bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
 897   bool restrict_to_mask(const kmp_affin_mask_t *mask);
 898   bool filter_hw_subset();
 899 #endif
 900   bool is_uniform() const { return flags.uniform; }
 901   // Tell whether a type is a valid type in the topology
 902   // returns KMP_HW_UNKNOWN when there is no equivalent type
 903   kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
 904     if (type == KMP_HW_UNKNOWN)
 905       return KMP_HW_UNKNOWN;
 906     return equivalent[type];
 907   }
 908   // Set type1 = type2
 909   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
 910     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
 911     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
 912     kmp_hw_t real_type2 = equivalent[type2];
 913     if (real_type2 == KMP_HW_UNKNOWN)
 914       real_type2 = type2;
 915     equivalent[type1] = real_type2;
 916     // This loop is required since any of the types may have been set to
 917     // be equivalent to type1.  They all must be checked and reset to type2.
 918     KMP_FOREACH_HW_TYPE(type) {
 919       if (equivalent[type] == type1) {
 920         equivalent[type] = real_type2;
 921       }
 922     }
 923   }
 924   // Calculate number of types corresponding to level1
 925   // per types corresponding to level2 (e.g., number of threads per core)
 926   int calculate_ratio(int level1, int level2) const {
 927     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
 928     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
 929     int r = 1;
 930     for (int level = level1; level > level2; --level)
 931       r *= ratio[level];
 932     return r;
 933   }
 934   int get_ratio(int level) const {
 935     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
 936     return ratio[level];
 937   }
 938   int get_depth() const { return depth; };
 939   kmp_hw_t get_type(int level) const {
 940     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
 941     return types[level];
 942   }
 943   int get_level(kmp_hw_t type) const {
 944     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
 945     int eq_type = equivalent[type];
 946     if (eq_type == KMP_HW_UNKNOWN)
 947       return -1;
 948     for (int i = 0; i < depth; ++i)
 949       if (types[i] == eq_type)
 950         return i;
 951     return -1;
 952   }
 953   int get_count(int level) const {
 954     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
 955     return count[level];
 956   }
 957   // Return the total number of cores with attribute 'attr'
 958   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
 959     return _get_ncores_with_attr(attr, -1, true);
 960   }
 961   // Return the number of cores with attribute
 962   // 'attr' per topology level 'above'
 963   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
 964     return _get_ncores_with_attr(attr, above, false);
 965   }
 966
 967 #if KMP_AFFINITY_SUPPORTED
 968   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
 969   void sort_compact(kmp_affinity_t &affinity) {
 970     compact = affinity.compact;
 971     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
 972           kmp_hw_thread_t::compare_compact);
 973   }
 974 #endif
 975   void print(const char *env_var = "KMP_AFFINITY") const;
 976   void dump() const;
 977 };
 978 extern kmp_topology_t *__kmp_topology;
 979
 980 class kmp_hw_subset_t {
 981   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
 982
 983 public:
 984   // Describe a machine topology item in KMP_HW_SUBSET
 985   struct item_t {
 986     kmp_hw_t type;
 987     int num_attrs;
 988     int num[MAX_ATTRS];
 989     int offset[MAX_ATTRS];
 990     kmp_hw_attr_t attr[MAX_ATTRS];
 991   };
 992   // Put parenthesis around max to avoid accidental use of Windows max macro.
 993   const static int USE_ALL = (std::numeric_limits<int>::max)();
 994
 995 private:
 996   int depth;
 997   int capacity;
 998   item_t *items;
 999   kmp_uint64 set;
1000   bool absolute;
1001   // The set must be able to handle up to KMP_HW_LAST number of layers
1002   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1003   // Sorting the KMP_HW_SUBSET items to follow topology order
1004   // All unknown topology types will be at the beginning of the subset
1005   static int hw_subset_compare(const void *i1, const void *i2) {
1006     kmp_hw_t type1 = ((const item_t *)i1)->type;
1007     kmp_hw_t type2 = ((const item_t *)i2)->type;
1008     int level1 = __kmp_topology->get_level(type1);
1009     int level2 = __kmp_topology->get_level(type2);
1010     return level1 - level2;
1011   }
1012
1013 public:
1014   // Force use of allocate()/deallocate()
1015   kmp_hw_subset_t() = delete;
1016   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1017   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1018   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1019   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1020
1021   static kmp_hw_subset_t *allocate() {
1022     int initial_capacity = 5;
1023     kmp_hw_subset_t *retval =
1024         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1025     retval->depth = 0;
1026     retval->capacity = initial_capacity;
1027     retval->set = 0ull;
1028     retval->absolute = false;
1029     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1030     return retval;
1031   }
1032   static void deallocate(kmp_hw_subset_t *subset) {
1033     __kmp_free(subset->items);
1034     __kmp_free(subset);
1035   }
1036   void set_absolute() { absolute = true; }
1037   bool is_absolute() const { return absolute; }
1038   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1039     for (int i = 0; i < depth; ++i) {
1040       // Found an existing item for this layer type
1041       // Add the num, offset, and attr to this item
1042       if (items[i].type == type) {
1043         int idx = items[i].num_attrs++;
1044         if ((size_t)idx >= MAX_ATTRS)
1045           return;
1046         items[i].num[idx] = num;
1047         items[i].offset[idx] = offset;
1048         items[i].attr[idx] = attr;
1049         return;
1050       }
1051     }
1052     if (depth == capacity - 1) {
1053       capacity *= 2;
1054       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1055       for (int i = 0; i < depth; ++i)
1056         new_items[i] = items[i];
1057       __kmp_free(items);
1058       items = new_items;
1059     }
1060     items[depth].num_attrs = 1;
1061     items[depth].type = type;
1062     items[depth].num[0] = num;
1063     items[depth].offset[0] = offset;
1064     items[depth].attr[0] = attr;
1065     depth++;
1066     set |= (1ull << type);
1067   }
1068   int get_depth() const { return depth; }
1069   const item_t &at(int index) const {
1070     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1071     return items[index];
1072   }
1073   item_t &at(int index) {
1074     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1075     return items[index];
1076   }
1077   void remove(int index) {
1078     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1079     set &= ~(1ull << items[index].type);
1080     for (int j = index + 1; j < depth; ++j) {
1081       items[j - 1] = items[j];
1082     }
1083     depth--;
1084   }
1085   void sort() {
1086     KMP_DEBUG_ASSERT(__kmp_topology);
1087     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1088   }
1089   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1090   void dump() const {
1091     printf("**********************\n");
1092     printf("*** kmp_hw_subset: ***\n");
1093     printf("* depth: %d\n", depth);
1094     printf("* items:\n");
1095     for (int i = 0; i < depth; ++i) {
1096       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1097       for (int j = 0; j < items[i].num_attrs; ++j) {
1098         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1099                items[i].offset[j]);
1100         if (!items[i].attr[j]) {
1101           printf(" (none)\n");
1102         } else {
1103           printf(
1104               " core_type = %s, core_eff = %d\n",
1105               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1106               items[i].attr[j].get_core_eff());
1107         }
1108       }
1109     }
1110     printf("* set: 0x%llx\n", set);
1111     printf("* absolute: %d\n", absolute);
1112     printf("**********************\n");
1113   }
1114 };
1115 extern kmp_hw_subset_t *__kmp_hw_subset;
1116
1117 /* A structure for holding machine-specific hierarchy info to be computed once
1118    at init. This structure represents a mapping of threads to the actual machine
1119    hierarchy, or to our best guess at what the hierarchy might be, for the
1120    purpose of performing an efficient barrier. In the worst case, when there is
1121    no machine hierarchy information, it produces a tree suitable for a barrier,
1122    similar to the tree used in the hyper barrier. */
1123 class hierarchy_info {
1124 public:
1125   /* Good default values for number of leaves and branching factor, given no
1126      affinity information. Behaves a bit like hyper barrier. */
1127   static const kmp_uint32 maxLeaves = 4;
1128   static const kmp_uint32 minBranch = 4;
1129   /** Number of levels in the hierarchy. Typical levels are threads/core,
1130       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1131       to get specific with nomenclature. When the machine is oversubscribed we
1132       add levels to duplicate the hierarchy, doubling the thread capacity of the
1133       hierarchy each time we add a level. */
1134   kmp_uint32 maxLevels;
1135
1136   /** This is specifically the depth of the machine configuration hierarchy, in
1137       terms of the number of levels along the longest path from root to any
1138       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1139       all but one trailing 1. */
1140   kmp_uint32 depth;
1141   kmp_uint32 base_num_threads;
1142   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1143   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1144   // 2=initialization in progress
1145   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1146
1147   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1148       the parent of a node at level i has. For example, if we have a machine
1149       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1150       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1151   kmp_uint32 *numPerLevel;
1152   kmp_uint32 *skipPerLevel;
1153
1154   void deriveLevels() {
1155     int hier_depth = __kmp_topology->get_depth();
1156     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1157       numPerLevel[level] = __kmp_topology->get_ratio(i);
1158     }
1159   }
1160
1161   hierarchy_info()
1162       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1163
1164   void fini() {
1165     if (!uninitialized && numPerLevel) {
1166       __kmp_free(numPerLevel);
1167       numPerLevel = NULL;
1168       uninitialized = not_initialized;
1169     }
1170   }
1171
1172   void init(int num_addrs) {
1173     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1174         &uninitialized, not_initialized, initializing);
1175     if (bool_result == 0) { // Wait for initialization
1176       while (TCR_1(uninitialized) != initialized)
1177         KMP_CPU_PAUSE();
1178       return;
1179     }
1180     KMP_DEBUG_ASSERT(bool_result == 1);
1181
1182     /* Added explicit initialization of the data fields here to prevent usage of
1183        dirty value observed when static library is re-initialized multiple times
1184        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1185        OpenMP). */
1186     depth = 1;
1187     resizing = 0;
1188     maxLevels = 7;
1189     numPerLevel =
1190         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1191     skipPerLevel = &(numPerLevel[maxLevels]);
1192     for (kmp_uint32 i = 0; i < maxLevels;
1193          ++i) { // init numPerLevel[*] to 1 item per level
1194       numPerLevel[i] = 1;
1195       skipPerLevel[i] = 1;
1196     }
1197
1198     // Sort table by physical ID
1199     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1200       deriveLevels();
1201     } else {
1202       numPerLevel[0] = maxLeaves;
1203       numPerLevel[1] = num_addrs / maxLeaves;
1204       if (num_addrs % maxLeaves)
1205         numPerLevel[1]++;
1206     }
1207
1208     base_num_threads = num_addrs;
1209     for (int i = maxLevels - 1; i >= 0;
1210          --i) // count non-empty levels to get depth
1211       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1212         depth++;
1213
1214     kmp_uint32 branch = minBranch;
1215     if (numPerLevel[0] == 1)
1216       branch = num_addrs / maxLeaves;
1217     if (branch < minBranch)
1218       branch = minBranch;
1219     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1220       while (numPerLevel[d] > branch ||
1221              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1222         if (numPerLevel[d] & 1)
1223           numPerLevel[d]++;
1224         numPerLevel[d] = numPerLevel[d] >> 1;
1225         if (numPerLevel[d + 1] == 1)
1226           depth++;
1227         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1228       }
1229       if (numPerLevel[0] == 1) {
1230         branch = branch >> 1;
1231         if (branch < 4)
1232           branch = minBranch;
1233       }
1234     }
1235
1236     for (kmp_uint32 i = 1; i < depth; ++i)
1237       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1238     // Fill in hierarchy in the case of oversubscription
1239     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1240       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1241
1242     uninitialized = initialized; // One writer
1243   }
1244
1245   // Resize the hierarchy if nproc changes to something larger than before
1246   void resize(kmp_uint32 nproc) {
1247     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1248     while (bool_result == 0) { // someone else is trying to resize
1249       KMP_CPU_PAUSE();
1250       if (nproc <= base_num_threads) // happy with other thread's resize
1251         return;
1252       else // try to resize
1253         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1254     }
1255     KMP_DEBUG_ASSERT(bool_result != 0);
1256     if (nproc <= base_num_threads)
1257       return; // happy with other thread's resize
1258
1259     // Calculate new maxLevels
1260     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1261     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1262     // First see if old maxLevels is enough to contain new size
1263     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1264       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1265       numPerLevel[i - 1] *= 2;
1266       old_sz *= 2;
1267       depth++;
1268     }
1269     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1270       while (nproc > old_sz) {
1271         old_sz *= 2;
1272         incs++;
1273         depth++;
1274       }
1275       maxLevels += incs;
1276
1277       // Resize arrays
1278       kmp_uint32 *old_numPerLevel = numPerLevel;
1279       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1280       numPerLevel = skipPerLevel = NULL;
1281       numPerLevel =
1282           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1283       skipPerLevel = &(numPerLevel[maxLevels]);
1284
1285       // Copy old elements from old arrays
1286       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1287         // init numPerLevel[*] to 1 item per level
1288         numPerLevel[i] = old_numPerLevel[i];
1289         skipPerLevel[i] = old_skipPerLevel[i];
1290       }
1291
1292       // Init new elements in arrays to 1
1293       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1294         // init numPerLevel[*] to 1 item per level
1295         numPerLevel[i] = 1;
1296         skipPerLevel[i] = 1;
1297       }
1298
1299       // Free old arrays
1300       __kmp_free(old_numPerLevel);
1301     }
1302
1303     // Fill in oversubscription levels of hierarchy
1304     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1305       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1306
1307     base_num_threads = nproc;
1308     resizing = 0; // One writer
1309   }
1310 };
1311 #endif // KMP_AFFINITY_H