src/gromacs/hardware/cpuinfo.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012-2018, The GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  * \brief
  39  * Implements gmx::CpuInfo.
  40  *
  41  * We need to be able to compile this file in stand-alone mode to use basic
  42  * CPU feature detection to set the SIMD acceleration and similar things in
  43  * CMake, while we still want to use more features that enable topology
  44  * detection when config.h is present.
  45  *
  46  * We solve this by skipping the advanced stuff when the preprocessor
  47  * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
  48  * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
  49  * support it is not possible to perform the actual detection on Linux/Mac.
  50  * Since these macros are specific to this file, they do not use the GMX prefix.
  51  *
  52  * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
  53  * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
  54  * 86, and for this we rely on including config.h.
  55  *
  56  * \author Erik Lindahl <erik.lindahl@gmail.com>
  57  * \ingroup module_hardware
  58  */
  59
  60 #ifndef GMX_CPUINFO_STANDALONE
  61 #    include "gmxpre.h"
  62 #endif
  63
  64 #include "cpuinfo.h"
  65
  66 #ifndef GMX_CPUINFO_STANDALONE
  67 #    include "config.h"
  68 #else
  69 #    define GMX_NATIVE_WINDOWS 0
  70 #endif
  71
  72 #if defined _MSC_VER
  73 #    include <intrin.h> // __cpuid()
  74 #endif
  75
  76 #if GMX_NATIVE_WINDOWS
  77 #    include <windows.h> // sysinfo(), necessary for topology stuff
  78 #endif
  79
  80 #ifdef HAVE_SCHED_H
  81 #    include <sched.h> // sched_getaffinity(), sched_setaffinity()
  82 #endif
  83 #ifdef HAVE_UNISTD_H
  84 #    include <unistd.h> // sysconf()
  85 #endif
  86
  87 #include <cctype>
  88 #include <cstdint> // uint32_t in X86 processor name code
  89 #include <cstdlib>
  90
  91 #include <algorithm>
  92 #include <fstream>
  93 #include <map>
  94 #include <set>
  95 #include <sstream>
  96 #include <string>
  97
  98 #ifdef GMX_CPUINFO_STANDALONE
  99 #    define gmx_unused
 100 #else
 101 #    include "gromacs/utility/basedefinitions.h"
 102 #endif
 103
 104 #include "architecture.h"
 105
 106 namespace gmx
 107 {
 108
 109 namespace
 110 {
 111
 112 /*! \cond internal */
 113
 114 /******************************************************************************
 115  *                                                                            *
 116  *   Utility functions to make this file independent of the GROMACS library   *
 117  *                                                                            *
 118  ******************************************************************************/
 119
 120 /*! \brief Remove initial and trailing whitespace from string
 121  *
 122  *  \param s  Pointer to string where whitespace will be removed
 123  */
 124 void trimString(std::string* s)
 125 {
 126     // heading
 127     s->erase(s->begin(),
 128              std::find_if(s->begin(), s->end(), [](char& c) -> bool { return std::isspace(c) == 0; }));
 129     // trailing
 130     s->erase(
 131             std::find_if(s->rbegin(), s->rend(), [](char& c) -> bool { return std::isspace(c) == 0; })
 132                     .base(),
 133             s->end());
 134 }
 135
 136
 137 /******************************************************************************
 138  *                                                                            *
 139  *                         x86 detection functions                            *
 140  *                                                                            *
 141  ******************************************************************************/
 142
 143 /*! \brief execute x86 cpuid instructions with custom level and extended level
 144  *
 145  *  \param level   The main cpuid level (input argument for eax register)
 146  *  \param ecxval  Extended level (input argument for ecx register)
 147  *  \param eax     Output in eax register
 148  *  \param ebx     Output in ebx register
 149  *  \param ecx     Output in ecx register
 150  *  \param edx     Output in edx register
 151  *
 152  *  \return 0 on success, or non-zero if the instruction could not execute.
 153  */
 154 int executeX86CpuID(unsigned int gmx_unused level,
 155                     unsigned int gmx_unused ecxval,
 156                     unsigned int*           eax,
 157                     unsigned int*           ebx,
 158                     unsigned int*           ecx,
 159                     unsigned int*           edx)
 160 {
 161     if (c_architecture == Architecture::X86)
 162     {
 163 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
 164
 165         // any compiler that understands gcc inline assembly
 166         *eax = level;
 167         *ecx = ecxval;
 168         *ebx = 0;
 169         *edx = 0;
 170
 171 #    if GMX_IS_X86_32 && defined(__PIC__)
 172         // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
 173         __asm__ __volatile__(
 174                 "xchgl %%ebx, %1  \n\t"
 175                 "cpuid            \n\t"
 176                 "xchgl %%ebx, %1  \n\t"
 177                 : "+a"(*eax), "+r"(*ebx), "+c"(*ecx), "+d"(*edx));
 178 #    elif GMX_IS_X86_64
 179         // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
 180         __asm__ __volatile__("cpuid            \n\t"
 181                              : "+a"(*eax), "+b"(*ebx), "+c"(*ecx), "+d"(*edx));
 182 #    else
 183         // Not a normal x86, which could happen when a compiler
 184         // targetting non-x86 pretends to be GCC.
 185 #    endif
 186         return 0;
 187
 188 #elif defined _MSC_VER
 189
 190         // MSVC (and icc on windows) on ia32 or x86-64
 191         int cpuInfo[4];
 192         __cpuidex(cpuInfo, level, ecxval);
 193         *eax = static_cast<unsigned int>(cpuInfo[0]);
 194         *ebx = static_cast<unsigned int>(cpuInfo[1]);
 195         *ecx = static_cast<unsigned int>(cpuInfo[2]);
 196         *edx = static_cast<unsigned int>(cpuInfo[3]);
 197         return 0;
 198
 199 #else
 200
 201         // We are on x86, but without compiler support for cpuid if we get here
 202         *eax = 0;
 203         *ebx = 0;
 204         *ecx = 0;
 205         *edx = 0;
 206         return 1;
 207
 208 #endif // check for inline asm on x86
 209     }
 210     else
 211     {
 212         // We are not on x86
 213         *eax = 0;
 214         *ebx = 0;
 215         *ecx = 0;
 216         *edx = 0;
 217         return 1;
 218     }
 219 }
 220
 221
 222 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
 223  *
 224  *  If support for the cpuid instruction is present, we check for Intel,
 225  *  AMD or Hygon vendors
 226  *
 227  *  \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd,
 228  *          gmx::CpuInfl::Vendor::Hygon, . If neither Intel, Amd  nor
 229  *          Hygon can be identified, or if the code fails to execute,
 230  *          gmx::CpuInfo::Vendor::Unknown is returned.
 231  */
 232 CpuInfo::Vendor detectX86Vendor()
 233 {
 234     unsigned int    eax, ebx, ecx, edx;
 235     CpuInfo::Vendor v = CpuInfo::Vendor::Unknown;
 236
 237     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) == 0)
 238     {
 239         if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
 240         {
 241             v = CpuInfo::Vendor::Intel; // ebx=='uneG', ecx=='letn', edx=='Ieni'
 242         }
 243         else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
 244         {
 245             v = CpuInfo::Vendor::Amd; // ebx=='htuA', ecx=='DMAc', edx=='itne'
 246         }
 247         else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
 248         {
 249             v = CpuInfo::Vendor::Hygon; // ebx=='ogyH', ecx=='eniu', edx=='neGn'
 250         }
 251     }
 252     return v;
 253 }
 254
 255 /*! \brief Detect second AVX-512 FMA from the processor name
 256  *
 257  * Should only be called for processors already determined to support AVX-512.
 258  *
 259  *  \param [in] brand     x86 processor name
 260  *  \param [in] model     x86 model
 261  *  \return               True if second FMA present
 262  */
 263 bool detectProcCpuInfoSecondAvx512FMA(const std::string& brand, int model)
 264 {
 265     // Skylake server
 266     if (model == 0x55)
 267     {
 268         // detect Xeon
 269         if (brand.find("Xeon") == 9)
 270         {
 271             // detect Silver or Bronze or specific models
 272             if (brand.find("Silver") == 17 || brand.find("Bronze") == 17
 273                 || (brand.find('W') == 17 && brand.find('0') == 21)   // detect Xeon W 210x
 274                 || (brand.find('D') == 17 && brand.find("21") == 19)) // detect Xeon D 2xxx
 275             {
 276                 return false;
 277             }
 278             // detect Gold 5xxx - can be corrected once Cooper Lake is added
 279             else if (brand.find("Gold") == 17 && brand.find('5') == 22)
 280             {
 281                 return (brand.find("53") == 22 || // detect Cooper Lake
 282                         brand.find("22") == 24);  // detect 5[12]22
 283             }
 284         }
 285         return true;
 286     }
 287     // Cannon Lake client
 288     if (model == 0x66)
 289     {
 290         return false;
 291     }
 292     // Ice Lake client
 293     if (model == 0x7d || model == 0x7e)
 294     {
 295         return false;
 296     }
 297     // This is the right default...
 298     return true;
 299 }
 300
 301 /*! \brief Simple utility function to set/clear feature in a set
 302  *
 303  *  \param featureSet    Pointer to the feature set to update
 304  *  \param feature       The specific feature to set/clear
 305  *  \param registerValue Register value (returned from cpuid)
 306  *  \param bit           Bit to check in registerValue. The feature will be
 307  *                       added to the featureSet if this bit is set.
 308  *
 309  *  \note Nothing is done if the bit is not set. In particular, this will not
 310  *        erase anything if the feature already exists in the set.
 311  */
 312 void setFeatureFromBit(std::set<CpuInfo::Feature>* featureSet,
 313                        CpuInfo::Feature            feature,
 314                        unsigned int                registerValue,
 315                        unsigned char               bit)
 316 {
 317     if (registerValue & (1 << bit))
 318     {
 319         featureSet->insert(feature);
 320     }
 321 }
 322
 323 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
 324  *
 325  *  \param[out] brand      String where to write the x86 brand string
 326  *  \param[out] family     Major version of processor
 327  *  \param[out] model      Middle version of processor
 328  *  \param[out] stepping   Minor version of processor
 329  *  \param[out] features   Feature set where supported features are inserted
 330  */
 331 void detectX86Features(std::string* brand, int* family, int* model, int* stepping, std::set<CpuInfo::Feature>* features)
 332 {
 333     unsigned int eax, ebx, ecx, edx;
 334
 335     // Return if we cannot execute any levels
 336     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) != 0)
 337     {
 338         return;
 339     }
 340     unsigned int maxStdLevel = eax;
 341
 342     if (maxStdLevel >= 0x1)
 343     {
 344         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 345
 346         *family   = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
 347         *model    = ((eax & 0x000f0000) >> 12) + ((eax & 0x000000f0) >> 4);
 348         *stepping = (eax & 0x0000000f);
 349
 350         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse3, ecx, 0);
 351         setFeatureFromBit(features, CpuInfo::Feature::X86_Pclmuldq, ecx, 1);
 352         setFeatureFromBit(features, CpuInfo::Feature::X86_Ssse3, ecx, 9);
 353         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma, ecx, 12);
 354         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx16, ecx, 13);
 355         setFeatureFromBit(features, CpuInfo::Feature::X86_Pdcm, ecx, 15);
 356         setFeatureFromBit(features, CpuInfo::Feature::X86_Pcid, ecx, 17);
 357         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_1, ecx, 19);
 358         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_2, ecx, 20);
 359         setFeatureFromBit(features, CpuInfo::Feature::X86_X2Apic, ecx, 21);
 360         setFeatureFromBit(features, CpuInfo::Feature::X86_Popcnt, ecx, 23);
 361         setFeatureFromBit(features, CpuInfo::Feature::X86_Tdt, ecx, 24);
 362         setFeatureFromBit(features, CpuInfo::Feature::X86_Aes, ecx, 25);
 363         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx, ecx, 28);
 364         setFeatureFromBit(features, CpuInfo::Feature::X86_F16C, ecx, 29);
 365         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdrnd, ecx, 30);
 366
 367         setFeatureFromBit(features, CpuInfo::Feature::X86_Pse, edx, 3);
 368         setFeatureFromBit(features, CpuInfo::Feature::X86_Msr, edx, 5);
 369         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx8, edx, 8);
 370         setFeatureFromBit(features, CpuInfo::Feature::X86_Apic, edx, 9);
 371         setFeatureFromBit(features, CpuInfo::Feature::X86_Cmov, edx, 15);
 372         setFeatureFromBit(features, CpuInfo::Feature::X86_Clfsh, edx, 19);
 373         setFeatureFromBit(features, CpuInfo::Feature::X86_Mmx, edx, 23);
 374         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse2, edx, 26);
 375         setFeatureFromBit(features, CpuInfo::Feature::X86_Htt, edx, 28);
 376     }
 377
 378     // Check whether Hyper-threading is really possible to enable in the hardware,
 379     // not just technically supported by this generation of processors
 380     if ((features->count(CpuInfo::Feature::X86_Htt) != 0U) && maxStdLevel >= 0x4)
 381     {
 382         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 383         unsigned int maxLogicalCores = (ebx >> 16) & 0x0ff;
 384         executeX86CpuID(0x4, 0, &eax, &ebx, &ecx, &edx);
 385         unsigned int maxPhysicalCores = ((eax >> 26) & 0x3f) + 1;
 386         if (maxLogicalCores / maxPhysicalCores < 2)
 387         {
 388             features->erase(CpuInfo::Feature::X86_Htt);
 389         }
 390     }
 391
 392     if (executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx) != 0)
 393     {
 394         // No point in continuing if we don't support any extended levels
 395         return;
 396     }
 397     unsigned int maxExtLevel = eax;
 398
 399     if (maxExtLevel >= 0x80000001)
 400     {
 401         executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 402
 403         setFeatureFromBit(features, CpuInfo::Feature::X86_Lahf, ecx, 0);
 404         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4A, ecx, 6);
 405         setFeatureFromBit(features, CpuInfo::Feature::X86_MisalignSse, ecx, 7);
 406         setFeatureFromBit(features, CpuInfo::Feature::X86_Xop, ecx, 11);
 407         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma4, ecx, 16);
 408         setFeatureFromBit(features, CpuInfo::Feature::X86_PDPE1GB, edx, 26);
 409         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdtscp, edx, 27);
 410     }
 411
 412     if (maxExtLevel >= 0x80000005)
 413     {
 414         // Get the x86 CPU brand string (3 levels, 16 bytes in each)
 415         brand->clear();
 416         for (unsigned int level = 0x80000002; level < 0x80000005; level++)
 417         {
 418             executeX86CpuID(level, 0, &eax, &ebx, &ecx, &edx);
 419             // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
 420             brand->append(reinterpret_cast<const char*>(&eax), sizeof(eax));
 421             brand->append(reinterpret_cast<const char*>(&ebx), sizeof(ebx));
 422             brand->append(reinterpret_cast<const char*>(&ecx), sizeof(ecx));
 423             brand->append(reinterpret_cast<const char*>(&edx), sizeof(edx));
 424         }
 425         trimString(brand);
 426     }
 427
 428     if (maxStdLevel >= 0x7)
 429     {
 430         executeX86CpuID(0x7, 0, &eax, &ebx, &ecx, &edx);
 431
 432         setFeatureFromBit(features, CpuInfo::Feature::X86_Hle, ebx, 4);
 433         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx2, ebx, 5);
 434         setFeatureFromBit(features, CpuInfo::Feature::X86_Rtm, ebx, 11);
 435         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512F, ebx, 16);
 436         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512PF, ebx, 26);
 437         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512ER, ebx, 27);
 438         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512CD, ebx, 28);
 439         setFeatureFromBit(features, CpuInfo::Feature::X86_Sha, ebx, 29);
 440         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BW, ebx, 30);
 441         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512VL, ebx, 31);
 442
 443         executeX86CpuID(0x7, 0x1, &eax, &ebx, &ecx, &edx);
 444         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BF16, eax, 5);
 445
 446         if (features->count(CpuInfo::Feature::X86_Avx512F) != 0)
 447         {
 448             // Only checking if the CPU supports AVX-512. There is no CPUID bit for this.
 449             if (detectProcCpuInfoSecondAvx512FMA(*brand, *model))
 450             {
 451                 features->insert(CpuInfo::Feature::X86_Avx512secondFMA);
 452             }
 453         }
 454     }
 455
 456
 457     if (maxExtLevel >= 0x80000007)
 458     {
 459         executeX86CpuID(0x80000007, 0, &eax, &ebx, &ecx, &edx);
 460
 461         setFeatureFromBit(features, CpuInfo::Feature::X86_NonstopTsc, edx, 8);
 462     }
 463 }
 464
 465
 466 /*! \brief Return a vector with x86 APIC IDs for all threads
 467  *
 468  *  \param haveX2Apic  True if the processors supports x2APIC, otherwise vanilla APIC.
 469  *
 470  *  \returns A new std::vector of unsigned integer APIC IDs, one for each
 471  *           logical processor in the system.
 472  */
 473 std::vector<unsigned int> detectX86ApicIDs(bool gmx_unused haveX2Apic)
 474 {
 475     std::vector<unsigned int> apicID;
 476
 477     // We cannot just ask for all APIC IDs, but must force execution on each
 478     // hardware thread and extract the APIC id there.
 479 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
 480     unsigned int eax, ebx, ecx, edx;
 481     unsigned int nApic = sysconf(_SC_NPROCESSORS_ONLN);
 482     cpu_set_t    saveCpuSet;
 483     cpu_set_t    cpuSet;
 484     sched_getaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
 485     CPU_ZERO(&cpuSet);
 486     for (unsigned int i = 0; i < nApic; i++)
 487     {
 488         CPU_SET(i, &cpuSet);
 489         sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
 490         if (haveX2Apic)
 491         {
 492             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 493             apicID.push_back(edx);
 494         }
 495         else
 496         {
 497             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 498             apicID.push_back(ebx >> 24);
 499         }
 500         CPU_CLR(i, &cpuSet);
 501     }
 502     sched_setaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
 503 #elif GMX_NATIVE_WINDOWS
 504     unsigned int eax, ebx, ecx, edx;
 505     SYSTEM_INFO sysinfo;
 506     GetSystemInfo(&sysinfo);
 507     unsigned int nApic = sysinfo.dwNumberOfProcessors;
 508     unsigned int saveAffinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 509     for (DWORD_PTR i = 0; i < nApic; i++)
 510     {
 511         SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1) << i));
 512         Sleep(0);
 513         if (haveX2Apic)
 514         {
 515             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 516             apicID.push_back(edx);
 517         }
 518         else
 519         {
 520             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 521             apicID.push_back(ebx >> 24);
 522         }
 523     }
 524     SetThreadAffinityMask(GetCurrentThread(), saveAffinity);
 525 #endif
 526     return apicID;
 527 }
 528
 529
 530 /*! \brief Utility to renumber indices extracted from APIC IDs
 531  *
 532  * \param v  Vector with unsigned integer indices
 533  *
 534  * This routine returns the number of unique different elements found in the vector,
 535  * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
 536  * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
 537  * number of unique elements.
 538  */
 539 void renumberIndex(std::vector<unsigned int>* v)
 540 {
 541     std::vector<unsigned int> sortedV(*v);
 542     std::sort(sortedV.begin(), sortedV.end());
 543
 544     std::vector<unsigned int> uniqueSortedV(sortedV);
 545     auto                      it = std::unique(uniqueSortedV.begin(), uniqueSortedV.end());
 546     uniqueSortedV.resize(std::distance(uniqueSortedV.begin(), it));
 547
 548     for (std::size_t i = 0; i < uniqueSortedV.size(); i++)
 549     {
 550         unsigned int val = uniqueSortedV[i];
 551         std::replace_if(v->begin(), v->end(), [val](unsigned int& c) -> bool { return c == val; },
 552                         static_cast<unsigned int>(i));
 553     }
 554 }
 555
 556 /*! \brief The layout of the bits in the APIC ID */
 557 struct ApicIdLayout
 558 {
 559     unsigned int hwThreadBits; //!< The number of least significant bits for hw-threads
 560     unsigned int coreBits;     //!< The number of core bits following the  hw-thread bits
 561 };
 562
 563 /*! \brief Detect the APIC ID layout for x2APIC
 564  */
 565 ApicIdLayout detectX2ApicIdLayout()
 566 {
 567     ApicIdLayout layout;
 568
 569     unsigned int eax;
 570     unsigned int ebx;
 571     unsigned int ecx;
 572     unsigned int edx;
 573     executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 574     layout.hwThreadBits = eax & 0x1f;
 575     executeX86CpuID(0xb, 1, &eax, &ebx, &ecx, &edx);
 576     layout.coreBits = (eax & 0x1f) - layout.hwThreadBits;
 577
 578     return layout;
 579 }
 580
 581 /*! \brief Detect the APIC ID layout for standard APIC or xAPIC on AMD
 582  *
 583  * \param[in] maxExtLevel  The largest CPUID extended function input value supported by the processor implementation
 584  */
 585 ApicIdLayout detectAmdApicIdLayout(unsigned int maxExtLevel)
 586 {
 587     ApicIdLayout layout;
 588
 589     unsigned int eax;
 590     unsigned int ebx;
 591     unsigned int ecx;
 592     unsigned int edx;
 593     executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 594     int family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
 595     executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 596     bool haveExtendedTopology = (ecx & (1 << 22)) != 0U;
 597
 598     // NOTE: Here we assume 1 thread per core, unless we have family >= 17h
 599     layout.hwThreadBits = 0;
 600     if (family >= 0x17 && haveExtendedTopology && maxExtLevel >= 0x8000001e)
 601     {
 602         executeX86CpuID(0x8000001e, 1, &eax, &ebx, &ecx, &edx);
 603         int numThreadsPerCore = ((ebx >> 8) & 0xff) + 1;
 604         // NOTE: The AMD documentation only specifies the layout of apicid
 605         //       when we have 1 or 2 threads per core.
 606         while (numThreadsPerCore > (1 << layout.hwThreadBits))
 607         {
 608             layout.hwThreadBits++;
 609         }
 610     }
 611
 612     // Get number of core bits in apic ID - try modern extended method first
 613     executeX86CpuID(0x80000008, 0, &eax, &ebx, &ecx, &edx);
 614     layout.coreBits = (ecx >> 12) & 0xf;
 615     if (layout.coreBits == 0)
 616     {
 617         // Legacy method for old single/dual core AMD CPUs
 618         int i = ecx & 0xf;
 619         while (i >> layout.coreBits)
 620         {
 621             layout.coreBits++;
 622         }
 623     }
 624
 625     return layout;
 626 }
 627
 628 /*! \brief Try to detect basic CPU topology information using x86 cpuid
 629  *
 630  *  If x2APIC support is present, this is our first choice, otherwise we
 631  *  attempt to use old vanilla APIC.
 632  *
 633  *  \return A new vector of entries with socket, core, hwthread information
 634  *          for each logical processor.
 635  */
 636 std::vector<CpuInfo::LogicalProcessor> detectX86LogicalProcessors()
 637 {
 638     unsigned int eax;
 639     unsigned int ebx;
 640     unsigned int ecx;
 641     unsigned int edx;
 642     unsigned int maxStdLevel;
 643     unsigned int maxExtLevel;
 644     bool         haveApic;
 645     bool         haveX2Apic;
 646
 647     std::vector<CpuInfo::LogicalProcessor> logicalProcessors;
 648
 649     // Find largest standard & extended level input values allowed
 650     executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx);
 651     maxStdLevel = eax;
 652     executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 653     maxExtLevel = eax;
 654
 655     if (maxStdLevel >= 0x1)
 656     {
 657         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 658         haveX2Apic = ((ecx & (1 << 21)) != 0U) && maxStdLevel >= 0xb;
 659         haveApic   = ((edx & (1 << 9)) != 0U) && maxExtLevel >= 0x80000008;
 660     }
 661     else
 662     {
 663         haveX2Apic = false;
 664         haveApic   = false;
 665     }
 666
 667     if (haveX2Apic || haveApic)
 668     {
 669         ApicIdLayout layout;
 670         // Get bits for cores and hardware threads
 671         if (haveX2Apic)
 672         {
 673             layout = detectX2ApicIdLayout();
 674         }
 675         else // haveApic
 676         {
 677             if (detectX86Vendor() == CpuInfo::Vendor::Amd || detectX86Vendor() == CpuInfo::Vendor::Hygon)
 678             {
 679                 layout = detectAmdApicIdLayout(maxExtLevel);
 680
 681                 if (layout.hwThreadBits > 1)
 682                 {
 683                     // At the time of writing this code we do not know what
 684                     // to do with more than 2 threads, so return empty.
 685                     return logicalProcessors;
 686                 }
 687             }
 688             else
 689             {
 690                 // We do not know the APIC ID layout, return empty.
 691                 return logicalProcessors;
 692             }
 693         }
 694
 695         std::vector<unsigned int> apicID = detectX86ApicIDs(haveX2Apic);
 696
 697         if (!apicID.empty())
 698         {
 699             // APIC IDs can be buggy, and it is always a mess. Typically more bits are
 700             // reserved than needed, and the numbers might not increment by 1 even in
 701             // a single socket or core. Extract, renumber, and check that things make sense.
 702             unsigned int              hwThreadMask = (1 << layout.hwThreadBits) - 1;
 703             unsigned int              coreMask     = (1 << layout.coreBits) - 1;
 704             std::vector<unsigned int> hwThreadRanks;
 705             std::vector<unsigned int> coreRanks;
 706             std::vector<unsigned int> socketRanks;
 707
 708             for (auto a : apicID)
 709             {
 710                 hwThreadRanks.push_back(static_cast<int>(a & hwThreadMask));
 711                 coreRanks.push_back(static_cast<int>((a >> layout.hwThreadBits) & coreMask));
 712                 socketRanks.push_back(static_cast<int>(a >> (layout.coreBits + layout.hwThreadBits)));
 713             }
 714
 715             renumberIndex(&hwThreadRanks);
 716             renumberIndex(&coreRanks);
 717             renumberIndex(&socketRanks);
 718
 719             unsigned int hwThreadRankSize =
 720                     1 + *std::max_element(hwThreadRanks.begin(), hwThreadRanks.end());
 721             unsigned int coreRankSize = 1 + *std::max_element(coreRanks.begin(), coreRanks.end());
 722             unsigned int socketRankSize = 1 + *std::max_element(socketRanks.begin(), socketRanks.end());
 723
 724             if (socketRankSize * coreRankSize * hwThreadRankSize == apicID.size())
 725             {
 726                 // Alright, everything looks consistent, so put it in the result
 727                 for (std::size_t i = 0; i < apicID.size(); i++)
 728                 {
 729                     // While the internal APIC IDs are always unsigned integers, we also cast to
 730                     // plain integers for the externally exposed vectors, since that will make
 731                     // it possible to use '-1' for invalid entries in the future.
 732                     logicalProcessors.push_back(
 733                             { int(socketRanks[i]), int(coreRanks[i]), int(hwThreadRanks[i]) });
 734                 }
 735             }
 736         }
 737     }
 738     return logicalProcessors; // Will only have contents if everything worked
 739 }
 740
 741
 742 /******************************************************************************
 743  *                                                                            *
 744  *              Generic Linux detection by parsing /proc/cpuinfo              *
 745  *                                                                            *
 746  ******************************************************************************/
 747
 748 /*! \brief Parse /proc/cpuinfo into a simple string map
 749  *
 750  * This routine will read the contents of /proc/cpuinfo, and for each
 751  * line that is not empty we will assign the (trimmed) string to the right of
 752  * the colon as a key, and the left-hand side as the value in the map.
 753  * For multi-processor systems where lines are repeated the latter lines will
 754  * overwrite the first occurrence.
 755  *
 756  * \return New map with the contents. If the file is not available, the returned
 757  *         map will be empty.
 758  */
 759 std::map<std::string, std::string> parseProcCpuInfo()
 760 {
 761     std::ifstream                      procCpuInfo("/proc/cpuinfo");
 762     std::string                        line;
 763     std::map<std::string, std::string> cpuInfo;
 764
 765     while (std::getline(procCpuInfo, line))
 766     {
 767         if (!line.empty())
 768         {
 769             std::stringstream iss(line);
 770             std::string       key;
 771             std::string       val;
 772             std::getline(iss, key, ':'); // part before colon
 773             std::getline(iss, val);      // part after colon
 774             trimString(&key);
 775             trimString(&val);
 776             // put it in the map. This will overwrite previous processors, but we don't care.
 777             cpuInfo[key] = val;
 778         }
 779     }
 780     return cpuInfo;
 781 }
 782
 783
 784 /*! \brief Try to detect vendor from /proc/cpuinfo
 785  *
 786  *  \param cpuInfo  Map returned from parseProcCpuinfo()
 787  *
 788  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 789  *  they begin with the name of a standard vendor. If the file cannot be read
 790  *  or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
 791  */
 792 CpuInfo::Vendor detectProcCpuInfoVendor(const std::map<std::string, std::string>& cpuInfo)
 793 {
 794     const std::map<std::string, CpuInfo::Vendor> testVendors = {
 795         { "GenuineIntel", CpuInfo::Vendor::Intel },
 796         { "Intel", CpuInfo::Vendor::Intel },
 797         { "AuthenticAmd", CpuInfo::Vendor::Amd },
 798         { "AMD", CpuInfo::Vendor::Amd },
 799         { "ARM", CpuInfo::Vendor::Arm },
 800         { "AArch64", CpuInfo::Vendor::Arm },
 801         { "Fujitsu", CpuInfo::Vendor::Fujitsu },
 802         { "IBM", CpuInfo::Vendor::Ibm },
 803         { "POWER", CpuInfo::Vendor::Ibm },
 804         { "Oracle", CpuInfo::Vendor::Oracle },
 805         { "HygonGenuine", CpuInfo::Vendor::Hygon },
 806         { "Hygon", CpuInfo::Vendor::Hygon },
 807     };
 808
 809     // For each label in /proc/cpuinfo, compare the value to the name in the
 810     // testNames map above, and if it's a match return the vendor.
 811     for (auto& l : { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
 812     {
 813         if (cpuInfo.count(l) != 0U)
 814         {
 815             // there was a line with this left-hand side in /proc/cpuinfo
 816             const std::string& s1 = cpuInfo.at(l);
 817
 818             for (auto& t : testVendors)
 819             {
 820                 const std::string& s2 = t.first;
 821
 822                 // If the entire name we are testing (s2) matches the first part of
 823                 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
 824                 if (std::equal(s2.begin(), s2.end(), s1.begin(), [](const char& x, const char& y) -> bool {
 825                         return tolower(x) == tolower(y);
 826                     }))
 827                 {
 828                     return t.second;
 829                 }
 830             }
 831         }
 832     }
 833     return CpuInfo::Vendor::Unknown;
 834 }
 835
 836
 837 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
 838  *
 839  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
 840  *  \param[out] brand      String where to write the brand string
 841  *  \param[out] features   Feature set where supported features are inserted
 842  *
 843  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 844  *  we can find the processor name and features. It is likely fragile.
 845  */
 846 void detectProcCpuInfoIbm(const std::map<std::string, std::string>& cpuInfo,
 847                           std::string*                              brand,
 848                           std::set<CpuInfo::Feature>*               features)
 849 {
 850     // Get brand string from 'cpu' label if present, otherwise 'Processor'
 851     if (cpuInfo.count("cpu") != 0U)
 852     {
 853         *brand = cpuInfo.at("cpu");
 854     }
 855     else if (cpuInfo.count("Processor") != 0U)
 856     {
 857         *brand = cpuInfo.at("Processor");
 858     }
 859
 860     if (brand->find("A2") != std::string::npos)
 861     {
 862         // If the processor identification contains "A2", this is BlueGene/Q with QPX
 863         features->insert(CpuInfo::Feature::Ibm_Qpx);
 864     }
 865
 866     for (auto& l : { "model name", "model", "Processor", "cpu" })
 867     {
 868         if (cpuInfo.count(l) != 0U)
 869         {
 870             std::string s1 = cpuInfo.at(l);
 871             std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
 872
 873             if (s1.find("altivec") != std::string::npos)
 874             {
 875                 features->insert(CpuInfo::Feature::Ibm_Vmx);
 876                 // If this is a power6, we only have VMX. All later processors have VSX.
 877                 if (s1.find("power6") == std::string::npos)
 878                 {
 879                     features->insert(CpuInfo::Feature::Ibm_Vsx);
 880                 }
 881             }
 882         }
 883     }
 884 }
 885
 886
 887 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
 888  *
 889  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
 890  *  \param[out] brand      String where to write the brand string
 891  *  \param[out] family     Major version of processor
 892  *  \param[out] model      Middle version of processor
 893  *  \param[out] stepping   Minor version of processor
 894  *  \param[out] features   Feature set where supported features are inserted
 895  *
 896  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 897  *  we can find the processor name and features. It is likely fragile.
 898  */
 899 void detectProcCpuInfoArm(const std::map<std::string, std::string>& cpuInfo,
 900                           std::string*                              brand,
 901                           int*                                      family,
 902                           int*                                      model,
 903                           int*                                      stepping,
 904                           std::set<CpuInfo::Feature>*               features)
 905 {
 906     if (cpuInfo.count("Processor") != 0U)
 907     {
 908         *brand = cpuInfo.at("Processor");
 909     }
 910     else if (cpuInfo.count("model name") != 0U)
 911     {
 912         *brand = cpuInfo.at("model name");
 913     }
 914
 915     if (cpuInfo.count("CPU architecture") != 0U)
 916     {
 917         *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
 918         // For some 64-bit CPUs it appears to say 'AArch64' instead
 919         if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
 920         {
 921             *family = 8; // fragile - no idea how a future ARMv9 will be represented in this case
 922         }
 923     }
 924     if (cpuInfo.count("CPU variant") != 0U)
 925     {
 926         *model = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
 927     }
 928     if (cpuInfo.count("CPU revision") != 0U)
 929     {
 930         *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
 931     }
 932
 933     if (cpuInfo.count("Features") != 0U)
 934     {
 935         const std::string& s = cpuInfo.at("Features");
 936         if (s.find("neon") != std::string::npos)
 937         {
 938             features->insert(CpuInfo::Feature::Arm_Neon);
 939         }
 940         if (s.find("asimd") != std::string::npos)
 941         {
 942             // At least Jetson TX1 runs a 32-bit environment by default, although
 943             // the kernel is 64-bits, and reports asimd feature flags. We cannot
 944             // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
 945             if (sizeof(void*) == 8)
 946             {
 947                 features->insert(CpuInfo::Feature::Arm_NeonAsimd);
 948             }
 949         }
 950         if (s.find("sve") != std::string::npos)
 951         {
 952             features->insert(CpuInfo::Feature::Arm_Sve);
 953         }
 954     }
 955 }
 956
 957
 958 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
 959  *
 960  *  \param[out] vendor     Detected hardware vendor
 961  *  \param[out] brand      String where to write the brand string
 962  *  \param[out] family     Major version of processor
 963  *  \param[out] model      Middle version of processor
 964  *  \param[out] stepping   Minor version of processor
 965  *  \param[out] features   Feature set where supported features are inserted
 966  *
 967  *  This routine reads the /proc/cpuinfo file into a map and calls subroutines
 968  *  that attempt to parse by matching keys and values to known strings. It is
 969  *  much more fragile than our x86 detection, but it does not depend on
 970  *  specific system calls, intrinsics or assembly instructions.
 971  */
 972 void detectProcCpuInfo(CpuInfo::Vendor*            vendor,
 973                        std::string*                brand,
 974                        int*                        family,
 975                        int*                        model,
 976                        int*                        stepping,
 977                        std::set<CpuInfo::Feature>* features)
 978 {
 979     std::map<std::string, std::string> cpuInfo = parseProcCpuInfo();
 980
 981     if (*vendor == CpuInfo::Vendor::Unknown)
 982     {
 983         *vendor = detectProcCpuInfoVendor(cpuInfo);
 984     }
 985
 986     // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
 987     // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
 988     // To handle this slightly better we use one subroutine per vendor.
 989     switch (*vendor)
 990     {
 991         case CpuInfo::Vendor::Ibm: detectProcCpuInfoIbm(cpuInfo, brand, features); break;
 992
 993         case CpuInfo::Vendor::Arm:
 994             detectProcCpuInfoArm(cpuInfo, brand, family, model, stepping, features);
 995             break;
 996
 997         default:
 998             // We only have a single check for fujitsu for now
 999 #ifdef __HPC_ACE__
1000             features->insert(CpuInfo::Feature::Fujitsu_HpcAce);
1001 #endif
1002             break;
1003     }
1004 }
1005 /*! \endcond */
1006 } // namespace
1007
1008
1009 // static
1010 CpuInfo CpuInfo::detect()
1011 {
1012     CpuInfo result;
1013
1014     if (c_architecture == Architecture::X86)
1015     {
1016         result.vendor_ = detectX86Vendor();
1017
1018         if (result.vendor_ == CpuInfo::Vendor::Intel)
1019         {
1020             result.features_.insert(CpuInfo::Feature::X86_Intel);
1021         }
1022         else if (result.vendor_ == CpuInfo::Vendor::Amd)
1023         {
1024             result.features_.insert(CpuInfo::Feature::X86_Amd);
1025         }
1026         else if (result.vendor_ == CpuInfo::Vendor::Hygon)
1027         {
1028             result.features_.insert(CpuInfo::Feature::X86_Hygon);
1029         }
1030         detectX86Features(&result.brandString_, &result.family_, &result.model_, &result.stepping_,
1031                           &result.features_);
1032         result.logicalProcessors_ = detectX86LogicalProcessors();
1033     }
1034     else
1035     {
1036         // Not x86
1037         if (c_architecture == Architecture::Arm)
1038         {
1039             result.vendor_ = CpuInfo::Vendor::Arm;
1040         }
1041         else if (c_architecture == Architecture::PowerPC)
1042         {
1043             result.vendor_ = CpuInfo::Vendor::Ibm;
1044         }
1045
1046 #if defined __aarch64__ || (defined _M_ARM && _M_ARM >= 8)
1047         result.features_.insert(Feature::Arm_Neon);      // ARMv8 always has Neon
1048         result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
1049 #endif
1050 #if defined __arch64__ && defined __ARM_FEATURE_SVE
1051         result.features_.insert(Feature::Arm_Sve);
1052 #endif
1053
1054 #if defined sun
1055         result.vendor_ = CpuInfo::Vendor::Oracle;
1056 #endif
1057
1058         // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
1059         // is set to a known value this routine will not overwrite it.
1060         detectProcCpuInfo(&result.vendor_, &result.brandString_, &result.family_, &result.model_,
1061                           &result.stepping_, &result.features_);
1062     }
1063
1064     if (!result.logicalProcessors_.empty())
1065     {
1066         result.supportLevel_ = CpuInfo::SupportLevel::LogicalProcessorInfo;
1067     }
1068     else if (!result.features_.empty())
1069     {
1070         result.supportLevel_ = CpuInfo::SupportLevel::Features;
1071     }
1072     else if (result.vendor_ != CpuInfo::Vendor::Unknown
1073              || result.brandString_ != "Unknown CPU brand")
1074     {
1075         result.supportLevel_ = CpuInfo::SupportLevel::Name;
1076     }
1077     else
1078     {
1079         result.supportLevel_ = CpuInfo::SupportLevel::None;
1080     }
1081
1082     return result;
1083 }
1084
1085 CpuInfo::CpuInfo() :
1086     vendor_(CpuInfo::Vendor::Unknown),
1087     brandString_("Unknown CPU brand"),
1088     family_(0),
1089     model_(0),
1090     stepping_(0)
1091 {
1092 }
1093
1094 const std::string& CpuInfo::vendorString() const
1095 {
1096     static const std::map<Vendor, std::string> vendorStrings = {
1097         { Vendor::Unknown, "Unknown vendor" }, { Vendor::Intel, "Intel" }, { Vendor::Amd, "AMD" },
1098         { Vendor::Fujitsu, "Fujitsu" },        { Vendor::Ibm, "IBM" },     { Vendor::Arm, "ARM" },
1099         { Vendor::Oracle, "Oracle" },          { Vendor::Hygon, "Hygon" },
1100     };
1101
1102     return vendorStrings.at(vendor_);
1103 }
1104
1105
1106 const std::string& CpuInfo::featureString(Feature f)
1107 {
1108     static const std::map<Feature, std::string> featureStrings = {
1109         { Feature::X86_Aes, "aes" },
1110         { Feature::X86_Amd, "amd" },
1111         { Feature::X86_Apic, "apic" },
1112         { Feature::X86_Avx, "avx" },
1113         { Feature::X86_Avx2, "avx2" },
1114         { Feature::X86_Avx512F, "avx512f" },
1115         { Feature::X86_Avx512PF, "avx512pf" },
1116         { Feature::X86_Avx512ER, "avx512er" },
1117         { Feature::X86_Avx512CD, "avx512cd" },
1118         { Feature::X86_Avx512BW, "avx512bw" },
1119         { Feature::X86_Avx512VL, "avx512vl" },
1120         { Feature::X86_Avx512BF16, "avx512bf16" },
1121         { Feature::X86_Avx512secondFMA, "avx512secondFMA" },
1122         { Feature::X86_Clfsh, "clfsh" },
1123         { Feature::X86_Cmov, "cmov" },
1124         { Feature::X86_Cx8, "cx8" },
1125         { Feature::X86_Cx16, "cx16" },
1126         { Feature::X86_F16C, "f16c" },
1127         { Feature::X86_Fma, "fma" },
1128         { Feature::X86_Fma4, "fma4" },
1129         { Feature::X86_Hle, "hle" },
1130         { Feature::X86_Htt, "htt" },
1131         { Feature::X86_Intel, "intel" },
1132         { Feature::X86_Lahf, "lahf" },
1133         { Feature::X86_MisalignSse, "misalignsse" },
1134         { Feature::X86_Mmx, "mmx" },
1135         { Feature::X86_Msr, "msr" },
1136         { Feature::X86_NonstopTsc, "nonstop_tsc" },
1137         { Feature::X86_Pcid, "pcid" },
1138         { Feature::X86_Pclmuldq, "pclmuldq" },
1139         { Feature::X86_Pdcm, "pdcm" },
1140         { Feature::X86_PDPE1GB, "pdpe1gb" },
1141         { Feature::X86_Popcnt, "popcnt" },
1142         { Feature::X86_Pse, "pse" },
1143         { Feature::X86_Rdrnd, "rdrnd" },
1144         { Feature::X86_Rdtscp, "rdtscp" },
1145         { Feature::X86_Rtm, "rtm" },
1146         { Feature::X86_Sha, "sha" },
1147         { Feature::X86_Sse2, "sse2" },
1148         { Feature::X86_Sse3, "sse3" },
1149         { Feature::X86_Sse4A, "sse4a" },
1150         { Feature::X86_Sse4_1, "sse4.1" },
1151         { Feature::X86_Sse4_2, "sse4.2" },
1152         { Feature::X86_Ssse3, "ssse3" },
1153         { Feature::X86_Tdt, "tdt" },
1154         { Feature::X86_X2Apic, "x2apic" },
1155         { Feature::X86_Xop, "xop" },
1156         { Feature::Arm_Neon, "neon" },
1157         { Feature::Arm_NeonAsimd, "neon_asimd" },
1158         { Feature::Arm_Sve, "sve" },
1159         { Feature::Ibm_Qpx, "qpx" },
1160         { Feature::Ibm_Vmx, "vmx" },
1161         { Feature::Ibm_Vsx, "vsx" },
1162         { Feature::Fujitsu_HpcAce, "hpc-ace" },
1163         { Feature::X86_Hygon, "hygon" }
1164     };
1165     return featureStrings.at(f);
1166 }
1167
1168
1169 bool cpuIsX86Nehalem(const CpuInfo& cpuInfo)
1170 {
1171     return (cpuInfo.vendor() == CpuInfo::Vendor::Intel && cpuInfo.family() == 6
1172             && (cpuInfo.model() == 0x2E || cpuInfo.model() == 0x1A || cpuInfo.model() == 0x1E
1173                 || cpuInfo.model() == 0x2F || cpuInfo.model() == 0x2C || cpuInfo.model() == 0x25));
1174 }
1175
1176 bool cpuIsAmdZen1(const CpuInfo& cpuInfo)
1177 {
1178     /* Both Zen/Zen+/Zen2 have family==23
1179      * Model numbers for Zen:
1180      * 1)  Naples, Whitehaven, Summit Ridge, and Snowy Owl;
1181      * 17) Raven Ridge.
1182      * Model numbers for Zen+:
1183      * 8)  Pinnacle Ridge;
1184      * 24) Picasso.
1185      * Hygon got license for Zen1, but not Zen2 (https://www.tomshardware.com/news/amd-zen-china-x86-ip-license,39573.html)
1186      */
1187     return (cpuInfo.vendor() == CpuInfo::Vendor::Amd && cpuInfo.family() == 23
1188             && (cpuInfo.model() == 1 || cpuInfo.model() == 17 || cpuInfo.model() == 8
1189                 || cpuInfo.model() == 24))
1190            || (cpuInfo.vendor() == CpuInfo::Vendor::Hygon);
1191 }
1192
1193 } // namespace gmx
1194
1195 #ifdef GMX_CPUINFO_STANDALONE
1196 int main(int argc, char** argv)
1197 {
1198     if (argc < 2)
1199     {
1200         fprintf(stdout,
1201                 "Usage:\n\n%s [flags]\n\n"
1202                 "Available flags:\n"
1203                 "-vendor        Print CPU vendor.\n"
1204                 "-brand         Print CPU brand string.\n"
1205                 "-family        Print CPU family version.\n"
1206                 "-model         Print CPU model version.\n"
1207                 "-stepping      Print CPU stepping version.\n"
1208                 "-features      Print CPU feature flags.\n",
1209                 argv[0]);
1210         exit(1);
1211     }
1212
1213     std::string  arg(argv[1]);
1214     gmx::CpuInfo cpuInfo(gmx::CpuInfo::detect());
1215
1216     if (arg == "-vendor")
1217     {
1218         printf("%s\n", cpuInfo.vendorString().c_str());
1219     }
1220     else if (arg == "-brand")
1221     {
1222         printf("%s\n", cpuInfo.brandString().c_str());
1223     }
1224     else if (arg == "-family")
1225     {
1226         printf("%d\n", cpuInfo.family());
1227     }
1228     else if (arg == "-model")
1229     {
1230         printf("%d\n", cpuInfo.model());
1231     }
1232     else if (arg == "-stepping")
1233     {
1234         printf("%d\n", cpuInfo.stepping());
1235     }
1236     else if (arg == "-features")
1237     {
1238         // Separate the feature strings with spaces. Note that in the
1239         // GROMACS cmake code, surrounding whitespace is first
1240         // stripped by the CPU detection routine, and then added back
1241         // in the code for making the SIMD suggestion.
1242         for (auto& f : cpuInfo.featureSet())
1243         {
1244             printf("%s ", cpuInfo.featureString(f).c_str());
1245         }
1246         printf("\n");
1247     }
1248     else if (arg == "-topology")
1249     {
1250         // Undocumented debug option, usually not present in standalone version
1251         for (auto& t : cpuInfo.logicalProcessors())
1252         {
1253             printf("%3u %3u %3u\n", t.socketRankInMachine, t.coreRankInSocket, t.hwThreadRankInCore);
1254         }
1255     }
1256     return 0;
1257 }
1258 #endif