2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements a routine to check the number of AVX512 fma units
39 * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode
40 * to set the SIMD acceleration and similar things during CMake configuration.
43 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
47 #include "identifyavx512fmaunits.h"
49 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
59 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
60 # include "gromacs/hardware/cpuinfo.h"
69 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
70 /*\ brief Loop over mixed FMA and shuffle AVX512 instructions
72 * This function executes a meaningless loop that includes both
73 * FMA and shuffle instructions from the AVX512 instruction set.
74 * We need a bit of complex logic to make sure it cannot be
75 * optimized away by the compiler.
77 * \param loopCount Number of iterations. Each iteration will
78 * execute 12 FMA and 12 shuffle instructions.
79 * \return Number of cycles used for the loop.
81 uint64_t timeFmaAndShuffleLoop(uint64_t loopCount
)
84 // Unfortunately we need to resort to inline ASM since we are
85 // making a choice based on timing, and without efficient optimization
86 // (e.g. when doing debugging) the usual intrinsics are often implemented
87 // as independent load/store operations, which completely screws up timing.
89 "\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
90 "\tvmovaps %%zmm0, %%zmm1\n"
91 "\tvmovaps %%zmm0, %%zmm2\n"
92 "\tvmovaps %%zmm0, %%zmm3\n"
93 "\tvmovaps %%zmm0, %%zmm4\n"
94 "\tvmovaps %%zmm0, %%zmm5\n"
95 "\tvmovaps %%zmm0, %%zmm6\n"
96 "\tvmovaps %%zmm0, %%zmm7\n"
97 "\tvmovaps %%zmm0, %%zmm8\n"
98 "\tvmovaps %%zmm0, %%zmm9\n"
99 "\tvmovaps %%zmm0, %%zmm10\n"
100 "\tvmovaps %%zmm0, %%zmm11\n"
101 "\tvpxord %%zmm12, %%zmm12, %%zmm12\n"
102 "\tvmovaps %%zmm12, %%zmm13\n"
103 "\tvmovaps %%zmm12, %%zmm14\n"
104 "\tvmovaps %%zmm12, %%zmm15\n"
105 "\tvmovaps %%zmm12, %%zmm16\n"
106 "\tvmovaps %%zmm12, %%zmm17\n"
107 "\tvmovaps %%zmm12, %%zmm18\n"
108 "\tvmovaps %%zmm12, %%zmm19\n"
109 "\tvmovaps %%zmm12, %%zmm20\n"
110 "\tvmovaps %%zmm12, %%zmm21\n"
111 "\tvmovaps %%zmm12, %%zmm22\n"
112 "\tvmovaps %%zmm12, %%zmm23\n"
113 "\tvmovaps %%zmm12, %%zmm30\n"
115 "\tsalq $32, %%rdx\n"
116 "\tmovl %%eax, %%eax\n"
117 "\tmovq %%rdx, %%rbx\n"
118 "\torq %%rax, %%rbx\n"
121 "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
122 "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
123 "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
124 "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
125 "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
126 "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
127 "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
128 "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
129 "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
130 "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
131 "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
132 "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
133 "\tvpermd %%zmm30, %%zmm30, %%zmm12\n"
134 "\tvpermd %%zmm30, %%zmm30, %%zmm13\n"
135 "\tvpermd %%zmm30, %%zmm30, %%zmm14\n"
136 "\tvpermd %%zmm30, %%zmm30, %%zmm15\n"
137 "\tvpermd %%zmm30, %%zmm30, %%zmm16\n"
138 "\tvpermd %%zmm30, %%zmm30, %%zmm17\n"
139 "\tvpermd %%zmm30, %%zmm30, %%zmm18\n"
140 "\tvpermd %%zmm30, %%zmm30, %%zmm19\n"
141 "\tvpermd %%zmm30, %%zmm30, %%zmm20\n"
142 "\tvpermd %%zmm30, %%zmm30, %%zmm21\n"
143 "\tvpermd %%zmm30, %%zmm30, %%zmm22\n"
144 "\tvpermd %%zmm30, %%zmm30, %%zmm23\n"
148 "\tsalq $32, %%rdx\n"
149 "\tmovl %%eax, %%eax\n"
150 "\torq %%rax, %%rdx\n"
151 "\tsubq %%rbx, %%rdx\n"
155 : "rax", "rbx", "rcx", "rdx", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
156 "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16",
157 "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm30");
162 /*\ brief Loop over FMA AVX512 instructions
164 * This function executes a meaningless loop that includes only
165 * FMA instructions from the AVX512 instruction set.
166 * We need a bit of complex logic to make sure it cannot be
167 * optimized away by the compiler.
169 * \param loopCount Number of iterations. Each iteration will
170 * execute 12 FMA instructions.
171 * \return Number of cycles used for the loop.
173 uint64_t timeFmaOnlyLoop(uint64_t loopCount
)
176 // Unfortunately we need to resort to inline ASM since we are
177 // making a choice based on timing, and without efficient optimization
178 // (e.g. when doing debugging) the usual intrinsics are often implemented
179 // as independent load/store operations, which completely screws up timing.
180 __asm__
__volatile__(
181 "\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
182 "\tvmovaps %%zmm0, %%zmm1\n"
183 "\tvmovaps %%zmm0, %%zmm2\n"
184 "\tvmovaps %%zmm0, %%zmm3\n"
185 "\tvmovaps %%zmm0, %%zmm4\n"
186 "\tvmovaps %%zmm0, %%zmm5\n"
187 "\tvmovaps %%zmm0, %%zmm6\n"
188 "\tvmovaps %%zmm0, %%zmm7\n"
189 "\tvmovaps %%zmm0, %%zmm8\n"
190 "\tvmovaps %%zmm0, %%zmm9\n"
191 "\tvmovaps %%zmm0, %%zmm10\n"
192 "\tvmovaps %%zmm0, %%zmm11\n"
194 "\tsalq $32, %%rdx\n"
195 "\tmovl %%eax, %%eax\n"
196 "\tmovq %%rdx, %%rbx\n"
197 "\torq %%rax, %%rbx\n"
200 "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
201 "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
202 "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
203 "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
204 "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
205 "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
206 "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
207 "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
208 "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
209 "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
210 "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
211 "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
215 "\tsalq $32, %%rdx\n"
216 "\tmovl %%eax, %%eax\n"
217 "\torq %%rax, %%rdx\n"
218 "\tsubq %%rbx, %%rdx\n"
222 : "rax", "rbx", "rcx", "rdx", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
223 "zmm7", "zmm8", "zmm9", "zmm10", "zmm11");
228 bool checkDualAvx512FmaUnits()
230 uint64_t timeFmaAndShuf
= static_cast<uint64_t>(1e9
); // Large value
232 // Make sure the CPU is in AVX512 mode by executing a fairly long loop.
233 // Use the return value to make sure it is not optimized away. Later invocations
234 // use fewer iterations, so they should always be faster.
235 uint64_t timeFmaOnly
= timeFmaOnlyLoop(100000);
237 // Execute the loops three times
238 for (int i
= 0; i
< 3; i
++)
240 timeFmaAndShuf
= std::min(timeFmaAndShuf
, timeFmaAndShuffleLoop(1000));
241 timeFmaOnly
= std::min(timeFmaOnly
, timeFmaOnlyLoop(1000));
244 return timeFmaAndShuf
> 1.5 * timeFmaOnly
;
247 #endif // GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
250 /*! \brief Mutex to guard the execution of the timing test
252 * We only execute the test once, and return the saved result
253 * on subsequent calls.
255 std::mutex initMutex
;
259 int identifyAvx512FmaUnits()
261 static bool initialized
= false;
262 static int result
= 0;
266 std::lock_guard
<std::mutex
> lock(initMutex
);
270 // For the standalone test binary we assume it will
271 // only be executed on AVX512 hardware, but for the
272 // library version we check the hardware support.
273 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
274 bool haveAvx512Hardware
= true;
276 bool haveAvx512Hardware
= CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F
);
279 if (haveAvx512Hardware
)
281 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
282 result
= checkDualAvx512FmaUnits() ? 2 : 1;
284 result
= -1; // Cannot run the tests
289 result
= 0; // Not AVX-512 hardware
299 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
302 printf("%d\n", gmx::identifyAvx512FmaUnits());