6 typedef void (*testfn
)(void);
9 uint64_t q0
, q1
, q2
, q3
;
10 } __attribute__((aligned(32))) v4di
;
35 static void dump_ymm(const char *name
, int n
, const v4di
*r
, int ff
)
37 printf("%s%d = %016lx %016lx %016lx %016lx\n",
38 name
, n
, r
->q3
, r
->q2
, r
->q1
, r
->q0
);
41 memcpy(v
, r
, sizeof(v
));
42 printf(" %16g %16g %16g %16g\n",
43 v
[3], v
[2], v
[1], v
[0]);
44 } else if (ff
== 32) {
46 memcpy(v
, r
, sizeof(v
));
47 printf(" %8g %8g %8g %8g %8g %8g %8g %8g\n",
48 v
[7], v
[6], v
[5], v
[4], v
[3], v
[2], v
[1], v
[0]);
52 static void dump_regs(reg_state
*s
)
56 for (i
= 0; i
< 16; i
++) {
57 dump_ymm("ymm", i
, &s
->ymm
[i
], 0);
59 for (i
= 0; i
< 4; i
++) {
60 dump_ymm("mem", i
, &s
->mem0
[i
], 0);
64 static void compare_state(const reg_state
*a
, const reg_state
*b
)
67 for (i
= 0; i
< 8; i
++) {
68 if (a
->mm
[i
] != b
->mm
[i
]) {
69 printf("MM%d = %016lx\n", i
, b
->mm
[i
]);
72 for (i
= 0; i
< 16; i
++) {
73 if (a
->r
[i
] != b
->r
[i
]) {
74 printf("r%d = %016lx\n", i
, b
->r
[i
]);
77 for (i
= 0; i
< 16; i
++) {
78 if (memcmp(&a
->ymm
[i
], &b
->ymm
[i
], 32)) {
79 dump_ymm("ymm", i
, &b
->ymm
[i
], a
->ff
);
82 for (i
= 0; i
< 4; i
++) {
83 if (memcmp(&a
->mem0
[i
], &a
->mem
[i
], 32)) {
84 dump_ymm("mem", i
, &a
->mem
[i
], a
->ff
);
87 if (a
->flags
!= b
->flags
) {
88 printf("FLAGS = %016lx\n", b
->flags
);
92 #define LOADMM(r, o) "movq " #r ", " #o "[%0]\n\t"
93 #define LOADYMM(r, o) "vmovdqa " #r ", " #o "[%0]\n\t"
94 #define STOREMM(r, o) "movq " #o "[%1], " #r "\n\t"
95 #define STOREYMM(r, o) "vmovdqa " #o "[%1], " #r "\n\t"
122 #define LOADREG(r, o) "mov " #r ", " #o "[rax]\n\t"
123 #define STOREREG(r, o) "mov " #o "[rax], " #r "\n\t"
139 static void run_test(const TestDef *t)
142 reg_state
*init
= t
->init
;
143 memcpy(init
->mem
, init
->mem0
, sizeof(init
->mem
));
144 printf("%5d %s\n", t
->n
, t
->s
);
160 "mov rcx, 0x2c0[rax]\n\t"
166 "mov rax, 0x240[rax]\n\t"
169 "mov rax, 8[rsp]\n\t"
172 "mov 0x240[rax], rbx\n\t"
174 "mov 0x270[rax], rbx\n\t"
175 "mov 0x278[rax], rbx\n\t"
179 "mov 0x2c0[rax], rbx\n\t"
188 : : "r"(init
), "r"(&result
), "r"(t
->fn
)
191 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
192 "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
193 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
194 "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
195 "ymm12", "ymm13", "ymm14", "ymm15"
197 compare_state(init
, &result
);
200 #define TEST(n, cmd, type) \
201 static void __attribute__((naked)) test_##n(void) \
204 asm volatile("ret"); \
206 #include "test-avx.h"
209 static const TestDef test_table
[] = {
210 #define TEST(n, cmd, type) {n, test_##n, cmd, &init##type},
211 #include "test-avx.h"
215 static void run_all(void)
218 for (t
= test_table
; t
->fn
; t
++) {
223 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
225 uint16_t val_f16
[] = { 0x4000, 0xbc00, 0x44cd, 0x3a66, 0x4200, 0x7a1a, 0x4780, 0x4826 };
226 float val_f32
[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6
, 7.5, 8.3};
227 double val_f64
[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6
, 7.5};
229 {0x3d6b3b6a9e4118f2lu
, 0x355ae76d2774d78clu
,
230 0xac3ff76c4daa4b28lu
, 0xe7fabd204cb54083lu
},
231 {0xd851c54a56bf1f29lu
, 0x4a84d1d50bf4c4fflu
,
232 0x56621e553d52b56clu
, 0xd0069553da8f584alu
},
233 {0x5826475e2c5fd799lu
, 0xfd32edc01243f5e9lu
,
234 0x738ba2c66d3fe126lu
, 0x5707219c6e6c26b4lu
},
237 v4di deadbeef
= {0xa5a5a5a5deadbeefull
, 0xa5a5a5a5deadbeefull
,
238 0xa5a5a5a5deadbeefull
, 0xa5a5a5a5deadbeefull
};
239 /* &gather_mem[0x10] is 512 bytes from the base; indices must be >=-64, <64
240 * to account for scaling by 8 */
241 v4di indexq
= {0x000000000000001full
, 0x000000000000003dull
,
242 0xffffffffffffffffull
, 0xffffffffffffffdfull
};
243 v4di indexd
= {0x00000002ffffffcdull
, 0xfffffff500000010ull
,
244 0x0000003afffffff0ull
, 0x000000000000000eull
};
246 v4di gather_mem
[0x20];
247 _Static_assert(sizeof(gather_mem
) == 1024);
249 void init_f16reg(v4di
*r
)
251 memset(r
, 0, sizeof(*r
));
252 memcpy(r
, val_f16
, sizeof(val_f16
));
255 void init_f32reg(v4di
*r
)
260 for (i
= 0; i
< 8; i
++) {
262 if (n
== ARRAY_LEN(val_f32
)) {
266 memcpy(r
, v
, sizeof(*r
));
269 void init_f64reg(v4di
*r
)
274 for (i
= 0; i
< 4; i
++) {
276 if (n
== ARRAY_LEN(val_f64
)) {
280 memcpy(r
, v
, sizeof(*r
));
283 void init_intreg(v4di
*r
)
285 static uint64_t mask
;
288 r
->q0
= val_i64
[n
].q0
^ mask
;
289 r
->q1
= val_i64
[n
].q1
^ mask
;
290 r
->q2
= val_i64
[n
].q2
^ mask
;
291 r
->q3
= val_i64
[n
].q3
^ mask
;
293 if (n
== ARRAY_LEN(val_i64
)) {
299 static void init_all(reg_state
*s
)
303 s
->r
[3] = (uint64_t)&s
->mem
[0]; /* rdx */
304 s
->r
[4] = (uint64_t)&gather_mem
[ARRAY_LEN(gather_mem
) / 2]; /* rsi */
305 s
->r
[5] = (uint64_t)&s
->mem
[2]; /* rdi */
307 for (i
= 0; i
< 16; i
++) {
308 s
->ymm
[i
] = deadbeef
;
312 for (i
= 0; i
< 4; i
++) {
313 s
->mem0
[i
] = deadbeef
;
317 int main(int argc
, char *argv
[])
322 init_intreg(&initI
.ymm
[0]);
323 init_intreg(&initI
.ymm
[9]);
324 init_intreg(&initI
.ymm
[10]);
325 init_intreg(&initI
.ymm
[11]);
326 init_intreg(&initI
.ymm
[12]);
327 init_intreg(&initI
.mem0
[1]);
332 init_f16reg(&initF16
.ymm
[0]);
333 init_f16reg(&initF16
.ymm
[9]);
334 init_f16reg(&initF16
.ymm
[10]);
335 init_f16reg(&initF16
.ymm
[11]);
336 init_f16reg(&initF16
.ymm
[12]);
337 init_f16reg(&initF16
.mem0
[1]);
343 init_f32reg(&initF32
.ymm
[0]);
344 init_f32reg(&initF32
.ymm
[9]);
345 init_f32reg(&initF32
.ymm
[10]);
346 init_f32reg(&initF32
.ymm
[11]);
347 init_f32reg(&initF32
.ymm
[12]);
348 init_f32reg(&initF32
.mem0
[1]);
354 init_f64reg(&initF64
.ymm
[0]);
355 init_f64reg(&initF64
.ymm
[9]);
356 init_f64reg(&initF64
.ymm
[10]);
357 init_f64reg(&initF64
.ymm
[11]);
358 init_f64reg(&initF64
.ymm
[12]);
359 init_f64reg(&initF64
.mem0
[1]);
364 for (i
= 0; i
< ARRAY_LEN(gather_mem
); i
++) {
365 init_intreg(&gather_mem
[i
]);
369 int n
= atoi(argv
[1]);
370 run_test(&test_table
[n
]);