6 typedef void (*testfn
)(void);
9 uint64_t q0
, q1
, q2
, q3
;
10 } __attribute__((aligned(32))) v4di
;
35 static void dump_ymm(const char *name
, int n
, const v4di
*r
, int ff
)
37 printf("%s%d = %016lx %016lx %016lx %016lx\n",
38 name
, n
, r
->q3
, r
->q2
, r
->q1
, r
->q0
);
41 memcpy(v
, r
, sizeof(v
));
42 printf(" %16g %16g %16g %16g\n",
43 v
[3], v
[2], v
[1], v
[0]);
44 } else if (ff
== 32) {
46 memcpy(v
, r
, sizeof(v
));
47 printf(" %8g %8g %8g %8g %8g %8g %8g %8g\n",
48 v
[7], v
[6], v
[5], v
[4], v
[3], v
[2], v
[1], v
[0]);
52 static void dump_regs(reg_state
*s
)
56 for (i
= 0; i
< 16; i
++) {
57 dump_ymm("ymm", i
, &s
->ymm
[i
], 0);
59 for (i
= 0; i
< 4; i
++) {
60 dump_ymm("mem", i
, &s
->mem0
[i
], 0);
64 static void compare_state(const reg_state
*a
, const reg_state
*b
)
67 for (i
= 0; i
< 8; i
++) {
68 if (a
->mm
[i
] != b
->mm
[i
]) {
69 printf("MM%d = %016lx\n", i
, b
->mm
[i
]);
72 for (i
= 0; i
< 16; i
++) {
73 if (a
->r
[i
] != b
->r
[i
]) {
74 printf("r%d = %016lx\n", i
, b
->r
[i
]);
77 for (i
= 0; i
< 16; i
++) {
78 if (memcmp(&a
->ymm
[i
], &b
->ymm
[i
], 32)) {
79 dump_ymm("ymm", i
, &b
->ymm
[i
], a
->ff
);
82 for (i
= 0; i
< 4; i
++) {
83 if (memcmp(&a
->mem0
[i
], &a
->mem
[i
], 32)) {
84 dump_ymm("mem", i
, &a
->mem
[i
], a
->ff
);
87 if (a
->flags
!= b
->flags
) {
88 printf("FLAGS = %016lx\n", b
->flags
);
92 #define LOADMM(r, o) "movq " #r ", " #o "[%0]\n\t"
93 #define LOADYMM(r, o) "vmovdqa " #r ", " #o "[%0]\n\t"
94 #define STOREMM(r, o) "movq " #o "[%1], " #r "\n\t"
95 #define STOREYMM(r, o) "vmovdqa " #o "[%1], " #r "\n\t"
122 #define LOADREG(r, o) "mov " #r ", " #o "[rax]\n\t"
123 #define STOREREG(r, o) "mov " #o "[rax], " #r "\n\t"
139 static void run_test(const TestDef *t)
142 reg_state
*init
= t
->init
;
143 memcpy(init
->mem
, init
->mem0
, sizeof(init
->mem
));
144 printf("%5d %s\n", t
->n
, t
->s
);
160 "mov rcx, 0x2c0[rax]\n\t"
166 "mov rax, 0x240[rax]\n\t"
169 "mov rax, 8[rsp]\n\t"
172 "mov 0x240[rax], rbx\n\t"
174 "mov 0x270[rax], rbx\n\t"
175 "mov 0x278[rax], rbx\n\t"
179 "mov 0x2c0[rax], rbx\n\t"
188 : : "r"(init
), "r"(&result
), "r"(t
->fn
)
191 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
192 "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
193 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
194 "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
195 "ymm12", "ymm13", "ymm14", "ymm15"
197 compare_state(init
, &result
);
200 #define TEST(n, cmd, type) \
201 static void __attribute__((naked)) test_##n(void) \
204 asm volatile("ret"); \
206 #include "test-avx.h"
209 static const TestDef test_table
[] = {
210 #define TEST(n, cmd, type) {n, test_##n, cmd, &init##type},
211 #include "test-avx.h"
215 static void run_all(void)
218 for (t
= test_table
; t
->fn
; t
++) {
223 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
225 uint16_t val_f16
[] = { 0x4000, 0xbc00, 0x44cd, 0x3a66, 0x4200, 0x7a1a, 0x4780, 0x4826 };
226 float val_f32
[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6
, 7.5, 8.3};
227 double val_f64
[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6
, 7.5};
229 {0x3d6b3b6a9e4118f2lu
, 0x355ae76d2774d78clu
,
230 0xac3ff76c4daa4b28lu
, 0xe7fabd204cb54083lu
},
231 {0xd851c54a56bf1f29lu
, 0x4a84d1d50bf4c4fflu
,
232 0x56621e553d52b56clu
, 0xd0069553da8f584alu
},
233 {0x5826475e2c5fd799lu
, 0xfd32edc01243f5e9lu
,
234 0x738ba2c66d3fe126lu
, 0x5707219c6e6c26b4lu
},
237 v4di deadbeef
= {0xa5a5a5a5deadbeefull
, 0xa5a5a5a5deadbeefull
,
238 0xa5a5a5a5deadbeefull
, 0xa5a5a5a5deadbeefull
};
239 v4di indexq
= {0x000000000000001full
, 0x000000000000008full
,
240 0xffffffffffffffffull
, 0xffffffffffffff5full
};
241 v4di indexd
= {0x00000002000000efull
, 0xfffffff500000010ull
,
242 0x0000000afffffff0ull
, 0x000000000000000eull
};
244 v4di gather_mem
[0x20];
246 void init_f16reg(v4di
*r
)
248 memset(r
, 0, sizeof(*r
));
249 memcpy(r
, val_f16
, sizeof(val_f16
));
252 void init_f32reg(v4di
*r
)
257 for (i
= 0; i
< 8; i
++) {
259 if (n
== ARRAY_LEN(val_f32
)) {
263 memcpy(r
, v
, sizeof(*r
));
266 void init_f64reg(v4di
*r
)
271 for (i
= 0; i
< 4; i
++) {
273 if (n
== ARRAY_LEN(val_f64
)) {
277 memcpy(r
, v
, sizeof(*r
));
280 void init_intreg(v4di
*r
)
282 static uint64_t mask
;
285 r
->q0
= val_i64
[n
].q0
^ mask
;
286 r
->q1
= val_i64
[n
].q1
^ mask
;
287 r
->q2
= val_i64
[n
].q2
^ mask
;
288 r
->q3
= val_i64
[n
].q3
^ mask
;
290 if (n
== ARRAY_LEN(val_i64
)) {
296 static void init_all(reg_state
*s
)
300 s
->r
[3] = (uint64_t)&s
->mem
[0]; /* rdx */
301 s
->r
[4] = (uint64_t)&gather_mem
[ARRAY_LEN(gather_mem
) / 2]; /* rsi */
302 s
->r
[5] = (uint64_t)&s
->mem
[2]; /* rdi */
304 for (i
= 0; i
< 16; i
++) {
305 s
->ymm
[i
] = deadbeef
;
309 for (i
= 0; i
< 4; i
++) {
310 s
->mem0
[i
] = deadbeef
;
314 int main(int argc
, char *argv
[])
319 init_intreg(&initI
.ymm
[10]);
320 init_intreg(&initI
.ymm
[11]);
321 init_intreg(&initI
.ymm
[12]);
322 init_intreg(&initI
.mem0
[1]);
327 init_f16reg(&initF16
.ymm
[10]);
328 init_f16reg(&initF16
.ymm
[11]);
329 init_f16reg(&initF16
.ymm
[12]);
330 init_f16reg(&initF16
.mem0
[1]);
336 init_f32reg(&initF32
.ymm
[10]);
337 init_f32reg(&initF32
.ymm
[11]);
338 init_f32reg(&initF32
.ymm
[12]);
339 init_f32reg(&initF32
.mem0
[1]);
345 init_f64reg(&initF64
.ymm
[10]);
346 init_f64reg(&initF64
.ymm
[11]);
347 init_f64reg(&initF64
.ymm
[12]);
348 init_f64reg(&initF64
.mem0
[1]);
353 for (i
= 0; i
< ARRAY_LEN(gather_mem
); i
++) {
354 init_intreg(&gather_mem
[i
]);
358 int n
= atoi(argv
[1]);
359 run_test(&test_table
[n
]);