1 // REQUIRES
: amdgpu-registered-target
2 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX908
3 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX90A
4 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX940
5 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX950
7 #pragma OPENCL EXTENSION cl_khr_fp64
:enable
9 typedef float v2f __attribute__
((ext_vector_type(2)));
10 typedef float v4f __attribute__
((ext_vector_type(4)));
11 typedef float v16f __attribute__
((ext_vector_type(16)));
12 typedef float v32f __attribute__
((ext_vector_type(32)));
13 typedef half v4h __attribute__
((ext_vector_type(4)));
14 typedef half v8h __attribute__
((ext_vector_type(8)));
15 typedef half v16h __attribute__
((ext_vector_type(16)));
16 typedef half v32h __attribute__
((ext_vector_type(32)));
17 typedef int v2i __attribute__
((ext_vector_type(2)));
18 typedef int v4i __attribute__
((ext_vector_type(4)));
19 typedef int v16i __attribute__
((ext_vector_type(16)));
20 typedef int v32i __attribute__
((ext_vector_type(32)));
21 typedef short v2s __attribute__
((ext_vector_type(2)));
22 typedef short v4s __attribute__
((ext_vector_type(4)));
23 typedef short v8s __attribute__
((ext_vector_type(8)));
24 typedef short v16s __attribute__
((ext_vector_type(16)));
25 typedef short v32s __attribute__
((ext_vector_type(32)));
26 typedef double v4d __attribute__
((ext_vector_type(4)));
27 typedef __bf16 v8bf16 __attribute__
((ext_vector_type(8)));
30 #ifdef MFMA_GFX908_TESTS
32 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x1f32
33 // CHECK-GFX908
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x1f32
(float %a
, float %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
34 void test_mfma_f32_32x32x1f32
(global v32f
* out
, float a
, float b
, v32f c
)
36 *out
= __builtin_amdgcn_mfma_f32_32x32x1f32
(a, b
, c
, 0, 0, 0);
39 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x1f32
40 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x1f32
(float %a
, float %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
41 void test_mfma_f32_16x16x1f32
(global v16f
* out
, float a
, float b
, v16f c
)
43 *out
= __builtin_amdgcn_mfma_f32_16x16x1f32
(a, b
, c
, 0, 0, 0);
46 // CHECK-GFX908-LABEL
: @test_mfma_f32_4x4x1f32
47 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x1f32
(float %a
, float %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
48 void test_mfma_f32_4x4x1f32
(global v4f
* out
, float a
, float b
, v4f c
)
50 *out
= __builtin_amdgcn_mfma_f32_4x4x1f32
(a, b
, c
, 0, 0, 0);
53 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x2f32
54 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x2f32
(float %a
, float %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
55 void test_mfma_f32_32x32x2f32
(global v16f
* out
, float a
, float b
, v16f c
)
57 *out
= __builtin_amdgcn_mfma_f32_32x32x2f32
(a, b
, c
, 0, 0, 0);
60 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x4f32
61 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x4f32
(float %a
, float %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
62 void test_mfma_f32_16x16x4f32
(global v4f
* out
, float a
, float b
, v4f c
)
64 *out
= __builtin_amdgcn_mfma_f32_16x16x4f32
(a, b
, c
, 0, 0, 0);
67 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x4f16
68 // CHECK-GFX908
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x4f16
(<4 x half
> %a
, <4 x half
> %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
69 void test_mfma_f32_32x32x4f16
(global v32f
* out
, v4h a
, v4h b
, v32f c
)
71 *out
= __builtin_amdgcn_mfma_f32_32x32x4f16
(a, b
, c
, 0, 0, 0);
74 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x4f16
75 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x4f16
(<4 x half
> %a
, <4 x half
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
76 void test_mfma_f32_16x16x4f16
(global v16f
* out
, v4h a
, v4h b
, v16f c
)
78 *out
= __builtin_amdgcn_mfma_f32_16x16x4f16
(a, b
, c
, 0, 0, 0);
81 // CHECK-GFX908-LABEL
: @test_mfma_f32_4x4x4f16
82 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x4f16
(<4 x half
> %a
, <4 x half
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
83 void test_mfma_f32_4x4x4f16
(global v4f
* out
, v4h a
, v4h b
, v4f c
)
85 *out
= __builtin_amdgcn_mfma_f32_4x4x4f16
(a, b
, c
, 0, 0, 0);
88 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x8f16
89 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x8f16
(<4 x half
> %a
, <4 x half
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
90 void test_mfma_f32_32x32x8f16
(global v16f
* out
, v4h a
, v4h b
, v16f c
)
92 *out
= __builtin_amdgcn_mfma_f32_32x32x8f16
(a, b
, c
, 0, 0, 0);
95 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x16f16
96 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x16f16
(<4 x half
> %a
, <4 x half
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
97 void test_mfma_f32_16x16x16f16
(global v4f
* out
, v4h a
, v4h b
, v4f c
)
99 *out
= __builtin_amdgcn_mfma_f32_16x16x16f16
(a, b
, c
, 0, 0, 0);
102 // CHECK-GFX908-LABEL
: @test_mfma_i32_32x32x4i8
103 // CHECK-GFX908
: call
<32 x i32
> @llvm.amdgcn.mfma.i32.32x32x4i8
(i32 %a
, i32 %b
, <32 x i32
> %c
, i32
0, i32
0, i32
0)
104 void test_mfma_i32_32x32x4i8
(global v32i
* out
, int a
, int b
, v32i c
)
106 *out
= __builtin_amdgcn_mfma_i32_32x32x4i8
(a, b
, c
, 0, 0, 0);
109 // CHECK-GFX908-LABEL
: @test_mfma_i32_16x16x4i8
110 // CHECK-GFX908
: call
<16 x i32
> @llvm.amdgcn.mfma.i32.16x16x4i8
(i32 %a
, i32 %b
, <16 x i32
> %c
, i32
0, i32
0, i32
0)
111 void test_mfma_i32_16x16x4i8
(global v16i
* out
, int a
, int b
, v16i c
)
113 *out
= __builtin_amdgcn_mfma_i32_16x16x4i8
(a, b
, c
, 0, 0, 0);
116 // CHECK-GFX908-LABEL
: @test_mfma_i32_4x4x4i8
117 // CHECK-GFX908
: call
<4 x i32
> @llvm.amdgcn.mfma.i32.4x4x4i8
(i32 %a
, i32 %b
, <4 x i32
> %c
, i32
0, i32
0, i32
0)
118 void test_mfma_i32_4x4x4i8
(global v4i
* out
, int a
, int b
, v4i c
)
120 *out
= __builtin_amdgcn_mfma_i32_4x4x4i8
(a, b
, c
, 0, 0, 0);
123 // CHECK-GFX908-LABEL
: @test_mfma_i32_32x32x8i8
124 // CHECK-GFX908
: call
<16 x i32
> @llvm.amdgcn.mfma.i32.32x32x8i8
(i32 %a
, i32 %b
, <16 x i32
> %c
, i32
0, i32
0, i32
0)
125 void test_mfma_i32_32x32x8i8
(global v16i
* out
, int a
, int b
, v16i c
)
127 *out
= __builtin_amdgcn_mfma_i32_32x32x8i8
(a, b
, c
, 0, 0, 0);
130 // CHECK-GFX908-LABEL
: @test_mfma_i32_16x16x16i8
131 // CHECK-GFX908
: call
<4 x i32
> @llvm.amdgcn.mfma.i32.16x16x16i8
(i32 %a
, i32 %b
, <4 x i32
> %c
, i32
0, i32
0, i32
0)
132 void test_mfma_i32_16x16x16i8
(global v4i
* out
, int a
, int b
, v4i c
)
134 *out
= __builtin_amdgcn_mfma_i32_16x16x16i8
(a, b
, c
, 0, 0, 0);
137 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x2bf16
138 // CHECK-GFX908
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x2bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
139 void test_mfma_f32_32x32x2bf16
(global v32f
* out
, v2s a
, v2s b
, v32f c
)
141 *out
= __builtin_amdgcn_mfma_f32_32x32x2bf16
(a, b
, c
, 0, 0, 0);
144 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x2bf16
145 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x2bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
146 void test_mfma_f32_16x16x2bf16
(global v16f
* out
, v2s a
, v2s b
, v16f c
)
148 *out
= __builtin_amdgcn_mfma_f32_16x16x2bf16
(a, b
, c
, 0, 0, 0);
151 // CHECK-GFX908-LABEL
: @test_mfma_f32_4x4x2bf16
152 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x2bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
153 void test_mfma_f32_4x4x2bf16
(global v4f
* out
, v2s a
, v2s b
, v4f c
)
155 *out
= __builtin_amdgcn_mfma_f32_4x4x2bf16
(a, b
, c
, 0, 0, 0);
158 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x4bf16
159 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x4bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
160 void test_mfma_f32_32x32x4bf16
(global v16f
* out
, v2s a
, v2s b
, v16f c
)
162 *out
= __builtin_amdgcn_mfma_f32_32x32x4bf16
(a, b
, c
, 0, 0, 0);
165 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x8bf16
166 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x8bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
167 void test_mfma_f32_16x16x8bf16
(global v4f
* out
, v2s a
, v2s b
, v4f c
)
169 *out
= __builtin_amdgcn_mfma_f32_16x16x8bf16
(a, b
, c
, 0, 0, 0);
172 #endif
// MFMA_GFX908_TESTS
174 #ifdef MFMA_GFX90A_TESTS
176 // CHECK-GFX90A-LABEL
: @test_mfma_f32_32x32x4bf16_1k
177 // CHECK-GFX90A
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
178 void test_mfma_f32_32x32x4bf16_1k
(global v32f
* out
, v4s a
, v4s b
, v32f c
)
180 *out
= __builtin_amdgcn_mfma_f32_32x32x4bf16_1k
(a, b
, c
, 0, 0, 0);
183 // CHECK-GFX90A-LABEL
: @test_mfma_f32_16x16x4bf16_1k
184 // CHECK-GFX90A
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
185 void test_mfma_f32_16x16x4bf16_1k
(global v16f
* out
, v4s a
, v4s b
, v16f c
)
187 *out
= __builtin_amdgcn_mfma_f32_16x16x4bf16_1k
(a, b
, c
, 0, 0, 0);
190 // CHECK-GFX90A-LABEL
: @test_mfma_f32_4x4x4bf16_1k
191 // CHECK-GFX90A
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
192 void test_mfma_f32_4x4x4bf16_1k
(global v4f
* out
, v4s a
, v4s b
, v4f c
)
194 *out
= __builtin_amdgcn_mfma_f32_4x4x4bf16_1k
(a, b
, c
, 0, 0, 0);
197 // CHECK-GFX90A-LABEL
: @test_mfma_f32_32x32x8bf16_1k
198 // CHECK-GFX90A
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
199 void test_mfma_f32_32x32x8bf16_1k
(global v16f
* out
, v4s a
, v4s b
, v16f c
)
201 *out
= __builtin_amdgcn_mfma_f32_32x32x8bf16_1k
(a, b
, c
, 0, 0, 0);
204 // CHECK-GFX90A-LABEL
: @test_mfma_f32_16x16x16bf16_1k
205 // CHECK-GFX90A
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
206 void test_mfma_f32_16x16x16bf16_1k
(global v4f
* out
, v4s a
, v4s b
, v4f c
)
208 *out
= __builtin_amdgcn_mfma_f32_16x16x16bf16_1k
(a, b
, c
, 0, 0, 0);
211 // CHECK-GFX90A-LABEL
: @test_mfma_f64_16x16x4f64
212 // CHECK-GFX90A
: call
<4 x double
> @llvm.amdgcn.mfma.f64.16x16x4f64
(double %a
, double %b
, <4 x double
> %c
, i32
0, i32
0, i32
0)
213 void test_mfma_f64_16x16x4f64
(global v4d
* out
, double a
, double b
, v4d c
)
215 *out
= __builtin_amdgcn_mfma_f64_16x16x4f64
(a, b
, c
, 0, 0, 0);
218 // CHECK-GFX90A-LABEL
: @test_mfma_f64_4x4x4f64
219 // CHECK-GFX90A
: call double
@llvm.amdgcn.mfma.f64.4x4x4f64
(double %a
, double %b
, double %c
, i32
0, i32
0, i32
0)
220 void test_mfma_f64_4x4x4f64
(global double
* out
, double a
, double b
, double c
)
222 *out
= __builtin_amdgcn_mfma_f64_4x4x4f64
(a, b
, c
, 0, 0, 0);
225 #endif
// MFMA_GFX90A_TESTS
227 #if defined
(MFMA_GFX940_TESTS) || defined
(MFMA_GFX950_TESTS)
228 // CHECK-GFX940-LABEL
: @test_mfma_i32_16x16x32_i8
229 // CHECK-GFX940
: call
<4 x i32
> @llvm.amdgcn.mfma.i32.16x16x32.i8
(i64 %a
, i64 %b
, <4 x i32
> %c
, i32
0, i32
0, i32
0)
230 void test_mfma_i32_16x16x32_i8
(global v4i
* out
, long a
, long b
, v4i c
)
232 *out
= __builtin_amdgcn_mfma_i32_16x16x32_i8
(a, b
, c
, 0, 0, 0);
235 // CHECK-GFX940-LABEL
: @test_mfma_i32_32x32x16_i8
236 // CHECK-GFX940
: call
<16 x i32
> @llvm.amdgcn.mfma.i32.32x32x16.i8
(i64 %a
, i64 %b
, <16 x i32
> %c
, i32
0, i32
0, i32
0)
237 void test_mfma_i32_32x32x16_i8
(global v16i
* out
, long a
, long b
, v16i c
)
239 *out
= __builtin_amdgcn_mfma_i32_32x32x16_i8
(a, b
, c
, 0, 0, 0);
242 // CHECK-GFX940-LABEL
: @test_mfma_f32_16x16x8_xf32
243 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x8.xf32
(<2 x float
> %a
, <2 x float
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
244 void test_mfma_f32_16x16x8_xf32
(global v4f
* out
, v2f a
, v2f b
, v4f c
)
246 *out
= __builtin_amdgcn_mfma_f32_16x16x8_xf32
(a, b
, c
, 0, 0, 0);
249 // CHECK-GFX940-LABEL
: @test_mfma_f32_32x32x4_xf32
250 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x4.xf32
(<2 x float
> %a
, <2 x float
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
251 void test_mfma_f32_32x32x4_xf32
(global v16f
* out
, v2f a
, v2f b
, v16f c
)
253 *out
= __builtin_amdgcn_mfma_f32_32x32x4_xf32
(a, b
, c
, 0, 0, 0);
256 // CHECK-GFX940-LABEL
: @test_mfma_f32_16x16x32_bf8_bf8
257 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8
(i64 %a
, i64 %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
258 void test_mfma_f32_16x16x32_bf8_bf8
(global v4f
* out
, long a
, long b
, v4f c
)
260 *out
= __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8
(a, b
, c
, 0, 0, 0);
263 // CHECK-GFX940-LABEL
: @test_mfma_f32_16x16x32_bf8_fp8
264 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8
(i64 %a
, i64 %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
265 void test_mfma_f32_16x16x32_bf8_fp8
(global v4f
* out
, long a
, long b
, v4f c
)
267 *out
= __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8
(a, b
, c
, 0, 0, 0);
270 // CHECK-GFX940-LABEL
: @test_mfma_f32_16x16x32_fp8_bf8
271 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8
(i64 %a
, i64 %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
272 void test_mfma_f32_16x16x32_fp8_bf8
(global v4f
* out
, long a
, long b
, v4f c
)
274 *out
= __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8
(a, b
, c
, 0, 0, 0);
277 // CHECK-GFX940-LABEL
: @test_mfma_f32_16x16x32_fp8_fp8
278 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8
(i64 %a
, i64 %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
279 void test_mfma_f32_16x16x32_fp8_fp8
(global v4f
* out
, long a
, long b
, v4f c
)
281 *out
= __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8
(a, b
, c
, 0, 0, 0);
284 // CHECK-GFX940-LABEL
: @test_mfma_f32_32x32x16_bf8_bf8
285 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8
(i64 %a
, i64 %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
286 void test_mfma_f32_32x32x16_bf8_bf8
(global v16f
* out
, long a
, long b
, v16f c
)
288 *out
= __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8
(a, b
, c
, 0, 0, 0);
291 // CHECK-GFX940-LABEL
: @test_mfma_f32_32x32x16_bf8_fp8
292 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8
(i64 %a
, i64 %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
293 void test_mfma_f32_32x32x16_bf8_fp8
(global v16f
* out
, long a
, long b
, v16f c
)
295 *out
= __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8
(a, b
, c
, 0, 0, 0);
298 // CHECK-GFX940-LABEL
: @test_mfma_f32_32x32x16_fp8_bf8
299 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8
(i64 %a
, i64 %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
300 void test_mfma_f32_32x32x16_fp8_bf8
(global v16f
* out
, long a
, long b
, v16f c
)
302 *out
= __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8
(a, b
, c
, 0, 0, 0);
305 // CHECK-GFX940-LABEL
: @test_mfma_f32_32x32x16_fp8_fp8
306 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8
(i64 %a
, i64 %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
307 void test_mfma_f32_32x32x16_fp8_fp8
(global v16f
* out
, long a
, long b
, v16f c
)
309 *out
= __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8
(a, b
, c
, 0, 0, 0);
312 // CHECK-GFX940-LABEL
: @test_smfmac_f32_16x16x32_f16
313 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.smfmac.f32.16x16x32.f16
(<4 x half
> %a
, <8 x half
> %b
, <4 x float
> %c
, i32 %idx
, i32
0, i32
0)
314 void test_smfmac_f32_16x16x32_f16
(global v4f
* out
, v4h a
, v8h b
, v4f c
, int idx
)
316 *out
= __builtin_amdgcn_smfmac_f32_16x16x32_f16
(a, b
, c
, idx
, 0, 0);
319 // CHECK-GFX940-LABEL
: @test_smfmac_f32_32x32x16_f16
320 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.smfmac.f32.32x32x16.f16
(<4 x half
> %a
, <8 x half
> %b
, <16 x float
> %c
, i32 %idx
, i32
0, i32
0)
321 void test_smfmac_f32_32x32x16_f16
(global v16f
* out
, v4h a
, v8h b
, v16f c
, int idx
)
323 *out
= __builtin_amdgcn_smfmac_f32_32x32x16_f16
(a, b
, c
, idx
, 0, 0);
326 // CHECK-GFX940-LABEL
: @test_smfmac_f32_16x16x32_bf16
327 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.smfmac.f32.16x16x32.bf16
(<4 x i16
> %a
, <8 x i16
> %b
, <4 x float
> %c
, i32 %idx
, i32
0, i32
0)
328 void test_smfmac_f32_16x16x32_bf16
(global v4f
* out
, v4s a
, v8s b
, v4f c
, int idx
)
330 *out
= __builtin_amdgcn_smfmac_f32_16x16x32_bf16
(a, b
, c
, idx
, 0, 0);
333 // CHECK-GFX940-LABEL
: @test_smfmac_f32_32x32x16_bf16
334 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.smfmac.f32.32x32x16.bf16
(<4 x i16
> %a
, <8 x i16
> %b
, <16 x float
> %c
, i32 %idx
, i32
0, i32
0)
335 void test_smfmac_f32_32x32x16_bf16
(global v16f
* out
, v4s a
, v8s b
, v16f c
, int idx
)
337 *out
= __builtin_amdgcn_smfmac_f32_32x32x16_bf16
(a, b
, c
, idx
, 0, 0);
340 // CHECK-GFX940-LABEL
: @test_smfmac_i32_16x16x64_i8
341 // CHECK-GFX940
: call
<4 x i32
> @llvm.amdgcn.smfmac.i32.16x16x64.i8
(<2 x i32
> %a
, <4 x i32
> %b
, <4 x i32
> %c
, i32 %idx
, i32
0, i32
0)
342 void test_smfmac_i32_16x16x64_i8
(global v4i
* out
, v2i a
, v4i b
, v4i c
, int idx
)
344 *out
= __builtin_amdgcn_smfmac_i32_16x16x64_i8
(a, b
, c
, idx
, 0, 0);
347 // CHECK-GFX940-LABEL
: @test_smfmac_i32_32x32x32_i8
348 // CHECK-GFX940
: call
<16 x i32
> @llvm.amdgcn.smfmac.i32.32x32x32.i8
(<2 x i32
> %a
, <4 x i32
> %b
, <16 x i32
> %c
, i32 %idx
, i32
0, i32
0)
349 void test_smfmac_i32_32x32x32_i8
(global v16i
* out
, v2i a
, v4i b
, v16i c
, int idx
)
351 *out
= __builtin_amdgcn_smfmac_i32_32x32x32_i8
(a, b
, c
, idx
, 0, 0);
354 // CHECK-GFX940-LABEL
: @test_smfmac_f32_16x16x64_bf8_bf8
355 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8
(<2 x i32
> %a
, <4 x i32
> %b
, <4 x float
> %c
, i32 %idx
, i32
0, i32
0)
356 void test_smfmac_f32_16x16x64_bf8_bf8
(global v4f
* out
, v2i a
, v4i b
, v4f c
, int idx
)
358 *out
= __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8
(a, b
, c
, idx
, 0, 0);
361 // CHECK-GFX940-LABEL
: @test_smfmac_f32_16x16x64_bf8_fp8
362 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8
(<2 x i32
> %a
, <4 x i32
> %b
, <4 x float
> %c
, i32 %idx
, i32
0, i32
0)
363 void test_smfmac_f32_16x16x64_bf8_fp8
(global v4f
* out
, v2i a
, v4i b
, v4f c
, int idx
)
365 *out
= __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8
(a, b
, c
, idx
, 0, 0);
368 // CHECK-GFX940-LABEL
: @test_smfmac_f32_16x16x64_fp8_bf8
369 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8
(<2 x i32
> %a
, <4 x i32
> %b
, <4 x float
> %c
, i32 %idx
, i32
0, i32
0)
370 void test_smfmac_f32_16x16x64_fp8_bf8
(global v4f
* out
, v2i a
, v4i b
, v4f c
, int idx
)
372 *out
= __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8
(a, b
, c
, idx
, 0, 0);
375 // CHECK-GFX940-LABEL
: @test_smfmac_f32_16x16x64_fp8_fp8
376 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8
(<2 x i32
> %a
, <4 x i32
> %b
, <4 x float
> %c
, i32 %idx
, i32
0, i32
0)
377 void test_smfmac_f32_16x16x64_fp8_fp8
(global v4f
* out
, v2i a
, v4i b
, v4f c
, int idx
)
379 *out
= __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8
(a, b
, c
, idx
, 0, 0);
382 // CHECK-GFX940-LABEL
: @test_smfmac_f32_32x32x32_bf8_bf8
383 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8
(<2 x i32
> %a
, <4 x i32
> %b
, <16 x float
> %c
, i32 %idx
, i32
0, i32
0)
384 void test_smfmac_f32_32x32x32_bf8_bf8
(global v16f
* out
, v2i a
, v4i b
, v16f c
, int idx
)
386 *out
= __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8
(a, b
, c
, idx
, 0, 0);
389 // CHECK-GFX940-LABEL
: @test_smfmac_f32_32x32x32_bf8_fp8
390 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8
(<2 x i32
> %a
, <4 x i32
> %b
, <16 x float
> %c
, i32 %idx
, i32
0, i32
0)
391 void test_smfmac_f32_32x32x32_bf8_fp8
(global v16f
* out
, v2i a
, v4i b
, v16f c
, int idx
)
393 *out
= __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8
(a, b
, c
, idx
, 0, 0);
396 // CHECK-GFX940-LABEL
: @test_smfmac_f32_32x32x32_fp8_bf8
397 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8
(<2 x i32
> %a
, <4 x i32
> %b
, <16 x float
> %c
, i32 %idx
, i32
0, i32
0)
398 void test_smfmac_f32_32x32x32_fp8_bf8
(global v16f
* out
, v2i a
, v4i b
, v16f c
, int idx
)
400 *out
= __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8
(a, b
, c
, idx
, 0, 0);
403 // CHECK-GFX940-LABEL
: @test_smfmac_f32_32x32x32_fp8_fp8
404 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8
(<2 x i32
> %a
, <4 x i32
> %b
, <16 x float
> %c
, i32 %idx
, i32
0, i32
0)
405 void test_smfmac_f32_32x32x32_fp8_fp8
(global v16f
* out
, v2i a
, v4i b
, v16f c
, int idx
)
407 *out
= __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8
(a, b
, c
, idx
, 0, 0);
409 #endif
// defined
(MFMA_GFX940_TESTS) || defined
(MFMA_GFX950_TESTS)
411 #ifdef MFMA_GFX950_TESTS
413 // CHECK-GFX950-LABEL
: @test_mfma_f32_16x16x32_f16
(
414 // CHECK-GFX950
: tail call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x32.f16
(<8 x half
> %a
, <8 x half
> %b
, <4 x float
> %c
, i32
1, i32
2, i32
3)
416 v4f test_mfma_f32_16x16x32_f16
(v8h a
, v8h b
, v4f c
)
418 return __builtin_amdgcn_mfma_f32_16x16x32_f16
(a, b
, c
, 1, 2, 3);
421 // CHECK-GFX950-LABEL
: @test_mfma_f32_32x32x16_f16
422 // CHECK-GFX950
: tail call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x16.f16
(<8 x half
> %a
, <8 x half
> %b
, <16 x float
> %c
, i32
1, i32
2, i32
3)
423 v16f test_mfma_f32_32x32x16_f16
(v8h a
, v8h b
, v16f c
)
425 return __builtin_amdgcn_mfma_f32_32x32x16_f16
(a, b
, c
, 1, 2, 3);
428 // CHECK-GFX950-LABEL
: @test_mfma_f32_32x32x16_bf16
(
429 // CHECK-GFX950
: tail call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x16.bf16
(<8 x bfloat
> %a
, <8 x bfloat
> %b
, <16 x float
> %c
, i32
1, i32
2, i32
3)
430 v16f test_mfma_f32_32x32x16_bf16
(v8bf16 a
, v8bf16 b
, v16f c
) {
431 return __builtin_amdgcn_mfma_f32_32x32x16_bf16
(a, b
, c
, 1, 2, 3);