Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGenOpenCL / builtins-amdgcn-mfma.cl
blob1819ff0a6177c4225c6e8f7af1e1dcbfd53748f9
1 // REQUIRES: amdgpu-registered-target
2 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
3 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
4 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
6 #pragma OPENCL EXTENSION cl_khr_fp64:enable
8 typedef float v2f __attribute__((ext_vector_type(2)));
9 typedef float v4f __attribute__((ext_vector_type(4)));
10 typedef float v16f __attribute__((ext_vector_type(16)));
11 typedef float v32f __attribute__((ext_vector_type(32)));
12 typedef half v4h __attribute__((ext_vector_type(4)));
13 typedef half v8h __attribute__((ext_vector_type(8)));
14 typedef half v16h __attribute__((ext_vector_type(16)));
15 typedef half v32h __attribute__((ext_vector_type(32)));
16 typedef int v2i __attribute__((ext_vector_type(2)));
17 typedef int v4i __attribute__((ext_vector_type(4)));
18 typedef int v16i __attribute__((ext_vector_type(16)));
19 typedef int v32i __attribute__((ext_vector_type(32)));
20 typedef short v2s __attribute__((ext_vector_type(2)));
21 typedef short v4s __attribute__((ext_vector_type(4)));
22 typedef short v8s __attribute__((ext_vector_type(8)));
23 typedef short v16s __attribute__((ext_vector_type(16)));
24 typedef short v32s __attribute__((ext_vector_type(32)));
25 typedef double v4d __attribute__((ext_vector_type(4)));
28 #ifdef MFMA_GFX908_TESTS
30 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
31 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
32 void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
34 *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
37 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
38 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
39 void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
41 *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
44 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
45 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
46 void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
48 *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
51 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
52 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
53 void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
55 *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
58 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
59 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
60 void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
62 *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
65 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
66 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
67 void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
69 *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
72 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
73 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
74 void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
76 *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
79 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
80 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
81 void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
83 *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
86 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
87 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
88 void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
90 *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
93 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
94 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
95 void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
97 *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
100 // CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
101 // CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
102 void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
104 *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
107 // CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
108 // CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
109 void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
111 *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
114 // CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
115 // CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
116 void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
118 *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
121 // CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
122 // CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
123 void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
125 *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
128 // CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
129 // CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
130 void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
132 *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
135 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
136 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
137 void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
139 *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
142 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
143 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
144 void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
146 *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
149 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
150 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
151 void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
153 *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
156 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
157 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
158 void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
160 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
163 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
164 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
165 void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
167 *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
170 #endif // MFMA_GFX908_TESTS
172 #ifdef MFMA_GFX90A_TESTS
174 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
175 // CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
176 void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
178 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
181 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
182 // CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
183 void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
185 *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
188 // CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
189 // CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
190 void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
192 *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
195 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
196 // CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
197 void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
199 *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
202 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
203 // CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
204 void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
206 *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
209 // CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
210 // CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
211 void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
213 *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
216 // CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
217 // CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
218 void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
220 *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
223 #endif // MFMA_GFX90A_TESTS
225 #ifdef MFMA_GFX940_TESTS
226 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
227 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
228 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
230 *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
233 // CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
234 // CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
235 void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
237 *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
240 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
241 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
242 void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
244 *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
247 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
248 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
249 void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
251 *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
254 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8
255 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
256 void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c)
258 *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0);
261 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8
262 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
263 void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c)
265 *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0);
268 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8
269 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
270 void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c)
272 *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0);
275 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8
276 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
277 void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c)
279 *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0);
282 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8
283 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
284 void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c)
286 *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0);
289 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8
290 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
291 void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c)
293 *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0);
296 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8
297 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
298 void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c)
300 *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0);
303 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8
304 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
305 void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c)
307 *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0);
310 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16
311 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
312 void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx)
314 *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0);
317 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16
318 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
319 void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx)
321 *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0);
324 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16
325 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
326 void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx)
328 *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0);
331 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16
332 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
333 void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx)
335 *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0);
338 // CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8
339 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0)
340 void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx)
342 *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0);
345 // CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8
346 // CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0)
347 void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx)
349 *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0);
352 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8
353 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
354 void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
356 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0);
359 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8
360 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
361 void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
363 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0);
366 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8
367 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
368 void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
370 *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0);
373 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8
374 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
375 void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
377 *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0);
380 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8
381 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
382 void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
384 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0);
387 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8
388 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
389 void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
391 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0);
394 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8
395 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
396 void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
398 *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0);
401 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8
402 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
403 void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
405 *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
407 #endif // MFMA_GFX940_TESTS