[flang][cuda] Adapt ExternalNameConversion to work in gpu module (#117039)
[llvm-project.git] / clang / test / CodeGenOpenCL / builtins-amdgcn-mfma.cl
blob841d8fcad0fee0ce8f8ce46a9c798f3b00cb7d42
1 // REQUIRES: amdgpu-registered-target
2 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
3 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
4 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
5 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950
7 #pragma OPENCL EXTENSION cl_khr_fp64:enable
9 typedef float v2f __attribute__((ext_vector_type(2)));
10 typedef float v4f __attribute__((ext_vector_type(4)));
11 typedef float v16f __attribute__((ext_vector_type(16)));
12 typedef float v32f __attribute__((ext_vector_type(32)));
13 typedef half v4h __attribute__((ext_vector_type(4)));
14 typedef half v8h __attribute__((ext_vector_type(8)));
15 typedef half v16h __attribute__((ext_vector_type(16)));
16 typedef half v32h __attribute__((ext_vector_type(32)));
17 typedef int v2i __attribute__((ext_vector_type(2)));
18 typedef int v4i __attribute__((ext_vector_type(4)));
19 typedef int v16i __attribute__((ext_vector_type(16)));
20 typedef int v32i __attribute__((ext_vector_type(32)));
21 typedef short v2s __attribute__((ext_vector_type(2)));
22 typedef short v4s __attribute__((ext_vector_type(4)));
23 typedef short v8s __attribute__((ext_vector_type(8)));
24 typedef short v16s __attribute__((ext_vector_type(16)));
25 typedef short v32s __attribute__((ext_vector_type(32)));
26 typedef double v4d __attribute__((ext_vector_type(4)));
27 typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
30 #ifdef MFMA_GFX908_TESTS
32 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
33 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
34 void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
36 *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
39 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
40 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
41 void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
43 *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
46 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
47 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
48 void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
50 *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
53 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
54 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
55 void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
57 *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
60 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
61 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
62 void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
64 *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
67 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
68 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
69 void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
71 *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
74 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
75 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
76 void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
78 *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
81 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
82 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
83 void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
85 *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
88 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
89 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
90 void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
92 *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
95 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
96 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
97 void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
99 *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
102 // CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
103 // CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
104 void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
106 *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
109 // CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
110 // CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
111 void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
113 *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
116 // CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
117 // CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
118 void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
120 *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
123 // CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
124 // CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
125 void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
127 *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
130 // CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
131 // CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
132 void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
134 *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
137 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
138 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
139 void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
141 *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
144 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
145 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
146 void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
148 *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
151 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
152 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
153 void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
155 *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
158 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
159 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
160 void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
162 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
165 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
166 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
167 void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
169 *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
172 #endif // MFMA_GFX908_TESTS
174 #ifdef MFMA_GFX90A_TESTS
176 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
177 // CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
178 void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
180 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
183 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
184 // CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
185 void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
187 *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
190 // CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
191 // CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
192 void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
194 *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
197 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
198 // CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
199 void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
201 *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
204 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
205 // CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
206 void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
208 *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
211 // CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
212 // CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
213 void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
215 *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
218 // CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
219 // CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
220 void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
222 *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
225 #endif // MFMA_GFX90A_TESTS
227 #if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
228 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
229 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
230 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
232 *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
235 // CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
236 // CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
237 void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
239 *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
242 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
243 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
244 void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
246 *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
249 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
250 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
251 void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
253 *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
256 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8
257 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
258 void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c)
260 *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0);
263 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8
264 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
265 void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c)
267 *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0);
270 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8
271 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
272 void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c)
274 *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0);
277 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8
278 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
279 void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c)
281 *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0);
284 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8
285 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
286 void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c)
288 *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0);
291 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8
292 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
293 void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c)
295 *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0);
298 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8
299 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
300 void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c)
302 *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0);
305 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8
306 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
307 void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c)
309 *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0);
312 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16
313 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
314 void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx)
316 *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0);
319 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16
320 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
321 void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx)
323 *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0);
326 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16
327 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
328 void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx)
330 *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0);
333 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16
334 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
335 void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx)
337 *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0);
340 // CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8
341 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0)
342 void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx)
344 *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0);
347 // CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8
348 // CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0)
349 void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx)
351 *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0);
354 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8
355 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
356 void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
358 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0);
361 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8
362 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
363 void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
365 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0);
368 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8
369 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
370 void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
372 *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0);
375 // CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8
376 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
377 void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
379 *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0);
382 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8
383 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
384 void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
386 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0);
389 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8
390 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
391 void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
393 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0);
396 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8
397 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
398 void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
400 *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0);
403 // CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8
404 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
405 void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
407 *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
409 #endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
411 #ifdef MFMA_GFX950_TESTS
413 // CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
414 // CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)
416 v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
418 return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
421 // CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
422 // CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %a, <8 x half> %b, <16 x float> %c, i32 1, i32 2, i32 3)
423 v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
425 return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
428 // CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
429 // CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
430 v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
431 return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
434 #endif