[AMDGPU] New gfx940 mfma instructions
[llvm-project.git] / clang / test / CodeGenOpenCL / builtins-amdgcn-mfma.cl
blobfc29faf9ad1c5bc9e0c6d9e90e0efe81ba4a902e
1 // REQUIRES: amdgpu-registered-target
2 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
3 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
4 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
6 #pragma OPENCL EXTENSION cl_khr_fp64:enable
8 typedef float v2f __attribute__((ext_vector_type(2)));
9 typedef float v4f __attribute__((ext_vector_type(4)));
10 typedef float v16f __attribute__((ext_vector_type(16)));
11 typedef float v32f __attribute__((ext_vector_type(32)));
12 typedef half v4h __attribute__((ext_vector_type(4)));
13 typedef half v16h __attribute__((ext_vector_type(16)));
14 typedef half v32h __attribute__((ext_vector_type(32)));
15 typedef int v4i __attribute__((ext_vector_type(4)));
16 typedef int v16i __attribute__((ext_vector_type(16)));
17 typedef int v32i __attribute__((ext_vector_type(32)));
18 typedef short v2s __attribute__((ext_vector_type(2)));
19 typedef short v4s __attribute__((ext_vector_type(4)));
20 typedef short v16s __attribute__((ext_vector_type(16)));
21 typedef short v32s __attribute__((ext_vector_type(32)));
22 typedef double v4d __attribute__((ext_vector_type(4)));
25 #ifdef MFMA_GFX908_TESTS
27 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
28 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
29 void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
31 *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
34 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
35 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
36 void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
38 *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
41 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
42 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
43 void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
45 *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
48 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
49 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
50 void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
52 *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
55 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
56 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
57 void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
59 *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
62 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
63 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
64 void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
66 *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
69 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
70 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
71 void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
73 *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
76 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
77 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
78 void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
80 *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
83 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
84 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
85 void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
87 *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
90 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
91 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
92 void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
94 *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
97 // CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
98 // CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
99 void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
101 *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
104 // CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
105 // CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
106 void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
108 *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
111 // CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
112 // CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
113 void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
115 *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
118 // CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
119 // CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
120 void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
122 *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
125 // CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
126 // CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
127 void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
129 *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
132 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
133 // CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
134 void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
136 *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
139 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
140 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
141 void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
143 *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
146 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
147 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
148 void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
150 *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
153 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
154 // CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
155 void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
157 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
160 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
161 // CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
162 void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
164 *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
167 #endif // MFMA_GFX908_TESTS
169 #ifdef MFMA_GFX90A_TESTS
171 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
172 // CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
173 void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
175 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
178 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
179 // CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
180 void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
182 *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
185 // CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
186 // CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
187 void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
189 *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
192 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
193 // CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
194 void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
196 *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
199 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
200 // CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
201 void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
203 *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
206 // CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
207 // CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
208 void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
210 *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
213 // CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
214 // CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
215 void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
217 *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
220 #endif // MFMA_GFX90A_TESTS
222 #ifdef MFMA_GFX940_TESTS
223 // CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
224 // CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
225 void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
227 *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
230 // CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
231 // CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
232 void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
234 *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
237 // CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
238 // CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
239 void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
241 *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
244 // CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
245 // CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
246 void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
248 *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
250 #endif // MFMA_GFX940_TESTS