1 // REQUIRES
: amdgpu-registered-target
2 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX908
3 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX90A
4 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix
=CHECK-GFX940
6 #pragma OPENCL EXTENSION cl_khr_fp64
:enable
8 typedef float v2f __attribute__
((ext_vector_type(2)));
9 typedef float v4f __attribute__
((ext_vector_type(4)));
10 typedef float v16f __attribute__
((ext_vector_type(16)));
11 typedef float v32f __attribute__
((ext_vector_type(32)));
12 typedef half v4h __attribute__
((ext_vector_type(4)));
13 typedef half v16h __attribute__
((ext_vector_type(16)));
14 typedef half v32h __attribute__
((ext_vector_type(32)));
15 typedef int v4i __attribute__
((ext_vector_type(4)));
16 typedef int v16i __attribute__
((ext_vector_type(16)));
17 typedef int v32i __attribute__
((ext_vector_type(32)));
18 typedef short v2s __attribute__
((ext_vector_type(2)));
19 typedef short v4s __attribute__
((ext_vector_type(4)));
20 typedef short v16s __attribute__
((ext_vector_type(16)));
21 typedef short v32s __attribute__
((ext_vector_type(32)));
22 typedef double v4d __attribute__
((ext_vector_type(4)));
25 #ifdef MFMA_GFX908_TESTS
27 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x1f32
28 // CHECK-GFX908
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x1f32
(float %a
, float %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
29 void test_mfma_f32_32x32x1f32
(global v32f
* out
, float a
, float b
, v32f c
)
31 *out
= __builtin_amdgcn_mfma_f32_32x32x1f32
(a, b
, c
, 0, 0, 0);
34 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x1f32
35 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x1f32
(float %a
, float %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
36 void test_mfma_f32_16x16x1f32
(global v16f
* out
, float a
, float b
, v16f c
)
38 *out
= __builtin_amdgcn_mfma_f32_16x16x1f32
(a, b
, c
, 0, 0, 0);
41 // CHECK-GFX908-LABEL
: @test_mfma_f32_4x4x1f32
42 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x1f32
(float %a
, float %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
43 void test_mfma_f32_4x4x1f32
(global v4f
* out
, float a
, float b
, v4f c
)
45 *out
= __builtin_amdgcn_mfma_f32_4x4x1f32
(a, b
, c
, 0, 0, 0);
48 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x2f32
49 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x2f32
(float %a
, float %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
50 void test_mfma_f32_32x32x2f32
(global v16f
* out
, float a
, float b
, v16f c
)
52 *out
= __builtin_amdgcn_mfma_f32_32x32x2f32
(a, b
, c
, 0, 0, 0);
55 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x4f32
56 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x4f32
(float %a
, float %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
57 void test_mfma_f32_16x16x4f32
(global v4f
* out
, float a
, float b
, v4f c
)
59 *out
= __builtin_amdgcn_mfma_f32_16x16x4f32
(a, b
, c
, 0, 0, 0);
62 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x4f16
63 // CHECK-GFX908
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x4f16
(<4 x half
> %a
, <4 x half
> %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
64 void test_mfma_f32_32x32x4f16
(global v32f
* out
, v4h a
, v4h b
, v32f c
)
66 *out
= __builtin_amdgcn_mfma_f32_32x32x4f16
(a, b
, c
, 0, 0, 0);
69 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x4f16
70 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x4f16
(<4 x half
> %a
, <4 x half
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
71 void test_mfma_f32_16x16x4f16
(global v16f
* out
, v4h a
, v4h b
, v16f c
)
73 *out
= __builtin_amdgcn_mfma_f32_16x16x4f16
(a, b
, c
, 0, 0, 0);
76 // CHECK-GFX908-LABEL
: @test_mfma_f32_4x4x4f16
77 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x4f16
(<4 x half
> %a
, <4 x half
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
78 void test_mfma_f32_4x4x4f16
(global v4f
* out
, v4h a
, v4h b
, v4f c
)
80 *out
= __builtin_amdgcn_mfma_f32_4x4x4f16
(a, b
, c
, 0, 0, 0);
83 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x8f16
84 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x8f16
(<4 x half
> %a
, <4 x half
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
85 void test_mfma_f32_32x32x8f16
(global v16f
* out
, v4h a
, v4h b
, v16f c
)
87 *out
= __builtin_amdgcn_mfma_f32_32x32x8f16
(a, b
, c
, 0, 0, 0);
90 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x16f16
91 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x16f16
(<4 x half
> %a
, <4 x half
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
92 void test_mfma_f32_16x16x16f16
(global v4f
* out
, v4h a
, v4h b
, v4f c
)
94 *out
= __builtin_amdgcn_mfma_f32_16x16x16f16
(a, b
, c
, 0, 0, 0);
97 // CHECK-GFX908-LABEL
: @test_mfma_i32_32x32x4i8
98 // CHECK-GFX908
: call
<32 x i32
> @llvm.amdgcn.mfma.i32.32x32x4i8
(i32 %a
, i32 %b
, <32 x i32
> %c
, i32
0, i32
0, i32
0)
99 void test_mfma_i32_32x32x4i8
(global v32i
* out
, int a
, int b
, v32i c
)
101 *out
= __builtin_amdgcn_mfma_i32_32x32x4i8
(a, b
, c
, 0, 0, 0);
104 // CHECK-GFX908-LABEL
: @test_mfma_i32_16x16x4i8
105 // CHECK-GFX908
: call
<16 x i32
> @llvm.amdgcn.mfma.i32.16x16x4i8
(i32 %a
, i32 %b
, <16 x i32
> %c
, i32
0, i32
0, i32
0)
106 void test_mfma_i32_16x16x4i8
(global v16i
* out
, int a
, int b
, v16i c
)
108 *out
= __builtin_amdgcn_mfma_i32_16x16x4i8
(a, b
, c
, 0, 0, 0);
111 // CHECK-GFX908-LABEL
: @test_mfma_i32_4x4x4i8
112 // CHECK-GFX908
: call
<4 x i32
> @llvm.amdgcn.mfma.i32.4x4x4i8
(i32 %a
, i32 %b
, <4 x i32
> %c
, i32
0, i32
0, i32
0)
113 void test_mfma_i32_4x4x4i8
(global v4i
* out
, int a
, int b
, v4i c
)
115 *out
= __builtin_amdgcn_mfma_i32_4x4x4i8
(a, b
, c
, 0, 0, 0);
118 // CHECK-GFX908-LABEL
: @test_mfma_i32_32x32x8i8
119 // CHECK-GFX908
: call
<16 x i32
> @llvm.amdgcn.mfma.i32.32x32x8i8
(i32 %a
, i32 %b
, <16 x i32
> %c
, i32
0, i32
0, i32
0)
120 void test_mfma_i32_32x32x8i8
(global v16i
* out
, int a
, int b
, v16i c
)
122 *out
= __builtin_amdgcn_mfma_i32_32x32x8i8
(a, b
, c
, 0, 0, 0);
125 // CHECK-GFX908-LABEL
: @test_mfma_i32_16x16x16i8
126 // CHECK-GFX908
: call
<4 x i32
> @llvm.amdgcn.mfma.i32.16x16x16i8
(i32 %a
, i32 %b
, <4 x i32
> %c
, i32
0, i32
0, i32
0)
127 void test_mfma_i32_16x16x16i8
(global v4i
* out
, int a
, int b
, v4i c
)
129 *out
= __builtin_amdgcn_mfma_i32_16x16x16i8
(a, b
, c
, 0, 0, 0);
132 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x2bf16
133 // CHECK-GFX908
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x2bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
134 void test_mfma_f32_32x32x2bf16
(global v32f
* out
, v2s a
, v2s b
, v32f c
)
136 *out
= __builtin_amdgcn_mfma_f32_32x32x2bf16
(a, b
, c
, 0, 0, 0);
139 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x2bf16
140 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x2bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
141 void test_mfma_f32_16x16x2bf16
(global v16f
* out
, v2s a
, v2s b
, v16f c
)
143 *out
= __builtin_amdgcn_mfma_f32_16x16x2bf16
(a, b
, c
, 0, 0, 0);
146 // CHECK-GFX908-LABEL
: @test_mfma_f32_4x4x2bf16
147 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x2bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
148 void test_mfma_f32_4x4x2bf16
(global v4f
* out
, v2s a
, v2s b
, v4f c
)
150 *out
= __builtin_amdgcn_mfma_f32_4x4x2bf16
(a, b
, c
, 0, 0, 0);
153 // CHECK-GFX908-LABEL
: @test_mfma_f32_32x32x4bf16
154 // CHECK-GFX908
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x4bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
155 void test_mfma_f32_32x32x4bf16
(global v16f
* out
, v2s a
, v2s b
, v16f c
)
157 *out
= __builtin_amdgcn_mfma_f32_32x32x4bf16
(a, b
, c
, 0, 0, 0);
160 // CHECK-GFX908-LABEL
: @test_mfma_f32_16x16x8bf16
161 // CHECK-GFX908
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x8bf16
(<2 x i16
> %a
, <2 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
162 void test_mfma_f32_16x16x8bf16
(global v4f
* out
, v2s a
, v2s b
, v4f c
)
164 *out
= __builtin_amdgcn_mfma_f32_16x16x8bf16
(a, b
, c
, 0, 0, 0);
167 #endif
// MFMA_GFX908_TESTS
169 #ifdef MFMA_GFX90A_TESTS
171 // CHECK-GFX90A-LABEL
: @test_mfma_f32_32x32x4bf16_1k
172 // CHECK-GFX90A
: call
<32 x float
> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <32 x float
> %c
, i32
0, i32
0, i32
0)
173 void test_mfma_f32_32x32x4bf16_1k
(global v32f
* out
, v4s a
, v4s b
, v32f c
)
175 *out
= __builtin_amdgcn_mfma_f32_32x32x4bf16_1k
(a, b
, c
, 0, 0, 0);
178 // CHECK-GFX90A-LABEL
: @test_mfma_f32_16x16x4bf16_1k
179 // CHECK-GFX90A
: call
<16 x float
> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
180 void test_mfma_f32_16x16x4bf16_1k
(global v16f
* out
, v4s a
, v4s b
, v16f c
)
182 *out
= __builtin_amdgcn_mfma_f32_16x16x4bf16_1k
(a, b
, c
, 0, 0, 0);
185 // CHECK-GFX90A-LABEL
: @test_mfma_f32_4x4x4bf16_1k
186 // CHECK-GFX90A
: call
<4 x float
> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
187 void test_mfma_f32_4x4x4bf16_1k
(global v4f
* out
, v4s a
, v4s b
, v4f c
)
189 *out
= __builtin_amdgcn_mfma_f32_4x4x4bf16_1k
(a, b
, c
, 0, 0, 0);
192 // CHECK-GFX90A-LABEL
: @test_mfma_f32_32x32x8bf16_1k
193 // CHECK-GFX90A
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
194 void test_mfma_f32_32x32x8bf16_1k
(global v16f
* out
, v4s a
, v4s b
, v16f c
)
196 *out
= __builtin_amdgcn_mfma_f32_32x32x8bf16_1k
(a, b
, c
, 0, 0, 0);
199 // CHECK-GFX90A-LABEL
: @test_mfma_f32_16x16x16bf16_1k
200 // CHECK-GFX90A
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k
(<4 x i16
> %a
, <4 x i16
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
201 void test_mfma_f32_16x16x16bf16_1k
(global v4f
* out
, v4s a
, v4s b
, v4f c
)
203 *out
= __builtin_amdgcn_mfma_f32_16x16x16bf16_1k
(a, b
, c
, 0, 0, 0);
206 // CHECK-GFX90A-LABEL
: @test_mfma_f64_16x16x4f64
207 // CHECK-GFX90A
: call
<4 x double
> @llvm.amdgcn.mfma.f64.16x16x4f64
(double %a
, double %b
, <4 x double
> %c
, i32
0, i32
0, i32
0)
208 void test_mfma_f64_16x16x4f64
(global v4d
* out
, double a
, double b
, v4d c
)
210 *out
= __builtin_amdgcn_mfma_f64_16x16x4f64
(a, b
, c
, 0, 0, 0);
213 // CHECK-GFX90A-LABEL
: @test_mfma_f64_4x4x4f64
214 // CHECK-GFX90A
: call double
@llvm.amdgcn.mfma.f64.4x4x4f64
(double %a
, double %b
, double %c
, i32
0, i32
0, i32
0)
215 void test_mfma_f64_4x4x4f64
(global double
* out
, double a
, double b
, double c
)
217 *out
= __builtin_amdgcn_mfma_f64_4x4x4f64
(a, b
, c
, 0, 0, 0);
220 #endif
// MFMA_GFX90A_TESTS
222 #ifdef MFMA_GFX940_TESTS
223 // CHECK-GFX940-LABEL
: @test_mfma_i32_16x16x32_i8
224 // CHECK-GFX940
: call
<4 x i32
> @llvm.amdgcn.mfma.i32.16x16x32.i8
(i64 %a
, i64 %b
, <4 x i32
> %c
, i32
0, i32
0, i32
0)
225 void test_mfma_i32_16x16x32_i8
(global v4i
* out
, long a
, long b
, v4i c
)
227 *out
= __builtin_amdgcn_mfma_i32_16x16x32_i8
(a, b
, c
, 0, 0, 0);
230 // CHECK-GFX940-LABEL
: @test_mfma_i32_32x32x16_i8
231 // CHECK-GFX940
: call
<16 x i32
> @llvm.amdgcn.mfma.i32.32x32x16.i8
(i64 %a
, i64 %b
, <16 x i32
> %c
, i32
0, i32
0, i32
0)
232 void test_mfma_i32_32x32x16_i8
(global v16i
* out
, long a
, long b
, v16i c
)
234 *out
= __builtin_amdgcn_mfma_i32_32x32x16_i8
(a, b
, c
, 0, 0, 0);
237 // CHECK-GFX940-LABEL
: @test_mfma_f32_16x16x8_xf32
238 // CHECK-GFX940
: call
<4 x float
> @llvm.amdgcn.mfma.f32.16x16x8.xf32
(<2 x float
> %a
, <2 x float
> %b
, <4 x float
> %c
, i32
0, i32
0, i32
0)
239 void test_mfma_f32_16x16x8_xf32
(global v4f
* out
, v2f a
, v2f b
, v4f c
)
241 *out
= __builtin_amdgcn_mfma_f32_16x16x8_xf32
(a, b
, c
, 0, 0, 0);
244 // CHECK-GFX940-LABEL
: @test_mfma_f32_32x32x4_xf32
245 // CHECK-GFX940
: call
<16 x float
> @llvm.amdgcn.mfma.f32.32x32x4.xf32
(<2 x float
> %a
, <2 x float
> %b
, <16 x float
> %c
, i32
0, i32
0, i32
0)
246 void test_mfma_f32_32x32x4_xf32
(global v16f
* out
, v2f a
, v2f b
, v16f c
)
248 *out
= __builtin_amdgcn_mfma_f32_32x32x4_xf32
(a, b
, c
, 0, 0, 0);
250 #endif
// MFMA_GFX940_TESTS