Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / aarch64-neon-vcmla.c
blob9cd046d63d1b520bcd2b5a282a5a31d88834d74a
1 // RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
2 // RUN: -target-feature +v8.3a \
3 // RUN: -target-feature +fullfp16 \
4 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
6 // REQUIRES: aarch64-registered-target
8 #include <arm_neon.h>
10 // CHECK-LABEL: @test_vcmla_f16(
11 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
12 // CHECK: ret <4 x half> [[RES]]
13 float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
14 return vcmla_f16(acc, lhs, rhs);
17 // CHECK-LABEL: @test_vcmla_f32(
18 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
19 // CHECK: ret <2 x float> [[RES]]
20 float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
21 return vcmla_f32(acc, lhs, rhs);
24 // CHECK-LABEL: @test_vcmlaq_f16(
25 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
26 // CHECK: ret <8 x half> [[RES]]
27 float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
28 return vcmlaq_f16(acc, lhs, rhs);
31 // CHECK-LABEL: @test_vcmlaq_f32(
32 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
33 // CHECK: ret <4 x float> [[RES]]
34 float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
35 return vcmlaq_f32(acc, lhs, rhs);
38 // CHECK-LABEL: @test_vcmlaq_f64(
39 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
40 // CHECK: ret <2 x double> [[RES]]
41 float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
42 return vcmlaq_f64(acc, lhs, rhs);
45 // CHECK-LABEL: @test_vcmla_rot90_f16(
46 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
47 // CHECK: ret <4 x half> [[RES]]
48 float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
49 return vcmla_rot90_f16(acc, lhs, rhs);
52 // CHECK-LABEL: @test_vcmla_rot90_f32(
53 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
54 // CHECK: ret <2 x float> [[RES]]
55 float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
56 return vcmla_rot90_f32(acc, lhs, rhs);
59 // CHECK-LABEL: @test_vcmlaq_rot90_f16(
60 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
61 // CHECK: ret <8 x half> [[RES]]
62 float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
63 return vcmlaq_rot90_f16(acc, lhs, rhs);
66 // CHECK-LABEL: @test_vcmlaq_rot90_f32(
67 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
68 // CHECK: ret <4 x float> [[RES]]
69 float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
70 return vcmlaq_rot90_f32(acc, lhs, rhs);
73 // CHECK-LABEL: @test_vcmlaq_rot90_f64(
74 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
75 // CHECK: ret <2 x double> [[RES]]
76 float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
77 return vcmlaq_rot90_f64(acc, lhs, rhs);
80 // CHECK-LABEL: @test_vcmla_rot180_f16(
81 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
82 // CHECK: ret <4 x half> [[RES]]
83 float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
84 return vcmla_rot180_f16(acc, lhs, rhs);
87 // CHECK-LABEL: @test_vcmla_rot180_f32(
88 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
89 // CHECK: ret <2 x float> [[RES]]
90 float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
91 return vcmla_rot180_f32(acc, lhs, rhs);
94 // CHECK-LABEL: @test_vcmlaq_rot180_f16(
95 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
96 // CHECK: ret <8 x half> [[RES]]
97 float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
98 return vcmlaq_rot180_f16(acc, lhs, rhs);
101 // CHECK-LABEL: @test_vcmlaq_rot180_f32(
102 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
103 // CHECK: ret <4 x float> [[RES]]
104 float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
105 return vcmlaq_rot180_f32(acc, lhs, rhs);
108 // CHECK-LABEL: @test_vcmlaq_rot180_f64(
109 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
110 // CHECK: ret <2 x double> [[RES]]
111 float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
112 return vcmlaq_rot180_f64(acc, lhs, rhs);
115 // CHECK-LABEL: @test_vcmla_rot270_f16(
116 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
117 // CHECK: ret <4 x half> [[RES]]
118 float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
119 return vcmla_rot270_f16(acc, lhs, rhs);
122 // CHECK-LABEL: @test_vcmla_rot270_f32(
123 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
124 // CHECK: ret <2 x float> [[RES]]
125 float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
126 return vcmla_rot270_f32(acc, lhs, rhs);
129 // CHECK-LABEL: @test_vcmlaq_rot270_f16(
130 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
131 // CHECK: ret <8 x half> [[RES]]
132 float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
133 return vcmlaq_rot270_f16(acc, lhs, rhs);
136 // CHECK-LABEL: @test_vcmlaq_rot270_f32(
137 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
138 // CHECK: ret <4 x float> [[RES]]
139 float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
140 return vcmlaq_rot270_f32(acc, lhs, rhs);
143 // CHECK-LABEL: @test_vcmlaq_rot270_f64(
144 // CHECK: [[RES:%.*]] = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
145 // CHECK: ret <2 x double> [[RES]]
146 float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
147 return vcmlaq_rot270_f64(acc, lhs, rhs);
150 // CHECK-LABEL: @test_vcmla_lane_f16(
151 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
152 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
153 // CHECK: ret <4 x half> [[RES]]
154 float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
155 return vcmla_lane_f16(acc, lhs, rhs, 1);
158 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
159 // CHECK-LABEL: @test_vcmla_laneq_f16(
160 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
161 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
162 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
163 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
164 // CHECK: ret <4 x half> [[RES]]
165 float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
166 return vcmla_laneq_f16(acc, lhs, rhs, 3);
169 // CHECK-LABEL: @test_vcmlaq_lane_f16(
170 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
171 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
172 // CHECK: ret <8 x half> [[RES]]
173 float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
174 return vcmlaq_lane_f16(acc, lhs, rhs, 1);
177 // CHECK-LABEL: @test_vcmlaq_laneq_f16(
178 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
179 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
180 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
181 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
182 // CHECK: ret <8 x half> [[RES]]
183 float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
184 return vcmlaq_laneq_f16(acc, lhs, rhs, 3);
187 // CHECK-LABEL: @test_vcmla_lane_f32(
188 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
189 // CHECK: ret <2 x float> [[RES]]
190 float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
191 return vcmla_lane_f32(acc, lhs, rhs, 0);
194 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
195 // CHECK-LABEL: @test_vcmla_laneq_f32(
196 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
197 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
198 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
199 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
200 // CHECK: ret <2 x float> [[RES]]
201 float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
202 return vcmla_laneq_f32(acc, lhs, rhs, 1);
205 // CHECK-LABEL: @test_vcmlaq_lane_f32(
206 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
207 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
208 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
209 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
210 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
211 // CHECK: ret <4 x float> [[RES]]
212 float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
213 return vcmlaq_lane_f32(acc, lhs, rhs, 0);
216 // CHECK-LABEL: @test_vcmlaq_laneq_f32(
217 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
218 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
219 // CHECK: ret <4 x float> [[RES]]
220 float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
221 return vcmlaq_laneq_f32(acc, lhs, rhs, 1);
224 // CHECK-LABEL: @test_vcmla_rot90_lane_f16(
225 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
226 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
227 // CHECK: ret <4 x half> [[RES]]
228 float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
229 return vcmla_rot90_lane_f16(acc, lhs, rhs, 1);
232 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
233 // CHECK-LABEL: @test_vcmla_rot90_laneq_f16(
234 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
235 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
236 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
237 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
238 // CHECK: ret <4 x half> [[RES]]
239 float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
240 return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3);
243 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f16(
244 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
245 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
246 // CHECK: ret <8 x half> [[RES]]
247 float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
248 return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1);
251 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f16(
252 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
253 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
254 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
255 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
256 // CHECK: ret <8 x half> [[RES]]
257 float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
258 return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3);
261 // CHECK-LABEL: @test_vcmla_rot90_lane_f32(
262 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
263 // CHECK: ret <2 x float> [[RES]]
264 float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
265 return vcmla_rot90_lane_f32(acc, lhs, rhs, 0);
268 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
269 // CHECK-LABEL: @test_vcmla_rot90_laneq_f32(
270 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
271 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
272 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
273 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
274 // CHECK: ret <2 x float> [[RES]]
275 float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
276 return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1);
279 // CHECK-LABEL: @test_vcmlaq_rot90_lane_f32(
280 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
281 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
282 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
283 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
284 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
285 // CHECK: ret <4 x float> [[RES]]
286 float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
287 return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0);
290 // CHECK-LABEL: @test_vcmlaq_rot90_laneq_f32(
291 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
292 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
293 // CHECK: ret <4 x float> [[RES]]
294 float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
295 return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1);
298 // CHECK-LABEL: @test_vcmla_rot180_lane_f16(
299 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
300 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
301 // CHECK: ret <4 x half> [[RES]]
302 float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
303 return vcmla_rot180_lane_f16(acc, lhs, rhs, 1);
306 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
307 // CHECK-LABEL: @test_vcmla_rot180_laneq_f16(
308 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
309 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
310 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
311 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
312 // CHECK: ret <4 x half> [[RES]]
313 float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
314 return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3);
317 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f16(
318 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
319 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
320 // CHECK: ret <8 x half> [[RES]]
321 float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
322 return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1);
325 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f16(
326 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
327 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
328 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
329 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
330 // CHECK: ret <8 x half> [[RES]]
331 float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
332 return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3);
335 // CHECK-LABEL: @test_vcmla_rot180_lane_f32(
336 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
337 // CHECK: ret <2 x float> [[RES]]
338 float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
339 return vcmla_rot180_lane_f32(acc, lhs, rhs, 0);
342 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
343 // CHECK-LABEL: @test_vcmla_rot180_laneq_f32(
344 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
345 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
346 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
347 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
348 // CHECK: ret <2 x float> [[RES]]
349 float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
350 return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1);
353 // CHECK-LABEL: @test_vcmlaq_rot180_lane_f32(
354 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
355 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
356 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[CPLX_VEC]] to <4 x float>
357 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
358 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
359 // CHECK: ret <4 x float> [[RES]]
360 float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
361 return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0);
364 // CHECK-LABEL: @test_vcmlaq_rot180_laneq_f32(
365 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
366 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
367 // CHECK: ret <4 x float> [[RES]]
368 float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
369 return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1);
372 // CHECK-LABEL: @test_vcmla_rot270_lane_f16(
373 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
374 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP]])
375 // CHECK: ret <4 x half> [[RES]]
376 float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
377 return vcmla_rot270_lane_f16(acc, lhs, rhs, 1);
380 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
381 // CHECK-LABEL: @test_vcmla_rot270_laneq_f16(
382 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
383 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
384 // CHECK: [[DUP_FLT:%.*]] = bitcast <2 x i32> [[DUP]] to <4 x half>
385 // CHECK: [[RES:%.*]] = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> [[DUP_FLT]])
386 // CHECK: ret <4 x half> [[RES]]
387 float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) {
388 return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3);
391 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f16(
392 // CHECK: [[DUP:%.*]] = shufflevector <4 x half> %rhs, <4 x half> poison, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
393 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP]])
394 // CHECK: ret <8 x half> [[RES]]
395 float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) {
396 return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1);
399 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f16(
400 // CHECK: [[CPLX:%.*]] = bitcast <8 x half> %rhs to <4 x i32>
401 // CHECK: [[DUP:%.*]] = shufflevector <4 x i32> [[CPLX]], <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
402 // CHECK: [[DUP_FLT:%.*]] = bitcast <4 x i32> [[DUP]] to <8 x half>
403 // CHECK: [[RES:%.*]] = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> [[DUP_FLT]])
404 // CHECK: ret <8 x half> [[RES]]
405 float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
406 return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3);
409 // CHECK-LABEL: @test_vcmla_rot270_lane_f32(
410 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
411 // CHECK: ret <2 x float> [[RES]]
412 float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
413 return vcmla_rot270_lane_f32(acc, lhs, rhs, 0);
416 // ACLE says this exists, but it won't map to a single instruction if lane > 1.
417 // CHECK-LABEL: @test_vcmla_rot270_laneq_f32(
418 // CHECK: [[CPLX:%.*]] = bitcast <4 x float> %rhs to <2 x i64>
419 // CHECK: [[DUP:%.*]] = shufflevector <2 x i64> [[CPLX]], <2 x i64> undef, <1 x i32> <i32 1>
420 // CHECK: [[DUP_FLT:%.*]] = bitcast <1 x i64> [[DUP]] to <2 x float>
421 // CHECK: [[RES:%.*]] = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> [[DUP_FLT]])
422 // CHECK: ret <2 x float> [[RES]]
423 float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) {
424 return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1);
427 // CHECK-LABEL: @test_vcmlaq_rot270_lane_f32(
428 // CHECK: [[CPLX:%.*]] = bitcast <2 x float> %rhs to i64
429 // CHECK: [[CPLX_VEC:%.*]] = insertelement <2 x i64> undef, i64 [[CPLX]], i64 0
430 // CHECK: [[CPLX2:%.*]] = bitcast <2 x i64> [[DUP]] to <4 x float>
431 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> [[CPLX2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
432 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
433 // CHECK: ret <4 x float> [[RES]]
434 float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) {
435 return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0);
438 // CHECK-LABEL: @test_vcmlaq_rot270_laneq_f32(
439 // CHECK: [[DUP:%.*]] = shufflevector <4 x float> %rhs, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
440 // CHECK: [[RES:%.*]] = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> [[DUP]])
441 // CHECK: ret <4 x float> [[RES]]
442 float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
443 return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1);