1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc %s -mtriple=aarch64 -mattr=+v8.3a,+fullfp16 -o - | FileCheck %s
4 define <4 x half> @test_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
5 ; CHECK-LABEL: test_16x4:
6 ; CHECK: // %bb.0: // %entry
7 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #0
10 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
14 define <4 x half> @test_16x4_lane_1(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
15 ; CHECK-LABEL: test_16x4_lane_1:
16 ; CHECK: // %bb.0: // %entry
17 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
18 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[1], #0
21 %c.cast = bitcast <4 x half> %c to <2 x i32>
22 %c.dup = shufflevector <2 x i32> %c.cast , <2 x i32> undef, <2 x i32> <i32 1, i32 1>
23 %c.res = bitcast <2 x i32> %c.dup to <4 x half>
24 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c.res)
28 define <4 x half> @test_rot90_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
29 ; CHECK-LABEL: test_rot90_16x4:
30 ; CHECK: // %bb.0: // %entry
31 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #90
34 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
38 define <4 x half> @test_rot90_16x4_lane_0(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
39 ; CHECK-LABEL: test_rot90_16x4_lane_0:
40 ; CHECK: // %bb.0: // %entry
41 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
42 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[0], #90
45 %c.cast = bitcast <4 x half> %c to <2 x i32>
46 %c.dup = shufflevector <2 x i32> %c.cast , <2 x i32> undef, <2 x i32> <i32 0, i32 0>
47 %c.res = bitcast <2 x i32> %c.dup to <4 x half>
48 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c.res)
52 define <4 x half> @test_rot180_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
53 ; CHECK-LABEL: test_rot180_16x4:
54 ; CHECK: // %bb.0: // %entry
55 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #180
58 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
62 define <4 x half> @test_rot180_16x4_lane_0(<4 x half> %a, <4 x half> %b, <8 x half> %c) {
63 ; CHECK-LABEL: test_rot180_16x4_lane_0:
64 ; CHECK: // %bb.0: // %entry
65 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.h[0], #180
69 %c.cast = bitcast <8 x half> %c to <4 x i32>
70 %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <2 x i32> <i32 0, i32 0>
71 %c.res = bitcast <2 x i32> %c.dup to <4 x half>
72 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c.res)
76 define <4 x half> @test_rot270_16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
77 ; CHECK-LABEL: test_rot270_16x4:
78 ; CHECK: // %bb.0: // %entry
79 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #270
82 %res = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
86 define <2 x float> @test_32x2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
87 ; CHECK-LABEL: test_32x2:
88 ; CHECK: // %bb.0: // %entry
89 ; CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #0
92 %res = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
96 define <2 x float> @test_rot90_32x2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
97 ; CHECK-LABEL: test_rot90_32x2:
98 ; CHECK: // %bb.0: // %entry
99 ; CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #90
102 %res = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
106 define <2 x float> @test_rot180_32x2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
107 ; CHECK-LABEL: test_rot180_32x2:
108 ; CHECK: // %bb.0: // %entry
109 ; CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #180
112 %res = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
116 define <2 x float> @test_rot270_32x2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
117 ; CHECK-LABEL: test_rot270_32x2:
118 ; CHECK: // %bb.0: // %entry
119 ; CHECK-NEXT: fcmla v0.2s, v1.2s, v2.2s, #270
122 %res = tail call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
126 define <8 x half> @test_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
127 ; CHECK-LABEL: test_16x8:
128 ; CHECK: // %bb.0: // %entry
129 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.8h, #0
132 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c)
136 define <8 x half> @test_16x8_lane_0(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
137 ; CHECK-LABEL: test_16x8_lane_0:
138 ; CHECK: // %bb.0: // %entry
139 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[0], #0
142 %c.cast = bitcast <8 x half> %c to <4 x i32>
143 %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
144 %c.res = bitcast <4 x i32> %c.dup to <8 x half>
145 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res)
149 define <8 x half> @test_rot90_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
150 ; CHECK-LABEL: test_rot90_16x8:
151 ; CHECK: // %bb.0: // %entry
152 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.8h, #90
155 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c)
159 define <8 x half> @test_rot90_16x8_lane_1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
160 ; CHECK-LABEL: test_rot90_16x8_lane_1:
161 ; CHECK: // %bb.0: // %entry
162 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[1], #90
165 %c.cast = bitcast <8 x half> %c to <4 x i32>
166 %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
167 %c.res = bitcast <4 x i32> %c.dup to <8 x half>
168 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res)
172 define <8 x half> @test_rot180_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
173 ; CHECK-LABEL: test_rot180_16x8:
174 ; CHECK: // %bb.0: // %entry
175 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.8h, #180
178 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c)
182 define <8 x half> @test_rot180_16x8_lane_1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
183 ; CHECK-LABEL: test_rot180_16x8_lane_1:
184 ; CHECK: // %bb.0: // %entry
185 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[1], #180
188 %c.cast = bitcast <8 x half> %c to <4 x i32>
189 %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
190 %c.res = bitcast <4 x i32> %c.dup to <8 x half>
191 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res)
195 define <8 x half> @test_rot270_16x8(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
196 ; CHECK-LABEL: test_rot270_16x8:
197 ; CHECK: // %bb.0: // %entry
198 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.8h, #270
201 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c)
205 define <8 x half> @test_rot270_16x8_lane_0(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
206 ; CHECK-LABEL: test_rot270_16x8_lane_0:
207 ; CHECK: // %bb.0: // %entry
208 ; CHECK-NEXT: fcmla v0.8h, v1.8h, v2.h[0], #270
211 %c.cast = bitcast <8 x half> %c to <4 x i32>
212 %c.dup = shufflevector <4 x i32> %c.cast , <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
213 %c.res = bitcast <4 x i32> %c.dup to <8 x half>
214 %res = tail call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c.res)
218 define <4 x float> @test_32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
219 ; CHECK-LABEL: test_32x4:
220 ; CHECK: // %bb.0: // %entry
221 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #0
224 %res = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
228 define <4 x float> @test_32x4_lane_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
229 ; CHECK-LABEL: test_32x4_lane_0:
230 ; CHECK: // %bb.0: // %entry
231 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.s[0], #0
234 %c.cast = bitcast <4 x float> %c to <2 x i64>
235 %c.dup = shufflevector <2 x i64> %c.cast , <2 x i64> undef, <2 x i32> <i32 0, i32 0>
236 %c.res = bitcast <2 x i64> %c.dup to <4 x float>
237 %res = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c.res)
241 define <4 x float> @test_rot90_32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
242 ; CHECK-LABEL: test_rot90_32x4:
243 ; CHECK: // %bb.0: // %entry
244 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #90
247 %res = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
251 define <4 x float> @test_rot180_32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
252 ; CHECK-LABEL: test_rot180_32x4:
253 ; CHECK: // %bb.0: // %entry
254 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #180
257 %res = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
261 define <4 x float> @test_rot270_32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
262 ; CHECK-LABEL: test_rot270_32x4:
263 ; CHECK: // %bb.0: // %entry
264 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #270
267 %res = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
271 define <2 x double> @test_64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
272 ; CHECK-LABEL: test_64x2:
273 ; CHECK: // %bb.0: // %entry
274 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #0
277 %res = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
278 ret <2 x double> %res
281 define <2 x double> @test_rot90_64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
282 ; CHECK-LABEL: test_rot90_64x2:
283 ; CHECK: // %bb.0: // %entry
284 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #90
287 %res = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
288 ret <2 x double> %res
291 define <2 x double> @test_rot180_64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
292 ; CHECK-LABEL: test_rot180_64x2:
293 ; CHECK: // %bb.0: // %entry
294 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #180
297 %res = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
298 ret <2 x double> %res
301 define <2 x double> @test_rot270_64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
302 ; CHECK-LABEL: test_rot270_64x2:
303 ; CHECK: // %bb.0: // %entry
304 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #270
307 %res = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
308 ret <2 x double> %res
311 define <4 x float> @reassoc_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
312 ; CHECK-LABEL: reassoc_f32x4:
313 ; CHECK: // %bb.0: // %entry
314 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #0
317 %d = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> zeroinitializer, <4 x float> %b, <4 x float> %c)
318 %res = fadd fast <4 x float> %d, %a
322 define <4 x float> @reassoc_c_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
323 ; CHECK-LABEL: reassoc_c_f32x4:
324 ; CHECK: // %bb.0: // %entry
325 ; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #90
328 %d = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> zeroinitializer, <4 x float> %b, <4 x float> %c)
329 %res = fadd fast <4 x float> %a, %d
333 define <4 x half> @reassoc_f16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
334 ; CHECK-LABEL: reassoc_f16x4:
335 ; CHECK: // %bb.0: // %entry
336 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #180
339 %d = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> zeroinitializer, <4 x half> %b, <4 x half> %c)
340 %res = fadd fast <4 x half> %d, %a
344 define <4 x half> @reassoc_c_f16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
345 ; CHECK-LABEL: reassoc_c_f16x4:
346 ; CHECK: // %bb.0: // %entry
347 ; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #270
350 %d = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> zeroinitializer, <4 x half> %b, <4 x half> %c)
351 %res = fadd fast <4 x half> %a, %d
355 define <2 x double> @reassoc_f64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %g) {
356 ; CHECK-LABEL: reassoc_f64x2:
357 ; CHECK: // %bb.0: // %entry
358 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #270
359 ; CHECK-NEXT: fcmla v0.2d, v2.2d, v3.2d, #270
362 %d = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
363 %e = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> zeroinitializer, <2 x double> %c, <2 x double> %g)
364 %res = fadd fast <2 x double> %e, %d
365 ret <2 x double> %res
368 define <2 x double> @reassoc_c_f64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %g) {
369 ; CHECK-LABEL: reassoc_c_f64x2:
370 ; CHECK: // %bb.0: // %entry
371 ; CHECK-NEXT: fadd v0.2d, v0.2d, v0.2d
372 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #270
373 ; CHECK-NEXT: fcmla v0.2d, v2.2d, v3.2d, #270
376 %d = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
377 %e = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %a, <2 x double> %c, <2 x double> %g)
378 %res = fadd fast <2 x double> %e, %d
379 ret <2 x double> %res
382 define <4 x float> @reassoc_nonfast_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
383 ; CHECK-LABEL: reassoc_nonfast_f32x4:
384 ; CHECK: // %bb.0: // %entry
385 ; CHECK-NEXT: movi v3.2d, #0000000000000000
386 ; CHECK-NEXT: fcmla v3.4s, v1.4s, v2.4s, #0
387 ; CHECK-NEXT: fadd v0.4s, v3.4s, v0.4s
390 %d = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> zeroinitializer, <4 x float> %b, <4 x float> %c)
391 %res = fadd <4 x float> %d, %a
395 declare <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half>, <4 x half>, <4 x half>)
396 declare <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half>, <4 x half>, <4 x half>)
397 declare <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half>, <4 x half>, <4 x half>)
398 declare <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half>, <4 x half>, <4 x half>)
399 declare <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half>, <8 x half>, <8 x half>)
400 declare <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half>, <8 x half>, <8 x half>)
401 declare <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half>, <8 x half>, <8 x half>)
402 declare <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half>, <8 x half>, <8 x half>)
403 declare <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float>, <2 x float>, <2 x float>)
404 declare <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float>, <2 x float>, <2 x float>)
405 declare <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float>, <2 x float>, <2 x float>)
406 declare <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float>, <2 x float>, <2 x float>)
407 declare <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float>, <4 x float>, <4 x float>)
408 declare <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float>, <4 x float>, <4 x float>)
409 declare <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float>, <4 x float>, <4 x float>)
410 declare <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float>, <4 x float>, <4 x float>)
411 declare <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double>, <2 x double>, <2 x double>)
412 declare <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double>, <2 x double>, <2 x double>)
413 declare <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double>, <2 x double>, <2 x double>)
414 declare <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double>, <2 x double>, <2 x double>)