1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
3 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
5 define arm_aapcs_vfpcc float @fmul_v2f32(<2 x float> %x, float %y) {
6 ; CHECK-LABEL: fmul_v2f32:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vmul.f32 s0, s0, s1
9 ; CHECK-NEXT: vmul.f32 s0, s4, s0
12 %z = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
16 define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
17 ; CHECK-FP-LABEL: fmul_v4f32:
18 ; CHECK-FP: @ %bb.0: @ %entry
19 ; CHECK-FP-NEXT: vmul.f32 s2, s2, s3
20 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s1
21 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s2
22 ; CHECK-FP-NEXT: vmul.f32 s0, s4, s0
23 ; CHECK-FP-NEXT: bx lr
25 ; CHECK-NOFP-LABEL: fmul_v4f32:
26 ; CHECK-NOFP: @ %bb.0: @ %entry
27 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s1
28 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2
29 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s3
30 ; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0
31 ; CHECK-NOFP-NEXT: bx lr
33 %z = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
37 define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) {
38 ; CHECK-FP-LABEL: fmul_v8f32:
39 ; CHECK-FP: @ %bb.0: @ %entry
40 ; CHECK-FP-NEXT: vmul.f32 q0, q0, q1
41 ; CHECK-FP-NEXT: vmul.f32 s2, s2, s3
42 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s1
43 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s2
44 ; CHECK-FP-NEXT: vmul.f32 s0, s8, s0
45 ; CHECK-FP-NEXT: bx lr
47 ; CHECK-NOFP-LABEL: fmul_v8f32:
48 ; CHECK-NOFP: @ %bb.0: @ %entry
49 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4
50 ; CHECK-NOFP-NEXT: vmul.f32 s10, s1, s5
51 ; CHECK-NOFP-NEXT: vmul.f32 s2, s2, s6
52 ; CHECK-NOFP-NEXT: vmul.f32 s4, s3, s7
53 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s10
54 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2
55 ; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4
56 ; CHECK-NOFP-NEXT: vmul.f32 s0, s8, s0
57 ; CHECK-NOFP-NEXT: bx lr
59 %z = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x)
63 define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) {
64 ; CHECK-LABEL: fmul_v2f16:
65 ; CHECK: @ %bb.0: @ %entry
66 ; CHECK-NEXT: vmovx.f16 s2, s0
67 ; CHECK-NEXT: vmul.f16 s0, s0, s2
68 ; CHECK-NEXT: vmul.f16 s0, s4, s0
71 %z = call fast half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
75 define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) {
76 ; CHECK-FP-LABEL: fmul_v4f16:
77 ; CHECK-FP: @ %bb.0: @ %entry
78 ; CHECK-FP-NEXT: vmovx.f16 s2, s1
79 ; CHECK-FP-NEXT: vmovx.f16 s6, s0
80 ; CHECK-FP-NEXT: vmul.f16 s2, s1, s2
81 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s6
82 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s2
83 ; CHECK-FP-NEXT: vmul.f16 s0, s4, s0
84 ; CHECK-FP-NEXT: bx lr
86 ; CHECK-NOFP-LABEL: fmul_v4f16:
87 ; CHECK-NOFP: @ %bb.0: @ %entry
88 ; CHECK-NOFP-NEXT: vmovx.f16 s2, s0
89 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
90 ; CHECK-NOFP-NEXT: vmovx.f16 s2, s1
91 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1
92 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
93 ; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0
94 ; CHECK-NOFP-NEXT: bx lr
96 %z = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
100 define arm_aapcs_vfpcc half @fmul_v8f16(<8 x half> %x, half %y) {
101 ; CHECK-FP-LABEL: fmul_v8f16:
102 ; CHECK-FP: @ %bb.0: @ %entry
103 ; CHECK-FP-NEXT: vrev32.16 q2, q0
104 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q2
105 ; CHECK-FP-NEXT: vmul.f16 s2, s2, s3
106 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s1
107 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s2
108 ; CHECK-FP-NEXT: vmul.f16 s0, s4, s0
109 ; CHECK-FP-NEXT: bx lr
111 ; CHECK-NOFP-LABEL: fmul_v8f16:
112 ; CHECK-NOFP: @ %bb.0: @ %entry
113 ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0
114 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6
115 ; CHECK-NOFP-NEXT: vmovx.f16 s6, s1
116 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1
117 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6
118 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
119 ; CHECK-NOFP-NEXT: vmovx.f16 s2, s2
120 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
121 ; CHECK-NOFP-NEXT: vmovx.f16 s2, s3
122 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s3
123 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
124 ; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0
125 ; CHECK-NOFP-NEXT: bx lr
127 %z = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
131 define arm_aapcs_vfpcc half @fmul_v16f16(<16 x half> %x, half %y) {
132 ; CHECK-FP-LABEL: fmul_v16f16:
133 ; CHECK-FP: @ %bb.0: @ %entry
134 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
135 ; CHECK-FP-NEXT: vrev32.16 q1, q0
136 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1
137 ; CHECK-FP-NEXT: vmul.f16 s2, s2, s3
138 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s1
139 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s2
140 ; CHECK-FP-NEXT: vmul.f16 s0, s8, s0
141 ; CHECK-FP-NEXT: bx lr
143 ; CHECK-NOFP-LABEL: fmul_v16f16:
144 ; CHECK-NOFP: @ %bb.0: @ %entry
145 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0
146 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s4
147 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
148 ; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10
149 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s10
150 ; CHECK-NOFP-NEXT: vmul.f16 s4, s1, s5
151 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
152 ; CHECK-NOFP-NEXT: vmovx.f16 s4, s5
153 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s1
154 ; CHECK-NOFP-NEXT: vmul.f16 s4, s10, s4
155 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
156 ; CHECK-NOFP-NEXT: vmul.f16 s4, s2, s6
157 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4
158 ; CHECK-NOFP-NEXT: vmovx.f16 s4, s6
159 ; CHECK-NOFP-NEXT: vmovx.f16 s2, s2
160 ; CHECK-NOFP-NEXT: vmul.f16 s2, s2, s4
161 ; CHECK-NOFP-NEXT: vmovx.f16 s4, s3
162 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
163 ; CHECK-NOFP-NEXT: vmul.f16 s2, s3, s7
164 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
165 ; CHECK-NOFP-NEXT: vmovx.f16 s2, s7
166 ; CHECK-NOFP-NEXT: vmul.f16 s2, s4, s2
167 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2
168 ; CHECK-NOFP-NEXT: vmul.f16 s0, s8, s0
169 ; CHECK-NOFP-NEXT: bx lr
171 %z = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x)
175 define arm_aapcs_vfpcc double @fmul_v1f64(<1 x double> %x, double %y) {
176 ; CHECK-LABEL: fmul_v1f64:
177 ; CHECK: @ %bb.0: @ %entry
178 ; CHECK-NEXT: vmul.f64 d0, d1, d0
181 %z = call fast double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x)
185 define arm_aapcs_vfpcc double @fmul_v2f64(<2 x double> %x, double %y) {
186 ; CHECK-LABEL: fmul_v2f64:
187 ; CHECK: @ %bb.0: @ %entry
188 ; CHECK-NEXT: vmul.f64 d0, d0, d1
189 ; CHECK-NEXT: vmul.f64 d0, d2, d0
192 %z = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
196 define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) {
197 ; CHECK-LABEL: fmul_v4f64:
198 ; CHECK: @ %bb.0: @ %entry
199 ; CHECK-NEXT: vmul.f64 d1, d1, d3
200 ; CHECK-NEXT: vmul.f64 d0, d0, d2
201 ; CHECK-NEXT: vmul.f64 d0, d0, d1
202 ; CHECK-NEXT: vmul.f64 d0, d4, d0
205 %z = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x)
209 define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) {
210 ; CHECK-LABEL: fmul_v2f32_nofast:
211 ; CHECK: @ %bb.0: @ %entry
212 ; CHECK-NEXT: vmul.f32 s0, s4, s0
213 ; CHECK-NEXT: vmul.f32 s0, s0, s1
216 %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
220 define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) {
221 ; CHECK-LABEL: fmul_v4f32_nofast:
222 ; CHECK: @ %bb.0: @ %entry
223 ; CHECK-NEXT: vmul.f32 s0, s4, s0
224 ; CHECK-NEXT: vmul.f32 s0, s0, s1
225 ; CHECK-NEXT: vmul.f32 s0, s0, s2
226 ; CHECK-NEXT: vmul.f32 s0, s0, s3
229 %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
233 define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) {
234 ; CHECK-LABEL: fmul_v8f32_nofast:
235 ; CHECK: @ %bb.0: @ %entry
236 ; CHECK-NEXT: vmul.f32 s0, s8, s0
237 ; CHECK-NEXT: vmul.f32 s0, s0, s1
238 ; CHECK-NEXT: vmul.f32 s0, s0, s2
239 ; CHECK-NEXT: vmul.f32 s0, s0, s3
240 ; CHECK-NEXT: vmul.f32 s0, s0, s4
241 ; CHECK-NEXT: vmul.f32 s0, s0, s5
242 ; CHECK-NEXT: vmul.f32 s0, s0, s6
243 ; CHECK-NEXT: vmul.f32 s0, s0, s7
246 %z = call float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x)
250 define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) {
251 ; CHECK-LABEL: fmul_v2f16_nofast:
252 ; CHECK: @ %bb.0: @ %entry
253 ; CHECK-NEXT: vmul.f16 s2, s4, s0
254 ; CHECK-NEXT: vmovx.f16 s0, s0
255 ; CHECK-NEXT: vmul.f16 s0, s2, s0
258 %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
262 define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) {
263 ; CHECK-LABEL: fmul_v4f16_nofast:
264 ; CHECK: @ %bb.0: @ %entry
265 ; CHECK-NEXT: vmul.f16 s2, s4, s0
266 ; CHECK-NEXT: vmovx.f16 s0, s0
267 ; CHECK-NEXT: vmul.f16 s0, s2, s0
268 ; CHECK-NEXT: vmovx.f16 s2, s1
269 ; CHECK-NEXT: vmul.f16 s0, s0, s1
270 ; CHECK-NEXT: vmul.f16 s0, s0, s2
273 %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
277 define arm_aapcs_vfpcc half @fmul_v8f16_nofast(<8 x half> %x, half %y) {
278 ; CHECK-LABEL: fmul_v8f16_nofast:
279 ; CHECK: @ %bb.0: @ %entry
280 ; CHECK-NEXT: vmul.f16 s4, s4, s0
281 ; CHECK-NEXT: vmovx.f16 s0, s0
282 ; CHECK-NEXT: vmul.f16 s0, s4, s0
283 ; CHECK-NEXT: vmovx.f16 s4, s1
284 ; CHECK-NEXT: vmul.f16 s0, s0, s1
285 ; CHECK-NEXT: vmul.f16 s0, s0, s4
286 ; CHECK-NEXT: vmul.f16 s0, s0, s2
287 ; CHECK-NEXT: vmovx.f16 s2, s2
288 ; CHECK-NEXT: vmul.f16 s0, s0, s2
289 ; CHECK-NEXT: vmovx.f16 s2, s3
290 ; CHECK-NEXT: vmul.f16 s0, s0, s3
291 ; CHECK-NEXT: vmul.f16 s0, s0, s2
294 %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
298 define arm_aapcs_vfpcc half @fmul_v16f16_nofast(<16 x half> %x, half %y) {
299 ; CHECK-LABEL: fmul_v16f16_nofast:
300 ; CHECK: @ %bb.0: @ %entry
301 ; CHECK-NEXT: vmul.f16 s8, s8, s0
302 ; CHECK-NEXT: vmovx.f16 s0, s0
303 ; CHECK-NEXT: vmul.f16 s0, s8, s0
304 ; CHECK-NEXT: vmovx.f16 s8, s1
305 ; CHECK-NEXT: vmul.f16 s0, s0, s1
306 ; CHECK-NEXT: vmul.f16 s0, s0, s8
307 ; CHECK-NEXT: vmul.f16 s0, s0, s2
308 ; CHECK-NEXT: vmovx.f16 s2, s2
309 ; CHECK-NEXT: vmul.f16 s0, s0, s2
310 ; CHECK-NEXT: vmovx.f16 s2, s3
311 ; CHECK-NEXT: vmul.f16 s0, s0, s3
312 ; CHECK-NEXT: vmul.f16 s0, s0, s2
313 ; CHECK-NEXT: vmovx.f16 s2, s4
314 ; CHECK-NEXT: vmul.f16 s0, s0, s4
315 ; CHECK-NEXT: vmul.f16 s0, s0, s2
316 ; CHECK-NEXT: vmovx.f16 s2, s5
317 ; CHECK-NEXT: vmul.f16 s0, s0, s5
318 ; CHECK-NEXT: vmul.f16 s0, s0, s2
319 ; CHECK-NEXT: vmovx.f16 s2, s6
320 ; CHECK-NEXT: vmul.f16 s0, s0, s6
321 ; CHECK-NEXT: vmul.f16 s0, s0, s2
322 ; CHECK-NEXT: vmovx.f16 s2, s7
323 ; CHECK-NEXT: vmul.f16 s0, s0, s7
324 ; CHECK-NEXT: vmul.f16 s0, s0, s2
327 %z = call half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x)
331 define arm_aapcs_vfpcc double @fmul_v1f64_nofast(<1 x double> %x, double %y) {
332 ; CHECK-LABEL: fmul_v1f64_nofast:
333 ; CHECK: @ %bb.0: @ %entry
334 ; CHECK-NEXT: vmul.f64 d0, d1, d0
337 %z = call double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x)
341 define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) {
342 ; CHECK-LABEL: fmul_v2f64_nofast:
343 ; CHECK: @ %bb.0: @ %entry
344 ; CHECK-NEXT: vmul.f64 d0, d2, d0
345 ; CHECK-NEXT: vmul.f64 d0, d0, d1
348 %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
352 define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) {
353 ; CHECK-LABEL: fmul_v4f64_nofast:
354 ; CHECK: @ %bb.0: @ %entry
355 ; CHECK-NEXT: vmul.f64 d0, d4, d0
356 ; CHECK-NEXT: vmul.f64 d0, d0, d1
357 ; CHECK-NEXT: vmul.f64 d0, d0, d2
358 ; CHECK-NEXT: vmul.f64 d0, d0, d3
361 %z = call double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x)
365 declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>)
366 declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
367 declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
368 declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
369 declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
370 declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
371 declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
372 declare half @llvm.vector.reduce.fmul.f16.v2f16(half, <2 x half>)
373 declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
374 declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)