1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
4 target triple = "thumbv8.1m.main-none-none-eabi"
6 ; Expected to transform
7 define arm_aapcs_vfpcc <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
8 ; CHECK-LABEL: mul_mul:
9 ; CHECK: @ %bb.0: @ %entry
10 ; CHECK-NEXT: vcmul.f32 q3, q0, q1, #0
11 ; CHECK-NEXT: vcmla.f32 q3, q0, q1, #90
12 ; CHECK-NEXT: vcmul.f32 q0, q3, q2, #0
13 ; CHECK-NEXT: vcmla.f32 q0, q3, q2, #90
16 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
17 %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
18 %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
19 %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
20 %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151
21 %1 = fmul fast <2 x float> %strided.vec153, %strided.vec
22 %2 = fmul fast <2 x float> %strided.vec154, %strided.vec
23 %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151
24 %4 = fadd fast <2 x float> %3, %2
25 %5 = fsub fast <2 x float> %1, %0
26 %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
27 %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
28 %6 = fmul fast <2 x float> %4, %strided.vec156
29 %7 = fmul fast <2 x float> %5, %strided.vec157
30 %8 = fadd fast <2 x float> %6, %7
31 %9 = fmul fast <2 x float> %strided.vec156, %5
32 %10 = fmul fast <2 x float> %4, %strided.vec157
33 %11 = fsub fast <2 x float> %9, %10
34 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
35 ret <4 x float> %interleaved.vec
38 ; Expected to not transform
39 define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
40 ; CHECK-LABEL: add_mul:
41 ; CHECK: @ %bb.0: @ %entry
42 ; CHECK-NEXT: .vsave {d8, d9}
43 ; CHECK-NEXT: vpush {d8, d9}
44 ; CHECK-NEXT: vsub.f32 q3, q1, q2
45 ; CHECK-NEXT: vsub.f32 q0, q1, q0
46 ; CHECK-NEXT: vmov.f32 s16, s9
47 ; CHECK-NEXT: vmov.f32 s13, s14
48 ; CHECK-NEXT: vmov.f32 s17, s11
49 ; CHECK-NEXT: vmov.f32 s0, s1
50 ; CHECK-NEXT: vmul.f32 q1, q3, q4
51 ; CHECK-NEXT: vmov.f32 s1, s3
52 ; CHECK-NEXT: vmov.f32 s9, s10
53 ; CHECK-NEXT: vfma.f32 q1, q2, q0
54 ; CHECK-NEXT: vmul.f32 q0, q4, q0
55 ; CHECK-NEXT: vneg.f32 q4, q0
56 ; CHECK-NEXT: vmov.f32 s1, s4
57 ; CHECK-NEXT: vfma.f32 q4, q2, q3
58 ; CHECK-NEXT: vmov.f32 s3, s5
59 ; CHECK-NEXT: vmov.f32 s0, s16
60 ; CHECK-NEXT: vmov.f32 s2, s17
61 ; CHECK-NEXT: vpop {d8, d9}
64 %0 = fsub fast <4 x float> %b, %c
65 %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
66 %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
67 %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
68 %2 = fmul fast <2 x float> %1, %strided.vec59
69 %3 = fsub fast <4 x float> %b, %a
70 %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
71 %5 = fmul fast <2 x float> %strided.vec58, %4
72 %6 = fadd fast <2 x float> %5, %2
73 %7 = fmul fast <2 x float> %strided.vec58, %1
74 %8 = fmul fast <2 x float> %strided.vec59, %4
75 %9 = fsub fast <2 x float> %7, %8
76 %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
77 ret <4 x float> %interleaved.vec
80 ; Expected to not transform
81 define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
82 ; CHECK-LABEL: mul_mul270_mul:
83 ; CHECK: @ %bb.0: @ %entry
84 ; CHECK-NEXT: .vsave {d12}
85 ; CHECK-NEXT: vpush {d12}
86 ; CHECK-NEXT: .vsave {d10}
87 ; CHECK-NEXT: vpush {d10}
88 ; CHECK-NEXT: .vsave {d8}
89 ; CHECK-NEXT: vpush {d8}
90 ; CHECK-NEXT: vmov.f32 s20, s4
91 ; CHECK-NEXT: vmov.f32 s16, s8
92 ; CHECK-NEXT: vmov.f32 s17, s10
93 ; CHECK-NEXT: vmov.f32 s21, s6
94 ; CHECK-NEXT: vmul.f32 q3, q5, q4
95 ; CHECK-NEXT: vmov.f32 s4, s5
96 ; CHECK-NEXT: vneg.f32 q3, q3
97 ; CHECK-NEXT: vmov.f32 s24, s9
98 ; CHECK-NEXT: vmov.f32 s25, s11
99 ; CHECK-NEXT: vmov.f32 s5, s7
100 ; CHECK-NEXT: vmul.f32 q2, q1, q4
101 ; CHECK-NEXT: vmov.f32 s16, s0
102 ; CHECK-NEXT: vfma.f32 q3, q1, q6
103 ; CHECK-NEXT: vmov.f32 s17, s2
104 ; CHECK-NEXT: vmov.f32 s0, s1
105 ; CHECK-NEXT: vfma.f32 q2, q5, q6
106 ; CHECK-NEXT: vmul.f32 q1, q3, q4
107 ; CHECK-NEXT: vmov.f32 s1, s3
108 ; CHECK-NEXT: vfma.f32 q1, q2, q0
109 ; CHECK-NEXT: vmul.f32 q0, q3, q0
110 ; CHECK-NEXT: vneg.f32 q3, q0
111 ; CHECK-NEXT: vmov.f32 s1, s4
112 ; CHECK-NEXT: vfma.f32 q3, q2, q4
113 ; CHECK-NEXT: vmov.f32 s3, s5
114 ; CHECK-NEXT: vmov.f32 s0, s12
115 ; CHECK-NEXT: vmov.f32 s2, s13
116 ; CHECK-NEXT: vpop {d8}
117 ; CHECK-NEXT: vpop {d10}
118 ; CHECK-NEXT: vpop {d12}
121 %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
122 %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
123 %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
124 %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
125 %0 = fmul fast <2 x float> %strided.vec84, %strided.vec
126 %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81
127 %2 = fadd fast <2 x float> %1, %0
128 %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
129 %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
130 %3 = fmul fast <2 x float> %2, %strided.vec87
131 %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81
132 %5 = fmul fast <2 x float> %strided.vec83, %strided.vec
133 %6 = fsub fast <2 x float> %4, %5
134 %7 = fmul fast <2 x float> %6, %strided.vec86
135 %8 = fadd fast <2 x float> %3, %7
136 %9 = fmul fast <2 x float> %2, %strided.vec86
137 %10 = fmul fast <2 x float> %6, %strided.vec87
138 %11 = fsub fast <2 x float> %9, %10
139 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
140 ret <4 x float> %interleaved.vec
144 ; Expected to transform
145 define arm_aapcs_vfpcc <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) {
146 ; CHECK-LABEL: mul_triangle:
147 ; CHECK: @ %bb.0: @ %entry
148 ; CHECK-NEXT: vcmul.f32 q2, q1, q0, #0
149 ; CHECK-NEXT: vcmla.f32 q2, q1, q0, #90
150 ; CHECK-NEXT: vcmul.f32 q1, q0, q2, #0
151 ; CHECK-NEXT: vcmla.f32 q1, q0, q2, #90
152 ; CHECK-NEXT: vmov q0, q1
155 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
156 %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
157 %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
158 %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
159 %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
160 %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
161 %2 = fsub fast <2 x float> %0, %1
162 %3 = fmul fast <2 x float> %2, %strided.vec35
163 %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
164 %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
165 %6 = fadd fast <2 x float> %4, %5
166 %7 = fmul fast <2 x float> %6, %strided.vec
167 %8 = fadd fast <2 x float> %3, %7
168 %9 = fmul fast <2 x float> %2, %strided.vec
169 %10 = fmul fast <2 x float> %6, %strided.vec35
170 %11 = fsub fast <2 x float> %9, %10
171 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
172 ret <4 x float> %interleaved.vec
176 ; d * (b * a) * (c * a)
177 ; Expected to transform
178 define arm_aapcs_vfpcc <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
179 ; CHECK-LABEL: mul_diamond:
180 ; CHECK: @ %bb.0: @ %entry
181 ; CHECK-NEXT: .vsave {d8, d9}
182 ; CHECK-NEXT: vpush {d8, d9}
183 ; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0
184 ; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90
185 ; CHECK-NEXT: vcmul.f32 q1, q4, q3, #0
186 ; CHECK-NEXT: vcmla.f32 q1, q4, q3, #90
187 ; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0
188 ; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90
189 ; CHECK-NEXT: vcmul.f32 q0, q3, q1, #0
190 ; CHECK-NEXT: vcmla.f32 q0, q3, q1, #90
191 ; CHECK-NEXT: vpop {d8, d9}
194 %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
195 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
196 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
197 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
198 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
199 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
200 %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
201 %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
202 %0 = fmul fast <2 x float> %a.imag, %b.real
203 %1 = fmul fast <2 x float> %a.real, %b.imag
204 %2 = fadd fast <2 x float> %1, %0
205 %3 = fmul fast <2 x float> %a.real, %b.real
206 %4 = fmul fast <2 x float> %b.imag, %a.imag
207 %5 = fsub fast <2 x float> %3, %4
208 %6 = fmul fast <2 x float> %d.real, %5
209 %7 = fmul fast <2 x float> %2, %d.imag
210 %8 = fmul fast <2 x float> %d.real, %2
211 %9 = fmul fast <2 x float> %5, %d.imag
212 %10 = fsub fast <2 x float> %6, %7
213 %11 = fadd fast <2 x float> %8, %9
214 %12 = fmul fast <2 x float> %c.real, %a.imag
215 %13 = fmul fast <2 x float> %c.imag, %a.real
216 %14 = fadd fast <2 x float> %13, %12
217 %15 = fmul fast <2 x float> %14, %10
218 %16 = fmul fast <2 x float> %c.real, %a.real
219 %17 = fmul fast <2 x float> %c.imag, %a.imag
220 %18 = fsub fast <2 x float> %16, %17
221 %19 = fmul fast <2 x float> %18, %11
222 %20 = fadd fast <2 x float> %15, %19
223 %21 = fmul fast <2 x float> %18, %10
224 %22 = fmul fast <2 x float> %14, %11
225 %23 = fsub fast <2 x float> %21, %22
226 %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
227 ret <4 x float> %interleaved.vec
230 ; Expected to transform
231 define arm_aapcs_vfpcc <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
232 ; CHECK-LABEL: mul_add90_mul:
233 ; CHECK: @ %bb.0: @ %entry
234 ; CHECK-NEXT: .vsave {d8, d9}
235 ; CHECK-NEXT: vpush {d8, d9}
236 ; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0
237 ; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0
238 ; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90
239 ; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90
240 ; CHECK-NEXT: vcadd.f32 q0, q3, q4, #90
241 ; CHECK-NEXT: vpop {d8, d9}
244 %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
245 %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
246 %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
247 %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
248 %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
249 %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
251 %i6 = fmul fast <2 x float> %br, %ar
252 %i7 = fmul fast <2 x float> %bi, %ai
253 %xr = fsub fast <2 x float> %i6, %i7
254 %i9 = fmul fast <2 x float> %bi, %ar
255 %i10 = fmul fast <2 x float> %br, %ai
256 %xi = fadd fast <2 x float> %i9, %i10
258 %j6 = fmul fast <2 x float> %cr, %ar
259 %j7 = fmul fast <2 x float> %ci, %ai
260 %yr = fsub fast <2 x float> %j6, %j7
261 %j9 = fmul fast <2 x float> %ci, %ar
262 %j10 = fmul fast <2 x float> %cr, %ai
263 %yi = fadd fast <2 x float> %j9, %j10
265 %zr = fsub fast <2 x float> %yr, %xi
266 %zi = fadd fast <2 x float> %yi, %xr
267 %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
268 ret <4 x float> %interleaved.vec
271 ; Expected to not transform
272 define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
273 ; CHECK-LABEL: mul_triangle_addmul:
274 ; CHECK: @ %bb.0: @ %entry
275 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
276 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
277 ; CHECK-NEXT: vmov.f32 s16, s0
278 ; CHECK-NEXT: vmov.f32 s20, s5
279 ; CHECK-NEXT: vmov.f32 s17, s2
280 ; CHECK-NEXT: vmov.f32 s21, s7
281 ; CHECK-NEXT: vmov.f32 s5, s6
282 ; CHECK-NEXT: vmul.f32 q3, q5, q4
283 ; CHECK-NEXT: vmul.f32 q4, q1, q4
284 ; CHECK-NEXT: vmov.f32 s0, s1
285 ; CHECK-NEXT: vmov.f32 s1, s3
286 ; CHECK-NEXT: vmov q6, q4
287 ; CHECK-NEXT: vfms.f32 q6, q5, q0
288 ; CHECK-NEXT: vmov q7, q3
289 ; CHECK-NEXT: vfma.f32 q3, q1, q0
290 ; CHECK-NEXT: vmov.f32 s20, s8
291 ; CHECK-NEXT: vmov.f32 s21, s10
292 ; CHECK-NEXT: vmov.f32 s4, s9
293 ; CHECK-NEXT: vfma.f32 q7, q5, q0
294 ; CHECK-NEXT: vmov.f32 s5, s11
295 ; CHECK-NEXT: vadd.f32 q5, q7, q6
296 ; CHECK-NEXT: vfms.f32 q4, q1, q0
297 ; CHECK-NEXT: vmov.f32 s1, s20
298 ; CHECK-NEXT: vsub.f32 q1, q4, q3
299 ; CHECK-NEXT: vmov.f32 s3, s21
300 ; CHECK-NEXT: vmov.f32 s0, s4
301 ; CHECK-NEXT: vmov.f32 s2, s5
302 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
305 %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
306 %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
307 %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
308 %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
309 %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
310 %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
312 %i6 = fmul fast <2 x float> %br, %ar
313 %i7 = fmul fast <2 x float> %bi, %ai
314 %xr = fsub fast <2 x float> %i6, %i7
315 %i9 = fmul fast <2 x float> %bi, %ar
316 %i10 = fmul fast <2 x float> %br, %ai
317 %xi = fadd fast <2 x float> %i9, %i10
319 ;%j6 = fmul fast <2 x float> %cr, %ar
320 %j7 = fmul fast <2 x float> %ci, %ai
321 %yr = fsub fast <2 x float> %i6, %j7
322 ;%j9 = fmul fast <2 x float> %ci, %ar
323 %j10 = fmul fast <2 x float> %cr, %ai
324 %yi = fadd fast <2 x float> %i9, %j10
326 %zr = fsub fast <2 x float> %yr, %xi
327 %zi = fadd fast <2 x float> %yi, %xr
328 %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
329 ret <4 x float> %interleaved.vec
332 ; Expected to not transform
333 define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) {
334 ; CHECK-LABEL: mul_triangle_multiuses:
335 ; CHECK: @ %bb.0: @ %entry
336 ; CHECK-NEXT: .vsave {d8, d9}
337 ; CHECK-NEXT: vpush {d8, d9}
338 ; CHECK-NEXT: vmov.f32 s16, s4
339 ; CHECK-NEXT: vmov.f32 s8, s1
340 ; CHECK-NEXT: vmov.f32 s17, s6
341 ; CHECK-NEXT: vmov.f32 s9, s3
342 ; CHECK-NEXT: vmov.f32 s4, s5
343 ; CHECK-NEXT: vmul.f32 q3, q2, q4
344 ; CHECK-NEXT: vmov.f32 s1, s2
345 ; CHECK-NEXT: vmov.f32 s5, s7
346 ; CHECK-NEXT: vfma.f32 q3, q1, q0
347 ; CHECK-NEXT: vmul.f32 q1, q1, q2
348 ; CHECK-NEXT: vneg.f32 q1, q1
349 ; CHECK-NEXT: vfma.f32 q1, q4, q0
350 ; CHECK-NEXT: vmov.f32 s18, s12
351 ; CHECK-NEXT: vmov.f32 s16, s4
352 ; CHECK-NEXT: vmov.f32 s17, s5
353 ; CHECK-NEXT: vmov.f32 s19, s13
354 ; CHECK-NEXT: vstrw.32 q4, [r0]
355 ; CHECK-NEXT: vmul.f32 q4, q3, q0
356 ; CHECK-NEXT: vfma.f32 q4, q1, q2
357 ; CHECK-NEXT: vmul.f32 q2, q3, q2
358 ; CHECK-NEXT: vneg.f32 q2, q2
359 ; CHECK-NEXT: vfma.f32 q2, q1, q0
360 ; CHECK-NEXT: vmov.f32 s1, s16
361 ; CHECK-NEXT: vmov.f32 s0, s8
362 ; CHECK-NEXT: vmov.f32 s2, s9
363 ; CHECK-NEXT: vmov.f32 s3, s17
364 ; CHECK-NEXT: vpop {d8, d9}
367 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
368 %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
369 %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
370 %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
371 %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
372 %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
373 %2 = fsub fast <2 x float> %0, %1
374 %3 = fmul fast <2 x float> %2, %strided.vec35
375 %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
376 %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
377 %6 = fadd fast <2 x float> %4, %5
378 %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
379 store <4 x float> %otheruse, ptr %p
380 %7 = fmul fast <2 x float> %6, %strided.vec
381 %8 = fadd fast <2 x float> %3, %7
382 %9 = fmul fast <2 x float> %2, %strided.vec
383 %10 = fmul fast <2 x float> %6, %strided.vec35
384 %11 = fsub fast <2 x float> %9, %10
385 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
386 ret <4 x float> %interleaved.vec
389 ; Expected to transform
390 define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
391 ; CHECK-LABEL: mul_addequal:
392 ; CHECK: @ %bb.0: @ %entry
393 ; CHECK-NEXT: add.w r12, sp, #16
394 ; CHECK-NEXT: vmov d0, r0, r1
395 ; CHECK-NEXT: mov r0, sp
396 ; CHECK-NEXT: vldrw.u32 q2, [r12]
397 ; CHECK-NEXT: vldrw.u32 q1, [r0]
398 ; CHECK-NEXT: vmov d1, r2, r3
399 ; CHECK-NEXT: vcmla.f32 q2, q0, q1, #0
400 ; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90
401 ; CHECK-NEXT: vmov r0, r1, d4
402 ; CHECK-NEXT: vmov r2, r3, d5
405 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
406 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
407 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
408 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
409 %0 = fmul fast <2 x float> %b.imag, %strided.vec
410 %1 = fmul fast <2 x float> %b.real, %a.imag
411 %2 = fadd fast <2 x float> %1, %0
412 %3 = fmul fast <2 x float> %b.real, %strided.vec
413 %4 = fmul fast <2 x float> %a.imag, %b.imag
414 %5 = fsub fast <2 x float> %3, %4
415 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
416 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
417 %6 = fadd fast <2 x float> %5, %c.real
418 %7 = fadd fast <2 x float> %2, %c.imag
419 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
420 ret <4 x float> %interleaved.vec
423 ; Expected to transform
424 define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
425 ; CHECK-LABEL: mul_subequal:
426 ; CHECK: @ %bb.0: @ %entry
427 ; CHECK-NEXT: vmov d0, r0, r1
428 ; CHECK-NEXT: mov r1, sp
429 ; CHECK-NEXT: vldrw.u32 q2, [r1]
430 ; CHECK-NEXT: vmov d1, r2, r3
431 ; CHECK-NEXT: add r0, sp, #16
432 ; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0
433 ; CHECK-NEXT: vldrw.u32 q1, [r0]
434 ; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90
435 ; CHECK-NEXT: vsub.f32 q0, q3, q1
436 ; CHECK-NEXT: vmov r0, r1, d0
437 ; CHECK-NEXT: vmov r2, r3, d1
440 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
441 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
442 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
443 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
444 %0 = fmul fast <2 x float> %b.imag, %strided.vec
445 %1 = fmul fast <2 x float> %b.real, %a.imag
446 %2 = fadd fast <2 x float> %1, %0
447 %3 = fmul fast <2 x float> %b.real, %strided.vec
448 %4 = fmul fast <2 x float> %a.imag, %b.imag
449 %5 = fsub fast <2 x float> %3, %4
450 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
451 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
452 %6 = fsub fast <2 x float> %5, %c.real
453 %7 = fsub fast <2 x float> %2, %c.imag
454 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
455 ret <4 x float> %interleaved.vec
459 ; Expected to transform
460 define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
461 ; CHECK-LABEL: mul_mulequal:
462 ; CHECK: @ %bb.0: @ %entry
463 ; CHECK-NEXT: vmov d0, r0, r1
464 ; CHECK-NEXT: mov r1, sp
465 ; CHECK-NEXT: vldrw.u32 q2, [r1]
466 ; CHECK-NEXT: vmov d1, r2, r3
467 ; CHECK-NEXT: add r0, sp, #16
468 ; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0
469 ; CHECK-NEXT: vldrw.u32 q1, [r0]
470 ; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90
471 ; CHECK-NEXT: vmul.f32 q0, q3, q1
472 ; CHECK-NEXT: vmov r0, r1, d0
473 ; CHECK-NEXT: vmov r2, r3, d1
476 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
477 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
478 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
479 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
480 %0 = fmul fast <2 x float> %b.imag, %strided.vec
481 %1 = fmul fast <2 x float> %b.real, %a.imag
482 %2 = fadd fast <2 x float> %1, %0
483 %3 = fmul fast <2 x float> %b.real, %strided.vec
484 %4 = fmul fast <2 x float> %a.imag, %b.imag
485 %5 = fsub fast <2 x float> %3, %4
486 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
487 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
488 %6 = fmul fast <2 x float> %5, %c.real
489 %7 = fmul fast <2 x float> %2, %c.imag
490 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
491 ret <4 x float> %interleaved.vec
494 ; Expected to not transform
495 define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
496 ; CHECK-LABEL: mul_divequal:
497 ; CHECK: @ %bb.0: @ %entry
498 ; CHECK-NEXT: .vsave {d10, d11}
499 ; CHECK-NEXT: vpush {d10, d11}
500 ; CHECK-NEXT: .vsave {d8}
501 ; CHECK-NEXT: vpush {d8}
502 ; CHECK-NEXT: vmov d0, r0, r1
503 ; CHECK-NEXT: add r0, sp, #24
504 ; CHECK-NEXT: vldrw.u32 q1, [r0]
505 ; CHECK-NEXT: vmov d1, r2, r3
506 ; CHECK-NEXT: vmov.f32 s16, s1
507 ; CHECK-NEXT: add.w r12, sp, #40
508 ; CHECK-NEXT: vmov.f32 s12, s5
509 ; CHECK-NEXT: vmov.f32 s13, s7
510 ; CHECK-NEXT: vmov.f32 s1, s2
511 ; CHECK-NEXT: vmov.f32 s8, s4
512 ; CHECK-NEXT: vmul.f32 q5, q3, q0
513 ; CHECK-NEXT: vmov.f32 s9, s6
514 ; CHECK-NEXT: vldrw.u32 q1, [r12]
515 ; CHECK-NEXT: vmov.f32 s17, s3
516 ; CHECK-NEXT: vfma.f32 q5, q2, q4
517 ; CHECK-NEXT: vmul.f32 q3, q4, q3
518 ; CHECK-NEXT: vdiv.f32 s3, s21, s7
519 ; CHECK-NEXT: vneg.f32 q3, q3
520 ; CHECK-NEXT: vfma.f32 q3, q2, q0
521 ; CHECK-NEXT: vdiv.f32 s1, s20, s5
522 ; CHECK-NEXT: vdiv.f32 s2, s13, s6
523 ; CHECK-NEXT: vdiv.f32 s0, s12, s4
524 ; CHECK-NEXT: vmov r0, r1, d0
525 ; CHECK-NEXT: vmov r2, r3, d1
526 ; CHECK-NEXT: vpop {d8}
527 ; CHECK-NEXT: vpop {d10, d11}
530 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
531 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
532 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
533 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
534 %0 = fmul fast <2 x float> %b.imag, %strided.vec
535 %1 = fmul fast <2 x float> %b.real, %a.imag
536 %2 = fadd fast <2 x float> %1, %0
537 %3 = fmul fast <2 x float> %b.real, %strided.vec
538 %4 = fmul fast <2 x float> %a.imag, %b.imag
539 %5 = fsub fast <2 x float> %3, %4
540 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
541 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
542 %6 = fdiv fast <2 x float> %5, %c.real
543 %7 = fdiv fast <2 x float> %2, %c.imag
544 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
545 ret <4 x float> %interleaved.vec
548 ; Expected to transform
549 define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) {
550 ; CHECK-LABEL: mul_negequal:
551 ; CHECK: @ %bb.0: @ %entry
552 ; CHECK-NEXT: vmov d0, r0, r1
553 ; CHECK-NEXT: mov r0, sp
554 ; CHECK-NEXT: vldrw.u32 q1, [r0]
555 ; CHECK-NEXT: vmov d1, r2, r3
556 ; CHECK-NEXT: vcmul.f32 q2, q0, q1, #180
557 ; CHECK-NEXT: vcmla.f32 q2, q0, q1, #270
558 ; CHECK-NEXT: vmov r0, r1, d4
559 ; CHECK-NEXT: vmov r2, r3, d5
562 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
563 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
564 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
565 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
566 %0 = fmul fast <2 x float> %b.imag, %strided.vec
567 %1 = fmul fast <2 x float> %b.real, %a.imag
568 %2 = fadd fast <2 x float> %1, %0
569 %3 = fmul fast <2 x float> %b.real, %strided.vec
570 %4 = fmul fast <2 x float> %a.imag, %b.imag
571 %5 = fsub fast <2 x float> %3, %4
572 %6 = fneg fast <2 x float> %5
573 %7 = fneg fast <2 x float> %2
574 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
575 ret <4 x float> %interleaved.vec