1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV
3 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV,CHECKFP
4 ; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS
5 ; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS,CHECKFP
7 define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) {
8 ; CHECK-LABEL: shuffle1_i32:
9 ; CHECK: @ %bb.0: @ %entry
10 ; CHECK-NEXT: vmov.f32 s4, s3
11 ; CHECK-NEXT: vmov.f32 s5, s2
12 ; CHECK-NEXT: vmov.f32 s6, s1
13 ; CHECK-NEXT: vmov.f32 s7, s0
14 ; CHECK-NEXT: vmov q0, q1
17 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
21 define arm_aapcs_vfpcc <4 x i32> @shuffle2_i32(<4 x i32> %src) {
22 ; CHECK-LABEL: shuffle2_i32:
23 ; CHECK: @ %bb.0: @ %entry
26 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
30 define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) {
31 ; CHECK-LABEL: shuffle3_i32:
32 ; CHECK: @ %bb.0: @ %entry
33 ; CHECK-NEXT: vmov.f32 s4, s3
34 ; CHECK-NEXT: vmov.f32 s5, s1
35 ; CHECK-NEXT: vmov.f32 s6, s2
36 ; CHECK-NEXT: vmov.f32 s7, s0
37 ; CHECK-NEXT: vmov q0, q1
40 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
44 define arm_aapcs_vfpcc <4 x i32> @shuffle5_i32(<4 x i32> %src) {
45 ; CHECK-LABEL: shuffle5_i32:
46 ; CHECK: @ %bb.0: @ %entry
47 ; CHECK-NEXT: vrev64.32 q1, q0
48 ; CHECK-NEXT: vmov q0, q1
51 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
55 define arm_aapcs_vfpcc <4 x i32> @shuffle6_i32(<4 x i32> %src) {
56 ; CHECK-LABEL: shuffle6_i32:
57 ; CHECK: @ %bb.0: @ %entry
60 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 3>
64 define arm_aapcs_vfpcc <4 x i32> @oneoff11_i32(<4 x i32> %src1, <4 x i32> %src2) {
65 ; CHECK-LABEL: oneoff11_i32:
66 ; CHECK: @ %bb.0: @ %entry
67 ; CHECK-NEXT: vmov.f32 s2, s1
70 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
74 define arm_aapcs_vfpcc <4 x i32> @oneoff12_i32(<4 x i32> %src1, <4 x i32> %src2) {
75 ; CHECK-LABEL: oneoff12_i32:
76 ; CHECK: @ %bb.0: @ %entry
77 ; CHECK-NEXT: vmov.f32 s0, s4
80 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
84 define arm_aapcs_vfpcc <4 x i32> @oneoff21_i32(<4 x i32> %src1, <4 x i32> %src2) {
85 ; CHECK-LABEL: oneoff21_i32:
86 ; CHECK: @ %bb.0: @ %entry
87 ; CHECK-NEXT: vmov.f32 s7, s0
88 ; CHECK-NEXT: vmov q0, q1
91 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
95 define arm_aapcs_vfpcc <4 x i32> @oneoff22_i32(<4 x i32> %src1, <4 x i32> %src2) {
96 ; CHECK-LABEL: oneoff22_i32:
97 ; CHECK: @ %bb.0: @ %entry
98 ; CHECK-NEXT: vmov q0, q1
99 ; CHECK-NEXT: vmov.f32 s2, s0
102 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 5, i32 4, i32 7>
106 define arm_aapcs_vfpcc <4 x i32> @oneoffundef_i32(<4 x i32> %src1, <4 x i32> %src2) {
107 ; CHECK-LABEL: oneoffundef_i32:
108 ; CHECK: @ %bb.0: @ %entry
109 ; CHECK-NEXT: vmov.f32 s1, s4
112 %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 0, i32 4, i32 undef, i32 3>
116 define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) {
117 ; CHECK-LABEL: shuffle2step_i32:
118 ; CHECK: @ %bb.0: @ %entry
119 ; CHECK-NEXT: vmov.f32 s8, s1
120 ; CHECK-NEXT: vmov.f32 s9, s3
121 ; CHECK-NEXT: vmov.f32 s1, s2
122 ; CHECK-NEXT: vmov.f32 s10, s5
123 ; CHECK-NEXT: vmov.f32 s11, s7
124 ; CHECK-NEXT: vmov.f32 s2, s4
125 ; CHECK-NEXT: vmov.f32 s3, s6
126 ; CHECK-NEXT: vadd.i32 q0, q0, q2
129 %s1 = shufflevector <8 x i32> %src, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
130 %s2 = shufflevector <8 x i32> %src, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
131 %r = add <4 x i32> %s1, %s2
135 define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) {
136 ; CHECK-LABEL: shuffle3step_i32:
137 ; CHECK: @ %bb.0: @ %entry
138 ; CHECK-NEXT: .vsave {d8, d9}
139 ; CHECK-NEXT: vpush {d8, d9}
140 ; CHECK-NEXT: vmov.f32 s13, s4
141 ; CHECK-NEXT: vmov.f32 s14, s7
142 ; CHECK-NEXT: vmov.f32 s18, s6
143 ; CHECK-NEXT: vmov.f32 s12, s1
144 ; CHECK-NEXT: vmov.f32 s15, s10
145 ; CHECK-NEXT: vmov.f32 s16, s0
146 ; CHECK-NEXT: vmov.f32 s17, s3
147 ; CHECK-NEXT: vmov.f32 s19, s9
148 ; CHECK-NEXT: vadd.i32 q3, q4, q3
149 ; CHECK-NEXT: vmov.f32 s4, s2
150 ; CHECK-NEXT: vmov.f32 s6, s8
151 ; CHECK-NEXT: vmov.f32 s7, s11
152 ; CHECK-NEXT: vadd.i32 q0, q3, q1
153 ; CHECK-NEXT: vpop {d8, d9}
156 %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
157 %s2 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
158 %s3 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
159 %a = add <4 x i32> %s1, %s2
160 %r = add <4 x i32> %a, %s3
164 define arm_aapcs_vfpcc <4 x i32> @shuffle4step_i32(<16 x i32> %src) {
165 ; CHECK-LABEL: shuffle4step_i32:
166 ; CHECK: @ %bb.0: @ %entry
167 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
168 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
169 ; CHECK-NEXT: vmov.f32 s16, s3
170 ; CHECK-NEXT: vmov.f32 s20, s2
171 ; CHECK-NEXT: vmov.f32 s17, s7
172 ; CHECK-NEXT: vmov.f32 s18, s11
173 ; CHECK-NEXT: vmov.f32 s19, s15
174 ; CHECK-NEXT: vmov.f32 s21, s6
175 ; CHECK-NEXT: vmov.f32 s22, s10
176 ; CHECK-NEXT: vmov.f32 s23, s14
177 ; CHECK-NEXT: vadd.i32 q4, q5, q4
178 ; CHECK-NEXT: vmov.f32 s20, s1
179 ; CHECK-NEXT: vmov.f32 s21, s5
180 ; CHECK-NEXT: vmov.f32 s22, s9
181 ; CHECK-NEXT: vmov.f32 s23, s13
182 ; CHECK-NEXT: vmov.f32 s1, s4
183 ; CHECK-NEXT: vmov.f32 s2, s8
184 ; CHECK-NEXT: vmov.f32 s3, s12
185 ; CHECK-NEXT: vadd.i32 q0, q0, q5
186 ; CHECK-NEXT: vadd.i32 q0, q0, q4
187 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
190 %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
191 %s2 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
192 %s3 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
193 %s4 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
194 %a1 = add <4 x i32> %s1, %s2
195 %a2 = add <4 x i32> %s3, %s4
196 %r = add <4 x i32> %a1, %a2
202 define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
203 ; CHECK-LABEL: shuffle1_i16:
204 ; CHECK: @ %bb.0: @ %entry
205 ; CHECK-NEXT: vrev64.16 q1, q0
206 ; CHECK-NEXT: vmov.f32 s0, s6
207 ; CHECK-NEXT: vmov.f32 s1, s7
208 ; CHECK-NEXT: vmov.f32 s2, s4
209 ; CHECK-NEXT: vmov.f32 s3, s5
212 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
216 define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) {
217 ; CHECK-LABEL: shuffle2_i16:
218 ; CHECK: @ %bb.0: @ %entry
221 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
225 define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
226 ; CHECK-LABEL: shuffle3_i16:
227 ; CHECK: @ %bb.0: @ %entry
228 ; CHECK-NEXT: vmov q1, q0
229 ; CHECK-NEXT: vmovx.f16 s2, s5
230 ; CHECK-NEXT: vmovx.f16 s0, s4
231 ; CHECK-NEXT: vins.f16 s5, s4
232 ; CHECK-NEXT: vins.f16 s2, s0
233 ; CHECK-NEXT: vmov.f32 s3, s5
234 ; CHECK-NEXT: vmovx.f16 s1, s7
235 ; CHECK-NEXT: vmov.f32 s0, s6
236 ; CHECK-NEXT: vins.f16 s1, s7
239 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
243 define arm_aapcs_vfpcc <8 x i16> @shuffle5_i16(<8 x i16> %src) {
244 ; CHECK-LABEL: shuffle5_i16:
245 ; CHECK: @ %bb.0: @ %entry
246 ; CHECK-NEXT: vrev64.16 q1, q0
247 ; CHECK-NEXT: vmov q0, q1
250 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
254 define arm_aapcs_vfpcc <8 x i16> @shuffle6_i16(<8 x i16> %src) {
255 ; CHECK-LABEL: shuffle6_i16:
256 ; CHECK: @ %bb.0: @ %entry
257 ; CHECK-NEXT: vrev32.16 q0, q0
260 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
264 define arm_aapcs_vfpcc <8 x i16> @oneoff11_i16(<8 x i16> %src1, <8 x i16> %src2) {
265 ; CHECK-LABEL: oneoff11_i16:
266 ; CHECK: @ %bb.0: @ %entry
267 ; CHECK-NEXT: vmov.u16 r0, q0[1]
268 ; CHECK-NEXT: vmov.16 q0[2], r0
271 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7>
275 define arm_aapcs_vfpcc <8 x i16> @oneoff12_i16(<8 x i16> %src1, <8 x i16> %src2) {
276 ; CHECK-LABEL: oneoff12_i16:
277 ; CHECK: @ %bb.0: @ %entry
278 ; CHECK-NEXT: vmov.u16 r0, q1[0]
279 ; CHECK-NEXT: vmov.16 q0[0], r0
282 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
286 define arm_aapcs_vfpcc <8 x i16> @oneoff21_i16(<8 x i16> %src1, <8 x i16> %src2) {
287 ; CHECK-LABEL: oneoff21_i16:
288 ; CHECK: @ %bb.0: @ %entry
289 ; CHECK-NEXT: vins.f16 s5, s0
290 ; CHECK-NEXT: vmov q0, q1
293 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 9, i32 10, i32 0, i32 12, i32 13, i32 14, i32 15>
297 define arm_aapcs_vfpcc <8 x i16> @oneoff22_i16(<8 x i16> %src1, <8 x i16> %src2) {
298 ; CHECK-LABEL: oneoff22_i16:
299 ; CHECK: @ %bb.0: @ %entry
300 ; CHECK-NEXT: vmov q0, q1
301 ; CHECK-NEXT: vmov.u16 r0, q1[6]
302 ; CHECK-NEXT: vmov.16 q0[0], r0
305 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 14, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
309 define arm_aapcs_vfpcc <8 x i16> @oneoffundef_i16(<8 x i16> %src1, <8 x i16> %src2) {
310 ; CHECK-LABEL: oneoffundef_i16:
311 ; CHECK: @ %bb.0: @ %entry
312 ; CHECK-NEXT: vmov.u16 r0, q0[3]
313 ; CHECK-NEXT: vmov.16 q1[5], r0
314 ; CHECK-NEXT: vmov q0, q1
317 %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 9, i32 undef, i32 undef, i32 12, i32 3, i32 14, i32 15>
321 define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) {
322 ; CHECK-LABEL: shuffle2step_i16:
323 ; CHECK: @ %bb.0: @ %entry
324 ; CHECK-NEXT: .pad #32
325 ; CHECK-NEXT: sub sp, #32
326 ; CHECK-NEXT: mov r0, sp
327 ; CHECK-NEXT: vshr.u32 q2, q1, #16
328 ; CHECK-NEXT: vstrh.32 q2, [r0, #8]
329 ; CHECK-NEXT: vshr.u32 q2, q0, #16
330 ; CHECK-NEXT: add r1, sp, #16
331 ; CHECK-NEXT: vstrh.32 q2, [r0]
332 ; CHECK-NEXT: vstrh.32 q1, [r1, #8]
333 ; CHECK-NEXT: vstrh.32 q0, [r1]
334 ; CHECK-NEXT: vldrw.u32 q0, [r0]
335 ; CHECK-NEXT: vldrw.u32 q1, [r1]
336 ; CHECK-NEXT: vadd.i16 q0, q1, q0
337 ; CHECK-NEXT: add sp, #32
340 %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
341 %s2 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
342 %r = add <8 x i16> %s1, %s2
346 define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
347 ; CHECK-LABEL: shuffle3step_i16:
348 ; CHECK: @ %bb.0: @ %entry
349 ; CHECK-NEXT: .vsave {d8, d9}
350 ; CHECK-NEXT: vpush {d8, d9}
351 ; CHECK-NEXT: vmovx.f16 s12, s0
352 ; CHECK-NEXT: vmov.f32 s16, s1
353 ; CHECK-NEXT: vins.f16 s12, s2
354 ; CHECK-NEXT: vmovx.f16 s2, s2
355 ; CHECK-NEXT: vins.f16 s16, s2
356 ; CHECK-NEXT: vmovx.f16 s2, s5
357 ; CHECK-NEXT: vmov.f32 s17, s4
358 ; CHECK-NEXT: vmovx.f16 s13, s3
359 ; CHECK-NEXT: vins.f16 s17, s2
360 ; CHECK-NEXT: vmov.f32 s18, s7
361 ; CHECK-NEXT: vmovx.f16 s2, s8
362 ; CHECK-NEXT: vmov.f32 s19, s10
363 ; CHECK-NEXT: vins.f16 s18, s2
364 ; CHECK-NEXT: vmovx.f16 s2, s11
365 ; CHECK-NEXT: vins.f16 s19, s2
366 ; CHECK-NEXT: vmovx.f16 s2, s1
367 ; CHECK-NEXT: vins.f16 s0, s2
368 ; CHECK-NEXT: vmovx.f16 s2, s4
369 ; CHECK-NEXT: vins.f16 s3, s2
370 ; CHECK-NEXT: vmovx.f16 s2, s7
371 ; CHECK-NEXT: vmovx.f16 s4, s10
372 ; CHECK-NEXT: vmovx.f16 s14, s6
373 ; CHECK-NEXT: vmovx.f16 s15, s9
374 ; CHECK-NEXT: vins.f16 s6, s2
375 ; CHECK-NEXT: vins.f16 s9, s4
376 ; CHECK-NEXT: vmov.f32 s1, s3
377 ; CHECK-NEXT: vins.f16 s14, s8
378 ; CHECK-NEXT: vins.f16 s15, s11
379 ; CHECK-NEXT: vins.f16 s13, s5
380 ; CHECK-NEXT: vmov.f32 s2, s6
381 ; CHECK-NEXT: vmov.f32 s3, s9
382 ; CHECK-NEXT: vadd.i16 q0, q0, q3
383 ; CHECK-NEXT: vadd.i16 q0, q0, q4
384 ; CHECK-NEXT: vpop {d8, d9}
387 %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
388 %s2 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
389 %s3 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
390 %a = add <8 x i16> %s1, %s2
391 %r = add <8 x i16> %a, %s3
395 define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
396 ; CHECK-LABEL: shuffle4step_i16:
397 ; CHECK: @ %bb.0: @ %entry
398 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
399 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
400 ; CHECK-NEXT: vmovx.f16 s18, s9
401 ; CHECK-NEXT: vmovx.f16 s16, s11
402 ; CHECK-NEXT: vins.f16 s18, s16
403 ; CHECK-NEXT: vmovx.f16 s19, s13
404 ; CHECK-NEXT: vmovx.f16 s16, s15
405 ; CHECK-NEXT: vmovx.f16 s20, s3
406 ; CHECK-NEXT: vins.f16 s19, s16
407 ; CHECK-NEXT: vmovx.f16 s16, s1
408 ; CHECK-NEXT: vins.f16 s16, s20
409 ; CHECK-NEXT: vmovx.f16 s17, s5
410 ; CHECK-NEXT: vmovx.f16 s20, s7
411 ; CHECK-NEXT: vins.f16 s9, s11
412 ; CHECK-NEXT: vins.f16 s13, s15
413 ; CHECK-NEXT: vins.f16 s5, s7
414 ; CHECK-NEXT: vins.f16 s1, s3
415 ; CHECK-NEXT: vins.f16 s17, s20
416 ; CHECK-NEXT: vmov.f32 s20, s1
417 ; CHECK-NEXT: vmovx.f16 s1, s10
418 ; CHECK-NEXT: vmov.f32 s22, s9
419 ; CHECK-NEXT: vmov.f32 s23, s13
420 ; CHECK-NEXT: vmov.f32 s21, s5
421 ; CHECK-NEXT: vadd.i16 q4, q5, q4
422 ; CHECK-NEXT: vmovx.f16 s22, s8
423 ; CHECK-NEXT: vins.f16 s22, s1
424 ; CHECK-NEXT: vmovx.f16 s23, s12
425 ; CHECK-NEXT: vmovx.f16 s1, s14
426 ; CHECK-NEXT: vmovx.f16 s20, s0
427 ; CHECK-NEXT: vins.f16 s23, s1
428 ; CHECK-NEXT: vmovx.f16 s1, s2
429 ; CHECK-NEXT: vins.f16 s20, s1
430 ; CHECK-NEXT: vmovx.f16 s21, s4
431 ; CHECK-NEXT: vmovx.f16 s1, s6
432 ; CHECK-NEXT: vins.f16 s12, s14
433 ; CHECK-NEXT: vins.f16 s8, s10
434 ; CHECK-NEXT: vins.f16 s4, s6
435 ; CHECK-NEXT: vins.f16 s21, s1
436 ; CHECK-NEXT: vins.f16 s0, s2
437 ; CHECK-NEXT: vmov.f32 s3, s12
438 ; CHECK-NEXT: vmov.f32 s1, s4
439 ; CHECK-NEXT: vmov.f32 s2, s8
440 ; CHECK-NEXT: vadd.i16 q0, q0, q5
441 ; CHECK-NEXT: vadd.i16 q0, q0, q4
442 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
445 %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
446 %s2 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
447 %s3 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
448 %s4 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
449 %a1 = add <8 x i16> %s1, %s2
450 %a2 = add <8 x i16> %s3, %s4
451 %r = add <8 x i16> %a1, %a2
457 define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) {
458 ; CHECK-LABEL: shuffle1_i8:
459 ; CHECK: @ %bb.0: @ %entry
460 ; CHECK-NEXT: vrev64.8 q1, q0
461 ; CHECK-NEXT: vmov.f32 s0, s6
462 ; CHECK-NEXT: vmov.f32 s1, s7
463 ; CHECK-NEXT: vmov.f32 s2, s4
464 ; CHECK-NEXT: vmov.f32 s3, s5
467 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
471 define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) {
472 ; CHECK-LABEL: shuffle2_i8:
473 ; CHECK: @ %bb.0: @ %entry
476 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
480 define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) {
481 ; CHECK-LABEL: shuffle3_i8:
482 ; CHECK: @ %bb.0: @ %entry
483 ; CHECK-NEXT: vmov q1, q0
484 ; CHECK-NEXT: vmov.u8 r0, q0[4]
485 ; CHECK-NEXT: vmov.8 q0[0], r0
486 ; CHECK-NEXT: vmov.u8 r0, q1[5]
487 ; CHECK-NEXT: vmov.8 q0[1], r0
488 ; CHECK-NEXT: vmov.u8 r0, q1[15]
489 ; CHECK-NEXT: vmov.8 q0[2], r0
490 ; CHECK-NEXT: vmov.u8 r0, q1[7]
491 ; CHECK-NEXT: vmov.8 q0[3], r0
492 ; CHECK-NEXT: vmov.u8 r0, q1[14]
493 ; CHECK-NEXT: vmov.8 q0[4], r0
494 ; CHECK-NEXT: vmov.u8 r0, q1[9]
495 ; CHECK-NEXT: vmov.8 q0[5], r0
496 ; CHECK-NEXT: vmov.u8 r0, q1[6]
497 ; CHECK-NEXT: vmov.8 q0[6], r0
498 ; CHECK-NEXT: vmov.u8 r0, q1[3]
499 ; CHECK-NEXT: vmov.8 q0[7], r0
500 ; CHECK-NEXT: vmov.u8 r0, q1[10]
501 ; CHECK-NEXT: vmov.8 q0[8], r0
502 ; CHECK-NEXT: vmov.u8 r0, q1[12]
503 ; CHECK-NEXT: vmov.8 q0[9], r0
504 ; CHECK-NEXT: vmov.u8 r0, q1[1]
505 ; CHECK-NEXT: vmov.8 q0[10], r0
506 ; CHECK-NEXT: vmov.u8 r0, q1[13]
507 ; CHECK-NEXT: vmov.8 q0[11], r0
508 ; CHECK-NEXT: vmov.u8 r0, q1[2]
509 ; CHECK-NEXT: vmov.8 q0[12], r0
510 ; CHECK-NEXT: vmov.u8 r0, q1[8]
511 ; CHECK-NEXT: vmov.8 q0[13], r0
512 ; CHECK-NEXT: vmov.u8 r0, q1[0]
513 ; CHECK-NEXT: vmov.8 q0[14], r0
514 ; CHECK-NEXT: vmov.u8 r0, q1[11]
515 ; CHECK-NEXT: vmov.8 q0[15], r0
518 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 15, i32 7, i32 14, i32 9, i32 6, i32 3, i32 10, i32 12, i32 1, i32 13, i32 2, i32 8, i32 0, i32 11>
522 define arm_aapcs_vfpcc <16 x i8> @shuffle5_i8(<16 x i8> %src) {
523 ; CHECK-LABEL: shuffle5_i8:
524 ; CHECK: @ %bb.0: @ %entry
525 ; CHECK-NEXT: vrev64.8 q1, q0
526 ; CHECK-NEXT: vmov q0, q1
529 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
533 define arm_aapcs_vfpcc <16 x i8> @shuffle6_i8(<16 x i8> %src) {
534 ; CHECK-LABEL: shuffle6_i8:
535 ; CHECK: @ %bb.0: @ %entry
536 ; CHECK-NEXT: vrev32.8 q0, q0
539 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
543 define arm_aapcs_vfpcc <16 x i8> @shuffle7_i8(<16 x i8> %src) {
544 ; CHECK-LABEL: shuffle7_i8:
545 ; CHECK: @ %bb.0: @ %entry
546 ; CHECK-NEXT: vrev16.8 q0, q0
549 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
553 define arm_aapcs_vfpcc <16 x i8> @oneoff11_i8(<16 x i8> %src1, <16 x i8> %src2) {
554 ; CHECK-LABEL: oneoff11_i8:
555 ; CHECK: @ %bb.0: @ %entry
556 ; CHECK-NEXT: vmov.u8 r0, q0[1]
557 ; CHECK-NEXT: vmov.8 q0[2], r0
560 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
564 define arm_aapcs_vfpcc <16 x i8> @oneoff12_i8(<16 x i8> %src1, <16 x i8> %src2) {
565 ; CHECK-LABEL: oneoff12_i8:
566 ; CHECK: @ %bb.0: @ %entry
567 ; CHECK-NEXT: vmov.u8 r0, q1[4]
568 ; CHECK-NEXT: vmov.8 q0[0], r0
571 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 20, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
575 define arm_aapcs_vfpcc <16 x i8> @oneoff21_i8(<16 x i8> %src1, <16 x i8> %src2) {
576 ; CHECK-LABEL: oneoff21_i8:
577 ; CHECK: @ %bb.0: @ %entry
578 ; CHECK-NEXT: vmov.u8 r0, q0[0]
579 ; CHECK-NEXT: vmov.8 q1[3], r0
580 ; CHECK-NEXT: vmov q0, q1
583 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 16, i32 17, i32 18, i32 0, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
587 define arm_aapcs_vfpcc <16 x i8> @oneoff22_i8(<16 x i8> %src1, <16 x i8> %src2) {
588 ; CHECK-LABEL: oneoff22_i8:
589 ; CHECK: @ %bb.0: @ %entry
590 ; CHECK-NEXT: vmov q0, q1
591 ; CHECK-NEXT: vmov.u8 r0, q1[15]
592 ; CHECK-NEXT: vmov.8 q0[9], r0
595 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 31, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
599 define arm_aapcs_vfpcc <16 x i8> @oneoffundef_i8(<16 x i8> %src1, <16 x i8> %src2) {
600 ; CHECK-LABEL: oneoffundef_i8:
601 ; CHECK: @ %bb.0: @ %entry
602 ; CHECK-NEXT: vmov.u8 r0, q0[2]
603 ; CHECK-NEXT: vmov.8 q0[1], r0
606 %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 undef, i32 2, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 undef, i32 13, i32 14, i32 15>
610 define arm_aapcs_vfpcc <16 x i8> @shuffle2step_i8(<32 x i8> %src) {
611 ; CHECK-LABEL: shuffle2step_i8:
612 ; CHECK: @ %bb.0: @ %entry
613 ; CHECK-NEXT: .pad #32
614 ; CHECK-NEXT: sub sp, #32
615 ; CHECK-NEXT: mov r0, sp
616 ; CHECK-NEXT: vshr.u16 q2, q1, #8
617 ; CHECK-NEXT: vstrb.16 q2, [r0, #8]
618 ; CHECK-NEXT: vshr.u16 q2, q0, #8
619 ; CHECK-NEXT: add r1, sp, #16
620 ; CHECK-NEXT: vstrb.16 q2, [r0]
621 ; CHECK-NEXT: vstrb.16 q1, [r1, #8]
622 ; CHECK-NEXT: vstrb.16 q0, [r1]
623 ; CHECK-NEXT: vldrw.u32 q0, [r0]
624 ; CHECK-NEXT: vldrw.u32 q1, [r1]
625 ; CHECK-NEXT: vadd.i8 q0, q1, q0
626 ; CHECK-NEXT: add sp, #32
629 %s1 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
630 %s2 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
631 %r = add <16 x i8> %s1, %s2
635 define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) {
636 ; CHECK-LABEL: shuffle3step_i8:
637 ; CHECK: @ %bb.0: @ %entry
638 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
639 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
640 ; CHECK-NEXT: vmov.u8 r0, q0[1]
641 ; CHECK-NEXT: vmov.8 q3[0], r0
642 ; CHECK-NEXT: vmov.u8 r0, q0[4]
643 ; CHECK-NEXT: vmov.8 q3[1], r0
644 ; CHECK-NEXT: vmov.u8 r0, q0[7]
645 ; CHECK-NEXT: vmov.8 q3[2], r0
646 ; CHECK-NEXT: vmov.u8 r0, q0[10]
647 ; CHECK-NEXT: vmov.8 q3[3], r0
648 ; CHECK-NEXT: vmov.u8 r0, q0[13]
649 ; CHECK-NEXT: vmov.8 q3[4], r0
650 ; CHECK-NEXT: vmov.u8 r0, q1[0]
651 ; CHECK-NEXT: vmov.8 q3[5], r0
652 ; CHECK-NEXT: vmov.u8 r0, q1[3]
653 ; CHECK-NEXT: vmov.8 q3[6], r0
654 ; CHECK-NEXT: vmov.u8 r0, q1[9]
655 ; CHECK-NEXT: vmov.8 q4[8], r0
656 ; CHECK-NEXT: vmov.u8 r0, q1[12]
657 ; CHECK-NEXT: vmov.8 q4[9], r0
658 ; CHECK-NEXT: vmov.u8 r0, q1[15]
659 ; CHECK-NEXT: vmov.8 q4[10], r0
660 ; CHECK-NEXT: vmov.u8 r0, q2[2]
661 ; CHECK-NEXT: vmov.8 q4[11], r0
662 ; CHECK-NEXT: vmov.u8 r0, q2[5]
663 ; CHECK-NEXT: vmov.8 q4[12], r0
664 ; CHECK-NEXT: vmov.u8 r0, q2[8]
665 ; CHECK-NEXT: vmov.8 q4[13], r0
666 ; CHECK-NEXT: vmov.u8 r0, q2[11]
667 ; CHECK-NEXT: vmov.8 q4[14], r0
668 ; CHECK-NEXT: vmov.u8 r0, q2[14]
669 ; CHECK-NEXT: vmov.8 q4[15], r0
670 ; CHECK-NEXT: vmov.u8 r0, q1[6]
671 ; CHECK-NEXT: vmov.8 q3[7], r0
672 ; CHECK-NEXT: vmov.u8 r0, q0[0]
673 ; CHECK-NEXT: vmov.f32 s14, s18
674 ; CHECK-NEXT: vmov.f32 s15, s19
675 ; CHECK-NEXT: vmov.8 q4[0], r0
676 ; CHECK-NEXT: vmov.u8 r0, q0[3]
677 ; CHECK-NEXT: vmov.8 q4[1], r0
678 ; CHECK-NEXT: vmov.u8 r0, q0[6]
679 ; CHECK-NEXT: vmov.8 q4[2], r0
680 ; CHECK-NEXT: vmov.u8 r0, q0[9]
681 ; CHECK-NEXT: vmov.8 q4[3], r0
682 ; CHECK-NEXT: vmov.u8 r0, q0[12]
683 ; CHECK-NEXT: vmov.8 q4[4], r0
684 ; CHECK-NEXT: vmov.u8 r0, q0[15]
685 ; CHECK-NEXT: vmov.8 q4[5], r0
686 ; CHECK-NEXT: vmov.u8 r0, q1[2]
687 ; CHECK-NEXT: vmov.8 q4[6], r0
688 ; CHECK-NEXT: vmov.u8 r0, q1[8]
689 ; CHECK-NEXT: vmov.8 q5[8], r0
690 ; CHECK-NEXT: vmov.u8 r0, q1[11]
691 ; CHECK-NEXT: vmov.8 q5[9], r0
692 ; CHECK-NEXT: vmov.u8 r0, q1[14]
693 ; CHECK-NEXT: vmov.8 q5[10], r0
694 ; CHECK-NEXT: vmov.u8 r0, q2[1]
695 ; CHECK-NEXT: vmov.8 q5[11], r0
696 ; CHECK-NEXT: vmov.u8 r0, q2[4]
697 ; CHECK-NEXT: vmov.8 q5[12], r0
698 ; CHECK-NEXT: vmov.u8 r0, q2[7]
699 ; CHECK-NEXT: vmov.8 q5[13], r0
700 ; CHECK-NEXT: vmov.u8 r0, q2[10]
701 ; CHECK-NEXT: vmov.8 q5[14], r0
702 ; CHECK-NEXT: vmov.u8 r0, q2[13]
703 ; CHECK-NEXT: vmov.8 q5[15], r0
704 ; CHECK-NEXT: vmov.u8 r0, q1[5]
705 ; CHECK-NEXT: vmov.8 q4[7], r0
706 ; CHECK-NEXT: vmov.u8 r0, q0[2]
707 ; CHECK-NEXT: vmov.f32 s18, s22
708 ; CHECK-NEXT: vmov.f32 s19, s23
709 ; CHECK-NEXT: vadd.i8 q3, q4, q3
710 ; CHECK-NEXT: vmov.8 q4[0], r0
711 ; CHECK-NEXT: vmov.u8 r0, q0[5]
712 ; CHECK-NEXT: vmov.8 q4[1], r0
713 ; CHECK-NEXT: vmov.u8 r0, q0[8]
714 ; CHECK-NEXT: vmov.8 q4[2], r0
715 ; CHECK-NEXT: vmov.u8 r0, q0[11]
716 ; CHECK-NEXT: vmov.8 q4[3], r0
717 ; CHECK-NEXT: vmov.u8 r0, q0[14]
718 ; CHECK-NEXT: vmov.8 q4[4], r0
719 ; CHECK-NEXT: vmov.u8 r0, q1[1]
720 ; CHECK-NEXT: vmov.8 q4[5], r0
721 ; CHECK-NEXT: vmov.u8 r0, q1[4]
722 ; CHECK-NEXT: vmov.8 q4[6], r0
723 ; CHECK-NEXT: vmov.u8 r0, q1[10]
724 ; CHECK-NEXT: vmov.8 q0[8], r0
725 ; CHECK-NEXT: vmov.u8 r0, q1[13]
726 ; CHECK-NEXT: vmov.8 q0[9], r0
727 ; CHECK-NEXT: vmov.u8 r0, q2[0]
728 ; CHECK-NEXT: vmov.8 q0[10], r0
729 ; CHECK-NEXT: vmov.u8 r0, q2[3]
730 ; CHECK-NEXT: vmov.8 q0[11], r0
731 ; CHECK-NEXT: vmov.u8 r0, q2[6]
732 ; CHECK-NEXT: vmov.8 q0[12], r0
733 ; CHECK-NEXT: vmov.u8 r0, q2[9]
734 ; CHECK-NEXT: vmov.8 q0[13], r0
735 ; CHECK-NEXT: vmov.u8 r0, q2[12]
736 ; CHECK-NEXT: vmov.8 q0[14], r0
737 ; CHECK-NEXT: vmov.u8 r0, q2[15]
738 ; CHECK-NEXT: vmov.8 q0[15], r0
739 ; CHECK-NEXT: vmov.u8 r0, q1[7]
740 ; CHECK-NEXT: vmov.8 q4[7], r0
741 ; CHECK-NEXT: vmov.f32 s18, s2
742 ; CHECK-NEXT: vmov.f32 s19, s3
743 ; CHECK-NEXT: vadd.i8 q0, q3, q4
744 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
747 %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
748 %s2 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
749 %s3 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
750 %a = add <16 x i8> %s1, %s2
751 %r = add <16 x i8> %a, %s3
755 define arm_aapcs_vfpcc <16 x i8> @shuffle4step_i8(<64 x i8> %src) {
756 ; CHECK-LABEL: shuffle4step_i8:
757 ; CHECK: @ %bb.0: @ %entry
758 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
759 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
760 ; CHECK-NEXT: vmov.u8 r0, q0[3]
761 ; CHECK-NEXT: vmov.8 q4[0], r0
762 ; CHECK-NEXT: vmov.u8 r0, q0[7]
763 ; CHECK-NEXT: vmov.8 q4[1], r0
764 ; CHECK-NEXT: vmov.u8 r0, q0[11]
765 ; CHECK-NEXT: vmov.8 q4[2], r0
766 ; CHECK-NEXT: vmov.u8 r0, q0[15]
767 ; CHECK-NEXT: vmov.8 q4[3], r0
768 ; CHECK-NEXT: vmov.u8 r0, q1[3]
769 ; CHECK-NEXT: vmov.8 q4[4], r0
770 ; CHECK-NEXT: vmov.u8 r0, q1[7]
771 ; CHECK-NEXT: vmov.8 q4[5], r0
772 ; CHECK-NEXT: vmov.u8 r0, q1[11]
773 ; CHECK-NEXT: vmov.8 q4[6], r0
774 ; CHECK-NEXT: vmov.u8 r0, q2[3]
775 ; CHECK-NEXT: vmov.8 q5[8], r0
776 ; CHECK-NEXT: vmov.u8 r0, q2[7]
777 ; CHECK-NEXT: vmov.8 q5[9], r0
778 ; CHECK-NEXT: vmov.u8 r0, q2[11]
779 ; CHECK-NEXT: vmov.8 q5[10], r0
780 ; CHECK-NEXT: vmov.u8 r0, q2[15]
781 ; CHECK-NEXT: vmov.8 q5[11], r0
782 ; CHECK-NEXT: vmov.u8 r0, q3[3]
783 ; CHECK-NEXT: vmov.8 q5[12], r0
784 ; CHECK-NEXT: vmov.u8 r0, q3[7]
785 ; CHECK-NEXT: vmov.8 q5[13], r0
786 ; CHECK-NEXT: vmov.u8 r0, q3[11]
787 ; CHECK-NEXT: vmov.8 q5[14], r0
788 ; CHECK-NEXT: vmov.u8 r0, q3[15]
789 ; CHECK-NEXT: vmov.8 q5[15], r0
790 ; CHECK-NEXT: vmov.u8 r0, q1[15]
791 ; CHECK-NEXT: vmov.8 q4[7], r0
792 ; CHECK-NEXT: vmov.u8 r0, q0[2]
793 ; CHECK-NEXT: vmov.f32 s18, s22
794 ; CHECK-NEXT: vmov.f32 s19, s23
795 ; CHECK-NEXT: vmov.8 q5[0], r0
796 ; CHECK-NEXT: vmov.u8 r0, q0[6]
797 ; CHECK-NEXT: vmov.8 q5[1], r0
798 ; CHECK-NEXT: vmov.u8 r0, q0[10]
799 ; CHECK-NEXT: vmov.8 q5[2], r0
800 ; CHECK-NEXT: vmov.u8 r0, q0[14]
801 ; CHECK-NEXT: vmov.8 q5[3], r0
802 ; CHECK-NEXT: vmov.u8 r0, q1[2]
803 ; CHECK-NEXT: vmov.8 q5[4], r0
804 ; CHECK-NEXT: vmov.u8 r0, q1[6]
805 ; CHECK-NEXT: vmov.8 q5[5], r0
806 ; CHECK-NEXT: vmov.u8 r0, q1[10]
807 ; CHECK-NEXT: vmov.8 q5[6], r0
808 ; CHECK-NEXT: vmov.u8 r0, q2[2]
809 ; CHECK-NEXT: vmov.8 q6[8], r0
810 ; CHECK-NEXT: vmov.u8 r0, q2[6]
811 ; CHECK-NEXT: vmov.8 q6[9], r0
812 ; CHECK-NEXT: vmov.u8 r0, q2[10]
813 ; CHECK-NEXT: vmov.8 q6[10], r0
814 ; CHECK-NEXT: vmov.u8 r0, q2[14]
815 ; CHECK-NEXT: vmov.8 q6[11], r0
816 ; CHECK-NEXT: vmov.u8 r0, q3[2]
817 ; CHECK-NEXT: vmov.8 q6[12], r0
818 ; CHECK-NEXT: vmov.u8 r0, q3[6]
819 ; CHECK-NEXT: vmov.8 q6[13], r0
820 ; CHECK-NEXT: vmov.u8 r0, q3[10]
821 ; CHECK-NEXT: vmov.8 q6[14], r0
822 ; CHECK-NEXT: vmov.u8 r0, q3[14]
823 ; CHECK-NEXT: vmov.8 q6[15], r0
824 ; CHECK-NEXT: vmov.u8 r0, q1[14]
825 ; CHECK-NEXT: vmov.8 q5[7], r0
826 ; CHECK-NEXT: vmov.u8 r0, q0[1]
827 ; CHECK-NEXT: vmov.f32 s22, s26
828 ; CHECK-NEXT: vmov.f32 s23, s27
829 ; CHECK-NEXT: vadd.i8 q4, q5, q4
830 ; CHECK-NEXT: vmov.8 q5[0], r0
831 ; CHECK-NEXT: vmov.u8 r0, q0[5]
832 ; CHECK-NEXT: vmov.8 q5[1], r0
833 ; CHECK-NEXT: vmov.u8 r0, q0[9]
834 ; CHECK-NEXT: vmov.8 q5[2], r0
835 ; CHECK-NEXT: vmov.u8 r0, q0[13]
836 ; CHECK-NEXT: vmov.8 q5[3], r0
837 ; CHECK-NEXT: vmov.u8 r0, q1[1]
838 ; CHECK-NEXT: vmov.8 q5[4], r0
839 ; CHECK-NEXT: vmov.u8 r0, q1[5]
840 ; CHECK-NEXT: vmov.8 q5[5], r0
841 ; CHECK-NEXT: vmov.u8 r0, q1[9]
842 ; CHECK-NEXT: vmov.8 q5[6], r0
843 ; CHECK-NEXT: vmov.u8 r0, q2[1]
844 ; CHECK-NEXT: vmov.8 q6[8], r0
845 ; CHECK-NEXT: vmov.u8 r0, q2[5]
846 ; CHECK-NEXT: vmov.8 q6[9], r0
847 ; CHECK-NEXT: vmov.u8 r0, q2[9]
848 ; CHECK-NEXT: vmov.8 q6[10], r0
849 ; CHECK-NEXT: vmov.u8 r0, q2[13]
850 ; CHECK-NEXT: vmov.8 q6[11], r0
851 ; CHECK-NEXT: vmov.u8 r0, q3[1]
852 ; CHECK-NEXT: vmov.8 q6[12], r0
853 ; CHECK-NEXT: vmov.u8 r0, q3[5]
854 ; CHECK-NEXT: vmov.8 q6[13], r0
855 ; CHECK-NEXT: vmov.u8 r0, q3[9]
856 ; CHECK-NEXT: vmov.8 q6[14], r0
857 ; CHECK-NEXT: vmov.u8 r0, q3[13]
858 ; CHECK-NEXT: vmov.8 q6[15], r0
859 ; CHECK-NEXT: vmov.u8 r0, q1[13]
860 ; CHECK-NEXT: vmov.8 q5[7], r0
861 ; CHECK-NEXT: vmov.u8 r0, q0[0]
862 ; CHECK-NEXT: vmov.f32 s22, s26
863 ; CHECK-NEXT: vmov.f32 s23, s27
864 ; CHECK-NEXT: vmov.8 q6[0], r0
865 ; CHECK-NEXT: vmov.u8 r0, q0[4]
866 ; CHECK-NEXT: vmov.8 q6[1], r0
867 ; CHECK-NEXT: vmov.u8 r0, q0[8]
868 ; CHECK-NEXT: vmov.8 q6[2], r0
869 ; CHECK-NEXT: vmov.u8 r0, q0[12]
870 ; CHECK-NEXT: vmov.8 q6[3], r0
871 ; CHECK-NEXT: vmov.u8 r0, q1[0]
872 ; CHECK-NEXT: vmov.8 q6[4], r0
873 ; CHECK-NEXT: vmov.u8 r0, q1[4]
874 ; CHECK-NEXT: vmov.8 q6[5], r0
875 ; CHECK-NEXT: vmov.u8 r0, q1[8]
876 ; CHECK-NEXT: vmov.8 q6[6], r0
877 ; CHECK-NEXT: vmov.u8 r0, q2[0]
878 ; CHECK-NEXT: vmov.8 q0[8], r0
879 ; CHECK-NEXT: vmov.u8 r0, q2[4]
880 ; CHECK-NEXT: vmov.8 q0[9], r0
881 ; CHECK-NEXT: vmov.u8 r0, q2[8]
882 ; CHECK-NEXT: vmov.8 q0[10], r0
883 ; CHECK-NEXT: vmov.u8 r0, q2[12]
884 ; CHECK-NEXT: vmov.8 q0[11], r0
885 ; CHECK-NEXT: vmov.u8 r0, q3[0]
886 ; CHECK-NEXT: vmov.8 q0[12], r0
887 ; CHECK-NEXT: vmov.u8 r0, q3[4]
888 ; CHECK-NEXT: vmov.8 q0[13], r0
889 ; CHECK-NEXT: vmov.u8 r0, q3[8]
890 ; CHECK-NEXT: vmov.8 q0[14], r0
891 ; CHECK-NEXT: vmov.u8 r0, q3[12]
892 ; CHECK-NEXT: vmov.8 q0[15], r0
893 ; CHECK-NEXT: vmov.u8 r0, q1[12]
894 ; CHECK-NEXT: vmov.8 q6[7], r0
895 ; CHECK-NEXT: vmov.f32 s26, s2
896 ; CHECK-NEXT: vmov.f32 s27, s3
897 ; CHECK-NEXT: vadd.i8 q0, q6, q5
898 ; CHECK-NEXT: vadd.i8 q0, q0, q4
899 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
902 %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
903 %s2 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
904 %s3 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
905 %s4 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
906 %a1 = add <16 x i8> %s1, %s2
907 %a2 = add <16 x i8> %s3, %s4
908 %r = add <16 x i8> %a1, %a2
914 define arm_aapcs_vfpcc <2 x i64> @shuffle1_i64(<2 x i64> %src) {
915 ; CHECK-LABEL: shuffle1_i64:
916 ; CHECK: @ %bb.0: @ %entry
919 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
923 define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) {
924 ; CHECK-LABEL: shuffle2_i64:
925 ; CHECK: @ %bb.0: @ %entry
926 ; CHECK-NEXT: vmov.f32 s4, s2
927 ; CHECK-NEXT: vmov.f32 s6, s0
928 ; CHECK-NEXT: vmov.f32 s5, s3
929 ; CHECK-NEXT: vmov.f32 s7, s1
930 ; CHECK-NEXT: vmov q0, q1
933 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
937 define arm_aapcs_vfpcc <2 x i64> @shuffle3_i64(<2 x i64> %src) {
938 ; CHECK-LABEL: shuffle3_i64:
939 ; CHECK: @ %bb.0: @ %entry
942 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 undef, i32 1>
948 define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) {
949 ; CHECK-LABEL: shuffle1_f32:
950 ; CHECK: @ %bb.0: @ %entry
951 ; CHECK-NEXT: vmov.f32 s4, s3
952 ; CHECK-NEXT: vmov.f32 s5, s2
953 ; CHECK-NEXT: vmov.f32 s6, s1
954 ; CHECK-NEXT: vmov.f32 s7, s0
955 ; CHECK-NEXT: vmov q0, q1
958 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
962 define arm_aapcs_vfpcc <4 x float> @shuffle2_f32(<4 x float> %src) {
963 ; CHECK-LABEL: shuffle2_f32:
964 ; CHECK: @ %bb.0: @ %entry
967 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
971 define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) {
972 ; CHECK-LABEL: shuffle3_f32:
973 ; CHECK: @ %bb.0: @ %entry
974 ; CHECK-NEXT: vmov.f32 s4, s3
975 ; CHECK-NEXT: vmov.f32 s5, s1
976 ; CHECK-NEXT: vmov.f32 s6, s2
977 ; CHECK-NEXT: vmov.f32 s7, s0
978 ; CHECK-NEXT: vmov q0, q1
981 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
985 define arm_aapcs_vfpcc <4 x float> @shuffle5_f32(<4 x float> %src) {
986 ; CHECK-LABEL: shuffle5_f32:
987 ; CHECK: @ %bb.0: @ %entry
988 ; CHECK-NEXT: vrev64.32 q1, q0
989 ; CHECK-NEXT: vmov q0, q1
992 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
996 define arm_aapcs_vfpcc <4 x float> @oneoff11_f32(<4 x float> %src1, <4 x float> %src2) {
997 ; CHECK-LABEL: oneoff11_f32:
998 ; CHECK: @ %bb.0: @ %entry
999 ; CHECK-NEXT: vmov.f32 s2, s1
1002 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
1003 ret <4 x float> %out
1006 define arm_aapcs_vfpcc <4 x float> @oneoff12_f32(<4 x float> %src1, <4 x float> %src2) {
1007 ; CHECK-LABEL: oneoff12_f32:
1008 ; CHECK: @ %bb.0: @ %entry
1009 ; CHECK-NEXT: vmov.f32 s0, s4
1012 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1013 ret <4 x float> %out
1016 define arm_aapcs_vfpcc <4 x float> @oneoff21_f32(<4 x float> %src1, <4 x float> %src2) {
1017 ; CHECK-LABEL: oneoff21_f32:
1018 ; CHECK: @ %bb.0: @ %entry
1019 ; CHECK-NEXT: vmov.f32 s7, s0
1020 ; CHECK-NEXT: vmov q0, q1
1023 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
1024 ret <4 x float> %out
1027 define arm_aapcs_vfpcc <4 x float> @oneoff22_f32(<4 x float> %src1, <4 x float> %src2) {
1028 ; CHECK-LABEL: oneoff22_f32:
1029 ; CHECK: @ %bb.0: @ %entry
1030 ; CHECK-NEXT: vmov q0, q1
1031 ; CHECK-NEXT: vmov.f32 s2, s0
1034 %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 5, i32 4, i32 7>
1035 ret <4 x float> %out
1038 define arm_aapcs_vfpcc <4 x float> @shuffle2step_f32(<8 x float> %src) {
1039 ; CHECKFP-LABEL: shuffle2step_f32:
1040 ; CHECKFP: @ %bb.0: @ %entry
1041 ; CHECKFP-NEXT: vmov.f32 s8, s1
1042 ; CHECKFP-NEXT: vmov.f32 s9, s3
1043 ; CHECKFP-NEXT: vmov.f32 s1, s2
1044 ; CHECKFP-NEXT: vmov.f32 s10, s5
1045 ; CHECKFP-NEXT: vmov.f32 s11, s7
1046 ; CHECKFP-NEXT: vmov.f32 s2, s4
1047 ; CHECKFP-NEXT: vmov.f32 s3, s6
1048 ; CHECKFP-NEXT: vadd.f32 q0, q0, q2
1049 ; CHECKFP-NEXT: bx lr
1051 %s1 = shufflevector <8 x float> %src, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1052 %s2 = shufflevector <8 x float> %src, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1053 %r = fadd <4 x float> %s1, %s2
1057 define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) {
1058 ; CHECKFP-LABEL: shuffle3step_f32:
1059 ; CHECKFP: @ %bb.0: @ %entry
1060 ; CHECKFP-NEXT: .vsave {d8, d9}
1061 ; CHECKFP-NEXT: vpush {d8, d9}
1062 ; CHECKFP-NEXT: vmov.f32 s13, s4
1063 ; CHECKFP-NEXT: vmov.f32 s14, s7
1064 ; CHECKFP-NEXT: vmov.f32 s18, s6
1065 ; CHECKFP-NEXT: vmov.f32 s12, s1
1066 ; CHECKFP-NEXT: vmov.f32 s15, s10
1067 ; CHECKFP-NEXT: vmov.f32 s16, s0
1068 ; CHECKFP-NEXT: vmov.f32 s17, s3
1069 ; CHECKFP-NEXT: vmov.f32 s19, s9
1070 ; CHECKFP-NEXT: vadd.f32 q3, q4, q3
1071 ; CHECKFP-NEXT: vmov.f32 s4, s2
1072 ; CHECKFP-NEXT: vmov.f32 s6, s8
1073 ; CHECKFP-NEXT: vmov.f32 s7, s11
1074 ; CHECKFP-NEXT: vadd.f32 q0, q3, q1
1075 ; CHECKFP-NEXT: vpop {d8, d9}
1076 ; CHECKFP-NEXT: bx lr
1078 %s1 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1079 %s2 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1080 %s3 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1081 %a = fadd <4 x float> %s1, %s2
1082 %r = fadd <4 x float> %a, %s3
1086 define arm_aapcs_vfpcc <4 x float> @shuffle4step_f32(<16 x float> %src) {
1087 ; CHECKFP-LABEL: shuffle4step_f32:
1088 ; CHECKFP: @ %bb.0: @ %entry
1089 ; CHECKFP-NEXT: .vsave {d8, d9, d10, d11}
1090 ; CHECKFP-NEXT: vpush {d8, d9, d10, d11}
1091 ; CHECKFP-NEXT: vmov.f32 s16, s3
1092 ; CHECKFP-NEXT: vmov.f32 s20, s2
1093 ; CHECKFP-NEXT: vmov.f32 s17, s7
1094 ; CHECKFP-NEXT: vmov.f32 s18, s11
1095 ; CHECKFP-NEXT: vmov.f32 s19, s15
1096 ; CHECKFP-NEXT: vmov.f32 s21, s6
1097 ; CHECKFP-NEXT: vmov.f32 s22, s10
1098 ; CHECKFP-NEXT: vmov.f32 s23, s14
1099 ; CHECKFP-NEXT: vadd.f32 q4, q5, q4
1100 ; CHECKFP-NEXT: vmov.f32 s20, s1
1101 ; CHECKFP-NEXT: vmov.f32 s21, s5
1102 ; CHECKFP-NEXT: vmov.f32 s22, s9
1103 ; CHECKFP-NEXT: vmov.f32 s23, s13
1104 ; CHECKFP-NEXT: vmov.f32 s1, s4
1105 ; CHECKFP-NEXT: vmov.f32 s2, s8
1106 ; CHECKFP-NEXT: vmov.f32 s3, s12
1107 ; CHECKFP-NEXT: vadd.f32 q0, q0, q5
1108 ; CHECKFP-NEXT: vadd.f32 q0, q0, q4
1109 ; CHECKFP-NEXT: vpop {d8, d9, d10, d11}
1110 ; CHECKFP-NEXT: bx lr
1112 %s1 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1113 %s2 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
1114 %s3 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
1115 %s4 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
1116 %a1 = fadd <4 x float> %s1, %s2
1117 %a2 = fadd <4 x float> %s3, %s4
1118 %r = fadd <4 x float> %a1, %a2
1124 define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
1125 ; CHECK-LABEL: shuffle1_f16:
1126 ; CHECK: @ %bb.0: @ %entry
1127 ; CHECK-NEXT: vrev64.16 q1, q0
1128 ; CHECK-NEXT: vmov.f32 s0, s6
1129 ; CHECK-NEXT: vmov.f32 s1, s7
1130 ; CHECK-NEXT: vmov.f32 s2, s4
1131 ; CHECK-NEXT: vmov.f32 s3, s5
1134 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1138 define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) {
1139 ; CHECK-LABEL: shuffle2_f16:
1140 ; CHECK: @ %bb.0: @ %entry
1143 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1147 define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
1148 ; CHECK-LABEL: shuffle3_f16:
1149 ; CHECK: @ %bb.0: @ %entry
1150 ; CHECK-NEXT: vmov q1, q0
1151 ; CHECK-NEXT: vmovx.f16 s2, s5
1152 ; CHECK-NEXT: vmovx.f16 s0, s4
1153 ; CHECK-NEXT: vins.f16 s5, s4
1154 ; CHECK-NEXT: vins.f16 s2, s0
1155 ; CHECK-NEXT: vmov.f32 s3, s5
1156 ; CHECK-NEXT: vmovx.f16 s1, s7
1157 ; CHECK-NEXT: vmov.f32 s0, s6
1158 ; CHECK-NEXT: vins.f16 s1, s7
1161 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
1165 define arm_aapcs_vfpcc <8 x half> @shuffle5_f16(<8 x half> %src) {
1166 ; CHECK-LABEL: shuffle5_f16:
1167 ; CHECK: @ %bb.0: @ %entry
1168 ; CHECK-NEXT: vrev64.16 q1, q0
1169 ; CHECK-NEXT: vmov q0, q1
1172 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1176 define arm_aapcs_vfpcc <8 x half> @shuffle6_f16(<8 x half> %src) {
1177 ; CHECK-LABEL: shuffle6_f16:
1178 ; CHECK: @ %bb.0: @ %entry
1179 ; CHECK-NEXT: vrev32.16 q0, q0
1182 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1186 define arm_aapcs_vfpcc <8 x half> @oneoff11_f16(<8 x half> %src1, <8 x half> %src2) {
1187 ; CHECK-LABEL: oneoff11_f16:
1188 ; CHECK: @ %bb.0: @ %entry
1189 ; CHECK-NEXT: vmovx.f16 s4, s0
1190 ; CHECK-NEXT: vmov r0, s4
1191 ; CHECK-NEXT: vmov.16 q0[2], r0
1194 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7>
1198 define arm_aapcs_vfpcc <8 x half> @oneoff12_f16(<8 x half> %src1, <8 x half> %src2) {
1199 ; CHECK-LABEL: oneoff12_f16:
1200 ; CHECK: @ %bb.0: @ %entry
1201 ; CHECK-NEXT: vmov r0, s4
1202 ; CHECK-NEXT: vmov.16 q0[0], r0
1205 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1209 define arm_aapcs_vfpcc <8 x half> @oneoff21_f16(<8 x half> %src1, <8 x half> %src2) {
1210 ; CHECK-LABEL: oneoff21_f16:
1211 ; CHECK: @ %bb.0: @ %entry
1212 ; CHECK-NEXT: vins.f16 s5, s0
1213 ; CHECK-NEXT: vmov q0, q1
1216 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 8, i32 9, i32 10, i32 0, i32 12, i32 13, i32 14, i32 15>
1220 define arm_aapcs_vfpcc <8 x half> @oneoff22_f16(<8 x half> %src1, <8 x half> %src2) {
1221 ; CHECK-LABEL: oneoff22_f16:
1222 ; CHECK: @ %bb.0: @ %entry
1223 ; CHECK-NEXT: vmov q0, q1
1224 ; CHECK-NEXT: vmov r0, s3
1225 ; CHECK-NEXT: vmov.16 q0[0], r0
1228 %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 14, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1232 define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) {
1233 ; CHECKFP-LABEL: shuffle2step_f16:
1234 ; CHECKFP: @ %bb.0: @ %entry
1235 ; CHECKFP-NEXT: vmovx.f16 s8, s0
1236 ; CHECKFP-NEXT: vmovx.f16 s10, s1
1237 ; CHECKFP-NEXT: vins.f16 s8, s10
1238 ; CHECKFP-NEXT: vmovx.f16 s9, s2
1239 ; CHECKFP-NEXT: vmovx.f16 s10, s3
1240 ; CHECKFP-NEXT: vmovx.f16 s12, s5
1241 ; CHECKFP-NEXT: vins.f16 s9, s10
1242 ; CHECKFP-NEXT: vmovx.f16 s10, s4
1243 ; CHECKFP-NEXT: vins.f16 s10, s12
1244 ; CHECKFP-NEXT: vmovx.f16 s11, s6
1245 ; CHECKFP-NEXT: vmovx.f16 s12, s7
1246 ; CHECKFP-NEXT: vins.f16 s2, s3
1247 ; CHECKFP-NEXT: vins.f16 s6, s7
1248 ; CHECKFP-NEXT: vins.f16 s4, s5
1249 ; CHECKFP-NEXT: vins.f16 s0, s1
1250 ; CHECKFP-NEXT: vmov.f32 s1, s2
1251 ; CHECKFP-NEXT: vins.f16 s11, s12
1252 ; CHECKFP-NEXT: vmov.f32 s2, s4
1253 ; CHECKFP-NEXT: vmov.f32 s3, s6
1254 ; CHECKFP-NEXT: vadd.f16 q0, q0, q2
1255 ; CHECKFP-NEXT: bx lr
1257 %s1 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
1258 %s2 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
1259 %r = fadd <8 x half> %s1, %s2
1263 define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
1264 ; CHECKFP-LABEL: shuffle3step_f16:
1265 ; CHECKFP: @ %bb.0: @ %entry
1266 ; CHECKFP-NEXT: .vsave {d8, d9}
1267 ; CHECKFP-NEXT: vpush {d8, d9}
1268 ; CHECKFP-NEXT: vmov.f32 s13, s4
1269 ; CHECKFP-NEXT: vmovx.f16 s4, s4
1270 ; CHECKFP-NEXT: vmovx.f16 s17, s3
1271 ; CHECKFP-NEXT: vins.f16 s3, s4
1272 ; CHECKFP-NEXT: vmovx.f16 s4, s7
1273 ; CHECKFP-NEXT: vmovx.f16 s18, s6
1274 ; CHECKFP-NEXT: vmovx.f16 s16, s0
1275 ; CHECKFP-NEXT: vins.f16 s6, s4
1276 ; CHECKFP-NEXT: vmovx.f16 s14, s2
1277 ; CHECKFP-NEXT: vmov.f32 s12, s1
1278 ; CHECKFP-NEXT: vmovx.f16 s4, s10
1279 ; CHECKFP-NEXT: vmovx.f16 s19, s9
1280 ; CHECKFP-NEXT: vins.f16 s12, s14
1281 ; CHECKFP-NEXT: vmovx.f16 s14, s5
1282 ; CHECKFP-NEXT: vins.f16 s16, s2
1283 ; CHECKFP-NEXT: vmovx.f16 s2, s11
1284 ; CHECKFP-NEXT: vmovx.f16 s15, s8
1285 ; CHECKFP-NEXT: vins.f16 s18, s8
1286 ; CHECKFP-NEXT: vmovx.f16 s8, s1
1287 ; CHECKFP-NEXT: vins.f16 s9, s4
1288 ; CHECKFP-NEXT: vins.f16 s13, s14
1289 ; CHECKFP-NEXT: vmov.f32 s14, s7
1290 ; CHECKFP-NEXT: vins.f16 s10, s2
1291 ; CHECKFP-NEXT: vmov.f32 s1, s3
1292 ; CHECKFP-NEXT: vins.f16 s19, s11
1293 ; CHECKFP-NEXT: vins.f16 s17, s5
1294 ; CHECKFP-NEXT: vins.f16 s0, s8
1295 ; CHECKFP-NEXT: vmov.f32 s2, s6
1296 ; CHECKFP-NEXT: vmov.f32 s3, s9
1297 ; CHECKFP-NEXT: vins.f16 s14, s15
1298 ; CHECKFP-NEXT: vmov.f32 s15, s10
1299 ; CHECKFP-NEXT: vadd.f16 q0, q0, q4
1300 ; CHECKFP-NEXT: vadd.f16 q0, q0, q3
1301 ; CHECKFP-NEXT: vpop {d8, d9}
1302 ; CHECKFP-NEXT: bx lr
1304 %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1305 %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1306 %s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1307 %a = fadd <8 x half> %s1, %s2
1308 %r = fadd <8 x half> %a, %s3
1312 define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
1313 ; CHECKFP-LABEL: shuffle4step_f16:
1314 ; CHECKFP: @ %bb.0: @ %entry
1315 ; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
1316 ; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13}
1317 ; CHECKFP-NEXT: vmovx.f16 s18, s9
1318 ; CHECKFP-NEXT: vmovx.f16 s16, s11
1319 ; CHECKFP-NEXT: vins.f16 s18, s16
1320 ; CHECKFP-NEXT: vmovx.f16 s19, s13
1321 ; CHECKFP-NEXT: vmovx.f16 s16, s15
1322 ; CHECKFP-NEXT: vmovx.f16 s22, s8
1323 ; CHECKFP-NEXT: vins.f16 s19, s16
1324 ; CHECKFP-NEXT: vmovx.f16 s16, s1
1325 ; CHECKFP-NEXT: vmovx.f16 s20, s3
1326 ; CHECKFP-NEXT: vins.f16 s1, s3
1327 ; CHECKFP-NEXT: vmovx.f16 s3, s10
1328 ; CHECKFP-NEXT: vins.f16 s16, s20
1329 ; CHECKFP-NEXT: vmovx.f16 s17, s5
1330 ; CHECKFP-NEXT: vmovx.f16 s20, s7
1331 ; CHECKFP-NEXT: vins.f16 s22, s3
1332 ; CHECKFP-NEXT: vmovx.f16 s23, s12
1333 ; CHECKFP-NEXT: vmovx.f16 s3, s14
1334 ; CHECKFP-NEXT: vins.f16 s17, s20
1335 ; CHECKFP-NEXT: vins.f16 s23, s3
1336 ; CHECKFP-NEXT: vmovx.f16 s20, s0
1337 ; CHECKFP-NEXT: vmovx.f16 s3, s2
1338 ; CHECKFP-NEXT: vins.f16 s9, s11
1339 ; CHECKFP-NEXT: vins.f16 s13, s15
1340 ; CHECKFP-NEXT: vins.f16 s5, s7
1341 ; CHECKFP-NEXT: vins.f16 s20, s3
1342 ; CHECKFP-NEXT: vmovx.f16 s21, s4
1343 ; CHECKFP-NEXT: vmovx.f16 s3, s6
1344 ; CHECKFP-NEXT: vins.f16 s8, s10
1345 ; CHECKFP-NEXT: vins.f16 s12, s14
1346 ; CHECKFP-NEXT: vins.f16 s4, s6
1347 ; CHECKFP-NEXT: vins.f16 s21, s3
1348 ; CHECKFP-NEXT: vins.f16 s0, s2
1349 ; CHECKFP-NEXT: vmov.f32 s24, s1
1350 ; CHECKFP-NEXT: vmov.f32 s26, s9
1351 ; CHECKFP-NEXT: vmov.f32 s27, s13
1352 ; CHECKFP-NEXT: vmov.f32 s25, s5
1353 ; CHECKFP-NEXT: vmov.f32 s2, s8
1354 ; CHECKFP-NEXT: vadd.f16 q4, q6, q4
1355 ; CHECKFP-NEXT: vmov.f32 s3, s12
1356 ; CHECKFP-NEXT: vmov.f32 s1, s4
1357 ; CHECKFP-NEXT: vadd.f16 q0, q0, q5
1358 ; CHECKFP-NEXT: vadd.f16 q0, q0, q4
1359 ; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13}
1360 ; CHECKFP-NEXT: bx lr
1362 %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
1363 %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
1364 %s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
1365 %s4 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
1366 %a1 = fadd <8 x half> %s1, %s2
1367 %a2 = fadd <8 x half> %s3, %s4
1368 %r = fadd <8 x half> %a1, %a2
1374 define arm_aapcs_vfpcc <2 x double> @shuffle1_f64(<2 x double> %src) {
1375 ; CHECK-LABEL: shuffle1_f64:
1376 ; CHECK: @ %bb.0: @ %entry
1379 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 0, i32 1>
1380 ret <2 x double> %out
1383 define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) {
1384 ; CHECK-LABEL: shuffle2_f64:
1385 ; CHECK: @ %bb.0: @ %entry
1386 ; CHECK-NEXT: vmov.f32 s4, s2
1387 ; CHECK-NEXT: vmov.f32 s6, s0
1388 ; CHECK-NEXT: vmov.f32 s5, s3
1389 ; CHECK-NEXT: vmov.f32 s7, s1
1390 ; CHECK-NEXT: vmov q0, q1
1393 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1394 ret <2 x double> %out
1397 define arm_aapcs_vfpcc <2 x double> @shuffle3_f64(<2 x double> %src) {
1398 ; CHECK-LABEL: shuffle3_f64:
1399 ; CHECK: @ %bb.0: @ %entry
1402 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 undef, i32 1>
1403 ret <2 x double> %out
1406 define arm_aapcs_vfpcc <4 x double> @shuffle4_f64(<2 x double> %src1, <2 x double> %src2) {
1407 ; CHECK-LABEL: shuffle4_f64:
1408 ; CHECK: @ %bb.0: @ %entry
1409 ; CHECK-NEXT: vmov.f32 s8, s6
1410 ; CHECK-NEXT: vmov.f32 s6, s0
1411 ; CHECK-NEXT: vmov.f32 s9, s7
1412 ; CHECK-NEXT: vmov.f32 s7, s1
1413 ; CHECK-NEXT: vmov.f32 s10, s2
1414 ; CHECK-NEXT: vmov.f32 s11, s3
1415 ; CHECK-NEXT: vmov q0, q2
1418 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
1419 ret <4 x double> %out
1421 define arm_aapcs_vfpcc <4 x double> @shuffle5_f64(<2 x double> %src1, <2 x double> %src2) {
1422 ; CHECK-LABEL: shuffle5_f64:
1423 ; CHECK: @ %bb.0: @ %entry
1424 ; CHECK-NEXT: vmov.f32 s8, s6
1425 ; CHECK-NEXT: vmov.f32 s10, s4
1426 ; CHECK-NEXT: vmov.f32 s4, s2
1427 ; CHECK-NEXT: vmov.f32 s6, s0
1428 ; CHECK-NEXT: vmov.f32 s9, s7
1429 ; CHECK-NEXT: vmov.f32 s11, s5
1430 ; CHECK-NEXT: vmov.f32 s5, s3
1431 ; CHECK-NEXT: vmov.f32 s7, s1
1432 ; CHECK-NEXT: vmov q0, q2
1435 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1436 ret <4 x double> %out
1438 define arm_aapcs_vfpcc <2 x double> @shuffle6_f64(<2 x double> %src1, <2 x double> %src2) {
1439 ; CHECK-LABEL: shuffle6_f64:
1440 ; CHECK: @ %bb.0: @ %entry
1441 ; CHECK-NEXT: vmov.f32 s2, s6
1442 ; CHECK-NEXT: vmov.f32 s3, s7
1445 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 0, i32 3>
1446 ret <2 x double> %out
1448 define arm_aapcs_vfpcc <2 x double> @shuffle7_f64(<2 x double> %src1, <2 x double> %src2) {
1449 ; CHECK-LABEL: shuffle7_f64:
1450 ; CHECK: @ %bb.0: @ %entry
1451 ; CHECK-NEXT: vmov.f32 s0, s6
1452 ; CHECK-NEXT: vmov.f32 s1, s7
1455 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 3, i32 1>
1456 ret <2 x double> %out
1458 define arm_aapcs_vfpcc <2 x double> @shuffle8_f64(<2 x double> %src1, <2 x double> %src2) {
1459 ; CHECK-LABEL: shuffle8_f64:
1460 ; CHECK: @ %bb.0: @ %entry
1461 ; CHECK-NEXT: vmov.f32 s6, s2
1462 ; CHECK-NEXT: vmov.f32 s7, s3
1463 ; CHECK-NEXT: vmov q0, q1
1466 %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 2, i32 1>
1467 ret <2 x double> %out
1469 define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) {
1470 ; CHECK-LV-LABEL: shuffle9_f64:
1471 ; CHECK-LV: @ %bb.0: @ %entry
1472 ; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11}
1473 ; CHECK-LV-NEXT: vpush {d8, d9, d10, d11}
1474 ; CHECK-LV-NEXT: vmov q5, q2
1475 ; CHECK-LV-NEXT: vmov.f32 s16, s0
1476 ; CHECK-LV-NEXT: vmov.f32 s18, s20
1477 ; CHECK-LV-NEXT: vmov.f32 s20, s2
1478 ; CHECK-LV-NEXT: vmov.f32 s10, s12
1479 ; CHECK-LV-NEXT: vmov.f32 s19, s21
1480 ; CHECK-LV-NEXT: vmov.f32 s8, s4
1481 ; CHECK-LV-NEXT: vmov.f32 s17, s1
1482 ; CHECK-LV-NEXT: vmov.f32 s21, s3
1483 ; CHECK-LV-NEXT: vmov q0, q4
1484 ; CHECK-LV-NEXT: vmov.f32 s12, s6
1485 ; CHECK-LV-NEXT: vmov.f32 s11, s13
1486 ; CHECK-LV-NEXT: vmov.f32 s9, s5
1487 ; CHECK-LV-NEXT: vmov.f32 s13, s7
1488 ; CHECK-LV-NEXT: vmov q1, q5
1489 ; CHECK-LV-NEXT: vpop {d8, d9, d10, d11}
1490 ; CHECK-LV-NEXT: bx lr
1492 ; CHECK-LIS-LABEL: shuffle9_f64:
1493 ; CHECK-LIS: @ %bb.0: @ %entry
1494 ; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11}
1495 ; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11}
1496 ; CHECK-LIS-NEXT: vmov q5, q2
1497 ; CHECK-LIS-NEXT: vmov q4, q0
1498 ; CHECK-LIS-NEXT: vmov.f32 s2, s20
1499 ; CHECK-LIS-NEXT: vmov.f32 s20, s18
1500 ; CHECK-LIS-NEXT: vmov.f32 s10, s12
1501 ; CHECK-LIS-NEXT: vmov.f32 s3, s21
1502 ; CHECK-LIS-NEXT: vmov.f32 s8, s4
1503 ; CHECK-LIS-NEXT: vmov.f32 s21, s19
1504 ; CHECK-LIS-NEXT: vmov.f32 s12, s6
1505 ; CHECK-LIS-NEXT: vmov.f32 s11, s13
1506 ; CHECK-LIS-NEXT: vmov.f32 s9, s5
1507 ; CHECK-LIS-NEXT: vmov.f32 s13, s7
1508 ; CHECK-LIS-NEXT: vmov q1, q5
1509 ; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11}
1510 ; CHECK-LIS-NEXT: bx lr
1512 %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1513 ret <8 x double> %out
1519 define arm_aapcs_vfpcc <4 x i64> @shuffle4_i64(<2 x i64> %src1, <2 x i64> %src2) {
1520 ; CHECK-LABEL: shuffle4_i64:
1521 ; CHECK: @ %bb.0: @ %entry
1522 ; CHECK-NEXT: vmov.f32 s8, s6
1523 ; CHECK-NEXT: vmov.f32 s6, s0
1524 ; CHECK-NEXT: vmov.f32 s9, s7
1525 ; CHECK-NEXT: vmov.f32 s7, s1
1526 ; CHECK-NEXT: vmov.f32 s10, s2
1527 ; CHECK-NEXT: vmov.f32 s11, s3
1528 ; CHECK-NEXT: vmov q0, q2
1531 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
1534 define arm_aapcs_vfpcc <4 x i64> @shuffle5_i64(<2 x i64> %src1, <2 x i64> %src2) {
1535 ; CHECK-LABEL: shuffle5_i64:
1536 ; CHECK: @ %bb.0: @ %entry
1537 ; CHECK-NEXT: vmov.f32 s8, s6
1538 ; CHECK-NEXT: vmov.f32 s10, s4
1539 ; CHECK-NEXT: vmov.f32 s4, s2
1540 ; CHECK-NEXT: vmov.f32 s6, s0
1541 ; CHECK-NEXT: vmov.f32 s9, s7
1542 ; CHECK-NEXT: vmov.f32 s11, s5
1543 ; CHECK-NEXT: vmov.f32 s5, s3
1544 ; CHECK-NEXT: vmov.f32 s7, s1
1545 ; CHECK-NEXT: vmov q0, q2
1548 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1551 define arm_aapcs_vfpcc <2 x i64> @shuffle6_i64(<2 x i64> %src1, <2 x i64> %src2) {
1552 ; CHECK-LABEL: shuffle6_i64:
1553 ; CHECK: @ %bb.0: @ %entry
1554 ; CHECK-NEXT: vmov.f32 s2, s6
1555 ; CHECK-NEXT: vmov.f32 s3, s7
1558 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 0, i32 3>
1561 define arm_aapcs_vfpcc <2 x i64> @shuffle7_i64(<2 x i64> %src1, <2 x i64> %src2) {
1562 ; CHECK-LABEL: shuffle7_i64:
1563 ; CHECK: @ %bb.0: @ %entry
1564 ; CHECK-NEXT: vmov.f32 s0, s6
1565 ; CHECK-NEXT: vmov.f32 s1, s7
1568 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 1>
1571 define arm_aapcs_vfpcc <2 x i64> @shuffle8_i64(<2 x i64> %src1, <2 x i64> %src2) {
1572 ; CHECK-LABEL: shuffle8_i64:
1573 ; CHECK: @ %bb.0: @ %entry
1574 ; CHECK-NEXT: vmov.f32 s6, s2
1575 ; CHECK-NEXT: vmov.f32 s7, s3
1576 ; CHECK-NEXT: vmov q0, q1
1579 %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 2, i32 1>
1582 define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) {
1583 ; CHECK-LV-LABEL: shuffle9_i64:
1584 ; CHECK-LV: @ %bb.0: @ %entry
1585 ; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11}
1586 ; CHECK-LV-NEXT: vpush {d8, d9, d10, d11}
1587 ; CHECK-LV-NEXT: vmov q5, q2
1588 ; CHECK-LV-NEXT: vmov.f32 s16, s0
1589 ; CHECK-LV-NEXT: vmov.f32 s18, s20
1590 ; CHECK-LV-NEXT: vmov.f32 s20, s2
1591 ; CHECK-LV-NEXT: vmov.f32 s10, s12
1592 ; CHECK-LV-NEXT: vmov.f32 s19, s21
1593 ; CHECK-LV-NEXT: vmov.f32 s8, s4
1594 ; CHECK-LV-NEXT: vmov.f32 s17, s1
1595 ; CHECK-LV-NEXT: vmov.f32 s21, s3
1596 ; CHECK-LV-NEXT: vmov q0, q4
1597 ; CHECK-LV-NEXT: vmov.f32 s12, s6
1598 ; CHECK-LV-NEXT: vmov.f32 s11, s13
1599 ; CHECK-LV-NEXT: vmov.f32 s9, s5
1600 ; CHECK-LV-NEXT: vmov.f32 s13, s7
1601 ; CHECK-LV-NEXT: vmov q1, q5
1602 ; CHECK-LV-NEXT: vpop {d8, d9, d10, d11}
1603 ; CHECK-LV-NEXT: bx lr
1605 ; CHECK-LIS-LABEL: shuffle9_i64:
1606 ; CHECK-LIS: @ %bb.0: @ %entry
1607 ; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11}
1608 ; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11}
1609 ; CHECK-LIS-NEXT: vmov q5, q2
1610 ; CHECK-LIS-NEXT: vmov q4, q0
1611 ; CHECK-LIS-NEXT: vmov.f32 s2, s20
1612 ; CHECK-LIS-NEXT: vmov.f32 s20, s18
1613 ; CHECK-LIS-NEXT: vmov.f32 s10, s12
1614 ; CHECK-LIS-NEXT: vmov.f32 s3, s21
1615 ; CHECK-LIS-NEXT: vmov.f32 s8, s4
1616 ; CHECK-LIS-NEXT: vmov.f32 s21, s19
1617 ; CHECK-LIS-NEXT: vmov.f32 s12, s6
1618 ; CHECK-LIS-NEXT: vmov.f32 s11, s13
1619 ; CHECK-LIS-NEXT: vmov.f32 s9, s5
1620 ; CHECK-LIS-NEXT: vmov.f32 s13, s7
1621 ; CHECK-LIS-NEXT: vmov q1, q5
1622 ; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11}
1623 ; CHECK-LIS-NEXT: bx lr
1625 %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1630 define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) {
1631 ; CHECK-LABEL: insert_i32:
1632 ; CHECK: @ %bb.0: @ %entry
1633 ; CHECK-NEXT: vmov.32 q0[0], r0
1636 %res = insertelement <4 x i32> undef, i32 %a, i32 0
1640 define arm_aapcs_vfpcc <8 x i16> @insert_i16(i16 %a) {
1641 ; CHECK-LABEL: insert_i16:
1642 ; CHECK: @ %bb.0: @ %entry
1643 ; CHECK-NEXT: vmov.16 q0[0], r0
1646 %res = insertelement <8 x i16> undef, i16 %a, i32 0
1650 define arm_aapcs_vfpcc <16 x i8> @insert_i8(i8 %a) {
1651 ; CHECK-LABEL: insert_i8:
1652 ; CHECK: @ %bb.0: @ %entry
1653 ; CHECK-NEXT: vmov.8 q0[0], r0
1656 %res = insertelement <16 x i8> undef, i8 %a, i32 0
1660 define arm_aapcs_vfpcc <2 x i64> @insert_i64(i64 %a) {
1661 ; CHECK-LABEL: insert_i64:
1662 ; CHECK: @ %bb.0: @ %entry
1663 ; CHECK-NEXT: vmov.32 q0[0], r0
1664 ; CHECK-NEXT: vmov.32 q0[1], r1
1667 %res = insertelement <2 x i64> undef, i64 %a, i32 0
1671 define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) {
1672 ; CHECK-LABEL: insert_f32:
1673 ; CHECK: @ %bb.0: @ %entry
1676 %res = insertelement <4 x float> undef, float %a, i32 0
1677 ret <4 x float> %res
1680 define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) {
1681 ; CHECK-LABEL: insert_f16:
1682 ; CHECK: @ %bb.0: @ %entry
1685 %res = insertelement <8 x half> undef, half %a, i32 0
1689 define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) {
1690 ; CHECK-LABEL: insert_f64:
1691 ; CHECK: @ %bb.0: @ %entry
1694 %res = insertelement <2 x double> undef, double %a, i32 0
1695 ret <2 x double> %res
1698 define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
1699 ; CHECK-LABEL: scalar_to_vector_i32:
1700 ; CHECK: @ %bb.0: @ %entry
1701 ; CHECK-NEXT: .pad #8
1702 ; CHECK-NEXT: sub sp, #8
1703 ; CHECK-NEXT: adr r2, .LCPI88_0
1704 ; CHECK-NEXT: vmov.u16 r0, q0[0]
1705 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1706 ; CHECK-NEXT: mov r1, sp
1707 ; CHECK-NEXT: vmov.32 q0[0], r0
1708 ; CHECK-NEXT: vstrh.32 q0, [r1]
1709 ; CHECK-NEXT: ldrd r0, r1, [sp], #8
1711 ; CHECK-NEXT: .p2align 4
1712 ; CHECK-NEXT: @ %bb.1:
1713 ; CHECK-NEXT: .LCPI88_0:
1714 ; CHECK-NEXT: .zero 4
1715 ; CHECK-NEXT: .long 7 @ 0x7
1716 ; CHECK-NEXT: .long 1 @ 0x1
1717 ; CHECK-NEXT: .long 9 @ 0x9
1719 %f = shufflevector <8 x i16> %v, <8 x i16> <i16 undef, i16 7, i16 1, i16 9, i16 undef, i16 undef, i16 undef, i16 undef>, <4 x i32> <i32 0, i32 9, i32 10, i32 11>
1720 %0 = bitcast <4 x i16> %f to i64
1725 define arm_aapcs_vfpcc i32 @extract_i32_0(<4 x i32> %a) {
1726 ; CHECK-LABEL: extract_i32_0:
1727 ; CHECK: @ %bb.0: @ %entry
1728 ; CHECK-NEXT: vmov r0, s0
1731 %res = extractelement <4 x i32> %a, i32 0
1735 define arm_aapcs_vfpcc i32 @extract_i32_3(<4 x i32> %a) {
1736 ; CHECK-LABEL: extract_i32_3:
1737 ; CHECK: @ %bb.0: @ %entry
1738 ; CHECK-NEXT: vmov r0, s3
1741 %res = extractelement <4 x i32> %a, i32 3
1745 define arm_aapcs_vfpcc i16 @extract_i16_0(<8 x i16> %a) {
1746 ; CHECK-LABEL: extract_i16_0:
1747 ; CHECK: @ %bb.0: @ %entry
1748 ; CHECK-NEXT: vmov.u16 r0, q0[0]
1751 %res = extractelement <8 x i16> %a, i32 0
1755 define arm_aapcs_vfpcc i16 @extract_i16_3(<8 x i16> %a) {
1756 ; CHECK-LABEL: extract_i16_3:
1757 ; CHECK: @ %bb.0: @ %entry
1758 ; CHECK-NEXT: vmov.u16 r0, q0[3]
1761 %res = extractelement <8 x i16> %a, i32 3
1765 define arm_aapcs_vfpcc i8 @extract_i8_0(<16 x i8> %a) {
1766 ; CHECK-LABEL: extract_i8_0:
1767 ; CHECK: @ %bb.0: @ %entry
1768 ; CHECK-NEXT: vmov.u8 r0, q0[0]
1771 %res = extractelement <16 x i8> %a, i32 0
1775 define arm_aapcs_vfpcc i8 @extract_i8_3(<16 x i8> %a) {
1776 ; CHECK-LABEL: extract_i8_3:
1777 ; CHECK: @ %bb.0: @ %entry
1778 ; CHECK-NEXT: vmov.u8 r0, q0[3]
1781 %res = extractelement <16 x i8> %a, i32 3
1785 define arm_aapcs_vfpcc i64 @extract_i64_0(<2 x i64> %a) {
1786 ; CHECK-LABEL: extract_i64_0:
1787 ; CHECK: @ %bb.0: @ %entry
1788 ; CHECK-NEXT: vmov r0, r1, d0
1791 %res = extractelement <2 x i64> %a, i32 0
1795 define arm_aapcs_vfpcc i64 @extract_i64_1(<2 x i64> %a) {
1796 ; CHECK-LABEL: extract_i64_1:
1797 ; CHECK: @ %bb.0: @ %entry
1798 ; CHECK-NEXT: vmov r0, r1, d1
1801 %res = extractelement <2 x i64> %a, i32 1
1805 define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) {
1806 ; CHECK-LABEL: extract_f32_0:
1807 ; CHECK: @ %bb.0: @ %entry
1810 %res = extractelement <4 x float> %a, i32 0
1814 define arm_aapcs_vfpcc float @extract_f32_3(<4 x float> %a) {
1815 ; CHECK-LABEL: extract_f32_3:
1816 ; CHECK: @ %bb.0: @ %entry
1817 ; CHECK-NEXT: vmov.f32 s0, s3
1820 %res = extractelement <4 x float> %a, i32 3
1824 define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
1825 ; CHECK-LABEL: extract_f16_0:
1826 ; CHECK: @ %bb.0: @ %entry
1829 %res = extractelement <8 x half> %a, i32 0
1833 define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) {
1834 ; CHECK-LABEL: extract_f16_3:
1835 ; CHECK: @ %bb.0: @ %entry
1836 ; CHECK-NEXT: vmovx.f16 s0, s1
1839 %res = extractelement <8 x half> %a, i32 3
1843 define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) {
1844 ; CHECK-LABEL: extract_f64_0:
1845 ; CHECK: @ %bb.0: @ %entry
1848 %res = extractelement <2 x double> %a, i32 0
1852 define arm_aapcs_vfpcc double @extract_f64_1(<2 x double> %a) {
1853 ; CHECK-LABEL: extract_f64_1:
1854 ; CHECK: @ %bb.0: @ %entry
1855 ; CHECK-NEXT: vmov.f32 s0, s2
1856 ; CHECK-NEXT: vmov.f32 s1, s3
1859 %res = extractelement <2 x double> %a, i32 1