1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
6 define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) {
7 ; CHECK-LABEL: vst2_v2i32:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: ldm.w r0, {r2, r3, r12}
10 ; CHECK-NEXT: ldr r0, [r0, #12]
11 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
12 ; CHECK-NEXT: vmov q0[3], q0[1], r12, r0
13 ; CHECK-NEXT: vstrw.32 q0, [r1]
16 %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
17 %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
18 %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
19 %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
20 %s = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
21 store <4 x i32> %s, <4 x i32> *%dst, align 4
25 define void @vst2_v4i32(<4 x i32> *%src, <8 x i32> *%dst) {
26 ; CHECK-LABEL: vst2_v4i32:
27 ; CHECK: @ %bb.0: @ %entry
28 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
29 ; CHECK-NEXT: vldrw.u32 q0, [r0]
30 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1]
31 ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]
34 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
35 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
36 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
37 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
38 %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
39 store <8 x i32> %s, <8 x i32> *%dst, align 4
43 define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) {
44 ; CHECK-LABEL: vst2_v8i32:
45 ; CHECK: @ %bb.0: @ %entry
46 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
47 ; CHECK-NEXT: vldrw.u32 q0, [r0]
48 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
49 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
50 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1]
51 ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]!
52 ; CHECK-NEXT: vst20.32 {q2, q3}, [r1]
53 ; CHECK-NEXT: vst21.32 {q2, q3}, [r1]
56 %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
57 %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
58 %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
59 %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
60 %s = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
61 store <16 x i32> %s, <16 x i32> *%dst, align 4
65 define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) {
66 ; CHECK-LABEL: vst2_v16i32:
67 ; CHECK: @ %bb.0: @ %entry
68 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
69 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
70 ; CHECK-NEXT: vldrw.u32 q7, [r0, #64]
71 ; CHECK-NEXT: vldrw.u32 q6, [r0]
72 ; CHECK-NEXT: vldrw.u32 q1, [r0, #112]
73 ; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
74 ; CHECK-NEXT: vldrw.u32 q5, [r0, #80]
75 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
76 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
77 ; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
78 ; CHECK-NEXT: add.w r0, r1, #96
79 ; CHECK-NEXT: add.w r2, r1, #64
80 ; CHECK-NEXT: vst20.32 {q6, q7}, [r1]
81 ; CHECK-NEXT: vst21.32 {q6, q7}, [r1]!
82 ; CHECK-NEXT: vst20.32 {q4, q5}, [r1]
83 ; CHECK-NEXT: vst20.32 {q2, q3}, [r2]
84 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0]
85 ; CHECK-NEXT: vst21.32 {q4, q5}, [r1]
86 ; CHECK-NEXT: vst21.32 {q2, q3}, [r2]
87 ; CHECK-NEXT: vst21.32 {q0, q1}, [r0]
88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
91 %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
92 %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
93 %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
94 %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
95 %s = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
96 store <32 x i32> %s, <32 x i32> *%dst, align 4
100 define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) {
101 ; CHECK-LABEL: vst2_v4i32_align1:
102 ; CHECK: @ %bb.0: @ %entry
103 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
104 ; CHECK-NEXT: vldrw.u32 q1, [r0]
105 ; CHECK-NEXT: vmov.f32 s8, s6
106 ; CHECK-NEXT: vmov.f32 s9, s2
107 ; CHECK-NEXT: vmov.f32 s10, s7
108 ; CHECK-NEXT: vmov.f32 s11, s3
109 ; CHECK-NEXT: vmov.f32 s12, s4
110 ; CHECK-NEXT: vstrb.8 q2, [r1, #16]
111 ; CHECK-NEXT: vmov.f32 s13, s0
112 ; CHECK-NEXT: vmov.f32 s14, s5
113 ; CHECK-NEXT: vmov.f32 s15, s1
114 ; CHECK-NEXT: vstrb.8 q3, [r1]
117 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
118 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
119 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
120 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
121 %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
122 store <8 x i32> %s, <8 x i32> *%dst, align 1
128 define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) {
129 ; CHECK-LABEL: vst2_v2i16:
130 ; CHECK: @ %bb.0: @ %entry
131 ; CHECK-NEXT: ldrh r2, [r0, #2]
132 ; CHECK-NEXT: ldrh r3, [r0]
133 ; CHECK-NEXT: ldrh.w r12, [r0, #6]
134 ; CHECK-NEXT: ldrh r0, [r0, #4]
135 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
136 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
137 ; CHECK-NEXT: vstrh.32 q0, [r1]
140 %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
141 %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
142 %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
143 %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
144 %s = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
145 store <4 x i16> %s, <4 x i16> *%dst, align 2
149 define void @vst2_v4i16(<4 x i16> *%src, <8 x i16> *%dst) {
150 ; CHECK-LABEL: vst2_v4i16:
151 ; CHECK: @ %bb.0: @ %entry
152 ; CHECK-NEXT: vldrh.u32 q0, [r0, #8]
153 ; CHECK-NEXT: vldrh.u32 q1, [r0]
154 ; CHECK-NEXT: vmovnt.i32 q1, q0
155 ; CHECK-NEXT: vstrh.16 q1, [r1]
158 %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
159 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
160 %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
161 %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
162 %s = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
163 store <8 x i16> %s, <8 x i16> *%dst, align 2
167 define void @vst2_v8i16(<8 x i16> *%src, <16 x i16> *%dst) {
168 ; CHECK-LABEL: vst2_v8i16:
169 ; CHECK: @ %bb.0: @ %entry
170 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
171 ; CHECK-NEXT: vldrw.u32 q0, [r0]
172 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1]
173 ; CHECK-NEXT: vst21.16 {q0, q1}, [r1]
176 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
177 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
178 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
179 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
180 %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
181 store <16 x i16> %s, <16 x i16> *%dst, align 2
185 define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) {
186 ; CHECK-LABEL: vst2_v16i16:
187 ; CHECK: @ %bb.0: @ %entry
188 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
189 ; CHECK-NEXT: vldrw.u32 q0, [r0]
190 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
191 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
192 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1]
193 ; CHECK-NEXT: vst21.16 {q0, q1}, [r1]!
194 ; CHECK-NEXT: vst20.16 {q2, q3}, [r1]
195 ; CHECK-NEXT: vst21.16 {q2, q3}, [r1]
198 %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
199 %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
200 %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
201 %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
202 %s = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
203 store <32 x i16> %s, <32 x i16> *%dst, align 2
207 define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
208 ; CHECK-LABEL: vst2_v8i16_align1:
209 ; CHECK: @ %bb.0: @ %entry
210 ; CHECK-NEXT: vldrw.u32 q2, [r0]
211 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
212 ; CHECK-NEXT: vmovx.f16 s1, s10
213 ; CHECK-NEXT: vmovx.f16 s0, s6
214 ; CHECK-NEXT: vins.f16 s10, s6
215 ; CHECK-NEXT: vmovx.f16 s3, s11
216 ; CHECK-NEXT: vmovx.f16 s6, s7
217 ; CHECK-NEXT: vins.f16 s11, s7
218 ; CHECK-NEXT: vins.f16 s3, s6
219 ; CHECK-NEXT: vmovx.f16 s6, s8
220 ; CHECK-NEXT: vins.f16 s8, s4
221 ; CHECK-NEXT: vmovx.f16 s4, s4
222 ; CHECK-NEXT: vmov q3, q2
223 ; CHECK-NEXT: vins.f16 s6, s4
224 ; CHECK-NEXT: vmovx.f16 s15, s9
225 ; CHECK-NEXT: vins.f16 s9, s5
226 ; CHECK-NEXT: vmovx.f16 s4, s5
227 ; CHECK-NEXT: vins.f16 s1, s0
228 ; CHECK-NEXT: vmov.f32 s0, s10
229 ; CHECK-NEXT: vins.f16 s15, s4
230 ; CHECK-NEXT: vmov.f32 s2, s11
231 ; CHECK-NEXT: vmov.f32 s13, s6
232 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
233 ; CHECK-NEXT: vmov.f32 s14, s9
234 ; CHECK-NEXT: vstrb.8 q3, [r1]
237 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
238 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
239 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
240 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
241 %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
242 store <16 x i16> %s, <16 x i16> *%dst, align 1
248 define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) {
249 ; CHECK-LABEL: vst2_v2i8:
250 ; CHECK: @ %bb.0: @ %entry
251 ; CHECK-NEXT: ldrb r2, [r0]
252 ; CHECK-NEXT: ldrb r3, [r0, #1]
253 ; CHECK-NEXT: ldrb.w r12, [r0, #2]
254 ; CHECK-NEXT: ldrb r0, [r0, #3]
255 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
256 ; CHECK-NEXT: vmov q0[3], q0[1], r12, r0
257 ; CHECK-NEXT: vstrb.32 q0, [r1]
260 %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
261 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
262 %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
263 %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
264 %s = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
265 store <4 x i8> %s, <4 x i8> *%dst, align 1
269 define void @vst2_v4i8(<4 x i8> *%src, <8 x i8> *%dst) {
270 ; CHECK-LABEL: vst2_v4i8:
271 ; CHECK: @ %bb.0: @ %entry
272 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
273 ; CHECK-NEXT: vldrb.u32 q1, [r0]
274 ; CHECK-NEXT: vmovnt.i32 q1, q0
275 ; CHECK-NEXT: vstrb.16 q1, [r1]
278 %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
279 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
280 %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
281 %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
282 %s = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
283 store <8 x i8> %s, <8 x i8> *%dst, align 1
287 define void @vst2_v8i8(<8 x i8> *%src, <16 x i8> *%dst) {
288 ; CHECK-LABEL: vst2_v8i8:
289 ; CHECK: @ %bb.0: @ %entry
290 ; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
291 ; CHECK-NEXT: vldrb.u16 q1, [r0]
292 ; CHECK-NEXT: vmovnt.i16 q1, q0
293 ; CHECK-NEXT: vstrb.8 q1, [r1]
296 %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
297 %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
298 %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
299 %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
300 %s = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
301 store <16 x i8> %s, <16 x i8> *%dst, align 1
305 define void @vst2_v16i8(<16 x i8> *%src, <32 x i8> *%dst) {
306 ; CHECK-LABEL: vst2_v16i8:
307 ; CHECK: @ %bb.0: @ %entry
308 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
309 ; CHECK-NEXT: vldrw.u32 q0, [r0]
310 ; CHECK-NEXT: vst20.8 {q0, q1}, [r1]
311 ; CHECK-NEXT: vst21.8 {q0, q1}, [r1]
314 %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
315 %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
316 %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
317 %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
318 %s = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
319 store <32 x i8> %s, <32 x i8> *%dst, align 1
325 define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
326 ; CHECK-LABEL: vst2_v2i64:
327 ; CHECK: @ %bb.0: @ %entry
328 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
329 ; CHECK-NEXT: vldrw.u32 q1, [r0]
330 ; CHECK-NEXT: vmov.f32 s10, s0
331 ; CHECK-NEXT: vmov.f32 s11, s1
332 ; CHECK-NEXT: vmov.f32 s8, s4
333 ; CHECK-NEXT: vmov.f32 s9, s5
334 ; CHECK-NEXT: vmov.f32 s0, s6
335 ; CHECK-NEXT: vstrb.8 q2, [r1], #16
336 ; CHECK-NEXT: vmov.f32 s1, s7
337 ; CHECK-NEXT: vstrw.32 q0, [r1]
340 %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
341 %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
342 %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
343 %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
344 %s = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
345 store <4 x i64> %s, <4 x i64> *%dst, align 8
349 define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
350 ; CHECK-LABEL: vst2_v4i64:
351 ; CHECK: @ %bb.0: @ %entry
352 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
353 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
354 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
355 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
356 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
357 ; CHECK-NEXT: vldrw.u32 q1, [r0]
358 ; CHECK-NEXT: vmov.f32 s12, s2
359 ; CHECK-NEXT: vmov.f32 s13, s3
360 ; CHECK-NEXT: vmov.f32 s20, s6
361 ; CHECK-NEXT: vmov.f32 s21, s7
362 ; CHECK-NEXT: vmov.f32 s2, s16
363 ; CHECK-NEXT: vmov.f32 s3, s17
364 ; CHECK-NEXT: vmov.f32 s6, s8
365 ; CHECK-NEXT: vstrw.32 q0, [r1, #32]
366 ; CHECK-NEXT: vmov.f32 s7, s9
367 ; CHECK-NEXT: vmov.f32 s14, s18
368 ; CHECK-NEXT: vstrb.8 q1, [r1], #48
369 ; CHECK-NEXT: vmov.f32 s15, s19
370 ; CHECK-NEXT: vmov.f32 s22, s10
371 ; CHECK-NEXT: vstrw.32 q3, [r1]
372 ; CHECK-NEXT: vmov.f32 s23, s11
373 ; CHECK-NEXT: vstrw.32 q5, [r1, #-32]
374 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
377 %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
378 %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
379 %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
380 %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
381 %s = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
382 store <8 x i64> %s, <8 x i64> *%dst, align 8
388 define void @vst2_v2f32(<2 x float> *%src, <4 x float> *%dst) {
389 ; CHECK-LABEL: vst2_v2f32:
390 ; CHECK: @ %bb.0: @ %entry
391 ; CHECK-NEXT: vldr s0, [r0]
392 ; CHECK-NEXT: vldr s2, [r0, #4]
393 ; CHECK-NEXT: vldr s1, [r0, #8]
394 ; CHECK-NEXT: vldr s3, [r0, #12]
395 ; CHECK-NEXT: vstrw.32 q0, [r1]
398 %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
399 %l1 = load <2 x float>, <2 x float>* %s1, align 4
400 %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
401 %l2 = load <2 x float>, <2 x float>* %s2, align 4
402 %s = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
403 store <4 x float> %s, <4 x float> *%dst, align 4
407 define void @vst2_v4f32(<4 x float> *%src, <8 x float> *%dst) {
408 ; CHECK-LABEL: vst2_v4f32:
409 ; CHECK: @ %bb.0: @ %entry
410 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
411 ; CHECK-NEXT: vldrw.u32 q0, [r0]
412 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1]
413 ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]
416 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
417 %l1 = load <4 x float>, <4 x float>* %s1, align 4
418 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
419 %l2 = load <4 x float>, <4 x float>* %s2, align 4
420 %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
421 store <8 x float> %s, <8 x float> *%dst, align 4
425 define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) {
426 ; CHECK-LABEL: vst2_v8f32:
427 ; CHECK: @ %bb.0: @ %entry
428 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
429 ; CHECK-NEXT: vldrw.u32 q0, [r0]
430 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
431 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
432 ; CHECK-NEXT: vst20.32 {q0, q1}, [r1]
433 ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]!
434 ; CHECK-NEXT: vst20.32 {q2, q3}, [r1]
435 ; CHECK-NEXT: vst21.32 {q2, q3}, [r1]
438 %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
439 %l1 = load <8 x float>, <8 x float>* %s1, align 4
440 %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
441 %l2 = load <8 x float>, <8 x float>* %s2, align 4
442 %s = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
443 store <16 x float> %s, <16 x float> *%dst, align 4
447 define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) {
448 ; CHECK-LABEL: vst2_v16f32:
449 ; CHECK: @ %bb.0: @ %entry
450 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
451 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
452 ; CHECK-NEXT: vldrw.u32 q7, [r0, #64]
453 ; CHECK-NEXT: vldrw.u32 q6, [r0]
454 ; CHECK-NEXT: vldrw.u32 q1, [r0, #112]
455 ; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
456 ; CHECK-NEXT: vldrw.u32 q5, [r0, #80]
457 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
458 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
459 ; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
460 ; CHECK-NEXT: add.w r0, r1, #96
461 ; CHECK-NEXT: add.w r2, r1, #64
462 ; CHECK-NEXT: vst20.32 {q6, q7}, [r1]
463 ; CHECK-NEXT: vst21.32 {q6, q7}, [r1]!
464 ; CHECK-NEXT: vst20.32 {q4, q5}, [r1]
465 ; CHECK-NEXT: vst20.32 {q2, q3}, [r2]
466 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0]
467 ; CHECK-NEXT: vst21.32 {q4, q5}, [r1]
468 ; CHECK-NEXT: vst21.32 {q2, q3}, [r2]
469 ; CHECK-NEXT: vst21.32 {q0, q1}, [r0]
470 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
473 %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
474 %l1 = load <16 x float>, <16 x float>* %s1, align 4
475 %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
476 %l2 = load <16 x float>, <16 x float>* %s2, align 4
477 %s = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
478 store <32 x float> %s, <32 x float> *%dst, align 4
482 define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) {
483 ; CHECK-LABEL: vst2_v4f32_align1:
484 ; CHECK: @ %bb.0: @ %entry
485 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
486 ; CHECK-NEXT: vldrw.u32 q1, [r0]
487 ; CHECK-NEXT: vmov.f32 s8, s6
488 ; CHECK-NEXT: vmov.f32 s9, s2
489 ; CHECK-NEXT: vmov.f32 s10, s7
490 ; CHECK-NEXT: vmov.f32 s11, s3
491 ; CHECK-NEXT: vmov.f32 s12, s4
492 ; CHECK-NEXT: vstrb.8 q2, [r1, #16]
493 ; CHECK-NEXT: vmov.f32 s13, s0
494 ; CHECK-NEXT: vmov.f32 s14, s5
495 ; CHECK-NEXT: vmov.f32 s15, s1
496 ; CHECK-NEXT: vstrb.8 q3, [r1]
499 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
500 %l1 = load <4 x float>, <4 x float>* %s1, align 4
501 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
502 %l2 = load <4 x float>, <4 x float>* %s2, align 4
503 %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
504 store <8 x float> %s, <8 x float> *%dst, align 1
510 define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
511 ; CHECK-LABEL: vst2_v2f16:
512 ; CHECK: @ %bb.0: @ %entry
513 ; CHECK-NEXT: ldrd r2, r0, [r0]
514 ; CHECK-NEXT: vmov.32 q1[0], r2
515 ; CHECK-NEXT: vmov.32 q0[0], r0
516 ; CHECK-NEXT: vmovx.f16 s5, s4
517 ; CHECK-NEXT: vins.f16 s4, s0
518 ; CHECK-NEXT: vmovx.f16 s0, s0
519 ; CHECK-NEXT: vins.f16 s5, s0
520 ; CHECK-NEXT: vmov r0, r2, d2
521 ; CHECK-NEXT: str r2, [r1, #4]
522 ; CHECK-NEXT: str r0, [r1]
525 %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
526 %l1 = load <2 x half>, <2 x half>* %s1, align 4
527 %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
528 %l2 = load <2 x half>, <2 x half>* %s2, align 4
529 %s = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
530 store <4 x half> %s, <4 x half> *%dst, align 2
534 define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
535 ; CHECK-LABEL: vst2_v4f16:
536 ; CHECK: @ %bb.0: @ %entry
537 ; CHECK-NEXT: ldrd r2, r12, [r0]
538 ; CHECK-NEXT: ldrd r3, r0, [r0, #8]
539 ; CHECK-NEXT: vmov.32 q0[0], r2
540 ; CHECK-NEXT: vmov.32 q1[0], r3
541 ; CHECK-NEXT: vmov.32 q0[1], r12
542 ; CHECK-NEXT: vmov.32 q1[1], r0
543 ; CHECK-NEXT: vmovx.f16 s2, s0
544 ; CHECK-NEXT: vins.f16 s0, s4
545 ; CHECK-NEXT: vmovx.f16 s4, s4
546 ; CHECK-NEXT: vins.f16 s2, s4
547 ; CHECK-NEXT: vmovx.f16 s4, s1
548 ; CHECK-NEXT: vins.f16 s1, s5
549 ; CHECK-NEXT: vmovx.f16 s6, s5
550 ; CHECK-NEXT: vmov q2, q0
551 ; CHECK-NEXT: vins.f16 s4, s6
552 ; CHECK-NEXT: vmov.f32 s9, s2
553 ; CHECK-NEXT: vmov.f32 s10, s1
554 ; CHECK-NEXT: vmov.f32 s11, s4
555 ; CHECK-NEXT: vstrh.16 q2, [r1]
558 %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
559 %l1 = load <4 x half>, <4 x half>* %s1, align 4
560 %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
561 %l2 = load <4 x half>, <4 x half>* %s2, align 4
562 %s = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
563 store <8 x half> %s, <8 x half> *%dst, align 2
567 define void @vst2_v8f16(<8 x half> *%src, <16 x half> *%dst) {
568 ; CHECK-LABEL: vst2_v8f16:
569 ; CHECK: @ %bb.0: @ %entry
570 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
571 ; CHECK-NEXT: vldrw.u32 q0, [r0]
572 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1]
573 ; CHECK-NEXT: vst21.16 {q0, q1}, [r1]
576 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
577 %l1 = load <8 x half>, <8 x half>* %s1, align 4
578 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
579 %l2 = load <8 x half>, <8 x half>* %s2, align 4
580 %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
581 store <16 x half> %s, <16 x half> *%dst, align 2
585 define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) {
586 ; CHECK-LABEL: vst2_v16f16:
587 ; CHECK: @ %bb.0: @ %entry
588 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
589 ; CHECK-NEXT: vldrw.u32 q2, [r0]
590 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
591 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
592 ; CHECK-NEXT: vst20.16 {q2, q3}, [r1]
593 ; CHECK-NEXT: vst21.16 {q2, q3}, [r1]!
594 ; CHECK-NEXT: vst20.16 {q0, q1}, [r1]
595 ; CHECK-NEXT: vst21.16 {q0, q1}, [r1]
598 %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
599 %l1 = load <16 x half>, <16 x half>* %s1, align 4
600 %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
601 %l2 = load <16 x half>, <16 x half>* %s2, align 4
602 %s = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
603 store <32 x half> %s, <32 x half> *%dst, align 2
607 define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
608 ; CHECK-LABEL: vst2_v8f16_align1:
609 ; CHECK: @ %bb.0: @ %entry
610 ; CHECK-NEXT: vldrw.u32 q1, [r0]
611 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
612 ; CHECK-NEXT: vmovx.f16 s1, s6
613 ; CHECK-NEXT: vmovx.f16 s0, s10
614 ; CHECK-NEXT: vins.f16 s1, s0
615 ; CHECK-NEXT: vmovx.f16 s3, s7
616 ; CHECK-NEXT: vmovx.f16 s0, s11
617 ; CHECK-NEXT: vins.f16 s6, s10
618 ; CHECK-NEXT: vins.f16 s3, s0
619 ; CHECK-NEXT: vmovx.f16 s10, s4
620 ; CHECK-NEXT: vmovx.f16 s0, s8
621 ; CHECK-NEXT: vins.f16 s7, s11
622 ; CHECK-NEXT: vins.f16 s4, s8
623 ; CHECK-NEXT: vins.f16 s10, s0
624 ; CHECK-NEXT: vmovx.f16 s8, s5
625 ; CHECK-NEXT: vins.f16 s5, s9
626 ; CHECK-NEXT: vmovx.f16 s0, s9
627 ; CHECK-NEXT: vmov q3, q1
628 ; CHECK-NEXT: vins.f16 s8, s0
629 ; CHECK-NEXT: vmov.f32 s0, s6
630 ; CHECK-NEXT: vmov.f32 s2, s7
631 ; CHECK-NEXT: vmov.f32 s13, s10
632 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
633 ; CHECK-NEXT: vmov.f32 s14, s5
634 ; CHECK-NEXT: vmov.f32 s15, s8
635 ; CHECK-NEXT: vstrb.8 q3, [r1]
638 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
639 %l1 = load <8 x half>, <8 x half>* %s1, align 4
640 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
641 %l2 = load <8 x half>, <8 x half>* %s2, align 4
642 %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
643 store <16 x half> %s, <16 x half> *%dst, align 1
649 define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
650 ; CHECK-LABEL: vst2_v2f64:
651 ; CHECK: @ %bb.0: @ %entry
652 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
653 ; CHECK-NEXT: vldrw.u32 q1, [r0]
654 ; CHECK-NEXT: vmov.f64 d4, d3
655 ; CHECK-NEXT: vmov.f64 d5, d1
656 ; CHECK-NEXT: vmov.f64 d3, d0
657 ; CHECK-NEXT: vstrw.32 q2, [r1, #16]
658 ; CHECK-NEXT: vstrw.32 q1, [r1]
661 %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
662 %l1 = load <2 x double>, <2 x double>* %s1, align 4
663 %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
664 %l2 = load <2 x double>, <2 x double>* %s2, align 4
665 %s = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
666 store <4 x double> %s, <4 x double> *%dst, align 8
670 define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) {
671 ; CHECK-LABEL: vst2_v4f64:
672 ; CHECK: @ %bb.0: @ %entry
673 ; CHECK-NEXT: .vsave {d8, d9}
674 ; CHECK-NEXT: vpush {d8, d9}
675 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
676 ; CHECK-NEXT: vldrw.u32 q2, [r0]
677 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
678 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
679 ; CHECK-NEXT: vmov.f64 d8, d4
680 ; CHECK-NEXT: vmov.f64 d9, d0
681 ; CHECK-NEXT: vmov.f64 d0, d5
682 ; CHECK-NEXT: vstrw.32 q4, [r1]
683 ; CHECK-NEXT: vmov.f64 d5, d2
684 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
685 ; CHECK-NEXT: vmov.f64 d4, d6
686 ; CHECK-NEXT: vmov.f64 d2, d7
687 ; CHECK-NEXT: vstrw.32 q2, [r1, #32]
688 ; CHECK-NEXT: vstrw.32 q1, [r1, #48]
689 ; CHECK-NEXT: vpop {d8, d9}
692 %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
693 %l1 = load <4 x double>, <4 x double>* %s1, align 4
694 %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
695 %l2 = load <4 x double>, <4 x double>* %s2, align 4
696 %s = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
697 store <8 x double> %s, <8 x double> *%dst, align 8