1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s
6 define void @vst4_v2i32(ptr %src, ptr %dst) {
7 ; CHECK-LABEL: vst4_v2i32:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: .save {r4, r5, r6, lr}
10 ; CHECK-NEXT: push {r4, r5, r6, lr}
11 ; CHECK-NEXT: add.w r6, r0, #16
12 ; CHECK-NEXT: ldrd lr, r12, [r0]
13 ; CHECK-NEXT: ldrd r3, r2, [r0, #8]
14 ; CHECK-NEXT: ldm r6, {r4, r5, r6}
15 ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
16 ; CHECK-NEXT: ldr r0, [r0, #28]
17 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
18 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6
19 ; CHECK-NEXT: vmov.f32 s8, s4
20 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0
21 ; CHECK-NEXT: vmov.f32 s9, s6
22 ; CHECK-NEXT: vmov.f32 s4, s5
23 ; CHECK-NEXT: vmov.f32 s5, s7
24 ; CHECK-NEXT: vmov.f32 s10, s0
25 ; CHECK-NEXT: vmov.f32 s11, s2
26 ; CHECK-NEXT: vmov.f32 s6, s1
27 ; CHECK-NEXT: vstrw.32 q2, [r1]
28 ; CHECK-NEXT: vmov.f32 s7, s3
29 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
30 ; CHECK-NEXT: pop {r4, r5, r6, pc}
32 %l1 = load <2 x i32>, ptr %src, align 4
33 %s2 = getelementptr <2 x i32>, ptr %src, i32 1
34 %l2 = load <2 x i32>, ptr %s2, align 4
35 %s3 = getelementptr <2 x i32>, ptr %src, i32 2
36 %l3 = load <2 x i32>, ptr %s3, align 4
37 %s4 = getelementptr <2 x i32>, ptr %src, i32 3
38 %l4 = load <2 x i32>, ptr %s4, align 4
39 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
40 %t2 = shufflevector <2 x i32> %l3, <2 x i32> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
42 store <8 x i32> %s, ptr %dst, align 4
46 define void @vst4_v4i32(ptr %src, ptr %dst) {
47 ; CHECK-LABEL: vst4_v4i32:
48 ; CHECK: @ %bb.0: @ %entry
49 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
50 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
51 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
52 ; CHECK-NEXT: vldrw.u32 q0, [r0]
53 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
54 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
55 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
56 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
59 %l1 = load <4 x i32>, ptr %src, align 4
60 %s2 = getelementptr <4 x i32>, ptr %src, i32 1
61 %l2 = load <4 x i32>, ptr %s2, align 4
62 %s3 = getelementptr <4 x i32>, ptr %src, i32 2
63 %l3 = load <4 x i32>, ptr %s3, align 4
64 %s4 = getelementptr <4 x i32>, ptr %src, i32 3
65 %l4 = load <4 x i32>, ptr %s4, align 4
66 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
67 %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
68 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
69 store <16 x i32> %s, ptr %dst, align 4
73 define void @vst4_v8i32(ptr %src, ptr %dst) {
74 ; CHECK-LABEL: vst4_v8i32:
75 ; CHECK: @ %bb.0: @ %entry
76 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
77 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
78 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
79 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
80 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
81 ; CHECK-NEXT: vldrw.u32 q4, [r0]
82 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
83 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
84 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
85 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
86 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
87 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
88 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
89 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]!
90 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
91 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
92 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
93 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
94 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
97 %l1 = load <8 x i32>, ptr %src, align 4
98 %s2 = getelementptr <8 x i32>, ptr %src, i32 1
99 %l2 = load <8 x i32>, ptr %s2, align 4
100 %s3 = getelementptr <8 x i32>, ptr %src, i32 2
101 %l3 = load <8 x i32>, ptr %s3, align 4
102 %s4 = getelementptr <8 x i32>, ptr %src, i32 3
103 %l4 = load <8 x i32>, ptr %s4, align 4
104 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
105 %t2 = shufflevector <8 x i32> %l3, <8 x i32> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
107 store <32 x i32> %s, ptr %dst, align 4
111 define void @vst4_v16i32(ptr %src, ptr %dst) {
112 ; CHECK-LABEL: vst4_v16i32:
113 ; CHECK: @ %bb.0: @ %entry
114 ; CHECK-NEXT: .save {r4, r5}
115 ; CHECK-NEXT: push {r4, r5}
116 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
117 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
118 ; CHECK-NEXT: .pad #192
119 ; CHECK-NEXT: sub sp, #192
120 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
121 ; CHECK-NEXT: add r2, sp, #64
122 ; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
123 ; CHECK-NEXT: vldrw.u32 q3, [r0, #208]
124 ; CHECK-NEXT: vldrw.u32 q2, [r0, #144]
125 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
126 ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
127 ; CHECK-NEXT: vldrw.u32 q0, [r0]
128 ; CHECK-NEXT: vldrw.u32 q2, [r0, #128]
129 ; CHECK-NEXT: vldrw.u32 q5, [r0, #240]
130 ; CHECK-NEXT: vmov q6, q4
131 ; CHECK-NEXT: vldrw.u32 q3, [r0, #192]
132 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
133 ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
134 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
135 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
136 ; CHECK-NEXT: add r2, sp, #128
137 ; CHECK-NEXT: vmov q7, q5
138 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224]
139 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
140 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
141 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
142 ; CHECK-NEXT: vmov q6, q2
143 ; CHECK-NEXT: vmov q5, q1
144 ; CHECK-NEXT: vmov q7, q3
145 ; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
146 ; CHECK-NEXT: add r2, sp, #64
147 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
148 ; CHECK-NEXT: mov r0, r1
149 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
150 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
151 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
152 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]!
153 ; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
154 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
155 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
156 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
157 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
158 ; CHECK-NEXT: add.w r0, r1, #192
159 ; CHECK-NEXT: adds r1, #128
160 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
161 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
162 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
163 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]
164 ; CHECK-NEXT: add r1, sp, #128
165 ; CHECK-NEXT: vldmia r1, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
166 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
167 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
168 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
169 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
170 ; CHECK-NEXT: add sp, #192
171 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
172 ; CHECK-NEXT: pop {r4, r5}
175 %l1 = load <16 x i32>, ptr %src, align 4
176 %s2 = getelementptr <16 x i32>, ptr %src, i32 1
177 %l2 = load <16 x i32>, ptr %s2, align 4
178 %s3 = getelementptr <16 x i32>, ptr %src, i32 2
179 %l3 = load <16 x i32>, ptr %s3, align 4
180 %s4 = getelementptr <16 x i32>, ptr %src, i32 3
181 %l4 = load <16 x i32>, ptr %s4, align 4
182 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
183 %t2 = shufflevector <16 x i32> %l3, <16 x i32> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
184 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
185 store <64 x i32> %s, ptr %dst, align 4
189 define void @vst4_v4i32_align1(ptr %src, ptr %dst) {
190 ; CHECK-LABEL: vst4_v4i32_align1:
191 ; CHECK: @ %bb.0: @ %entry
192 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
193 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
194 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
195 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
196 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
197 ; CHECK-NEXT: vldrw.u32 q4, [r0]
198 ; CHECK-NEXT: vmov.f32 s14, s1
199 ; CHECK-NEXT: vmov.f32 s22, s0
200 ; CHECK-NEXT: vmov.f32 s26, s3
201 ; CHECK-NEXT: vmov.f32 s12, s17
202 ; CHECK-NEXT: vmov.f32 s13, s9
203 ; CHECK-NEXT: vmov.f32 s15, s5
204 ; CHECK-NEXT: vmov.f32 s20, s16
205 ; CHECK-NEXT: vstrb.8 q3, [r1, #16]
206 ; CHECK-NEXT: vmov.f32 s21, s8
207 ; CHECK-NEXT: vmov.f32 s23, s4
208 ; CHECK-NEXT: vmov.f32 s24, s19
209 ; CHECK-NEXT: vstrb.8 q5, [r1]
210 ; CHECK-NEXT: vmov.f32 s25, s11
211 ; CHECK-NEXT: vmov.f32 s27, s7
212 ; CHECK-NEXT: vmov.f32 s0, s18
213 ; CHECK-NEXT: vstrb.8 q6, [r1, #48]
214 ; CHECK-NEXT: vmov.f32 s1, s10
215 ; CHECK-NEXT: vmov.f32 s3, s6
216 ; CHECK-NEXT: vstrb.8 q0, [r1, #32]
217 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
220 %l1 = load <4 x i32>, ptr %src, align 4
221 %s2 = getelementptr <4 x i32>, ptr %src, i32 1
222 %l2 = load <4 x i32>, ptr %s2, align 4
223 %s3 = getelementptr <4 x i32>, ptr %src, i32 2
224 %l3 = load <4 x i32>, ptr %s3, align 4
225 %s4 = getelementptr <4 x i32>, ptr %src, i32 3
226 %l4 = load <4 x i32>, ptr %s4, align 4
227 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
228 %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
229 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
230 store <16 x i32> %s, ptr %dst, align 1
236 define void @vst4_v2i16(ptr %src, ptr %dst) {
237 ; CHECK-LABEL: vst4_v2i16:
238 ; CHECK: @ %bb.0: @ %entry
239 ; CHECK-NEXT: .save {r4, r5, r6, lr}
240 ; CHECK-NEXT: push {r4, r5, r6, lr}
241 ; CHECK-NEXT: ldrh r3, [r0, #2]
242 ; CHECK-NEXT: ldrh r2, [r0]
243 ; CHECK-NEXT: ldrh.w r12, [r0, #10]
244 ; CHECK-NEXT: ldrh.w lr, [r0, #4]
245 ; CHECK-NEXT: vmov q1[2], q1[0], r2, r3
246 ; CHECK-NEXT: ldrh r4, [r0, #12]
247 ; CHECK-NEXT: ldrh r5, [r0, #6]
248 ; CHECK-NEXT: ldrh r6, [r0, #14]
249 ; CHECK-NEXT: ldrh r0, [r0, #8]
250 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r12
251 ; CHECK-NEXT: vmov r0, s4
252 ; CHECK-NEXT: vmov.16 q1[0], r0
253 ; CHECK-NEXT: vmov r0, s0
254 ; CHECK-NEXT: vmov.16 q1[1], lr
255 ; CHECK-NEXT: vmov.16 q1[2], r0
256 ; CHECK-NEXT: vmov.16 q1[3], r4
257 ; CHECK-NEXT: vmov.16 q1[4], r3
258 ; CHECK-NEXT: vmov.16 q1[5], r5
259 ; CHECK-NEXT: vmov.16 q1[6], r12
260 ; CHECK-NEXT: vmov.16 q1[7], r6
261 ; CHECK-NEXT: vstrh.16 q1, [r1]
262 ; CHECK-NEXT: pop {r4, r5, r6, pc}
264 %l1 = load <2 x i16>, ptr %src, align 4
265 %s2 = getelementptr <2 x i16>, ptr %src, i32 1
266 %l2 = load <2 x i16>, ptr %s2, align 4
267 %s3 = getelementptr <2 x i16>, ptr %src, i32 2
268 %l3 = load <2 x i16>, ptr %s3, align 4
269 %s4 = getelementptr <2 x i16>, ptr %src, i32 3
270 %l4 = load <2 x i16>, ptr %s4, align 4
271 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
272 %t2 = shufflevector <2 x i16> %l3, <2 x i16> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
273 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
274 store <8 x i16> %s, ptr %dst, align 2
278 define void @vst4_v4i16(ptr %src, ptr %dst) {
279 ; CHECK-LABEL: vst4_v4i16:
280 ; CHECK: @ %bb.0: @ %entry
281 ; CHECK-NEXT: .save {r4, r5, r6, lr}
282 ; CHECK-NEXT: push {r4, r5, r6, lr}
283 ; CHECK-NEXT: .vsave {d8, d9}
284 ; CHECK-NEXT: vpush {d8, d9}
285 ; CHECK-NEXT: vldrh.u32 q0, [r0]
286 ; CHECK-NEXT: vldrh.u32 q3, [r0, #8]
287 ; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
288 ; CHECK-NEXT: vldrh.u32 q4, [r0, #24]
289 ; CHECK-NEXT: vmov r3, r4, d1
290 ; CHECK-NEXT: vmov r5, r12, d0
291 ; CHECK-NEXT: vmov.16 q2[0], r3
292 ; CHECK-NEXT: vmov.16 q0[0], r5
293 ; CHECK-NEXT: vmov r0, r5, d7
294 ; CHECK-NEXT: vmov.16 q2[1], r0
295 ; CHECK-NEXT: vmov r2, lr, d3
296 ; CHECK-NEXT: vmov r0, r3, d9
297 ; CHECK-NEXT: vmov.16 q2[2], r2
298 ; CHECK-NEXT: vmov.16 q2[3], r0
299 ; CHECK-NEXT: vmov r0, r6, d8
300 ; CHECK-NEXT: vmov.16 q2[4], r4
301 ; CHECK-NEXT: vmov.16 q2[5], r5
302 ; CHECK-NEXT: vmov r4, r5, d6
303 ; CHECK-NEXT: vmov.16 q2[6], lr
304 ; CHECK-NEXT: vmov.16 q0[1], r4
305 ; CHECK-NEXT: vmov.16 q2[7], r3
306 ; CHECK-NEXT: vmov r3, r2, d2
307 ; CHECK-NEXT: vmov.16 q0[2], r3
308 ; CHECK-NEXT: vstrh.16 q2, [r1, #16]
309 ; CHECK-NEXT: vmov.16 q0[3], r0
310 ; CHECK-NEXT: vmov.16 q0[4], r12
311 ; CHECK-NEXT: vmov.16 q0[5], r5
312 ; CHECK-NEXT: vmov.16 q0[6], r2
313 ; CHECK-NEXT: vmov.16 q0[7], r6
314 ; CHECK-NEXT: vstrh.16 q0, [r1]
315 ; CHECK-NEXT: vpop {d8, d9}
316 ; CHECK-NEXT: pop {r4, r5, r6, pc}
318 %l1 = load <4 x i16>, ptr %src, align 4
319 %s2 = getelementptr <4 x i16>, ptr %src, i32 1
320 %l2 = load <4 x i16>, ptr %s2, align 4
321 %s3 = getelementptr <4 x i16>, ptr %src, i32 2
322 %l3 = load <4 x i16>, ptr %s3, align 4
323 %s4 = getelementptr <4 x i16>, ptr %src, i32 3
324 %l4 = load <4 x i16>, ptr %s4, align 4
325 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
326 %t2 = shufflevector <4 x i16> %l3, <4 x i16> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
327 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
328 store <16 x i16> %s, ptr %dst, align 2
332 define void @vst4_v8i16(ptr %src, ptr %dst) {
333 ; CHECK-LABEL: vst4_v8i16:
334 ; CHECK: @ %bb.0: @ %entry
335 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
336 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
337 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
338 ; CHECK-NEXT: vldrw.u32 q0, [r0]
339 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
340 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
341 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
342 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
345 %l1 = load <8 x i16>, ptr %src, align 4
346 %s2 = getelementptr <8 x i16>, ptr %src, i32 1
347 %l2 = load <8 x i16>, ptr %s2, align 4
348 %s3 = getelementptr <8 x i16>, ptr %src, i32 2
349 %l3 = load <8 x i16>, ptr %s3, align 4
350 %s4 = getelementptr <8 x i16>, ptr %src, i32 3
351 %l4 = load <8 x i16>, ptr %s4, align 4
352 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
353 %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
354 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
355 store <32 x i16> %s, ptr %dst, align 2
359 define void @vst4_v16i16(ptr %src, ptr %dst) {
360 ; CHECK-LABEL: vst4_v16i16:
361 ; CHECK: @ %bb.0: @ %entry
362 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
363 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
364 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
365 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
366 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
367 ; CHECK-NEXT: vldrw.u32 q4, [r0]
368 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
369 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
370 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
371 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
372 ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1]
373 ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1]
374 ; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1]
375 ; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]!
376 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
377 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
378 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
379 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
380 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
383 %l1 = load <16 x i16>, ptr %src, align 4
384 %s2 = getelementptr <16 x i16>, ptr %src, i32 1
385 %l2 = load <16 x i16>, ptr %s2, align 4
386 %s3 = getelementptr <16 x i16>, ptr %src, i32 2
387 %l3 = load <16 x i16>, ptr %s3, align 4
388 %s4 = getelementptr <16 x i16>, ptr %src, i32 3
389 %l4 = load <16 x i16>, ptr %s4, align 4
390 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
391 %t2 = shufflevector <16 x i16> %l3, <16 x i16> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
392 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
393 store <64 x i16> %s, ptr %dst, align 2
397 define void @vst4_v8i16_align1(ptr %src, ptr %dst) {
398 ; CHECK-LABEL: vst4_v8i16_align1:
399 ; CHECK: @ %bb.0: @ %entry
400 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
401 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
402 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
403 ; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
404 ; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
405 ; CHECK-NEXT: vmovx.f16 s12, s5
406 ; CHECK-NEXT: vmovx.f16 s0, s9
407 ; CHECK-NEXT: vins.f16 s5, s9
408 ; CHECK-NEXT: vins.f16 s12, s0
409 ; CHECK-NEXT: vmov q0, q1
410 ; CHECK-NEXT: vmovx.f16 s27, s4
411 ; CHECK-NEXT: vins.f16 s4, s8
412 ; CHECK-NEXT: vmov.f32 s3, s12
413 ; CHECK-NEXT: vldrw.u32 q3, [r0]
414 ; CHECK-NEXT: vmov.f32 s5, s4
415 ; CHECK-NEXT: vmovx.f16 s8, s8
416 ; CHECK-NEXT: vmovx.f16 s0, s17
417 ; CHECK-NEXT: vmovx.f16 s2, s13
418 ; CHECK-NEXT: vins.f16 s27, s8
419 ; CHECK-NEXT: vmovx.f16 s4, s12
420 ; CHECK-NEXT: vmovx.f16 s8, s16
421 ; CHECK-NEXT: vins.f16 s13, s17
422 ; CHECK-NEXT: vins.f16 s12, s16
423 ; CHECK-NEXT: vmov q5, q3
424 ; CHECK-NEXT: vins.f16 s4, s8
425 ; CHECK-NEXT: vmov.f32 s22, s4
426 ; CHECK-NEXT: vmovx.f16 s4, s11
427 ; CHECK-NEXT: vmov.f32 s23, s27
428 ; CHECK-NEXT: vmovx.f16 s27, s7
429 ; CHECK-NEXT: vins.f16 s7, s11
430 ; CHECK-NEXT: vins.f16 s27, s4
431 ; CHECK-NEXT: vmovx.f16 s26, s15
432 ; CHECK-NEXT: vmovx.f16 s4, s19
433 ; CHECK-NEXT: vmov.f32 s25, s7
434 ; CHECK-NEXT: vins.f16 s26, s4
435 ; CHECK-NEXT: vmovx.f16 s7, s6
436 ; CHECK-NEXT: vmovx.f16 s4, s10
437 ; CHECK-NEXT: vins.f16 s6, s10
438 ; CHECK-NEXT: vmov.f32 s21, s5
439 ; CHECK-NEXT: vins.f16 s15, s19
440 ; CHECK-NEXT: vins.f16 s7, s4
441 ; CHECK-NEXT: vmov.f32 s5, s6
442 ; CHECK-NEXT: vmovx.f16 s6, s14
443 ; CHECK-NEXT: vmovx.f16 s4, s18
444 ; CHECK-NEXT: vins.f16 s14, s18
445 ; CHECK-NEXT: vins.f16 s2, s0
446 ; CHECK-NEXT: vmov.f32 s0, s13
447 ; CHECK-NEXT: vmov.f32 s24, s15
448 ; CHECK-NEXT: vins.f16 s6, s4
449 ; CHECK-NEXT: vmov.f32 s4, s14
450 ; CHECK-NEXT: vstrb.8 q6, [r1, #48]
451 ; CHECK-NEXT: vstrb.8 q1, [r1, #32]
452 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
453 ; CHECK-NEXT: vstrb.8 q5, [r1]
454 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
457 %l1 = load <8 x i16>, ptr %src, align 4
458 %s2 = getelementptr <8 x i16>, ptr %src, i32 1
459 %l2 = load <8 x i16>, ptr %s2, align 4
460 %s3 = getelementptr <8 x i16>, ptr %src, i32 2
461 %l3 = load <8 x i16>, ptr %s3, align 4
462 %s4 = getelementptr <8 x i16>, ptr %src, i32 3
463 %l4 = load <8 x i16>, ptr %s4, align 4
464 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
465 %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
466 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
467 store <32 x i16> %s, ptr %dst, align 1
473 define void @vst4_v2i8(ptr %src, ptr %dst) {
474 ; CHECK-LABEL: vst4_v2i8:
475 ; CHECK: @ %bb.0: @ %entry
476 ; CHECK-NEXT: .save {r4, r5, r6, lr}
477 ; CHECK-NEXT: push {r4, r5, r6, lr}
478 ; CHECK-NEXT: ldrb r4, [r0, #5]
479 ; CHECK-NEXT: ldrb r5, [r0, #4]
480 ; CHECK-NEXT: ldrb r2, [r0]
481 ; CHECK-NEXT: ldrb r3, [r0, #1]
482 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
483 ; CHECK-NEXT: vmov r5, s0
484 ; CHECK-NEXT: ldrb.w r12, [r0, #2]
485 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
486 ; CHECK-NEXT: ldrb.w lr, [r0, #3]
487 ; CHECK-NEXT: vmov r2, s0
488 ; CHECK-NEXT: ldrb r6, [r0, #7]
489 ; CHECK-NEXT: vmov.16 q0[0], r2
490 ; CHECK-NEXT: ldrb r0, [r0, #6]
491 ; CHECK-NEXT: vmov.16 q0[1], r12
492 ; CHECK-NEXT: vmov.16 q0[2], r5
493 ; CHECK-NEXT: vmov.16 q0[3], r0
494 ; CHECK-NEXT: vmov.16 q0[4], r3
495 ; CHECK-NEXT: vmov.16 q0[5], lr
496 ; CHECK-NEXT: vmov.16 q0[6], r4
497 ; CHECK-NEXT: vmov.16 q0[7], r6
498 ; CHECK-NEXT: vstrb.16 q0, [r1]
499 ; CHECK-NEXT: pop {r4, r5, r6, pc}
501 %l1 = load <2 x i8>, ptr %src, align 4
502 %s2 = getelementptr <2 x i8>, ptr %src, i32 1
503 %l2 = load <2 x i8>, ptr %s2, align 4
504 %s3 = getelementptr <2 x i8>, ptr %src, i32 2
505 %l3 = load <2 x i8>, ptr %s3, align 4
506 %s4 = getelementptr <2 x i8>, ptr %src, i32 3
507 %l4 = load <2 x i8>, ptr %s4, align 4
508 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
509 %t2 = shufflevector <2 x i8> %l3, <2 x i8> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
510 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
511 store <8 x i8> %s, ptr %dst, align 1
515 define void @vst4_v4i8(ptr %src, ptr %dst) {
516 ; CHECK-LABEL: vst4_v4i8:
517 ; CHECK: @ %bb.0: @ %entry
518 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
519 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
520 ; CHECK-NEXT: .pad #4
521 ; CHECK-NEXT: sub sp, #4
522 ; CHECK-NEXT: .vsave {d8, d9}
523 ; CHECK-NEXT: vpush {d8, d9}
524 ; CHECK-NEXT: vldrb.u32 q2, [r0]
525 ; CHECK-NEXT: vldrb.u32 q3, [r0, #4]
526 ; CHECK-NEXT: vldrb.u32 q1, [r0, #8]
527 ; CHECK-NEXT: vldrb.u32 q4, [r0, #12]
528 ; CHECK-NEXT: vmov r4, r5, d4
529 ; CHECK-NEXT: vmov.8 q0[0], r4
530 ; CHECK-NEXT: vmov r2, lr, d6
531 ; CHECK-NEXT: vmov.8 q0[1], r2
532 ; CHECK-NEXT: vmov r0, r4, d2
533 ; CHECK-NEXT: vmov r3, r12, d8
534 ; CHECK-NEXT: vmov.8 q0[2], r0
535 ; CHECK-NEXT: vmov.8 q0[3], r3
536 ; CHECK-NEXT: vmov r2, r7, d9
537 ; CHECK-NEXT: vmov.8 q0[4], r5
538 ; CHECK-NEXT: vmov r3, r5, d7
539 ; CHECK-NEXT: vmov.8 q0[5], lr
540 ; CHECK-NEXT: vmov.8 q0[6], r4
541 ; CHECK-NEXT: vmov r4, r0, d5
542 ; CHECK-NEXT: vmov.8 q0[7], r12
543 ; CHECK-NEXT: vmov.8 q0[8], r4
544 ; CHECK-NEXT: vmov r4, r6, d3
545 ; CHECK-NEXT: vmov.8 q0[9], r3
546 ; CHECK-NEXT: vmov.8 q0[10], r4
547 ; CHECK-NEXT: vmov.8 q0[11], r2
548 ; CHECK-NEXT: vmov.8 q0[12], r0
549 ; CHECK-NEXT: vmov.8 q0[13], r5
550 ; CHECK-NEXT: vmov.8 q0[14], r6
551 ; CHECK-NEXT: vmov.8 q0[15], r7
552 ; CHECK-NEXT: vstrb.8 q0, [r1]
553 ; CHECK-NEXT: vpop {d8, d9}
554 ; CHECK-NEXT: add sp, #4
555 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
557 %l1 = load <4 x i8>, ptr %src, align 4
558 %s2 = getelementptr <4 x i8>, ptr %src, i32 1
559 %l2 = load <4 x i8>, ptr %s2, align 4
560 %s3 = getelementptr <4 x i8>, ptr %src, i32 2
561 %l3 = load <4 x i8>, ptr %s3, align 4
562 %s4 = getelementptr <4 x i8>, ptr %src, i32 3
563 %l4 = load <4 x i8>, ptr %s4, align 4
564 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
565 %t2 = shufflevector <4 x i8> %l3, <4 x i8> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
566 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
567 store <16 x i8> %s, ptr %dst, align 1
571 define void @vst4_v8i8(ptr %src, ptr %dst) {
572 ; CHECK-LABEL: vst4_v8i8:
573 ; CHECK: @ %bb.0: @ %entry
574 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
575 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
576 ; CHECK-NEXT: vldrb.u16 q1, [r0]
577 ; CHECK-NEXT: vldrb.u16 q2, [r0, #8]
578 ; CHECK-NEXT: vldrb.u16 q3, [r0, #16]
579 ; CHECK-NEXT: vldrb.u16 q4, [r0, #24]
580 ; CHECK-NEXT: vmov.u16 r2, q1[4]
581 ; CHECK-NEXT: vmov.8 q0[0], r2
582 ; CHECK-NEXT: vmov.u16 r2, q2[4]
583 ; CHECK-NEXT: vmov.8 q0[1], r2
584 ; CHECK-NEXT: vmov.u16 r2, q3[4]
585 ; CHECK-NEXT: vmov.8 q0[2], r2
586 ; CHECK-NEXT: vmov.u16 r0, q4[4]
587 ; CHECK-NEXT: vmov.8 q0[3], r0
588 ; CHECK-NEXT: vmov.u16 r0, q1[5]
589 ; CHECK-NEXT: vmov.8 q0[4], r0
590 ; CHECK-NEXT: vmov.u16 r0, q2[5]
591 ; CHECK-NEXT: vmov.8 q0[5], r0
592 ; CHECK-NEXT: vmov.u16 r0, q3[5]
593 ; CHECK-NEXT: vmov.8 q0[6], r0
594 ; CHECK-NEXT: vmov.u16 r0, q4[5]
595 ; CHECK-NEXT: vmov.8 q0[7], r0
596 ; CHECK-NEXT: vmov.u16 r0, q1[6]
597 ; CHECK-NEXT: vmov.8 q0[8], r0
598 ; CHECK-NEXT: vmov.u16 r0, q2[6]
599 ; CHECK-NEXT: vmov.8 q0[9], r0
600 ; CHECK-NEXT: vmov.u16 r0, q3[6]
601 ; CHECK-NEXT: vmov.8 q0[10], r0
602 ; CHECK-NEXT: vmov.u16 r0, q4[6]
603 ; CHECK-NEXT: vmov.8 q0[11], r0
604 ; CHECK-NEXT: vmov.u16 r0, q1[7]
605 ; CHECK-NEXT: vmov.8 q0[12], r0
606 ; CHECK-NEXT: vmov.u16 r0, q2[7]
607 ; CHECK-NEXT: vmov.8 q0[13], r0
608 ; CHECK-NEXT: vmov.u16 r0, q3[7]
609 ; CHECK-NEXT: vmov.8 q0[14], r0
610 ; CHECK-NEXT: vmov.u16 r0, q4[7]
611 ; CHECK-NEXT: vmov.8 q0[15], r0
612 ; CHECK-NEXT: vmov.u16 r0, q1[0]
613 ; CHECK-NEXT: vmov.8 q5[0], r0
614 ; CHECK-NEXT: vmov.u16 r0, q2[0]
615 ; CHECK-NEXT: vmov.8 q5[1], r0
616 ; CHECK-NEXT: vmov.u16 r0, q3[0]
617 ; CHECK-NEXT: vmov.8 q5[2], r0
618 ; CHECK-NEXT: vmov.u16 r0, q4[0]
619 ; CHECK-NEXT: vmov.8 q5[3], r0
620 ; CHECK-NEXT: vmov.u16 r0, q1[1]
621 ; CHECK-NEXT: vmov.8 q5[4], r0
622 ; CHECK-NEXT: vmov.u16 r0, q2[1]
623 ; CHECK-NEXT: vmov.8 q5[5], r0
624 ; CHECK-NEXT: vmov.u16 r0, q3[1]
625 ; CHECK-NEXT: vmov.8 q5[6], r0
626 ; CHECK-NEXT: vmov.u16 r0, q4[1]
627 ; CHECK-NEXT: vmov.8 q5[7], r0
628 ; CHECK-NEXT: vmov.u16 r0, q1[2]
629 ; CHECK-NEXT: vmov.8 q5[8], r0
630 ; CHECK-NEXT: vmov.u16 r0, q2[2]
631 ; CHECK-NEXT: vmov.8 q5[9], r0
632 ; CHECK-NEXT: vmov.u16 r0, q3[2]
633 ; CHECK-NEXT: vmov.8 q5[10], r0
634 ; CHECK-NEXT: vmov.u16 r0, q4[2]
635 ; CHECK-NEXT: vmov.8 q5[11], r0
636 ; CHECK-NEXT: vmov.u16 r0, q1[3]
637 ; CHECK-NEXT: vmov.8 q5[12], r0
638 ; CHECK-NEXT: vmov.u16 r0, q2[3]
639 ; CHECK-NEXT: vmov.8 q5[13], r0
640 ; CHECK-NEXT: vmov.u16 r0, q3[3]
641 ; CHECK-NEXT: vmov.8 q5[14], r0
642 ; CHECK-NEXT: vmov.u16 r0, q4[3]
643 ; CHECK-NEXT: vmov.8 q5[15], r0
644 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
645 ; CHECK-NEXT: vstrb.8 q5, [r1]
646 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
649 %l1 = load <8 x i8>, ptr %src, align 4
650 %s2 = getelementptr <8 x i8>, ptr %src, i32 1
651 %l2 = load <8 x i8>, ptr %s2, align 4
652 %s3 = getelementptr <8 x i8>, ptr %src, i32 2
653 %l3 = load <8 x i8>, ptr %s3, align 4
654 %s4 = getelementptr <8 x i8>, ptr %src, i32 3
655 %l4 = load <8 x i8>, ptr %s4, align 4
656 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
657 %t2 = shufflevector <8 x i8> %l3, <8 x i8> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
658 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
659 store <32 x i8> %s, ptr %dst, align 1
663 define void @vst4_v16i8(ptr %src, ptr %dst) {
664 ; CHECK-LABEL: vst4_v16i8:
665 ; CHECK: @ %bb.0: @ %entry
666 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
667 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
668 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
669 ; CHECK-NEXT: vldrw.u32 q0, [r0]
670 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r1]
671 ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r1]
672 ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r1]
673 ; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r1]
676 %l1 = load <16 x i8>, ptr %src, align 4
677 %s2 = getelementptr <16 x i8>, ptr %src, i32 1
678 %l2 = load <16 x i8>, ptr %s2, align 4
679 %s3 = getelementptr <16 x i8>, ptr %src, i32 2
680 %l3 = load <16 x i8>, ptr %s3, align 4
681 %s4 = getelementptr <16 x i8>, ptr %src, i32 3
682 %l4 = load <16 x i8>, ptr %s4, align 4
683 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
684 %t2 = shufflevector <16 x i8> %l3, <16 x i8> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
685 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
686 store <64 x i8> %s, ptr %dst, align 1
692 define void @vst4_v2i64(ptr %src, ptr %dst) {
693 ; CHECK-LABEL: vst4_v2i64:
694 ; CHECK: @ %bb.0: @ %entry
695 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
696 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
697 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
698 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
699 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
700 ; CHECK-NEXT: vldrw.u32 q3, [r0]
701 ; CHECK-NEXT: vmov.f64 d9, d0
702 ; CHECK-NEXT: vmov.f64 d8, d4
703 ; CHECK-NEXT: vmov.f64 d11, d2
704 ; CHECK-NEXT: vstrw.32 q4, [r1, #16]
705 ; CHECK-NEXT: vmov.f64 d10, d6
706 ; CHECK-NEXT: vmov.f64 d0, d5
707 ; CHECK-NEXT: vstrw.32 q5, [r1]
708 ; CHECK-NEXT: vmov.f64 d2, d7
709 ; CHECK-NEXT: vstrw.32 q0, [r1, #48]
710 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
711 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
714 %l1 = load <2 x i64>, ptr %src, align 4
715 %s2 = getelementptr <2 x i64>, ptr %src, i32 1
716 %l2 = load <2 x i64>, ptr %s2, align 4
717 %s3 = getelementptr <2 x i64>, ptr %src, i32 2
718 %l3 = load <2 x i64>, ptr %s3, align 4
719 %s4 = getelementptr <2 x i64>, ptr %src, i32 3
720 %l4 = load <2 x i64>, ptr %s4, align 4
721 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
722 %t2 = shufflevector <2 x i64> %l3, <2 x i64> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
723 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
724 store <8 x i64> %s, ptr %dst, align 8
728 define void @vst4_v4i64(ptr %src, ptr %dst) {
729 ; CHECK-LABEL: vst4_v4i64:
730 ; CHECK: @ %bb.0: @ %entry
731 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
732 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
733 ; CHECK-NEXT: .pad #64
734 ; CHECK-NEXT: sub sp, #64
735 ; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
736 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
737 ; CHECK-NEXT: vldrw.u32 q6, [r0]
738 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
739 ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
740 ; CHECK-NEXT: vmov.f64 d15, d10
741 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
742 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
743 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
744 ; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
745 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
746 ; CHECK-NEXT: vmov.f64 d14, d12
747 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
748 ; CHECK-NEXT: vmov.f64 d14, d4
749 ; CHECK-NEXT: vmov.f64 d15, d2
750 ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
751 ; CHECK-NEXT: vmov.f64 d4, d0
752 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
753 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
754 ; CHECK-NEXT: vmov.f64 d10, d13
755 ; CHECK-NEXT: vmov.f64 d2, d5
756 ; CHECK-NEXT: vstrw.32 q5, [r1, #32]
757 ; CHECK-NEXT: vmov.f64 d5, d6
758 ; CHECK-NEXT: vstrw.32 q1, [r1, #48]
759 ; CHECK-NEXT: vmov.f64 d13, d8
760 ; CHECK-NEXT: vstrw.32 q2, [r1, #64]
761 ; CHECK-NEXT: vmov.f64 d12, d0
762 ; CHECK-NEXT: vmov.f64 d8, d1
763 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
764 ; CHECK-NEXT: vstrw.32 q6, [r1, #80]
765 ; CHECK-NEXT: vstrw.32 q0, [r1]
766 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
767 ; CHECK-NEXT: vmov.f64 d6, d15
768 ; CHECK-NEXT: vstrw.32 q4, [r1, #112]
769 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
770 ; CHECK-NEXT: vstrw.32 q3, [r1, #96]
771 ; CHECK-NEXT: add sp, #64
772 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
775 %l1 = load <4 x i64>, ptr %src, align 4
776 %s2 = getelementptr <4 x i64>, ptr %src, i32 1
777 %l2 = load <4 x i64>, ptr %s2, align 4
778 %s3 = getelementptr <4 x i64>, ptr %src, i32 2
779 %l3 = load <4 x i64>, ptr %s3, align 4
780 %s4 = getelementptr <4 x i64>, ptr %src, i32 3
781 %l4 = load <4 x i64>, ptr %s4, align 4
782 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
783 %t2 = shufflevector <4 x i64> %l3, <4 x i64> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
784 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
785 store <16 x i64> %s, ptr %dst, align 8
791 define void @vst4_v2f32(ptr %src, ptr %dst) {
792 ; CHECK-LABEL: vst4_v2f32:
793 ; CHECK: @ %bb.0: @ %entry
794 ; CHECK-NEXT: vldr s0, [r0]
795 ; CHECK-NEXT: vldr s4, [r0, #4]
796 ; CHECK-NEXT: vldr s1, [r0, #8]
797 ; CHECK-NEXT: vldr s5, [r0, #12]
798 ; CHECK-NEXT: vldr s2, [r0, #16]
799 ; CHECK-NEXT: vldr s6, [r0, #20]
800 ; CHECK-NEXT: vldr s3, [r0, #24]
801 ; CHECK-NEXT: vldr s7, [r0, #28]
802 ; CHECK-NEXT: vstrw.32 q0, [r1]
803 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
806 %l1 = load <2 x float>, ptr %src, align 4
807 %s2 = getelementptr <2 x float>, ptr %src, i32 1
808 %l2 = load <2 x float>, ptr %s2, align 4
809 %s3 = getelementptr <2 x float>, ptr %src, i32 2
810 %l3 = load <2 x float>, ptr %s3, align 4
811 %s4 = getelementptr <2 x float>, ptr %src, i32 3
812 %l4 = load <2 x float>, ptr %s4, align 4
813 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
814 %t2 = shufflevector <2 x float> %l3, <2 x float> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
815 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
816 store <8 x float> %s, ptr %dst, align 4
820 define void @vst4_v4f32(ptr %src, ptr %dst) {
821 ; CHECK-LABEL: vst4_v4f32:
822 ; CHECK: @ %bb.0: @ %entry
823 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
824 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
825 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
826 ; CHECK-NEXT: vldrw.u32 q0, [r0]
827 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
828 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
829 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
830 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
833 %l1 = load <4 x float>, ptr %src, align 4
834 %s2 = getelementptr <4 x float>, ptr %src, i32 1
835 %l2 = load <4 x float>, ptr %s2, align 4
836 %s3 = getelementptr <4 x float>, ptr %src, i32 2
837 %l3 = load <4 x float>, ptr %s3, align 4
838 %s4 = getelementptr <4 x float>, ptr %src, i32 3
839 %l4 = load <4 x float>, ptr %s4, align 4
840 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
841 %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
842 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
843 store <16 x float> %s, ptr %dst, align 4
847 define void @vst4_v8f32(ptr %src, ptr %dst) {
848 ; CHECK-LABEL: vst4_v8f32:
849 ; CHECK: @ %bb.0: @ %entry
850 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
851 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
852 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
853 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
854 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
855 ; CHECK-NEXT: vldrw.u32 q4, [r0]
856 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
857 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
858 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
859 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
860 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
861 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
862 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
863 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]!
864 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
865 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
866 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
867 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
868 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
871 %l1 = load <8 x float>, ptr %src, align 4
872 %s2 = getelementptr <8 x float>, ptr %src, i32 1
873 %l2 = load <8 x float>, ptr %s2, align 4
874 %s3 = getelementptr <8 x float>, ptr %src, i32 2
875 %l3 = load <8 x float>, ptr %s3, align 4
876 %s4 = getelementptr <8 x float>, ptr %src, i32 3
877 %l4 = load <8 x float>, ptr %s4, align 4
878 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
879 %t2 = shufflevector <8 x float> %l3, <8 x float> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
880 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
881 store <32 x float> %s, ptr %dst, align 4
885 define void @vst4_v16f32(ptr %src, ptr %dst) {
886 ; CHECK-LABEL: vst4_v16f32:
887 ; CHECK: @ %bb.0: @ %entry
888 ; CHECK-NEXT: .save {r4, r5}
889 ; CHECK-NEXT: push {r4, r5}
890 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
891 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
892 ; CHECK-NEXT: .pad #192
893 ; CHECK-NEXT: sub sp, #192
894 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
895 ; CHECK-NEXT: add r2, sp, #64
896 ; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
897 ; CHECK-NEXT: vldrw.u32 q3, [r0, #208]
898 ; CHECK-NEXT: vldrw.u32 q2, [r0, #144]
899 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
900 ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
901 ; CHECK-NEXT: vldrw.u32 q0, [r0]
902 ; CHECK-NEXT: vldrw.u32 q2, [r0, #128]
903 ; CHECK-NEXT: vldrw.u32 q5, [r0, #240]
904 ; CHECK-NEXT: vmov q6, q4
905 ; CHECK-NEXT: vldrw.u32 q3, [r0, #192]
906 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
907 ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
908 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
909 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
910 ; CHECK-NEXT: add r2, sp, #128
911 ; CHECK-NEXT: vmov q7, q5
912 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224]
913 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
914 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
915 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
916 ; CHECK-NEXT: vmov q6, q2
917 ; CHECK-NEXT: vmov q5, q1
918 ; CHECK-NEXT: vmov q7, q3
919 ; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
920 ; CHECK-NEXT: add r2, sp, #64
921 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
922 ; CHECK-NEXT: mov r0, r1
923 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
924 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
925 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
926 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]!
927 ; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
928 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
929 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
930 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
931 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
932 ; CHECK-NEXT: add.w r0, r1, #192
933 ; CHECK-NEXT: adds r1, #128
934 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
935 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
936 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
937 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]
938 ; CHECK-NEXT: add r1, sp, #128
939 ; CHECK-NEXT: vldmia r1, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
940 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
941 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
942 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
943 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
944 ; CHECK-NEXT: add sp, #192
945 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
946 ; CHECK-NEXT: pop {r4, r5}
949 %l1 = load <16 x float>, ptr %src, align 4
950 %s2 = getelementptr <16 x float>, ptr %src, i32 1
951 %l2 = load <16 x float>, ptr %s2, align 4
952 %s3 = getelementptr <16 x float>, ptr %src, i32 2
953 %l3 = load <16 x float>, ptr %s3, align 4
954 %s4 = getelementptr <16 x float>, ptr %src, i32 3
955 %l4 = load <16 x float>, ptr %s4, align 4
956 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
957 %t2 = shufflevector <16 x float> %l3, <16 x float> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
958 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
959 store <64 x float> %s, ptr %dst, align 4
963 define void @vst4_v4f32_align1(ptr %src, ptr %dst) {
964 ; CHECK-LABEL: vst4_v4f32_align1:
965 ; CHECK: @ %bb.0: @ %entry
966 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
967 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
968 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
969 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
970 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
971 ; CHECK-NEXT: vldrw.u32 q4, [r0]
972 ; CHECK-NEXT: vmov.f32 s14, s1
973 ; CHECK-NEXT: vmov.f32 s22, s0
974 ; CHECK-NEXT: vmov.f32 s26, s3
975 ; CHECK-NEXT: vmov.f32 s12, s17
976 ; CHECK-NEXT: vmov.f32 s13, s9
977 ; CHECK-NEXT: vmov.f32 s15, s5
978 ; CHECK-NEXT: vmov.f32 s20, s16
979 ; CHECK-NEXT: vstrb.8 q3, [r1, #16]
980 ; CHECK-NEXT: vmov.f32 s21, s8
981 ; CHECK-NEXT: vmov.f32 s23, s4
982 ; CHECK-NEXT: vmov.f32 s24, s19
983 ; CHECK-NEXT: vstrb.8 q5, [r1]
984 ; CHECK-NEXT: vmov.f32 s25, s11
985 ; CHECK-NEXT: vmov.f32 s27, s7
986 ; CHECK-NEXT: vmov.f32 s0, s18
987 ; CHECK-NEXT: vstrb.8 q6, [r1, #48]
988 ; CHECK-NEXT: vmov.f32 s1, s10
989 ; CHECK-NEXT: vmov.f32 s3, s6
990 ; CHECK-NEXT: vstrb.8 q0, [r1, #32]
991 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
994 %l1 = load <4 x float>, ptr %src, align 4
995 %s2 = getelementptr <4 x float>, ptr %src, i32 1
996 %l2 = load <4 x float>, ptr %s2, align 4
997 %s3 = getelementptr <4 x float>, ptr %src, i32 2
998 %l3 = load <4 x float>, ptr %s3, align 4
999 %s4 = getelementptr <4 x float>, ptr %src, i32 3
1000 %l4 = load <4 x float>, ptr %s4, align 4
1001 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1002 %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1003 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1004 store <16 x float> %s, ptr %dst, align 1
1010 define void @vst4_v2f16(ptr %src, ptr %dst) {
1011 ; CHECK-LABEL: vst4_v2f16:
1012 ; CHECK: @ %bb.0: @ %entry
1013 ; CHECK-NEXT: ldm.w r0, {r2, r3, r12}
1014 ; CHECK-NEXT: vmov.32 q1[0], r12
1015 ; CHECK-NEXT: ldr r0, [r0, #12]
1016 ; CHECK-NEXT: vmov.32 q0[0], r2
1017 ; CHECK-NEXT: vmov.32 q0[1], r3
1018 ; CHECK-NEXT: vmov.32 q1[1], r0
1019 ; CHECK-NEXT: vmovx.f16 s2, s0
1020 ; CHECK-NEXT: vmovx.f16 s6, s1
1021 ; CHECK-NEXT: vmovx.f16 s3, s4
1022 ; CHECK-NEXT: vins.f16 s2, s6
1023 ; CHECK-NEXT: vmovx.f16 s6, s5
1024 ; CHECK-NEXT: vins.f16 s4, s5
1025 ; CHECK-NEXT: vins.f16 s0, s1
1026 ; CHECK-NEXT: vins.f16 s3, s6
1027 ; CHECK-NEXT: vmov.f32 s1, s4
1028 ; CHECK-NEXT: vstrh.16 q0, [r1]
1031 %l1 = load <2 x half>, ptr %src, align 4
1032 %s2 = getelementptr <2 x half>, ptr %src, i32 1
1033 %l2 = load <2 x half>, ptr %s2, align 4
1034 %s3 = getelementptr <2 x half>, ptr %src, i32 2
1035 %l3 = load <2 x half>, ptr %s3, align 4
1036 %s4 = getelementptr <2 x half>, ptr %src, i32 3
1037 %l4 = load <2 x half>, ptr %s4, align 4
1038 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1039 %t2 = shufflevector <2 x half> %l3, <2 x half> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1040 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
1041 store <8 x half> %s, ptr %dst, align 2
1045 define void @vst4_v4f16(ptr %src, ptr %dst) {
1046 ; CHECK-LABEL: vst4_v4f16:
1047 ; CHECK: @ %bb.0: @ %entry
1048 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1049 ; CHECK-NEXT: push {r4, r5, r6, lr}
1050 ; CHECK-NEXT: add.w r6, r0, #16
1051 ; CHECK-NEXT: ldrd lr, r12, [r0]
1052 ; CHECK-NEXT: ldrd r3, r2, [r0, #8]
1053 ; CHECK-NEXT: ldm r6, {r4, r5, r6}
1054 ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
1055 ; CHECK-NEXT: ldr r0, [r0, #28]
1056 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
1057 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6
1058 ; CHECK-NEXT: vmovx.f16 s10, s5
1059 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0
1060 ; CHECK-NEXT: vins.f16 s5, s7
1061 ; CHECK-NEXT: vmovx.f16 s12, s0
1062 ; CHECK-NEXT: vins.f16 s0, s2
1063 ; CHECK-NEXT: vmovx.f16 s2, s2
1064 ; CHECK-NEXT: vmovx.f16 s11, s1
1065 ; CHECK-NEXT: vins.f16 s12, s2
1066 ; CHECK-NEXT: vmovx.f16 s2, s3
1067 ; CHECK-NEXT: vins.f16 s11, s2
1068 ; CHECK-NEXT: vmovx.f16 s2, s4
1069 ; CHECK-NEXT: vins.f16 s4, s6
1070 ; CHECK-NEXT: vmovx.f16 s6, s6
1071 ; CHECK-NEXT: vins.f16 s1, s3
1072 ; CHECK-NEXT: vins.f16 s2, s6
1073 ; CHECK-NEXT: vmovx.f16 s6, s7
1074 ; CHECK-NEXT: vmov.f32 s8, s5
1075 ; CHECK-NEXT: vins.f16 s10, s6
1076 ; CHECK-NEXT: vmov.f32 s9, s1
1077 ; CHECK-NEXT: vmov.f32 s5, s0
1078 ; CHECK-NEXT: vstrh.16 q2, [r1, #16]
1079 ; CHECK-NEXT: vmov.f32 s6, s2
1080 ; CHECK-NEXT: vmov.f32 s7, s12
1081 ; CHECK-NEXT: vstrh.16 q1, [r1]
1082 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1084 %l1 = load <4 x half>, ptr %src, align 4
1085 %s2 = getelementptr <4 x half>, ptr %src, i32 1
1086 %l2 = load <4 x half>, ptr %s2, align 4
1087 %s3 = getelementptr <4 x half>, ptr %src, i32 2
1088 %l3 = load <4 x half>, ptr %s3, align 4
1089 %s4 = getelementptr <4 x half>, ptr %src, i32 3
1090 %l4 = load <4 x half>, ptr %s4, align 4
1091 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1092 %t2 = shufflevector <4 x half> %l3, <4 x half> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1093 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1094 store <16 x half> %s, ptr %dst, align 2
1098 define void @vst4_v8f16(ptr %src, ptr %dst) {
1099 ; CHECK-LABEL: vst4_v8f16:
1100 ; CHECK: @ %bb.0: @ %entry
1101 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
1102 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1103 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
1104 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1105 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
1106 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
1107 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
1108 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
1111 %l1 = load <8 x half>, ptr %src, align 4
1112 %s2 = getelementptr <8 x half>, ptr %src, i32 1
1113 %l2 = load <8 x half>, ptr %s2, align 4
1114 %s3 = getelementptr <8 x half>, ptr %src, i32 2
1115 %l3 = load <8 x half>, ptr %s3, align 4
1116 %s4 = getelementptr <8 x half>, ptr %src, i32 3
1117 %l4 = load <8 x half>, ptr %s4, align 4
1118 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1119 %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1120 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1121 store <32 x half> %s, ptr %dst, align 2
1125 define void @vst4_v16f16(ptr %src, ptr %dst) {
1126 ; CHECK-LABEL: vst4_v16f16:
1127 ; CHECK: @ %bb.0: @ %entry
1128 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1129 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1130 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
1131 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
1132 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
1133 ; CHECK-NEXT: vldrw.u32 q4, [r0]
1134 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
1135 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
1136 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
1137 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1138 ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1]
1139 ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1]
1140 ; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1]
1141 ; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]!
1142 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
1143 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
1144 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
1145 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
1146 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1149 %l1 = load <16 x half>, ptr %src, align 4
1150 %s2 = getelementptr <16 x half>, ptr %src, i32 1
1151 %l2 = load <16 x half>, ptr %s2, align 4
1152 %s3 = getelementptr <16 x half>, ptr %src, i32 2
1153 %l3 = load <16 x half>, ptr %s3, align 4
1154 %s4 = getelementptr <16 x half>, ptr %src, i32 3
1155 %l4 = load <16 x half>, ptr %s4, align 4
1156 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1157 %t2 = shufflevector <16 x half> %l3, <16 x half> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1158 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
1159 store <64 x half> %s, ptr %dst, align 2
1163 define void @vst4_v8f16_align1(ptr %src, ptr %dst) {
1164 ; CHECK-LABEL: vst4_v8f16_align1:
1165 ; CHECK: @ %bb.0: @ %entry
1166 ; CHECK-NEXT: .vsave {d9, d10, d11, d12, d13}
1167 ; CHECK-NEXT: vpush {d9, d10, d11, d12, d13}
1168 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
1169 ; CHECK-NEXT: vldrw.u32 q5, [r0, #48]
1170 ; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
1171 ; CHECK-NEXT: vldrw.u32 q2, [r0]
1172 ; CHECK-NEXT: vmovx.f16 s0, s5
1173 ; CHECK-NEXT: vmovx.f16 s2, s21
1174 ; CHECK-NEXT: vins.f16 s0, s2
1175 ; CHECK-NEXT: vmovx.f16 s2, s9
1176 ; CHECK-NEXT: vmovx.f16 s12, s25
1177 ; CHECK-NEXT: vmovx.f16 s19, s4
1178 ; CHECK-NEXT: vins.f16 s2, s12
1179 ; CHECK-NEXT: vmovx.f16 s12, s20
1180 ; CHECK-NEXT: vins.f16 s19, s12
1181 ; CHECK-NEXT: vmovx.f16 s12, s8
1182 ; CHECK-NEXT: vmovx.f16 s14, s24
1183 ; CHECK-NEXT: vmovx.f16 s15, s7
1184 ; CHECK-NEXT: vins.f16 s12, s14
1185 ; CHECK-NEXT: vmovx.f16 s14, s23
1186 ; CHECK-NEXT: vins.f16 s15, s14
1187 ; CHECK-NEXT: vmovx.f16 s14, s11
1188 ; CHECK-NEXT: vmovx.f16 s1, s27
1189 ; CHECK-NEXT: vins.f16 s7, s23
1190 ; CHECK-NEXT: vins.f16 s14, s1
1191 ; CHECK-NEXT: vmovx.f16 s23, s6
1192 ; CHECK-NEXT: vmovx.f16 s1, s22
1193 ; CHECK-NEXT: vins.f16 s6, s22
1194 ; CHECK-NEXT: vins.f16 s5, s21
1195 ; CHECK-NEXT: vins.f16 s4, s20
1196 ; CHECK-NEXT: vins.f16 s23, s1
1197 ; CHECK-NEXT: vmovx.f16 s22, s10
1198 ; CHECK-NEXT: vins.f16 s10, s26
1199 ; CHECK-NEXT: vmovx.f16 s1, s26
1200 ; CHECK-NEXT: vins.f16 s9, s25
1201 ; CHECK-NEXT: vins.f16 s8, s24
1202 ; CHECK-NEXT: vins.f16 s11, s27
1203 ; CHECK-NEXT: vmov q6, q1
1204 ; CHECK-NEXT: vins.f16 s22, s1
1205 ; CHECK-NEXT: vmov.f32 s1, s25
1206 ; CHECK-NEXT: vmov q6, q2
1207 ; CHECK-NEXT: vmov.f32 s3, s0
1208 ; CHECK-NEXT: vmov.f32 s0, s9
1209 ; CHECK-NEXT: vmov.f32 s26, s12
1210 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
1211 ; CHECK-NEXT: vmov.f32 s25, s4
1212 ; CHECK-NEXT: vmov.f32 s27, s19
1213 ; CHECK-NEXT: vmov.f32 s13, s7
1214 ; CHECK-NEXT: vstrb.8 q6, [r1]
1215 ; CHECK-NEXT: vmov.f32 s12, s11
1216 ; CHECK-NEXT: vmov.f32 s21, s6
1217 ; CHECK-NEXT: vstrb.8 q3, [r1, #48]
1218 ; CHECK-NEXT: vmov.f32 s20, s10
1219 ; CHECK-NEXT: vstrb.8 q5, [r1, #32]
1220 ; CHECK-NEXT: vpop {d9, d10, d11, d12, d13}
1223 %l1 = load <8 x half>, ptr %src, align 4
1224 %s2 = getelementptr <8 x half>, ptr %src, i32 1
1225 %l2 = load <8 x half>, ptr %s2, align 4
1226 %s3 = getelementptr <8 x half>, ptr %src, i32 2
1227 %l3 = load <8 x half>, ptr %s3, align 4
1228 %s4 = getelementptr <8 x half>, ptr %src, i32 3
1229 %l4 = load <8 x half>, ptr %s4, align 4
1230 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1231 %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1232 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1233 store <32 x half> %s, ptr %dst, align 1
1239 define void @vst4_v2f64(ptr %src, ptr %dst) {
1240 ; CHECK-LABEL: vst4_v2f64:
1241 ; CHECK: @ %bb.0: @ %entry
1242 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1243 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
1244 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
1245 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1246 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
1247 ; CHECK-NEXT: vldrw.u32 q3, [r0]
1248 ; CHECK-NEXT: vmov.f64 d9, d0
1249 ; CHECK-NEXT: vmov.f64 d8, d4
1250 ; CHECK-NEXT: vmov.f64 d11, d2
1251 ; CHECK-NEXT: vstrw.32 q4, [r1, #16]
1252 ; CHECK-NEXT: vmov.f64 d10, d6
1253 ; CHECK-NEXT: vmov.f64 d0, d5
1254 ; CHECK-NEXT: vstrw.32 q5, [r1]
1255 ; CHECK-NEXT: vmov.f64 d2, d7
1256 ; CHECK-NEXT: vstrw.32 q0, [r1, #48]
1257 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
1258 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
1261 %l1 = load <2 x double>, ptr %src, align 4
1262 %s2 = getelementptr <2 x double>, ptr %src, i32 1
1263 %l2 = load <2 x double>, ptr %s2, align 4
1264 %s3 = getelementptr <2 x double>, ptr %src, i32 2
1265 %l3 = load <2 x double>, ptr %s3, align 4
1266 %s4 = getelementptr <2 x double>, ptr %src, i32 3
1267 %l4 = load <2 x double>, ptr %s4, align 4
1268 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1269 %t2 = shufflevector <2 x double> %l3, <2 x double> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1270 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
1271 store <8 x double> %s, ptr %dst, align 8
1275 define void @vst4_v4f64(ptr %src, ptr %dst) {
1276 ; CHECK-LABEL: vst4_v4f64:
1277 ; CHECK: @ %bb.0: @ %entry
1278 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1279 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1280 ; CHECK-NEXT: .pad #64
1281 ; CHECK-NEXT: sub sp, #64
1282 ; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
1283 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
1284 ; CHECK-NEXT: vldrw.u32 q6, [r0]
1285 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
1286 ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
1287 ; CHECK-NEXT: vmov.f64 d15, d10
1288 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
1289 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1290 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
1291 ; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
1292 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
1293 ; CHECK-NEXT: vmov.f64 d14, d12
1294 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
1295 ; CHECK-NEXT: vmov.f64 d14, d4
1296 ; CHECK-NEXT: vmov.f64 d15, d2
1297 ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
1298 ; CHECK-NEXT: vmov.f64 d4, d0
1299 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
1300 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
1301 ; CHECK-NEXT: vmov.f64 d10, d13
1302 ; CHECK-NEXT: vmov.f64 d2, d5
1303 ; CHECK-NEXT: vstrw.32 q5, [r1, #32]
1304 ; CHECK-NEXT: vmov.f64 d5, d6
1305 ; CHECK-NEXT: vstrw.32 q1, [r1, #48]
1306 ; CHECK-NEXT: vmov.f64 d13, d8
1307 ; CHECK-NEXT: vstrw.32 q2, [r1, #64]
1308 ; CHECK-NEXT: vmov.f64 d12, d0
1309 ; CHECK-NEXT: vmov.f64 d8, d1
1310 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
1311 ; CHECK-NEXT: vstrw.32 q6, [r1, #80]
1312 ; CHECK-NEXT: vstrw.32 q0, [r1]
1313 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
1314 ; CHECK-NEXT: vmov.f64 d6, d15
1315 ; CHECK-NEXT: vstrw.32 q4, [r1, #112]
1316 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
1317 ; CHECK-NEXT: vstrw.32 q3, [r1, #96]
1318 ; CHECK-NEXT: add sp, #64
1319 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1322 %l1 = load <4 x double>, ptr %src, align 4
1323 %s2 = getelementptr <4 x double>, ptr %src, i32 1
1324 %l2 = load <4 x double>, ptr %s2, align 4
1325 %s3 = getelementptr <4 x double>, ptr %src, i32 2
1326 %l3 = load <4 x double>, ptr %s3, align 4
1327 %s4 = getelementptr <4 x double>, ptr %src, i32 3
1328 %l4 = load <4 x double>, ptr %s4, align 4
1329 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1330 %t2 = shufflevector <4 x double> %l3, <4 x double> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1331 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1332 store <16 x double> %s, ptr %dst, align 8