1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
6 define void @vst3_v2i32(ptr %src, ptr %dst) {
7 ; CHECK-LABEL: vst3_v2i32:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: .save {r4, lr}
10 ; CHECK-NEXT: push {r4, lr}
11 ; CHECK-NEXT: ldrd lr, r12, [r0]
12 ; CHECK-NEXT: ldrd r3, r2, [r0, #8]
13 ; CHECK-NEXT: ldrd r4, r0, [r0, #16]
14 ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
15 ; CHECK-NEXT: str r2, [r1, #16]
16 ; CHECK-NEXT: vmov.32 q0[0], r4
17 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
18 ; CHECK-NEXT: vmov.32 q0[1], r0
19 ; CHECK-NEXT: vmov.f32 s8, s4
20 ; CHECK-NEXT: vmov.f32 s9, s6
21 ; CHECK-NEXT: str r0, [r1, #20]
22 ; CHECK-NEXT: vmov.f32 s10, s0
23 ; CHECK-NEXT: vmov.f32 s11, s5
24 ; CHECK-NEXT: vstrw.32 q2, [r1]
25 ; CHECK-NEXT: pop {r4, pc}
27 %l1 = load <2 x i32>, ptr %src, align 4
28 %s2 = getelementptr <2 x i32>, ptr %src, i32 1
29 %l2 = load <2 x i32>, ptr %s2, align 4
30 %s3 = getelementptr <2 x i32>, ptr %src, i32 2
31 %l3 = load <2 x i32>, ptr %s3, align 4
32 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
33 %t2 = shufflevector <2 x i32> %l3, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
34 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
35 store <6 x i32> %s, ptr %dst
39 define void @vst3_v4i32(ptr %src, ptr %dst) {
40 ; CHECK-LABEL: vst3_v4i32:
41 ; CHECK: @ %bb.0: @ %entry
42 ; CHECK-NEXT: .vsave {d8, d9}
43 ; CHECK-NEXT: vpush {d8, d9}
44 ; CHECK-NEXT: vldrw.u32 q2, [r0]
45 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
46 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
47 ; CHECK-NEXT: vmov.f32 s4, s8
48 ; CHECK-NEXT: vmov r0, r2, d0
49 ; CHECK-NEXT: vmov.f32 s5, s12
50 ; CHECK-NEXT: vmov.f32 s7, s9
51 ; CHECK-NEXT: vmov.f32 s16, s13
52 ; CHECK-NEXT: vmov.32 q1[2], r0
53 ; CHECK-NEXT: vmov.f32 s18, s10
54 ; CHECK-NEXT: vstrw.32 q1, [r1]
55 ; CHECK-NEXT: vmov.f32 s19, s14
56 ; CHECK-NEXT: vmov.f32 s0, s2
57 ; CHECK-NEXT: vmov.32 q4[1], r2
58 ; CHECK-NEXT: vmov.f32 s1, s11
59 ; CHECK-NEXT: vstrw.32 q4, [r1, #16]
60 ; CHECK-NEXT: vmov.f32 s2, s15
61 ; CHECK-NEXT: vstrw.32 q0, [r1, #32]
62 ; CHECK-NEXT: vpop {d8, d9}
65 %l1 = load <4 x i32>, ptr %src, align 4
66 %s2 = getelementptr <4 x i32>, ptr %src, i32 1
67 %l2 = load <4 x i32>, ptr %s2, align 4
68 %s3 = getelementptr <4 x i32>, ptr %src, i32 2
69 %l3 = load <4 x i32>, ptr %s3, align 4
70 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
71 %t2 = shufflevector <4 x i32> %l3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
72 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
73 store <12 x i32> %s, ptr %dst
77 define void @vst3_v8i32(ptr %src, ptr %dst) {
78 ; CHECK-LABEL: vst3_v8i32:
79 ; CHECK: @ %bb.0: @ %entry
80 ; CHECK-NEXT: .save {r7, lr}
81 ; CHECK-NEXT: push {r7, lr}
82 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
83 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
84 ; CHECK-NEXT: vldrw.u32 q3, [r0]
85 ; CHECK-NEXT: vldrw.u32 q7, [r0, #48]
86 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
87 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
88 ; CHECK-NEXT: vmov.f32 s4, s12
89 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
90 ; CHECK-NEXT: vmov.f32 s12, s28
91 ; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
92 ; CHECK-NEXT: vmov r2, lr, d0
93 ; CHECK-NEXT: vmov r12, r3, d4
94 ; CHECK-NEXT: vmov.f32 s0, s2
95 ; CHECK-NEXT: vmov.f32 s2, s31
96 ; CHECK-NEXT: vmov.f32 s20, s29
97 ; CHECK-NEXT: vmov.f32 s9, s15
98 ; CHECK-NEXT: vmov.f32 s29, s12
99 ; CHECK-NEXT: vmov.f32 s5, s16
100 ; CHECK-NEXT: vmov.f32 s7, s13
101 ; CHECK-NEXT: vmov.f32 s22, s26
102 ; CHECK-NEXT: vmov.32 q1[2], r12
103 ; CHECK-NEXT: vmov.f32 s23, s30
104 ; CHECK-NEXT: vstrw.32 q1, [r1]
105 ; CHECK-NEXT: vmov.f32 s28, s24
106 ; CHECK-NEXT: vmov.32 q5[1], lr
107 ; CHECK-NEXT: vmov.f32 s31, s25
108 ; CHECK-NEXT: vstrw.32 q5, [r1, #64]
109 ; CHECK-NEXT: vmov.f32 s12, s17
110 ; CHECK-NEXT: vmov.32 q7[2], r2
111 ; CHECK-NEXT: vmov.f32 s15, s18
112 ; CHECK-NEXT: vstrw.32 q7, [r1, #48]
113 ; CHECK-NEXT: vmov.f32 s1, s27
114 ; CHECK-NEXT: vmov.32 q3[1], r3
115 ; CHECK-NEXT: vmov.f32 s8, s10
116 ; CHECK-NEXT: vstrw.32 q3, [r1, #16]
117 ; CHECK-NEXT: vmov.f32 s10, s19
118 ; CHECK-NEXT: vstrw.32 q0, [r1, #80]
119 ; CHECK-NEXT: vstrw.32 q2, [r1, #32]
120 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
121 ; CHECK-NEXT: pop {r7, pc}
123 %l1 = load <8 x i32>, ptr %src, align 4
124 %s2 = getelementptr <8 x i32>, ptr %src, i32 1
125 %l2 = load <8 x i32>, ptr %s2, align 4
126 %s3 = getelementptr <8 x i32>, ptr %src, i32 2
127 %l3 = load <8 x i32>, ptr %s3, align 4
128 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
129 %t2 = shufflevector <8 x i32> %l3, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
130 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
131 store <24 x i32> %s, ptr %dst
135 define void @vst3_v16i32(ptr %src, ptr %dst) {
136 ; CHECK-LABEL: vst3_v16i32:
137 ; CHECK: @ %bb.0: @ %entry
138 ; CHECK-NEXT: .save {r4, lr}
139 ; CHECK-NEXT: push {r4, lr}
140 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
141 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
142 ; CHECK-NEXT: .pad #160
143 ; CHECK-NEXT: sub sp, #160
144 ; CHECK-NEXT: vldrw.u32 q3, [r0, #160]
145 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
146 ; CHECK-NEXT: vldrw.u32 q5, [r0, #128]
147 ; CHECK-NEXT: vldrw.u32 q1, [r0]
148 ; CHECK-NEXT: vstrw.32 q3, [sp, #144] @ 16-byte Spill
149 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
150 ; CHECK-NEXT: vmov r12, r3, d10
151 ; CHECK-NEXT: vldrw.u32 q7, [r0, #176]
152 ; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
153 ; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
154 ; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
155 ; CHECK-NEXT: vmov.f32 s8, s1
156 ; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
157 ; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
158 ; CHECK-NEXT: vmov.f32 s10, s6
159 ; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
160 ; CHECK-NEXT: vmov.f32 s11, s2
161 ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
162 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
163 ; CHECK-NEXT: vmov.32 q2[1], r3
164 ; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill
165 ; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
166 ; CHECK-NEXT: vstrw.32 q2, [r1, #16]
167 ; CHECK-NEXT: vmov.f32 s20, s22
168 ; CHECK-NEXT: vmov.f32 s22, s3
169 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
170 ; CHECK-NEXT: vmov.f32 s9, s0
171 ; CHECK-NEXT: vmov.f32 s0, s30
172 ; CHECK-NEXT: vmov.f32 s1, s15
173 ; CHECK-NEXT: vmov.f32 s2, s19
174 ; CHECK-NEXT: vmov.f32 s3, s31
175 ; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
176 ; CHECK-NEXT: vmov.f32 s8, s4
177 ; CHECK-NEXT: vmov.f32 s11, s5
178 ; CHECK-NEXT: vmov.f32 s0, s17
179 ; CHECK-NEXT: vstrw.32 q2, [sp, #128] @ 16-byte Spill
180 ; CHECK-NEXT: vmov.f32 s2, s14
181 ; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
182 ; CHECK-NEXT: vmov.f32 s3, s18
183 ; CHECK-NEXT: vmov.f32 s21, s7
184 ; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
185 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
186 ; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload
187 ; CHECK-NEXT: vstrw.32 q5, [r1, #32]
188 ; CHECK-NEXT: vmov.f32 s21, s7
189 ; CHECK-NEXT: vmov.f32 s20, s2
190 ; CHECK-NEXT: vmov.f32 s23, s3
191 ; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload
192 ; CHECK-NEXT: vmov.f32 s22, s11
193 ; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill
194 ; CHECK-NEXT: vmov.f32 s21, s16
195 ; CHECK-NEXT: vmov.f32 s23, s13
196 ; CHECK-NEXT: vmov.f32 s16, s9
197 ; CHECK-NEXT: vmov.f32 s19, s10
198 ; CHECK-NEXT: vmov.f32 s13, s8
199 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
200 ; CHECK-NEXT: vmov.f32 s18, s6
201 ; CHECK-NEXT: vmov.f64 d14, d4
202 ; CHECK-NEXT: vmov.f32 s15, s5
203 ; CHECK-NEXT: vmov.f32 s5, s27
204 ; CHECK-NEXT: vmov.f32 s8, s24
205 ; CHECK-NEXT: vmov.f32 s6, s3
206 ; CHECK-NEXT: vmov.f32 s9, s0
207 ; CHECK-NEXT: vmov.f32 s24, s1
208 ; CHECK-NEXT: vmov.f32 s27, s2
209 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
210 ; CHECK-NEXT: vmov r0, r3, d14
211 ; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload
212 ; CHECK-NEXT: vmov.f32 s7, s11
213 ; CHECK-NEXT: vstrw.32 q0, [r1, #128]
214 ; CHECK-NEXT: vmov.f32 s11, s25
215 ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
216 ; CHECK-NEXT: vmov.f32 s20, s12
217 ; CHECK-NEXT: vmov.32 q6[1], r3
218 ; CHECK-NEXT: vmov.f32 s12, s4
219 ; CHECK-NEXT: vstrw.32 q6, [r1, #64]
220 ; CHECK-NEXT: vmov.f32 s4, s10
221 ; CHECK-NEXT: vmov.32 q2[2], r0
222 ; CHECK-NEXT: vmov r0, lr, d14
223 ; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload
224 ; CHECK-NEXT: vmov.32 q0[1], lr
225 ; CHECK-NEXT: vmov.32 q5[2], r0
226 ; CHECK-NEXT: vstrw.32 q0, [r1, #160]
227 ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
228 ; CHECK-NEXT: vmov r2, r4, d14
229 ; CHECK-NEXT: vstrw.32 q2, [r1, #48]
230 ; CHECK-NEXT: vstrw.32 q0, [r1, #176]
231 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
232 ; CHECK-NEXT: vmov.32 q3[2], r2
233 ; CHECK-NEXT: vmov.32 q4[1], r4
234 ; CHECK-NEXT: vmov.32 q0[2], r12
235 ; CHECK-NEXT: vstrw.32 q1, [r1, #80]
236 ; CHECK-NEXT: vstrw.32 q3, [r1, #96]
237 ; CHECK-NEXT: vstrw.32 q4, [r1, #112]
238 ; CHECK-NEXT: vstrw.32 q5, [r1, #144]
239 ; CHECK-NEXT: vstrw.32 q0, [r1]
240 ; CHECK-NEXT: add sp, #160
241 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
242 ; CHECK-NEXT: pop {r4, pc}
244 %l1 = load <16 x i32>, ptr %src, align 4
245 %s2 = getelementptr <16 x i32>, ptr %src, i32 1
246 %l2 = load <16 x i32>, ptr %s2, align 4
247 %s3 = getelementptr <16 x i32>, ptr %src, i32 2
248 %l3 = load <16 x i32>, ptr %s3, align 4
249 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
250 %t2 = shufflevector <16 x i32> %l3, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
251 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
252 store <48 x i32> %s, ptr %dst
258 define void @vst3_v2i16(ptr %src, ptr %dst) {
259 ; CHECK-LABEL: vst3_v2i16:
260 ; CHECK: @ %bb.0: @ %entry
261 ; CHECK-NEXT: .save {r4, lr}
262 ; CHECK-NEXT: push {r4, lr}
263 ; CHECK-NEXT: ldrh r2, [r0, #10]
264 ; CHECK-NEXT: ldrh r4, [r0, #8]
265 ; CHECK-NEXT: ldrh.w r12, [r0, #2]
266 ; CHECK-NEXT: ldrh.w lr, [r0]
267 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r2
268 ; CHECK-NEXT: ldrh r3, [r0, #6]
269 ; CHECK-NEXT: ldrh r0, [r0, #4]
270 ; CHECK-NEXT: vmov q1[2], q1[0], r0, r3
271 ; CHECK-NEXT: vmov q2, q1
272 ; CHECK-NEXT: vmovnt.i32 q2, q0
273 ; CHECK-NEXT: vmov q0[2], q0[0], lr, r12
274 ; CHECK-NEXT: vmov r0, s10
275 ; CHECK-NEXT: vmov.f32 s1, s4
276 ; CHECK-NEXT: vmov.f32 s3, s2
277 ; CHECK-NEXT: vmov.32 q0[2], r4
278 ; CHECK-NEXT: vstrh.32 q0, [r1]
279 ; CHECK-NEXT: str r0, [r1, #8]
280 ; CHECK-NEXT: pop {r4, pc}
282 %l1 = load <2 x i16>, ptr %src, align 4
283 %s2 = getelementptr <2 x i16>, ptr %src, i32 1
284 %l2 = load <2 x i16>, ptr %s2, align 4
285 %s3 = getelementptr <2 x i16>, ptr %src, i32 2
286 %l3 = load <2 x i16>, ptr %s3, align 4
287 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
288 %t2 = shufflevector <2 x i16> %l3, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
289 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
290 store <6 x i16> %s, ptr %dst
294 define void @vst3_v4i16(ptr %src, ptr %dst) {
295 ; CHECK-LABEL: vst3_v4i16:
296 ; CHECK: @ %bb.0: @ %entry
297 ; CHECK-NEXT: .save {r4, r5, r7, lr}
298 ; CHECK-NEXT: push {r4, r5, r7, lr}
299 ; CHECK-NEXT: vldrh.u32 q1, [r0]
300 ; CHECK-NEXT: vldrh.u32 q0, [r0, #8]
301 ; CHECK-NEXT: vldrh.u32 q2, [r0, #16]
302 ; CHECK-NEXT: vmov r0, r5, d2
303 ; CHECK-NEXT: vmov.f32 s5, s7
304 ; CHECK-NEXT: vmov r2, r3, d0
305 ; CHECK-NEXT: vmov lr, r4, d1
306 ; CHECK-NEXT: vmov.16 q0[0], r0
307 ; CHECK-NEXT: vmov.f32 s4, s10
308 ; CHECK-NEXT: vmov.16 q0[1], r2
309 ; CHECK-NEXT: vmov.f32 s7, s11
310 ; CHECK-NEXT: vmov r12, s6
311 ; CHECK-NEXT: vmov.32 q1[2], r4
312 ; CHECK-NEXT: vmov r0, r4, d4
313 ; CHECK-NEXT: vstrh.32 q1, [r1, #16]
314 ; CHECK-NEXT: vmov.16 q0[2], r0
315 ; CHECK-NEXT: vmov.16 q0[3], r5
316 ; CHECK-NEXT: vmov.16 q0[4], r3
317 ; CHECK-NEXT: vmov.16 q0[5], r4
318 ; CHECK-NEXT: vmov.16 q0[6], r12
319 ; CHECK-NEXT: vmov.16 q0[7], lr
320 ; CHECK-NEXT: vstrw.32 q0, [r1]
321 ; CHECK-NEXT: pop {r4, r5, r7, pc}
323 %l1 = load <4 x i16>, ptr %src, align 4
324 %s2 = getelementptr <4 x i16>, ptr %src, i32 1
325 %l2 = load <4 x i16>, ptr %s2, align 4
326 %s3 = getelementptr <4 x i16>, ptr %src, i32 2
327 %l3 = load <4 x i16>, ptr %s3, align 4
328 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
329 %t2 = shufflevector <4 x i16> %l3, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
330 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
331 store <12 x i16> %s, ptr %dst
335 define void @vst3_v8i16(ptr %src, ptr %dst) {
336 ; CHECK-LABEL: vst3_v8i16:
337 ; CHECK: @ %bb.0: @ %entry
338 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
339 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
340 ; CHECK-NEXT: vldrw.u32 q1, [r0]
341 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
342 ; CHECK-NEXT: vmov.f32 s12, s7
343 ; CHECK-NEXT: vmov.u16 r2, q2[5]
344 ; CHECK-NEXT: vmov.16 q0[0], r2
345 ; CHECK-NEXT: vins.f16 s12, s11
346 ; CHECK-NEXT: vmov.f32 s1, s12
347 ; CHECK-NEXT: vmov.u16 r2, q2[7]
348 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
349 ; CHECK-NEXT: vmov.16 q0[6], r2
350 ; CHECK-NEXT: vmov.f32 s2, s7
351 ; CHECK-NEXT: vmov.u16 r0, q2[3]
352 ; CHECK-NEXT: vmovx.f16 s7, s14
353 ; CHECK-NEXT: vmov.16 q4[2], r0
354 ; CHECK-NEXT: vins.f16 s0, s7
355 ; CHECK-NEXT: vmovx.f16 s7, s15
356 ; CHECK-NEXT: vins.f16 s3, s7
357 ; CHECK-NEXT: vmov.f32 s7, s6
358 ; CHECK-NEXT: vmovx.f16 s2, s2
359 ; CHECK-NEXT: vins.f16 s7, s10
360 ; CHECK-NEXT: vmov.f32 s20, s4
361 ; CHECK-NEXT: vins.f16 s15, s2
362 ; CHECK-NEXT: vmov.f32 s18, s7
363 ; CHECK-NEXT: vins.f16 s20, s8
364 ; CHECK-NEXT: vmov.f32 s7, s6
365 ; CHECK-NEXT: vmovx.f16 s6, s5
366 ; CHECK-NEXT: vmov.f32 s2, s15
367 ; CHECK-NEXT: vmovx.f16 s15, s13
368 ; CHECK-NEXT: vins.f16 s13, s6
369 ; CHECK-NEXT: vmovx.f16 s6, s7
370 ; CHECK-NEXT: vmov.u16 r0, q2[1]
371 ; CHECK-NEXT: vmovx.f16 s4, s4
372 ; CHECK-NEXT: vins.f16 s14, s6
373 ; CHECK-NEXT: vmovx.f16 s6, s12
374 ; CHECK-NEXT: vmov.16 q5[4], r0
375 ; CHECK-NEXT: vins.f16 s5, s9
376 ; CHECK-NEXT: vins.f16 s12, s4
377 ; CHECK-NEXT: vins.f16 s17, s15
378 ; CHECK-NEXT: vmov.f32 s16, s13
379 ; CHECK-NEXT: vins.f16 s22, s6
380 ; CHECK-NEXT: vmov.f32 s19, s14
381 ; CHECK-NEXT: vstrw.32 q0, [r1, #32]
382 ; CHECK-NEXT: vmov.f32 s23, s5
383 ; CHECK-NEXT: vstrw.32 q4, [r1, #16]
384 ; CHECK-NEXT: vmov.f32 s21, s12
385 ; CHECK-NEXT: vstrw.32 q5, [r1]
386 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
389 %l1 = load <8 x i16>, ptr %src, align 4
390 %s2 = getelementptr <8 x i16>, ptr %src, i32 1
391 %l2 = load <8 x i16>, ptr %s2, align 4
392 %s3 = getelementptr <8 x i16>, ptr %src, i32 2
393 %l3 = load <8 x i16>, ptr %s3, align 4
394 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
395 %t2 = shufflevector <8 x i16> %l3, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
396 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
397 store <24 x i16> %s, ptr %dst
401 define void @vst3_v16i16(ptr %src, ptr %dst) {
402 ; CHECK-LABEL: vst3_v16i16:
403 ; CHECK: @ %bb.0: @ %entry
404 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
405 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
406 ; CHECK-NEXT: .pad #48
407 ; CHECK-NEXT: sub sp, #48
408 ; CHECK-NEXT: vldrw.u32 q2, [r0]
409 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
410 ; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
411 ; CHECK-NEXT: vmov.f32 s0, s11
412 ; CHECK-NEXT: vmov.u16 r2, q1[5]
413 ; CHECK-NEXT: vmov.16 q3[0], r2
414 ; CHECK-NEXT: vins.f16 s0, s7
415 ; CHECK-NEXT: vmov.f32 s2, s11
416 ; CHECK-NEXT: vmov.u16 r2, q1[7]
417 ; CHECK-NEXT: vmov.f64 d12, d4
418 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
419 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
420 ; CHECK-NEXT: vmov.f32 s26, s10
421 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
422 ; CHECK-NEXT: vmov.f32 s13, s0
423 ; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill
424 ; CHECK-NEXT: vmov.16 q3[6], r2
425 ; CHECK-NEXT: vmovx.f16 s0, s10
426 ; CHECK-NEXT: vins.f16 s12, s0
427 ; CHECK-NEXT: vmovx.f16 s0, s2
428 ; CHECK-NEXT: vmov.f32 s14, s11
429 ; CHECK-NEXT: vins.f16 s14, s0
430 ; CHECK-NEXT: vmov.f32 s20, s7
431 ; CHECK-NEXT: vmov q0, q3
432 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
433 ; CHECK-NEXT: vmov.u16 r2, q3[5]
434 ; CHECK-NEXT: vins.f16 s20, s15
435 ; CHECK-NEXT: vmov.16 q4[0], r2
436 ; CHECK-NEXT: vmov.u16 r2, q3[7]
437 ; CHECK-NEXT: vmov.f32 s17, s20
438 ; CHECK-NEXT: vmovx.f16 s20, s31
439 ; CHECK-NEXT: vmov.16 q4[6], r2
440 ; CHECK-NEXT: vmov.f32 s18, s7
441 ; CHECK-NEXT: vmovx.f16 s7, s30
442 ; CHECK-NEXT: vins.f16 s16, s7
443 ; CHECK-NEXT: vmovx.f16 s7, s18
444 ; CHECK-NEXT: vins.f16 s31, s7
445 ; CHECK-NEXT: vmovx.f16 s7, s11
446 ; CHECK-NEXT: vins.f16 s3, s7
447 ; CHECK-NEXT: vins.f16 s19, s20
448 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
449 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
450 ; CHECK-NEXT: vmov.f32 s20, s24
451 ; CHECK-NEXT: vmovx.f16 s11, s8
452 ; CHECK-NEXT: vmov.f32 s7, s25
453 ; CHECK-NEXT: vins.f16 s20, s0
454 ; CHECK-NEXT: vmov.u16 r0, q0[1]
455 ; CHECK-NEXT: vins.f16 s7, s1
456 ; CHECK-NEXT: vmov.16 q5[4], r0
457 ; CHECK-NEXT: vmov.u16 r0, q3[1]
458 ; CHECK-NEXT: vmov.f32 s23, s7
459 ; CHECK-NEXT: vmovx.f16 s7, s24
460 ; CHECK-NEXT: vmov.f32 s24, s4
461 ; CHECK-NEXT: vins.f16 s8, s7
462 ; CHECK-NEXT: vins.f16 s24, s12
463 ; CHECK-NEXT: vmov.f32 s21, s8
464 ; CHECK-NEXT: vmov.f32 s8, s5
465 ; CHECK-NEXT: vmov.16 q6[4], r0
466 ; CHECK-NEXT: vins.f16 s8, s13
467 ; CHECK-NEXT: vmovx.f16 s4, s4
468 ; CHECK-NEXT: vmov.f32 s27, s8
469 ; CHECK-NEXT: vmovx.f16 s8, s28
470 ; CHECK-NEXT: vins.f16 s28, s4
471 ; CHECK-NEXT: vmov.f32 s4, s6
472 ; CHECK-NEXT: vmov.u16 r0, q3[3]
473 ; CHECK-NEXT: vins.f16 s4, s14
474 ; CHECK-NEXT: vmov.16 q0[2], r0
475 ; CHECK-NEXT: vins.f16 s26, s8
476 ; CHECK-NEXT: vmov.f32 s2, s4
477 ; CHECK-NEXT: vmovx.f16 s4, s29
478 ; CHECK-NEXT: vins.f16 s1, s4
479 ; CHECK-NEXT: vmovx.f16 s4, s6
480 ; CHECK-NEXT: vmovx.f16 s0, s5
481 ; CHECK-NEXT: vins.f16 s30, s4
482 ; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
483 ; CHECK-NEXT: vins.f16 s29, s0
484 ; CHECK-NEXT: vmov.f32 s0, s29
485 ; CHECK-NEXT: vins.f16 s22, s11
486 ; CHECK-NEXT: vmov.f32 s3, s30
487 ; CHECK-NEXT: vstrw.32 q5, [r1]
488 ; CHECK-NEXT: vmov.f32 s29, s5
489 ; CHECK-NEXT: vstrw.32 q0, [r1, #64]
490 ; CHECK-NEXT: vmov.f32 s30, s6
491 ; CHECK-NEXT: vmov.f32 s8, s6
492 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
493 ; CHECK-NEXT: vmov.f32 s18, s31
494 ; CHECK-NEXT: vmov.u16 r0, q1[3]
495 ; CHECK-NEXT: vins.f16 s8, s6
496 ; CHECK-NEXT: vmov.16 q1[2], r0
497 ; CHECK-NEXT: vmov.f32 s25, s28
498 ; CHECK-NEXT: vmov.f32 s6, s8
499 ; CHECK-NEXT: vmovx.f16 s8, s9
500 ; CHECK-NEXT: vmovx.f16 s4, s29
501 ; CHECK-NEXT: vins.f16 s5, s8
502 ; CHECK-NEXT: vmovx.f16 s8, s30
503 ; CHECK-NEXT: vins.f16 s9, s4
504 ; CHECK-NEXT: vins.f16 s10, s8
505 ; CHECK-NEXT: vmov.f32 s4, s9
506 ; CHECK-NEXT: vmov.f32 s7, s10
507 ; CHECK-NEXT: vstrw.32 q6, [r1, #48]
508 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
509 ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload
510 ; CHECK-NEXT: vstrw.32 q4, [r1, #80]
511 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
512 ; CHECK-NEXT: add sp, #48
513 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
516 %l1 = load <16 x i16>, ptr %src, align 4
517 %s2 = getelementptr <16 x i16>, ptr %src, i32 1
518 %l2 = load <16 x i16>, ptr %s2, align 4
519 %s3 = getelementptr <16 x i16>, ptr %src, i32 2
520 %l3 = load <16 x i16>, ptr %s3, align 4
521 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
522 %t2 = shufflevector <16 x i16> %l3, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
523 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
524 store <48 x i16> %s, ptr %dst
530 define void @vst3_v2i8(ptr %src, ptr %dst) {
531 ; CHECK-LABEL: vst3_v2i8:
532 ; CHECK: @ %bb.0: @ %entry
533 ; CHECK-NEXT: .save {r4, r5, r7, lr}
534 ; CHECK-NEXT: push {r4, r5, r7, lr}
535 ; CHECK-NEXT: .pad #16
536 ; CHECK-NEXT: sub sp, #16
537 ; CHECK-NEXT: ldrb r2, [r0]
538 ; CHECK-NEXT: mov r5, sp
539 ; CHECK-NEXT: ldrb r3, [r0, #2]
540 ; CHECK-NEXT: vmov.16 q0[0], r2
541 ; CHECK-NEXT: ldrb.w r12, [r0, #1]
542 ; CHECK-NEXT: ldrb.w lr, [r0, #3]
543 ; CHECK-NEXT: vmov.16 q0[1], r3
544 ; CHECK-NEXT: ldrb r4, [r0, #5]
545 ; CHECK-NEXT: ldrb r0, [r0, #4]
546 ; CHECK-NEXT: vmov.16 q0[2], r0
547 ; CHECK-NEXT: add r0, sp, #8
548 ; CHECK-NEXT: vmov.16 q0[3], r12
549 ; CHECK-NEXT: vmov.16 q0[4], lr
550 ; CHECK-NEXT: vmov.16 q0[5], r4
551 ; CHECK-NEXT: vstrb.16 q0, [r5]
552 ; CHECK-NEXT: vstrb.16 q0, [r0]
553 ; CHECK-NEXT: vldrh.u32 q0, [r0]
554 ; CHECK-NEXT: ldr r2, [sp]
555 ; CHECK-NEXT: str r2, [r1]
556 ; CHECK-NEXT: vmov r0, s2
557 ; CHECK-NEXT: strh r0, [r1, #4]
558 ; CHECK-NEXT: add sp, #16
559 ; CHECK-NEXT: pop {r4, r5, r7, pc}
561 %l1 = load <2 x i8>, ptr %src, align 4
562 %s2 = getelementptr <2 x i8>, ptr %src, i32 1
563 %l2 = load <2 x i8>, ptr %s2, align 4
564 %s3 = getelementptr <2 x i8>, ptr %src, i32 2
565 %l3 = load <2 x i8>, ptr %s3, align 4
566 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
567 %t2 = shufflevector <2 x i8> %l3, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
568 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
569 store <6 x i8> %s, ptr %dst
573 define void @vst3_v4i8(ptr %src, ptr %dst) {
574 ; CHECK-LABEL: vst3_v4i8:
575 ; CHECK: @ %bb.0: @ %entry
576 ; CHECK-NEXT: .save {r4, r5, r6, lr}
577 ; CHECK-NEXT: push {r4, r5, r6, lr}
578 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
579 ; CHECK-NEXT: vldrb.u32 q1, [r0]
580 ; CHECK-NEXT: vmov r2, lr, d0
581 ; CHECK-NEXT: vmov r12, r3, d1
582 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
583 ; CHECK-NEXT: vmov r0, r6, d3
584 ; CHECK-NEXT: vmov r4, r5, d1
585 ; CHECK-NEXT: vmov.8 q2[8], r4
586 ; CHECK-NEXT: vmov.8 q2[9], r6
587 ; CHECK-NEXT: vmov.8 q2[10], r3
588 ; CHECK-NEXT: vmov.8 q2[11], r5
589 ; CHECK-NEXT: vmov r3, s10
590 ; CHECK-NEXT: str r3, [r1, #8]
591 ; CHECK-NEXT: vmov r3, r4, d2
592 ; CHECK-NEXT: vmov.16 q1[0], r3
593 ; CHECK-NEXT: vmov r3, r5, d0
594 ; CHECK-NEXT: vmov.16 q1[1], r2
595 ; CHECK-NEXT: vmov.16 q1[2], r3
596 ; CHECK-NEXT: vmov.16 q1[3], r4
597 ; CHECK-NEXT: vmov.16 q1[4], lr
598 ; CHECK-NEXT: vmov.16 q1[5], r5
599 ; CHECK-NEXT: vmov.16 q1[6], r0
600 ; CHECK-NEXT: vmov.16 q1[7], r12
601 ; CHECK-NEXT: vstrb.16 q1, [r1]
602 ; CHECK-NEXT: pop {r4, r5, r6, pc}
604 %l1 = load <4 x i8>, ptr %src, align 4
605 %s2 = getelementptr <4 x i8>, ptr %src, i32 1
606 %l2 = load <4 x i8>, ptr %s2, align 4
607 %s3 = getelementptr <4 x i8>, ptr %src, i32 2
608 %l3 = load <4 x i8>, ptr %s3, align 4
609 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
610 %t2 = shufflevector <4 x i8> %l3, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
611 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
612 store <12 x i8> %s, ptr %dst
616 define void @vst3_v8i8(ptr %src, ptr %dst) {
617 ; CHECK-LABEL: vst3_v8i8:
618 ; CHECK: @ %bb.0: @ %entry
619 ; CHECK-NEXT: .vsave {d8, d9}
620 ; CHECK-NEXT: vpush {d8, d9}
621 ; CHECK-NEXT: vldrb.u16 q2, [r0, #8]
622 ; CHECK-NEXT: vldrb.u16 q1, [r0, #16]
623 ; CHECK-NEXT: vldrb.u16 q3, [r0]
624 ; CHECK-NEXT: vmovx.f16 s2, s6
625 ; CHECK-NEXT: vmovx.f16 s0, s10
626 ; CHECK-NEXT: vins.f16 s0, s2
627 ; CHECK-NEXT: vins.f16 s1, s11
628 ; CHECK-NEXT: vmovx.f16 s2, s7
629 ; CHECK-NEXT: vmovx.f16 s3, s11
630 ; CHECK-NEXT: vins.f16 s3, s2
631 ; CHECK-NEXT: vmovx.f16 s16, s1
632 ; CHECK-NEXT: vmov.f32 s1, s15
633 ; CHECK-NEXT: vmovx.f16 s18, s15
634 ; CHECK-NEXT: vmov.f32 s2, s7
635 ; CHECK-NEXT: vmov.u16 r0, q3[0]
636 ; CHECK-NEXT: vins.f16 s1, s16
637 ; CHECK-NEXT: vins.f16 s2, s18
638 ; CHECK-NEXT: vmov.8 q4[0], r0
639 ; CHECK-NEXT: vmov.u16 r0, q2[0]
640 ; CHECK-NEXT: vmov.8 q4[1], r0
641 ; CHECK-NEXT: vmov.u16 r0, q1[0]
642 ; CHECK-NEXT: vmov.8 q4[2], r0
643 ; CHECK-NEXT: vmov.u16 r0, q3[1]
644 ; CHECK-NEXT: vmov.8 q4[3], r0
645 ; CHECK-NEXT: vmov.u16 r0, q2[1]
646 ; CHECK-NEXT: vmov.8 q4[4], r0
647 ; CHECK-NEXT: vmov.u16 r0, q1[1]
648 ; CHECK-NEXT: vmov.8 q4[5], r0
649 ; CHECK-NEXT: vmov.u16 r0, q3[2]
650 ; CHECK-NEXT: vmov.8 q4[6], r0
651 ; CHECK-NEXT: vmov.u16 r0, q2[2]
652 ; CHECK-NEXT: vmov.8 q4[7], r0
653 ; CHECK-NEXT: vmov.u16 r0, q1[2]
654 ; CHECK-NEXT: vmov.8 q4[8], r0
655 ; CHECK-NEXT: vmov.u16 r0, q3[3]
656 ; CHECK-NEXT: vmov.8 q4[9], r0
657 ; CHECK-NEXT: vmov.u16 r0, q2[3]
658 ; CHECK-NEXT: vmov.8 q4[10], r0
659 ; CHECK-NEXT: vmov.u16 r0, q1[3]
660 ; CHECK-NEXT: vmov.8 q4[11], r0
661 ; CHECK-NEXT: vmov.u16 r0, q3[4]
662 ; CHECK-NEXT: vmov.8 q4[12], r0
663 ; CHECK-NEXT: vmov.u16 r0, q2[4]
664 ; CHECK-NEXT: vmov.8 q4[13], r0
665 ; CHECK-NEXT: vmov.u16 r0, q1[4]
666 ; CHECK-NEXT: vmov.8 q4[14], r0
667 ; CHECK-NEXT: vmov.u16 r0, q3[5]
668 ; CHECK-NEXT: vmov.8 q4[15], r0
669 ; CHECK-NEXT: vstrb.16 q0, [r1, #16]
670 ; CHECK-NEXT: vstrw.32 q4, [r1]
671 ; CHECK-NEXT: vpop {d8, d9}
674 %l1 = load <8 x i8>, ptr %src, align 4
675 %s2 = getelementptr <8 x i8>, ptr %src, i32 1
676 %l2 = load <8 x i8>, ptr %s2, align 4
677 %s3 = getelementptr <8 x i8>, ptr %src, i32 2
678 %l3 = load <8 x i8>, ptr %s3, align 4
679 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
680 %t2 = shufflevector <8 x i8> %l3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
681 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
682 store <24 x i8> %s, ptr %dst
686 define void @vst3_v16i8(ptr %src, ptr %dst) {
687 ; CHECK-LABEL: vst3_v16i8:
688 ; CHECK: @ %bb.0: @ %entry
689 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
690 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
691 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
692 ; CHECK-NEXT: vldrw.u32 q3, [r0]
693 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
694 ; CHECK-NEXT: vmov.u8 r2, q2[5]
695 ; CHECK-NEXT: vmov.8 q4[0], r2
696 ; CHECK-NEXT: vmov.u8 r2, q3[6]
697 ; CHECK-NEXT: vmov.8 q4[2], r2
698 ; CHECK-NEXT: vmov.u8 r2, q2[6]
699 ; CHECK-NEXT: vmov.8 q4[3], r2
700 ; CHECK-NEXT: vmov.u8 r2, q3[7]
701 ; CHECK-NEXT: vmov.8 q4[5], r2
702 ; CHECK-NEXT: vmov.u8 r2, q2[7]
703 ; CHECK-NEXT: vmov.8 q4[6], r2
704 ; CHECK-NEXT: vmov.u8 r2, q3[8]
705 ; CHECK-NEXT: vmov.8 q4[8], r2
706 ; CHECK-NEXT: vmov.u8 r2, q2[8]
707 ; CHECK-NEXT: vmov.8 q4[9], r2
708 ; CHECK-NEXT: vmov.u8 r2, q3[9]
709 ; CHECK-NEXT: vmov.8 q4[11], r2
710 ; CHECK-NEXT: vmov.u8 r2, q2[9]
711 ; CHECK-NEXT: vmov.8 q4[12], r2
712 ; CHECK-NEXT: vmov.u8 r2, q3[10]
713 ; CHECK-NEXT: vmov.8 q4[14], r2
714 ; CHECK-NEXT: vmov.u8 r2, q2[10]
715 ; CHECK-NEXT: vmov.8 q4[15], r2
716 ; CHECK-NEXT: vmov.u8 r0, q1[5]
717 ; CHECK-NEXT: vmov.u8 r2, q4[0]
718 ; CHECK-NEXT: vmov.8 q0[0], r2
719 ; CHECK-NEXT: vmov.8 q0[1], r0
720 ; CHECK-NEXT: vmov.u8 r0, q4[2]
721 ; CHECK-NEXT: vmov.8 q0[2], r0
722 ; CHECK-NEXT: vmov.u8 r0, q4[3]
723 ; CHECK-NEXT: vmov.8 q0[3], r0
724 ; CHECK-NEXT: vmov.u8 r0, q1[6]
725 ; CHECK-NEXT: vmov.8 q0[4], r0
726 ; CHECK-NEXT: vmov.u8 r0, q4[5]
727 ; CHECK-NEXT: vmov.8 q0[5], r0
728 ; CHECK-NEXT: vmov.u8 r0, q4[6]
729 ; CHECK-NEXT: vmov.8 q0[6], r0
730 ; CHECK-NEXT: vmov.u8 r0, q1[7]
731 ; CHECK-NEXT: vmov.8 q0[7], r0
732 ; CHECK-NEXT: vmov.u8 r0, q4[8]
733 ; CHECK-NEXT: vmov.8 q0[8], r0
734 ; CHECK-NEXT: vmov.u8 r0, q4[9]
735 ; CHECK-NEXT: vmov.8 q0[9], r0
736 ; CHECK-NEXT: vmov.u8 r0, q1[8]
737 ; CHECK-NEXT: vmov.8 q0[10], r0
738 ; CHECK-NEXT: vmov.u8 r0, q4[11]
739 ; CHECK-NEXT: vmov.8 q0[11], r0
740 ; CHECK-NEXT: vmov.u8 r0, q4[12]
741 ; CHECK-NEXT: vmov.8 q0[12], r0
742 ; CHECK-NEXT: vmov.u8 r0, q1[9]
743 ; CHECK-NEXT: vmov.8 q0[13], r0
744 ; CHECK-NEXT: vmov.u8 r0, q4[14]
745 ; CHECK-NEXT: vmov.8 q0[14], r0
746 ; CHECK-NEXT: vmov.u8 r0, q4[15]
747 ; CHECK-NEXT: vmov.8 q0[15], r0
748 ; CHECK-NEXT: vmov.u8 r0, q3[0]
749 ; CHECK-NEXT: vmov.8 q5[0], r0
750 ; CHECK-NEXT: vmov.u8 r0, q2[0]
751 ; CHECK-NEXT: vmov.8 q5[1], r0
752 ; CHECK-NEXT: vmov.u8 r0, q3[1]
753 ; CHECK-NEXT: vmov.8 q5[3], r0
754 ; CHECK-NEXT: vmov.u8 r0, q2[1]
755 ; CHECK-NEXT: vmov.8 q5[4], r0
756 ; CHECK-NEXT: vmov.u8 r0, q3[2]
757 ; CHECK-NEXT: vmov.8 q5[6], r0
758 ; CHECK-NEXT: vmov.u8 r0, q2[2]
759 ; CHECK-NEXT: vmov.8 q5[7], r0
760 ; CHECK-NEXT: vmov.u8 r0, q3[3]
761 ; CHECK-NEXT: vmov.8 q5[9], r0
762 ; CHECK-NEXT: vmov.u8 r0, q2[3]
763 ; CHECK-NEXT: vmov.8 q5[10], r0
764 ; CHECK-NEXT: vmov.u8 r0, q3[4]
765 ; CHECK-NEXT: vmov.8 q5[12], r0
766 ; CHECK-NEXT: vmov.u8 r0, q2[4]
767 ; CHECK-NEXT: vmov.8 q5[13], r0
768 ; CHECK-NEXT: vmov.u8 r0, q3[5]
769 ; CHECK-NEXT: vmov.8 q5[15], r0
770 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
771 ; CHECK-NEXT: vmov.u8 r0, q5[0]
772 ; CHECK-NEXT: vmov.8 q4[0], r0
773 ; CHECK-NEXT: vmov.u8 r0, q5[1]
774 ; CHECK-NEXT: vmov.8 q4[1], r0
775 ; CHECK-NEXT: vmov.u8 r0, q1[0]
776 ; CHECK-NEXT: vmov.8 q4[2], r0
777 ; CHECK-NEXT: vmov.u8 r0, q5[3]
778 ; CHECK-NEXT: vmov.8 q4[3], r0
779 ; CHECK-NEXT: vmov.u8 r0, q5[4]
780 ; CHECK-NEXT: vmov.8 q4[4], r0
781 ; CHECK-NEXT: vmov.u8 r0, q1[1]
782 ; CHECK-NEXT: vmov.8 q4[5], r0
783 ; CHECK-NEXT: vmov.u8 r0, q5[6]
784 ; CHECK-NEXT: vmov.8 q4[6], r0
785 ; CHECK-NEXT: vmov.u8 r0, q5[7]
786 ; CHECK-NEXT: vmov.8 q4[7], r0
787 ; CHECK-NEXT: vmov.u8 r0, q1[2]
788 ; CHECK-NEXT: vmov.8 q4[8], r0
789 ; CHECK-NEXT: vmov.u8 r0, q5[9]
790 ; CHECK-NEXT: vmov.8 q4[9], r0
791 ; CHECK-NEXT: vmov.u8 r0, q5[10]
792 ; CHECK-NEXT: vmov.8 q4[10], r0
793 ; CHECK-NEXT: vmov.u8 r0, q1[3]
794 ; CHECK-NEXT: vmov.8 q4[11], r0
795 ; CHECK-NEXT: vmov.u8 r0, q5[12]
796 ; CHECK-NEXT: vmov.8 q4[12], r0
797 ; CHECK-NEXT: vmov.u8 r0, q5[13]
798 ; CHECK-NEXT: vmov.8 q4[13], r0
799 ; CHECK-NEXT: vmov.u8 r0, q1[4]
800 ; CHECK-NEXT: vmov.8 q4[14], r0
801 ; CHECK-NEXT: vmov.u8 r0, q5[15]
802 ; CHECK-NEXT: vmov.8 q4[15], r0
803 ; CHECK-NEXT: vmov.u8 r0, q1[10]
804 ; CHECK-NEXT: vmov.8 q5[0], r0
805 ; CHECK-NEXT: vmov.u8 r0, q3[11]
806 ; CHECK-NEXT: vmov.8 q6[1], r0
807 ; CHECK-NEXT: vmov.u8 r0, q2[11]
808 ; CHECK-NEXT: vmov.8 q6[2], r0
809 ; CHECK-NEXT: vmov.u8 r0, q3[12]
810 ; CHECK-NEXT: vmov.8 q6[4], r0
811 ; CHECK-NEXT: vmov.u8 r0, q2[12]
812 ; CHECK-NEXT: vmov.8 q6[5], r0
813 ; CHECK-NEXT: vmov.u8 r0, q3[13]
814 ; CHECK-NEXT: vmov.8 q6[7], r0
815 ; CHECK-NEXT: vmov.u8 r0, q2[13]
816 ; CHECK-NEXT: vmov.8 q6[8], r0
817 ; CHECK-NEXT: vmov.u8 r0, q3[14]
818 ; CHECK-NEXT: vmov.8 q6[10], r0
819 ; CHECK-NEXT: vmov.u8 r0, q2[14]
820 ; CHECK-NEXT: vmov.8 q6[11], r0
821 ; CHECK-NEXT: vmov.u8 r0, q3[15]
822 ; CHECK-NEXT: vmov.8 q6[13], r0
823 ; CHECK-NEXT: vmov.u8 r0, q2[15]
824 ; CHECK-NEXT: vmov.8 q6[14], r0
825 ; CHECK-NEXT: vstrw.32 q4, [r1]
826 ; CHECK-NEXT: vmov.u8 r0, q6[1]
827 ; CHECK-NEXT: vmov.8 q5[1], r0
828 ; CHECK-NEXT: vmov.u8 r0, q6[2]
829 ; CHECK-NEXT: vmov.8 q5[2], r0
830 ; CHECK-NEXT: vmov.u8 r0, q1[11]
831 ; CHECK-NEXT: vmov.8 q5[3], r0
832 ; CHECK-NEXT: vmov.u8 r0, q6[4]
833 ; CHECK-NEXT: vmov.8 q5[4], r0
834 ; CHECK-NEXT: vmov.u8 r0, q6[5]
835 ; CHECK-NEXT: vmov.8 q5[5], r0
836 ; CHECK-NEXT: vmov.u8 r0, q1[12]
837 ; CHECK-NEXT: vmov.8 q5[6], r0
838 ; CHECK-NEXT: vmov.u8 r0, q6[7]
839 ; CHECK-NEXT: vmov.8 q5[7], r0
840 ; CHECK-NEXT: vmov.u8 r0, q6[8]
841 ; CHECK-NEXT: vmov.8 q5[8], r0
842 ; CHECK-NEXT: vmov.u8 r0, q1[13]
843 ; CHECK-NEXT: vmov.8 q5[9], r0
844 ; CHECK-NEXT: vmov.u8 r0, q6[10]
845 ; CHECK-NEXT: vmov.8 q5[10], r0
846 ; CHECK-NEXT: vmov.u8 r0, q6[11]
847 ; CHECK-NEXT: vmov.8 q5[11], r0
848 ; CHECK-NEXT: vmov.u8 r0, q1[14]
849 ; CHECK-NEXT: vmov.8 q5[12], r0
850 ; CHECK-NEXT: vmov.u8 r0, q6[13]
851 ; CHECK-NEXT: vmov.8 q5[13], r0
852 ; CHECK-NEXT: vmov.u8 r0, q6[14]
853 ; CHECK-NEXT: vmov.8 q5[14], r0
854 ; CHECK-NEXT: vmov.u8 r0, q1[15]
855 ; CHECK-NEXT: vmov.8 q5[15], r0
856 ; CHECK-NEXT: vstrw.32 q5, [r1, #32]
857 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
860 %l1 = load <16 x i8>, ptr %src, align 4
861 %s2 = getelementptr <16 x i8>, ptr %src, i32 1
862 %l2 = load <16 x i8>, ptr %s2, align 4
863 %s3 = getelementptr <16 x i8>, ptr %src, i32 2
864 %l3 = load <16 x i8>, ptr %s3, align 4
865 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
866 %t2 = shufflevector <16 x i8> %l3, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
867 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
868 store <48 x i8> %s, ptr %dst
874 define void @vst3_v2i64(ptr %src, ptr %dst) {
875 ; CHECK-LABEL: vst3_v2i64:
876 ; CHECK: @ %bb.0: @ %entry
877 ; CHECK-NEXT: vldrw.u32 q0, [r0]
878 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
879 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
880 ; CHECK-NEXT: vmov.f64 d6, d2
881 ; CHECK-NEXT: vmov.f64 d7, d1
882 ; CHECK-NEXT: vmov.f64 d1, d4
883 ; CHECK-NEXT: vstrw.32 q3, [r1, #16]
884 ; CHECK-NEXT: vmov.f64 d2, d5
885 ; CHECK-NEXT: vstrw.32 q0, [r1]
886 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
889 %l1 = load <2 x i64>, ptr %src, align 4
890 %s2 = getelementptr <2 x i64>, ptr %src, i32 1
891 %l2 = load <2 x i64>, ptr %s2, align 4
892 %s3 = getelementptr <2 x i64>, ptr %src, i32 2
893 %l3 = load <2 x i64>, ptr %s3, align 4
894 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
895 %t2 = shufflevector <2 x i64> %l3, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
896 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
897 store <6 x i64> %s, ptr %dst
901 define void @vst3_v4i64(ptr %src, ptr %dst) {
902 ; CHECK-LABEL: vst3_v4i64:
903 ; CHECK: @ %bb.0: @ %entry
904 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
905 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
906 ; CHECK-NEXT: vldrw.u32 q7, [r0, #48]
907 ; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
908 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
909 ; CHECK-NEXT: vldrw.u32 q1, [r0]
910 ; CHECK-NEXT: vmov.f64 d6, d15
911 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
912 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
913 ; CHECK-NEXT: vmov.f64 d15, d13
914 ; CHECK-NEXT: vmov.f64 d7, d1
915 ; CHECK-NEXT: vmov.f64 d10, d2
916 ; CHECK-NEXT: vstrw.32 q3, [r1, #80]
917 ; CHECK-NEXT: vmov.f64 d11, d12
918 ; CHECK-NEXT: vmov.f64 d2, d8
919 ; CHECK-NEXT: vstrw.32 q5, [r1]
920 ; CHECK-NEXT: vmov.f64 d1, d5
921 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
922 ; CHECK-NEXT: vmov.f64 d8, d15
923 ; CHECK-NEXT: vstrw.32 q0, [r1, #64]
924 ; CHECK-NEXT: vmov.f64 d12, d4
925 ; CHECK-NEXT: vstrw.32 q4, [r1, #32]
926 ; CHECK-NEXT: vmov.f64 d13, d14
927 ; CHECK-NEXT: vstrw.32 q6, [r1, #48]
928 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
931 %l1 = load <4 x i64>, ptr %src, align 4
932 %s2 = getelementptr <4 x i64>, ptr %src, i32 1
933 %l2 = load <4 x i64>, ptr %s2, align 4
934 %s3 = getelementptr <4 x i64>, ptr %src, i32 2
935 %l3 = load <4 x i64>, ptr %s3, align 4
936 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
937 %t2 = shufflevector <4 x i64> %l3, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
938 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
939 store <12 x i64> %s, ptr %dst
945 define void @vst3_v2f32(ptr %src, ptr %dst) {
946 ; CHECK-LABEL: vst3_v2f32:
947 ; CHECK: @ %bb.0: @ %entry
948 ; CHECK-NEXT: ldr r2, [r0, #20]
949 ; CHECK-NEXT: vldr s0, [r0]
950 ; CHECK-NEXT: vldr s3, [r0, #4]
951 ; CHECK-NEXT: vldr s1, [r0, #8]
952 ; CHECK-NEXT: vldr s2, [r0, #16]
953 ; CHECK-NEXT: ldr r0, [r0, #12]
954 ; CHECK-NEXT: strd r0, r2, [r1, #16]
955 ; CHECK-NEXT: vstrw.32 q0, [r1]
958 %l1 = load <2 x float>, ptr %src, align 4
959 %s2 = getelementptr <2 x float>, ptr %src, i32 1
960 %l2 = load <2 x float>, ptr %s2, align 4
961 %s3 = getelementptr <2 x float>, ptr %src, i32 2
962 %l3 = load <2 x float>, ptr %s3, align 4
963 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
964 %t2 = shufflevector <2 x float> %l3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
965 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
966 store <6 x float> %s, ptr %dst
970 define void @vst3_v4f32(ptr %src, ptr %dst) {
971 ; CHECK-LABEL: vst3_v4f32:
972 ; CHECK: @ %bb.0: @ %entry
973 ; CHECK-NEXT: .vsave {d8, d9}
974 ; CHECK-NEXT: vpush {d8, d9}
975 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
976 ; CHECK-NEXT: vldrw.u32 q2, [r0]
977 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
978 ; CHECK-NEXT: vmov.f32 s12, s1
979 ; CHECK-NEXT: vmov.f32 s13, s5
980 ; CHECK-NEXT: vmov.f32 s18, s4
981 ; CHECK-NEXT: vmov.f32 s4, s6
982 ; CHECK-NEXT: vmov.f32 s14, s10
983 ; CHECK-NEXT: vmov.f32 s15, s2
984 ; CHECK-NEXT: vmov.f32 s16, s8
985 ; CHECK-NEXT: vstrw.32 q3, [r1, #16]
986 ; CHECK-NEXT: vmov.f32 s17, s0
987 ; CHECK-NEXT: vmov.f32 s19, s9
988 ; CHECK-NEXT: vmov.f32 s5, s11
989 ; CHECK-NEXT: vstrw.32 q4, [r1]
990 ; CHECK-NEXT: vmov.f32 s6, s3
991 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
992 ; CHECK-NEXT: vpop {d8, d9}
995 %l1 = load <4 x float>, ptr %src, align 4
996 %s2 = getelementptr <4 x float>, ptr %src, i32 1
997 %l2 = load <4 x float>, ptr %s2, align 4
998 %s3 = getelementptr <4 x float>, ptr %src, i32 2
999 %l3 = load <4 x float>, ptr %s3, align 4
1000 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1001 %t2 = shufflevector <4 x float> %l3, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1002 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1003 store <12 x float> %s, ptr %dst
1007 define void @vst3_v8f32(ptr %src, ptr %dst) {
1008 ; CHECK-LABEL: vst3_v8f32:
1009 ; CHECK: @ %bb.0: @ %entry
1010 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1011 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1012 ; CHECK-NEXT: .pad #32
1013 ; CHECK-NEXT: sub sp, #32
1014 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
1015 ; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
1016 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
1017 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1018 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
1019 ; CHECK-NEXT: vmov.f32 s0, s2
1020 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
1021 ; CHECK-NEXT: vmov.f32 s1, s15
1022 ; CHECK-NEXT: vmov.f32 s2, s11
1023 ; CHECK-NEXT: vldrw.u32 q7, [r0, #64]
1024 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
1025 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
1026 ; CHECK-NEXT: vmov.f32 s0, s12
1027 ; CHECK-NEXT: vmov.f32 s1, s8
1028 ; CHECK-NEXT: vmov.f32 s3, s13
1029 ; CHECK-NEXT: vmov.f32 s2, s24
1030 ; CHECK-NEXT: vstrw.32 q0, [r1, #48]
1031 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
1032 ; CHECK-NEXT: vmov.f32 s20, s4
1033 ; CHECK-NEXT: vmov.f32 s23, s5
1034 ; CHECK-NEXT: vstrw.32 q0, [r1, #80]
1035 ; CHECK-NEXT: vmov.f32 s12, s9
1036 ; CHECK-NEXT: vmov.f32 s15, s10
1037 ; CHECK-NEXT: vmov.f32 s13, s25
1038 ; CHECK-NEXT: vmov.f32 s9, s7
1039 ; CHECK-NEXT: vstrw.32 q3, [r1, #64]
1040 ; CHECK-NEXT: vmov.f32 s21, s16
1041 ; CHECK-NEXT: vmov.f32 s22, s28
1042 ; CHECK-NEXT: vmov.f32 s8, s30
1043 ; CHECK-NEXT: vstrw.32 q5, [r1]
1044 ; CHECK-NEXT: vmov.f32 s10, s19
1045 ; CHECK-NEXT: vmov.f32 s11, s31
1046 ; CHECK-NEXT: vmov.f32 s5, s29
1047 ; CHECK-NEXT: vstrw.32 q2, [r1, #32]
1048 ; CHECK-NEXT: vmov.f32 s4, s17
1049 ; CHECK-NEXT: vmov.f32 s7, s18
1050 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
1051 ; CHECK-NEXT: add sp, #32
1052 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1055 %l1 = load <8 x float>, ptr %src, align 4
1056 %s2 = getelementptr <8 x float>, ptr %src, i32 1
1057 %l2 = load <8 x float>, ptr %s2, align 4
1058 %s3 = getelementptr <8 x float>, ptr %src, i32 2
1059 %l3 = load <8 x float>, ptr %s3, align 4
1060 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1061 %t2 = shufflevector <8 x float> %l3, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1062 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1063 store <24 x float> %s, ptr %dst
1067 define void @vst3_v16f32(ptr %src, ptr %dst) {
1068 ; CHECK-LABEL: vst3_v16f32:
1069 ; CHECK: @ %bb.0: @ %entry
1070 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1071 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1072 ; CHECK-NEXT: .pad #128
1073 ; CHECK-NEXT: sub sp, #128
1074 ; CHECK-NEXT: vldrw.u32 q3, [r0, #176]
1075 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
1076 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1077 ; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
1078 ; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill
1079 ; CHECK-NEXT: vldrw.u32 q3, [r0, #160]
1080 ; CHECK-NEXT: vmov.f32 s24, s9
1081 ; CHECK-NEXT: vldrw.u32 q5, [r0, #144]
1082 ; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill
1083 ; CHECK-NEXT: vldrw.u32 q3, [r0, #96]
1084 ; CHECK-NEXT: vmov.f32 s26, s6
1085 ; CHECK-NEXT: vldrw.u32 q7, [r0, #112]
1086 ; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill
1087 ; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
1088 ; CHECK-NEXT: vmov.f32 s27, s10
1089 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
1090 ; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill
1091 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
1092 ; CHECK-NEXT: vmov.f32 s25, s1
1093 ; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
1094 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
1095 ; CHECK-NEXT: vstrw.32 q6, [r1, #16]
1096 ; CHECK-NEXT: vmov.f32 s24, s2
1097 ; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
1098 ; CHECK-NEXT: vmov.f32 s27, s3
1099 ; CHECK-NEXT: vmov.f32 s14, s0
1100 ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
1101 ; CHECK-NEXT: vmov.f32 s12, s4
1102 ; CHECK-NEXT: vmov.f32 s15, s5
1103 ; CHECK-NEXT: vmov.f32 s13, s8
1104 ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
1105 ; CHECK-NEXT: vmov.f32 s25, s7
1106 ; CHECK-NEXT: vmov.f32 s6, s0
1107 ; CHECK-NEXT: vmov.f32 s13, s1
1108 ; CHECK-NEXT: vmov.f32 s0, s2
1109 ; CHECK-NEXT: vmov.f32 s4, s16
1110 ; CHECK-NEXT: vmov.f32 s5, s28
1111 ; CHECK-NEXT: vmov.f32 s7, s17
1112 ; CHECK-NEXT: vmov.f32 s1, s19
1113 ; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
1114 ; CHECK-NEXT: vmov.f32 s2, s31
1115 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
1116 ; CHECK-NEXT: vmov.f32 s26, s11
1117 ; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
1118 ; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
1119 ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
1120 ; CHECK-NEXT: vmov.f32 s15, s30
1121 ; CHECK-NEXT: vstrw.32 q6, [r1, #32]
1122 ; CHECK-NEXT: vmov.f32 s17, s1
1123 ; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload
1124 ; CHECK-NEXT: vmov.f32 s30, s0
1125 ; CHECK-NEXT: vmov.f32 s0, s2
1126 ; CHECK-NEXT: vmov.f32 s1, s11
1127 ; CHECK-NEXT: vmov.f32 s2, s7
1128 ; CHECK-NEXT: vmov.f32 s14, s18
1129 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
1130 ; CHECK-NEXT: vmov.f32 s18, s10
1131 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
1132 ; CHECK-NEXT: vmov.f32 s28, s8
1133 ; CHECK-NEXT: vmov.f32 s31, s9
1134 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
1135 ; CHECK-NEXT: vmov.f32 s12, s29
1136 ; CHECK-NEXT: vmov.f32 s29, s4
1137 ; CHECK-NEXT: vstrw.32 q3, [r1, #160]
1138 ; CHECK-NEXT: vmov.f32 s16, s5
1139 ; CHECK-NEXT: vstrw.32 q7, [r1, #96]
1140 ; CHECK-NEXT: vmov.f32 s19, s6
1141 ; CHECK-NEXT: vmov.f32 s4, s8
1142 ; CHECK-NEXT: vstrw.32 q4, [r1, #112]
1143 ; CHECK-NEXT: vmov.f32 s6, s20
1144 ; CHECK-NEXT: vmov.f32 s20, s22
1145 ; CHECK-NEXT: vmov.f32 s5, s0
1146 ; CHECK-NEXT: vmov.f32 s8, s1
1147 ; CHECK-NEXT: vmov.f32 s11, s2
1148 ; CHECK-NEXT: vmov.f32 s22, s3
1149 ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
1150 ; CHECK-NEXT: vmov.f32 s7, s9
1151 ; CHECK-NEXT: vstrw.32 q0, [r1, #128]
1152 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
1153 ; CHECK-NEXT: vmov.f32 s9, s21
1154 ; CHECK-NEXT: vstrw.32 q1, [r1, #48]
1155 ; CHECK-NEXT: vstrw.32 q0, [r1, #144]
1156 ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload
1157 ; CHECK-NEXT: vmov.f32 s21, s27
1158 ; CHECK-NEXT: vstrw.32 q2, [r1, #64]
1159 ; CHECK-NEXT: vstrw.32 q0, [r1, #176]
1160 ; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload
1161 ; CHECK-NEXT: vstrw.32 q5, [r1, #80]
1162 ; CHECK-NEXT: vstrw.32 q0, [r1]
1163 ; CHECK-NEXT: add sp, #128
1164 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1167 %l1 = load <16 x float>, ptr %src, align 4
1168 %s2 = getelementptr <16 x float>, ptr %src, i32 1
1169 %l2 = load <16 x float>, ptr %s2, align 4
1170 %s3 = getelementptr <16 x float>, ptr %src, i32 2
1171 %l3 = load <16 x float>, ptr %s3, align 4
1172 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1173 %t2 = shufflevector <16 x float> %l3, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1174 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1175 store <48 x float> %s, ptr %dst
1181 define void @vst3_v2f16(ptr %src, ptr %dst) {
1182 ; CHECK-LABEL: vst3_v2f16:
1183 ; CHECK: @ %bb.0: @ %entry
1184 ; CHECK-NEXT: ldrd r2, r3, [r0]
1185 ; CHECK-NEXT: ldr r0, [r0, #8]
1186 ; CHECK-NEXT: vmov.32 q0[0], r2
1187 ; CHECK-NEXT: vmov.32 q0[1], r3
1188 ; CHECK-NEXT: vmov.32 q1[0], r0
1189 ; CHECK-NEXT: vmovx.f16 s2, s0
1190 ; CHECK-NEXT: vmovx.f16 s6, s4
1191 ; CHECK-NEXT: vins.f16 s4, s2
1192 ; CHECK-NEXT: vmovx.f16 s2, s1
1193 ; CHECK-NEXT: vins.f16 s0, s1
1194 ; CHECK-NEXT: vmov.f32 s1, s4
1195 ; CHECK-NEXT: vins.f16 s2, s6
1196 ; CHECK-NEXT: vmov r3, s2
1197 ; CHECK-NEXT: vmov r0, r2, d0
1198 ; CHECK-NEXT: stm r1!, {r0, r2, r3}
1201 %l1 = load <2 x half>, ptr %src, align 4
1202 %s2 = getelementptr <2 x half>, ptr %src, i32 1
1203 %l2 = load <2 x half>, ptr %s2, align 4
1204 %s3 = getelementptr <2 x half>, ptr %src, i32 2
1205 %l3 = load <2 x half>, ptr %s3, align 4
1206 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1207 %t2 = shufflevector <2 x half> %l3, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1208 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1209 store <6 x half> %s, ptr %dst
1213 define void @vst3_v4f16(ptr %src, ptr %dst) {
1214 ; CHECK-LABEL: vst3_v4f16:
1215 ; CHECK: @ %bb.0: @ %entry
1216 ; CHECK-NEXT: .save {r4, lr}
1217 ; CHECK-NEXT: push {r4, lr}
1218 ; CHECK-NEXT: ldrd lr, r12, [r0]
1219 ; CHECK-NEXT: ldrd r3, r2, [r0, #8]
1220 ; CHECK-NEXT: ldrd r4, r0, [r0, #16]
1221 ; CHECK-NEXT: vmov q0[2], q0[0], lr, r3
1222 ; CHECK-NEXT: vmov.32 q1[0], r4
1223 ; CHECK-NEXT: vmov q0[3], q0[1], r12, r2
1224 ; CHECK-NEXT: vmov.32 q1[1], r0
1225 ; CHECK-NEXT: vmovx.f16 s9, s3
1226 ; CHECK-NEXT: vmovx.f16 s6, s0
1227 ; CHECK-NEXT: vins.f16 s0, s2
1228 ; CHECK-NEXT: vmovx.f16 s8, s4
1229 ; CHECK-NEXT: vmovx.f16 s2, s2
1230 ; CHECK-NEXT: vins.f16 s4, s6
1231 ; CHECK-NEXT: vmovx.f16 s6, s1
1232 ; CHECK-NEXT: vins.f16 s2, s8
1233 ; CHECK-NEXT: vmovx.f16 s8, s5
1234 ; CHECK-NEXT: vins.f16 s5, s6
1235 ; CHECK-NEXT: vins.f16 s9, s8
1236 ; CHECK-NEXT: vmov.f32 s8, s5
1237 ; CHECK-NEXT: vins.f16 s1, s3
1238 ; CHECK-NEXT: vmov r0, r2, d4
1239 ; CHECK-NEXT: vmov q2, q0
1240 ; CHECK-NEXT: vmov.f32 s9, s4
1241 ; CHECK-NEXT: vmov.f32 s10, s2
1242 ; CHECK-NEXT: vmov.f32 s11, s1
1243 ; CHECK-NEXT: vstrw.32 q2, [r1]
1244 ; CHECK-NEXT: strd r0, r2, [r1, #16]
1245 ; CHECK-NEXT: pop {r4, pc}
1247 %l1 = load <4 x half>, ptr %src, align 4
1248 %s2 = getelementptr <4 x half>, ptr %src, i32 1
1249 %l2 = load <4 x half>, ptr %s2, align 4
1250 %s3 = getelementptr <4 x half>, ptr %src, i32 2
1251 %l3 = load <4 x half>, ptr %s3, align 4
1252 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1253 %t2 = shufflevector <4 x half> %l3, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1254 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1255 store <12 x half> %s, ptr %dst
1259 define void @vst3_v8f16(ptr %src, ptr %dst) {
1260 ; CHECK-LABEL: vst3_v8f16:
1261 ; CHECK: @ %bb.0: @ %entry
1262 ; CHECK-NEXT: .vsave {d8, d9}
1263 ; CHECK-NEXT: vpush {d8, d9}
1264 ; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
1265 ; CHECK-NEXT: vldrw.u32 q3, [r0]
1266 ; CHECK-NEXT: vmovx.f16 s0, s18
1267 ; CHECK-NEXT: vmov.f32 s4, s15
1268 ; CHECK-NEXT: vmov r2, s0
1269 ; CHECK-NEXT: vins.f16 s4, s19
1270 ; CHECK-NEXT: vmov.16 q0[0], r2
1271 ; CHECK-NEXT: vmovx.f16 s10, s16
1272 ; CHECK-NEXT: vmov.f32 s1, s4
1273 ; CHECK-NEXT: vmovx.f16 s4, s19
1274 ; CHECK-NEXT: vmov r2, s4
1275 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
1276 ; CHECK-NEXT: vmov.16 q0[6], r2
1277 ; CHECK-NEXT: vmov r0, s10
1278 ; CHECK-NEXT: vmovx.f16 s8, s6
1279 ; CHECK-NEXT: vmovx.f16 s2, s15
1280 ; CHECK-NEXT: vins.f16 s0, s8
1281 ; CHECK-NEXT: vmovx.f16 s8, s7
1282 ; CHECK-NEXT: vins.f16 s3, s8
1283 ; CHECK-NEXT: vmov.f32 s8, s12
1284 ; CHECK-NEXT: vins.f16 s8, s16
1285 ; CHECK-NEXT: vins.f16 s7, s2
1286 ; CHECK-NEXT: vmov.f32 s2, s13
1287 ; CHECK-NEXT: vmov.16 q2[4], r0
1288 ; CHECK-NEXT: vins.f16 s2, s17
1289 ; CHECK-NEXT: vmov.f32 s11, s2
1290 ; CHECK-NEXT: vmovx.f16 s2, s12
1291 ; CHECK-NEXT: vmovx.f16 s12, s4
1292 ; CHECK-NEXT: vins.f16 s4, s2
1293 ; CHECK-NEXT: vins.f16 s10, s12
1294 ; CHECK-NEXT: vmovx.f16 s12, s17
1295 ; CHECK-NEXT: vmov.f32 s2, s14
1296 ; CHECK-NEXT: vmov r0, s12
1297 ; CHECK-NEXT: vins.f16 s2, s18
1298 ; CHECK-NEXT: vmov.16 q4[2], r0
1299 ; CHECK-NEXT: vmovx.f16 s12, s5
1300 ; CHECK-NEXT: vmov.f32 s18, s2
1301 ; CHECK-NEXT: vmovx.f16 s2, s13
1302 ; CHECK-NEXT: vins.f16 s5, s2
1303 ; CHECK-NEXT: vmovx.f16 s2, s14
1304 ; CHECK-NEXT: vins.f16 s6, s2
1305 ; CHECK-NEXT: vmov.f32 s2, s7
1306 ; CHECK-NEXT: vmov.f32 s9, s4
1307 ; CHECK-NEXT: vins.f16 s17, s12
1308 ; CHECK-NEXT: vmov.f32 s16, s5
1309 ; CHECK-NEXT: vstrw.32 q0, [r1, #32]
1310 ; CHECK-NEXT: vmov.f32 s19, s6
1311 ; CHECK-NEXT: vstrw.32 q2, [r1]
1312 ; CHECK-NEXT: vstrw.32 q4, [r1, #16]
1313 ; CHECK-NEXT: vpop {d8, d9}
1316 %l1 = load <8 x half>, ptr %src, align 4
1317 %s2 = getelementptr <8 x half>, ptr %src, i32 1
1318 %l2 = load <8 x half>, ptr %s2, align 4
1319 %s3 = getelementptr <8 x half>, ptr %src, i32 2
1320 %l3 = load <8 x half>, ptr %s3, align 4
1321 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1322 %t2 = shufflevector <8 x half> %l3, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1323 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1324 store <24 x half> %s, ptr %dst
1328 define void @vst3_v16f16(ptr %src, ptr %dst) {
1329 ; CHECK-LABEL: vst3_v16f16:
1330 ; CHECK: @ %bb.0: @ %entry
1331 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1332 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1333 ; CHECK-NEXT: .pad #48
1334 ; CHECK-NEXT: sub sp, #48
1335 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
1336 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
1337 ; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
1338 ; CHECK-NEXT: vmov.f32 s8, s12
1339 ; CHECK-NEXT: vmovx.f16 s2, s4
1340 ; CHECK-NEXT: vmov.f32 s0, s13
1341 ; CHECK-NEXT: vins.f16 s8, s4
1342 ; CHECK-NEXT: vmov r2, s2
1343 ; CHECK-NEXT: vins.f16 s0, s5
1344 ; CHECK-NEXT: vmov.16 q2[4], r2
1345 ; CHECK-NEXT: vmov q4, q3
1346 ; CHECK-NEXT: vmov.f32 s11, s0
1347 ; CHECK-NEXT: vmovx.f16 s0, s16
1348 ; CHECK-NEXT: vmov.f32 s12, s8
1349 ; CHECK-NEXT: vmov.f64 d11, d9
1350 ; CHECK-NEXT: vmov.f32 s21, s17
1351 ; CHECK-NEXT: vmov.f64 d7, d5
1352 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
1353 ; CHECK-NEXT: vmovx.f16 s2, s8
1354 ; CHECK-NEXT: vins.f16 s8, s0
1355 ; CHECK-NEXT: vins.f16 s14, s2
1356 ; CHECK-NEXT: vmovx.f16 s2, s24
1357 ; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
1358 ; CHECK-NEXT: vldrw.u32 q3, [r0]
1359 ; CHECK-NEXT: vmov r2, s2
1360 ; CHECK-NEXT: vmov.f32 s16, s12
1361 ; CHECK-NEXT: vmov.f32 s0, s13
1362 ; CHECK-NEXT: vins.f16 s16, s24
1363 ; CHECK-NEXT: vmov.16 q4[4], r2
1364 ; CHECK-NEXT: vins.f16 s0, s25
1365 ; CHECK-NEXT: vmov.f32 s19, s0
1366 ; CHECK-NEXT: vmovx.f16 s0, s12
1367 ; CHECK-NEXT: vmov.f64 d15, d13
1368 ; CHECK-NEXT: vmov.f32 s17, s13
1369 ; CHECK-NEXT: vmov.f32 s24, s16
1370 ; CHECK-NEXT: vmov.f64 d13, d9
1371 ; CHECK-NEXT: vmov.f64 d9, d7
1372 ; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
1373 ; CHECK-NEXT: vmovx.f16 s2, s12
1374 ; CHECK-NEXT: vins.f16 s12, s0
1375 ; CHECK-NEXT: vins.f16 s26, s2
1376 ; CHECK-NEXT: vmovx.f16 s2, s30
1377 ; CHECK-NEXT: vmov.f32 s0, s19
1378 ; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill
1379 ; CHECK-NEXT: vmov r0, s2
1380 ; CHECK-NEXT: vins.f16 s0, s31
1381 ; CHECK-NEXT: vmov.f32 s29, s25
1382 ; CHECK-NEXT: vmov.16 q6[0], r0
1383 ; CHECK-NEXT: vmov.f32 s25, s0
1384 ; CHECK-NEXT: vmovx.f16 s0, s31
1385 ; CHECK-NEXT: vmov r0, s0
1386 ; CHECK-NEXT: vmovx.f16 s0, s14
1387 ; CHECK-NEXT: vmov.16 q6[6], r0
1388 ; CHECK-NEXT: vmovx.f16 s2, s15
1389 ; CHECK-NEXT: vins.f16 s24, s0
1390 ; CHECK-NEXT: vmovx.f16 s0, s19
1391 ; CHECK-NEXT: vins.f16 s15, s0
1392 ; CHECK-NEXT: vmovx.f16 s0, s6
1393 ; CHECK-NEXT: vmov.f32 s4, s23
1394 ; CHECK-NEXT: vins.f16 s27, s2
1395 ; CHECK-NEXT: vmov r0, s0
1396 ; CHECK-NEXT: vins.f16 s4, s7
1397 ; CHECK-NEXT: vmov.16 q0[0], r0
1398 ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
1399 ; CHECK-NEXT: vmov.f32 s1, s4
1400 ; CHECK-NEXT: vmovx.f16 s4, s7
1401 ; CHECK-NEXT: vmov r0, s4
1402 ; CHECK-NEXT: vmovx.f16 s4, s10
1403 ; CHECK-NEXT: vmov.16 q0[6], r0
1404 ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
1405 ; CHECK-NEXT: vins.f16 s0, s4
1406 ; CHECK-NEXT: vmovx.f16 s4, s11
1407 ; CHECK-NEXT: vmovx.f16 s2, s23
1408 ; CHECK-NEXT: vins.f16 s3, s4
1409 ; CHECK-NEXT: vmovx.f16 s4, s5
1410 ; CHECK-NEXT: vins.f16 s11, s2
1411 ; CHECK-NEXT: vmov.f32 s2, s22
1412 ; CHECK-NEXT: vmov r0, s4
1413 ; CHECK-NEXT: vins.f16 s2, s6
1414 ; CHECK-NEXT: vmov.16 q1[2], r0
1415 ; CHECK-NEXT: vmov.f32 s29, s12
1416 ; CHECK-NEXT: vmovx.f16 s4, s21
1417 ; CHECK-NEXT: vmovx.f16 s12, s9
1418 ; CHECK-NEXT: vins.f16 s9, s4
1419 ; CHECK-NEXT: vmovx.f16 s4, s22
1420 ; CHECK-NEXT: vins.f16 s10, s4
1421 ; CHECK-NEXT: vmov.f32 s21, s17
1422 ; CHECK-NEXT: vmov.f32 s22, s18
1423 ; CHECK-NEXT: vins.f16 s5, s12
1424 ; CHECK-NEXT: vmov.f32 s4, s18
1425 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
1426 ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
1427 ; CHECK-NEXT: vmov.f32 s6, s2
1428 ; CHECK-NEXT: vmovx.f16 s12, s17
1429 ; CHECK-NEXT: vins.f16 s4, s18
1430 ; CHECK-NEXT: vmov r0, s12
1431 ; CHECK-NEXT: vmovx.f16 s12, s13
1432 ; CHECK-NEXT: vmov.16 q7[2], r0
1433 ; CHECK-NEXT: vmov.f32 s2, s11
1434 ; CHECK-NEXT: vmov.f32 s30, s4
1435 ; CHECK-NEXT: vmovx.f16 s4, s21
1436 ; CHECK-NEXT: vins.f16 s13, s4
1437 ; CHECK-NEXT: vmovx.f16 s4, s22
1438 ; CHECK-NEXT: vins.f16 s14, s4
1439 ; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload
1440 ; CHECK-NEXT: vstrw.32 q0, [r1, #80]
1441 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
1442 ; CHECK-NEXT: vmov.f32 s26, s15
1443 ; CHECK-NEXT: vins.f16 s29, s12
1444 ; CHECK-NEXT: vmov.f32 s21, s8
1445 ; CHECK-NEXT: vstrw.32 q6, [r1, #32]
1446 ; CHECK-NEXT: vmov.f32 s4, s9
1447 ; CHECK-NEXT: vstrw.32 q5, [r1, #48]
1448 ; CHECK-NEXT: vmov.f32 s7, s10
1449 ; CHECK-NEXT: vstrw.32 q0, [r1]
1450 ; CHECK-NEXT: vmov.f32 s28, s13
1451 ; CHECK-NEXT: vstrw.32 q1, [r1, #64]
1452 ; CHECK-NEXT: vmov.f32 s31, s14
1453 ; CHECK-NEXT: vstrw.32 q7, [r1, #16]
1454 ; CHECK-NEXT: add sp, #48
1455 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1458 %l1 = load <16 x half>, ptr %src, align 4
1459 %s2 = getelementptr <16 x half>, ptr %src, i32 1
1460 %l2 = load <16 x half>, ptr %s2, align 4
1461 %s3 = getelementptr <16 x half>, ptr %src, i32 2
1462 %l3 = load <16 x half>, ptr %s3, align 4
1463 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1464 %t2 = shufflevector <16 x half> %l3, <16 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1465 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1466 store <48 x half> %s, ptr %dst
1472 define void @vst3_v2f64(ptr %src, ptr %dst) {
1473 ; CHECK-LABEL: vst3_v2f64:
1474 ; CHECK: @ %bb.0: @ %entry
1475 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1476 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
1477 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
1478 ; CHECK-NEXT: vmov.f64 d6, d2
1479 ; CHECK-NEXT: vmov.f64 d7, d1
1480 ; CHECK-NEXT: vmov.f64 d1, d4
1481 ; CHECK-NEXT: vstrw.32 q3, [r1, #16]
1482 ; CHECK-NEXT: vmov.f64 d2, d5
1483 ; CHECK-NEXT: vstrw.32 q0, [r1]
1484 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
1487 %l1 = load <2 x double>, ptr %src, align 4
1488 %s2 = getelementptr <2 x double>, ptr %src, i32 1
1489 %l2 = load <2 x double>, ptr %s2, align 4
1490 %s3 = getelementptr <2 x double>, ptr %src, i32 2
1491 %l3 = load <2 x double>, ptr %s3, align 4
1492 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1493 %t2 = shufflevector <2 x double> %l3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1494 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
1495 store <6 x double> %s, ptr %dst
1499 define void @vst3_v4f64(ptr %src, ptr %dst) {
1500 ; CHECK-LABEL: vst3_v4f64:
1501 ; CHECK: @ %bb.0: @ %entry
1502 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1503 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1504 ; CHECK-NEXT: vldrw.u32 q7, [r0, #48]
1505 ; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
1506 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
1507 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1508 ; CHECK-NEXT: vmov.f64 d6, d15
1509 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
1510 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
1511 ; CHECK-NEXT: vmov.f64 d15, d13
1512 ; CHECK-NEXT: vmov.f64 d7, d1
1513 ; CHECK-NEXT: vmov.f64 d10, d2
1514 ; CHECK-NEXT: vstrw.32 q3, [r1, #80]
1515 ; CHECK-NEXT: vmov.f64 d11, d12
1516 ; CHECK-NEXT: vmov.f64 d2, d8
1517 ; CHECK-NEXT: vstrw.32 q5, [r1]
1518 ; CHECK-NEXT: vmov.f64 d1, d5
1519 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
1520 ; CHECK-NEXT: vmov.f64 d8, d15
1521 ; CHECK-NEXT: vstrw.32 q0, [r1, #64]
1522 ; CHECK-NEXT: vmov.f64 d12, d4
1523 ; CHECK-NEXT: vstrw.32 q4, [r1, #32]
1524 ; CHECK-NEXT: vmov.f64 d13, d14
1525 ; CHECK-NEXT: vstrw.32 q6, [r1, #48]
1526 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1529 %l1 = load <4 x double>, ptr %src, align 4
1530 %s2 = getelementptr <4 x double>, ptr %src, i32 1
1531 %l2 = load <4 x double>, ptr %s2, align 4
1532 %s3 = getelementptr <4 x double>, ptr %src, i32 2
1533 %l3 = load <4 x double>, ptr %s3, align 4
1534 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1535 %t2 = shufflevector <4 x double> %l3, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1536 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
1537 store <12 x double> %s, ptr %dst