1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
6 define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
7 ; CHECK-LABEL: vld3_v2i32:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: .save {r7, lr}
10 ; CHECK-NEXT: push {r7, lr}
11 ; CHECK-NEXT: vldrw.u32 q0, [r0]
12 ; CHECK-NEXT: ldrd r2, r0, [r0, #16]
13 ; CHECK-NEXT: vmov.f32 s6, s3
14 ; CHECK-NEXT: vmov r12, lr, d0
15 ; CHECK-NEXT: vmov r3, s6
16 ; CHECK-NEXT: add r2, r3
17 ; CHECK-NEXT: add.w r3, r12, lr
18 ; CHECK-NEXT: add r0, r2
19 ; CHECK-NEXT: vmov r2, s2
20 ; CHECK-NEXT: add r2, r3
21 ; CHECK-NEXT: strd r2, r0, [r1]
22 ; CHECK-NEXT: pop {r7, pc}
24 %l1 = load <6 x i32>, <6 x i32>* %src, align 4
25 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
26 %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
27 %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
28 %a1 = add <2 x i32> %s1, %s2
29 %a = add <2 x i32> %a1, %s3
30 store <2 x i32> %a, <2 x i32> *%dst
34 define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
35 ; CHECK-LABEL: vld3_v4i32:
36 ; CHECK: @ %bb.0: @ %entry
37 ; CHECK-NEXT: .vsave {d8, d9}
38 ; CHECK-NEXT: vpush {d8, d9}
39 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
40 ; CHECK-NEXT: vldrw.u32 q1, [r0]
41 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
42 ; CHECK-NEXT: vmov.f32 s10, s2
43 ; CHECK-NEXT: vmov.f32 s13, s0
44 ; CHECK-NEXT: vmov.f32 s14, s3
45 ; CHECK-NEXT: vmov.f32 s8, s4
46 ; CHECK-NEXT: vmov.f32 s9, s7
47 ; CHECK-NEXT: vmov.f32 s12, s5
48 ; CHECK-NEXT: vmov.f32 s15, s18
49 ; CHECK-NEXT: vmov.f32 s11, s17
50 ; CHECK-NEXT: vadd.i32 q2, q2, q3
51 ; CHECK-NEXT: vmov.f32 s0, s6
52 ; CHECK-NEXT: vmov.f32 s2, s16
53 ; CHECK-NEXT: vmov.f32 s3, s19
54 ; CHECK-NEXT: vadd.i32 q0, q2, q0
55 ; CHECK-NEXT: vstrw.32 q0, [r1]
56 ; CHECK-NEXT: vpop {d8, d9}
59 %l1 = load <12 x i32>, <12 x i32>* %src, align 4
60 %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
61 %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
62 %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
63 %a1 = add <4 x i32> %s1, %s2
64 %a = add <4 x i32> %a1, %s3
65 store <4 x i32> %a, <4 x i32> *%dst
69 define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
70 ; CHECK-LABEL: vld3_v8i32:
71 ; CHECK: @ %bb.0: @ %entry
72 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
73 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
74 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
75 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
76 ; CHECK-NEXT: vldrw.u32 q4, [r0, #80]
77 ; CHECK-NEXT: vmov.f32 s10, s2
78 ; CHECK-NEXT: vmov.f32 s13, s0
79 ; CHECK-NEXT: vmov.f32 s14, s3
80 ; CHECK-NEXT: vmov.f32 s8, s4
81 ; CHECK-NEXT: vmov.f32 s9, s7
82 ; CHECK-NEXT: vmov.f32 s12, s5
83 ; CHECK-NEXT: vmov.f32 s15, s18
84 ; CHECK-NEXT: vmov.f32 s11, s17
85 ; CHECK-NEXT: vadd.i32 q2, q2, q3
86 ; CHECK-NEXT: vmov.f32 s0, s6
87 ; CHECK-NEXT: vmov.f32 s2, s16
88 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
89 ; CHECK-NEXT: vmov.f32 s3, s19
90 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
91 ; CHECK-NEXT: vadd.i32 q0, q2, q0
92 ; CHECK-NEXT: vldrw.u32 q2, [r0]
93 ; CHECK-NEXT: vmov.f32 s17, s4
94 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
95 ; CHECK-NEXT: vmov.f32 s18, s7
96 ; CHECK-NEXT: vmov.f32 s22, s6
97 ; CHECK-NEXT: vmov.f32 s16, s9
98 ; CHECK-NEXT: vmov.f32 s19, s14
99 ; CHECK-NEXT: vmov.f32 s20, s8
100 ; CHECK-NEXT: vmov.f32 s21, s11
101 ; CHECK-NEXT: vmov.f32 s23, s13
102 ; CHECK-NEXT: vadd.i32 q4, q5, q4
103 ; CHECK-NEXT: vmov.f32 s4, s10
104 ; CHECK-NEXT: vmov.f32 s6, s12
105 ; CHECK-NEXT: vmov.f32 s7, s15
106 ; CHECK-NEXT: vadd.i32 q1, q4, q1
107 ; CHECK-NEXT: vstrw.32 q1, [r1]
108 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
111 %l1 = load <24 x i32>, <24 x i32>* %src, align 4
112 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
113 %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
114 %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
115 %a1 = add <8 x i32> %s1, %s2
116 %a = add <8 x i32> %a1, %s3
117 store <8 x i32> %a, <8 x i32> *%dst
121 define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
122 ; CHECK-LABEL: vld3_v16i32:
123 ; CHECK: @ %bb.0: @ %entry
124 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
125 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
126 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
127 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
128 ; CHECK-NEXT: vldrw.u32 q4, [r0, #80]
129 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176]
130 ; CHECK-NEXT: vmov.f32 s10, s2
131 ; CHECK-NEXT: vmov.f32 s13, s0
132 ; CHECK-NEXT: vmov.f32 s14, s3
133 ; CHECK-NEXT: vmov.f32 s8, s4
134 ; CHECK-NEXT: vmov.f32 s9, s7
135 ; CHECK-NEXT: vmov.f32 s12, s5
136 ; CHECK-NEXT: vmov.f32 s15, s18
137 ; CHECK-NEXT: vmov.f32 s11, s17
138 ; CHECK-NEXT: vadd.i32 q2, q2, q3
139 ; CHECK-NEXT: vmov.f32 s0, s6
140 ; CHECK-NEXT: vmov.f32 s2, s16
141 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
142 ; CHECK-NEXT: vmov.f32 s3, s19
143 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
144 ; CHECK-NEXT: vadd.i32 q0, q2, q0
145 ; CHECK-NEXT: vldrw.u32 q2, [r0]
146 ; CHECK-NEXT: vmov.f32 s17, s4
147 ; CHECK-NEXT: vmov.f32 s18, s7
148 ; CHECK-NEXT: vmov.f32 s22, s6
149 ; CHECK-NEXT: vmov.f32 s16, s9
150 ; CHECK-NEXT: vmov.f32 s19, s14
151 ; CHECK-NEXT: vmov.f32 s20, s8
152 ; CHECK-NEXT: vmov.f32 s21, s11
153 ; CHECK-NEXT: vmov.f32 s23, s13
154 ; CHECK-NEXT: vmov.f32 s4, s10
155 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
156 ; CHECK-NEXT: vmov.f32 s6, s12
157 ; CHECK-NEXT: vadd.i32 q4, q5, q4
158 ; CHECK-NEXT: vmov.f32 s7, s15
159 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
160 ; CHECK-NEXT: vadd.i32 q1, q4, q1
161 ; CHECK-NEXT: vmov.f32 s18, s10
162 ; CHECK-NEXT: vmov.f32 s21, s8
163 ; CHECK-NEXT: vmov.f32 s22, s11
164 ; CHECK-NEXT: vmov.f32 s16, s12
165 ; CHECK-NEXT: vmov.f32 s17, s15
166 ; CHECK-NEXT: vmov.f32 s20, s13
167 ; CHECK-NEXT: vmov.f32 s23, s26
168 ; CHECK-NEXT: vmov.f32 s19, s25
169 ; CHECK-NEXT: vadd.i32 q4, q4, q5
170 ; CHECK-NEXT: vmov.f32 s8, s14
171 ; CHECK-NEXT: vmov.f32 s10, s24
172 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
173 ; CHECK-NEXT: vmov.f32 s11, s27
174 ; CHECK-NEXT: vldrw.u32 q5, [r0, #128]
175 ; CHECK-NEXT: vadd.i32 q2, q4, q2
176 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96]
177 ; CHECK-NEXT: vmov.f32 s25, s12
178 ; CHECK-NEXT: vstrw.32 q2, [r1, #48]
179 ; CHECK-NEXT: vmov.f32 s26, s15
180 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
181 ; CHECK-NEXT: vmov.f32 s30, s14
182 ; CHECK-NEXT: vstrw.32 q1, [r1]
183 ; CHECK-NEXT: vmov.f32 s24, s17
184 ; CHECK-NEXT: vmov.f32 s27, s22
185 ; CHECK-NEXT: vmov.f32 s28, s16
186 ; CHECK-NEXT: vmov.f32 s29, s19
187 ; CHECK-NEXT: vmov.f32 s31, s21
188 ; CHECK-NEXT: vadd.i32 q6, q7, q6
189 ; CHECK-NEXT: vmov.f32 s12, s18
190 ; CHECK-NEXT: vmov.f32 s14, s20
191 ; CHECK-NEXT: vmov.f32 s15, s23
192 ; CHECK-NEXT: vadd.i32 q3, q6, q3
193 ; CHECK-NEXT: vstrw.32 q3, [r1, #32]
194 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
197 %l1 = load <48 x i32>, <48 x i32>* %src, align 4
198 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
199 %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
200 %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
201 %a1 = add <16 x i32> %s1, %s2
202 %a = add <16 x i32> %a1, %s3
203 store <16 x i32> %a, <16 x i32> *%dst
209 define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) {
210 ; CHECK-LABEL: vld3_v2i16:
211 ; CHECK: @ %bb.0: @ %entry
212 ; CHECK-NEXT: .pad #8
213 ; CHECK-NEXT: sub sp, #8
214 ; CHECK-NEXT: vldrh.u32 q0, [r0]
215 ; CHECK-NEXT: ldr r2, [r0, #8]
216 ; CHECK-NEXT: mov r3, sp
217 ; CHECK-NEXT: str r2, [sp]
218 ; CHECK-NEXT: vmov.f32 s6, s3
219 ; CHECK-NEXT: vmov.f32 s8, s1
220 ; CHECK-NEXT: vmov r0, s6
221 ; CHECK-NEXT: vldrh.u32 q1, [r3]
222 ; CHECK-NEXT: vmov.f32 s6, s4
223 ; CHECK-NEXT: vmov.f32 s4, s2
224 ; CHECK-NEXT: vmov.f32 s2, s5
225 ; CHECK-NEXT: vmov r2, s6
226 ; CHECK-NEXT: add r0, r2
227 ; CHECK-NEXT: vmov r2, s2
228 ; CHECK-NEXT: add r0, r2
229 ; CHECK-NEXT: strh r0, [r1, #2]
230 ; CHECK-NEXT: vmov r0, s8
231 ; CHECK-NEXT: vmov r2, s0
232 ; CHECK-NEXT: add r0, r2
233 ; CHECK-NEXT: vmov r2, s4
234 ; CHECK-NEXT: add r0, r2
235 ; CHECK-NEXT: strh r0, [r1]
236 ; CHECK-NEXT: add sp, #8
239 %l1 = load <6 x i16>, <6 x i16>* %src, align 4
240 %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
241 %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
242 %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
243 %a1 = add <2 x i16> %s1, %s2
244 %a = add <2 x i16> %a1, %s3
245 store <2 x i16> %a, <2 x i16> *%dst
249 define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
250 ; CHECK-LABEL: vld3_v4i16:
251 ; CHECK: @ %bb.0: @ %entry
252 ; CHECK-NEXT: .save {r4, r5, r6, lr}
253 ; CHECK-NEXT: push {r4, r5, r6, lr}
254 ; CHECK-NEXT: vldrw.u32 q0, [r0]
255 ; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
256 ; CHECK-NEXT: vmov.u16 r5, q0[6]
257 ; CHECK-NEXT: vmov.u16 r6, q0[0]
258 ; CHECK-NEXT: vmov r0, r3, d2
259 ; CHECK-NEXT: vmov.u16 lr, q0[2]
260 ; CHECK-NEXT: vmov r2, r4, d3
261 ; CHECK-NEXT: vmov q1[2], q1[0], r6, r5
262 ; CHECK-NEXT: vmov.u16 r5, q0[7]
263 ; CHECK-NEXT: vmov.u16 r6, q0[1]
264 ; CHECK-NEXT: vmov q2[2], q2[0], r6, r5
265 ; CHECK-NEXT: vmov.u16 r5, q0[3]
266 ; CHECK-NEXT: vmov.u16 r6, q0[4]
267 ; CHECK-NEXT: vmov q1[3], q1[1], r5, r3
268 ; CHECK-NEXT: vmov q2[3], q2[1], r6, r2
269 ; CHECK-NEXT: vmov.u16 r12, q0[5]
270 ; CHECK-NEXT: vadd.i32 q0, q1, q2
271 ; CHECK-NEXT: vmov q1[2], q1[0], lr, r0
272 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r4
273 ; CHECK-NEXT: vadd.i32 q0, q0, q1
274 ; CHECK-NEXT: vstrh.32 q0, [r1]
275 ; CHECK-NEXT: pop {r4, r5, r6, pc}
277 %l1 = load <12 x i16>, <12 x i16>* %src, align 4
278 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
279 %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
280 %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
281 %a1 = add <4 x i16> %s1, %s2
282 %a = add <4 x i16> %a1, %s3
283 store <4 x i16> %a, <4 x i16> *%dst
287 define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
288 ; CHECK-LABEL: vld3_v8i16:
289 ; CHECK: @ %bb.0: @ %entry
290 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
291 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
292 ; CHECK-NEXT: vldrw.u32 q1, [r0]
293 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
294 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
295 ; CHECK-NEXT: vmov.f32 s0, s5
296 ; CHECK-NEXT: vmovx.f16 s2, s6
297 ; CHECK-NEXT: vins.f16 s0, s2
298 ; CHECK-NEXT: vmovx.f16 s2, s9
299 ; CHECK-NEXT: vmov.f32 s1, s8
300 ; CHECK-NEXT: vmovx.f16 s5, s5
301 ; CHECK-NEXT: vins.f16 s1, s2
302 ; CHECK-NEXT: vmov.f32 s19, s14
303 ; CHECK-NEXT: vmovx.f16 s2, s15
304 ; CHECK-NEXT: vmov.f32 s18, s12
305 ; CHECK-NEXT: vins.f16 s19, s2
306 ; CHECK-NEXT: vmov.f32 s2, s11
307 ; CHECK-NEXT: vmov q5, q4
308 ; CHECK-NEXT: vmov.f32 s16, s4
309 ; CHECK-NEXT: vins.f16 s16, s5
310 ; CHECK-NEXT: vmovx.f16 s5, s8
311 ; CHECK-NEXT: vmov.f32 s17, s7
312 ; CHECK-NEXT: vmovx.f16 s4, s4
313 ; CHECK-NEXT: vins.f16 s17, s5
314 ; CHECK-NEXT: vmovx.f16 s5, s11
315 ; CHECK-NEXT: vmov.f32 s18, s10
316 ; CHECK-NEXT: vmov.u16 r0, q2[5]
317 ; CHECK-NEXT: vmovx.f16 s11, s13
318 ; CHECK-NEXT: vins.f16 s18, s5
319 ; CHECK-NEXT: vmovx.f16 s5, s7
320 ; CHECK-NEXT: vmovnb.i32 q5, q0
321 ; CHECK-NEXT: vmov.f32 s3, s19
322 ; CHECK-NEXT: vmovx.f16 s14, s14
323 ; CHECK-NEXT: vmov.f32 s19, s13
324 ; CHECK-NEXT: vins.f16 s4, s6
325 ; CHECK-NEXT: vins.f16 s5, s9
326 ; CHECK-NEXT: vins.f16 s10, s12
327 ; CHECK-NEXT: vins.f16 s11, s15
328 ; CHECK-NEXT: vins.f16 s19, s14
329 ; CHECK-NEXT: vmov.16 q1[4], r0
330 ; CHECK-NEXT: vmov q3, q2
331 ; CHECK-NEXT: vmovnb.i32 q3, q1
332 ; CHECK-NEXT: vmov.f32 s7, s11
333 ; CHECK-NEXT: vmov.f32 s6, s14
334 ; CHECK-NEXT: vmov.f32 s2, s22
335 ; CHECK-NEXT: vadd.i16 q1, q4, q1
336 ; CHECK-NEXT: vadd.i16 q0, q1, q0
337 ; CHECK-NEXT: vstrw.32 q0, [r1]
338 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
341 %l1 = load <24 x i16>, <24 x i16>* %src, align 4
342 %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
343 %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
344 %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
345 %a1 = add <8 x i16> %s1, %s2
346 %a = add <8 x i16> %a1, %s3
347 store <8 x i16> %a, <8 x i16> *%dst
351 define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
352 ; CHECK-LABEL: vld3_v16i16:
353 ; CHECK: @ %bb.0: @ %entry
354 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
355 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
356 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
357 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
358 ; CHECK-NEXT: vldrw.u32 q4, [r0, #80]
359 ; CHECK-NEXT: vmov.f32 s0, s4
360 ; CHECK-NEXT: vmovx.f16 s2, s5
361 ; CHECK-NEXT: vins.f16 s0, s2
362 ; CHECK-NEXT: vmovx.f16 s2, s8
363 ; CHECK-NEXT: vmov.f32 s1, s7
364 ; CHECK-NEXT: vmovx.f16 s12, s11
365 ; CHECK-NEXT: vins.f16 s1, s2
366 ; CHECK-NEXT: vmov.f32 s2, s10
367 ; CHECK-NEXT: vmovx.f16 s14, s18
368 ; CHECK-NEXT: vmov.f32 s3, s17
369 ; CHECK-NEXT: vins.f16 s2, s12
370 ; CHECK-NEXT: vmovx.f16 s12, s6
371 ; CHECK-NEXT: vins.f16 s3, s14
372 ; CHECK-NEXT: vmovx.f16 s14, s19
373 ; CHECK-NEXT: vins.f16 s18, s14
374 ; CHECK-NEXT: vins.f16 s5, s12
375 ; CHECK-NEXT: vmovx.f16 s12, s9
376 ; CHECK-NEXT: vmov.f32 s13, s8
377 ; CHECK-NEXT: vmovx.f16 s4, s4
378 ; CHECK-NEXT: vins.f16 s13, s12
379 ; CHECK-NEXT: vmov.f32 s12, s5
380 ; CHECK-NEXT: vmovx.f16 s5, s7
381 ; CHECK-NEXT: vmov.u16 r2, q2[5]
382 ; CHECK-NEXT: vmov.f32 s14, s11
383 ; CHECK-NEXT: vmovx.f16 s11, s17
384 ; CHECK-NEXT: vmov.f32 s23, s18
385 ; CHECK-NEXT: vmov.f32 s22, s16
386 ; CHECK-NEXT: vins.f16 s4, s6
387 ; CHECK-NEXT: vins.f16 s5, s9
388 ; CHECK-NEXT: vmov q6, q5
389 ; CHECK-NEXT: vins.f16 s10, s16
390 ; CHECK-NEXT: vins.f16 s11, s19
391 ; CHECK-NEXT: vmovnb.i32 q6, q3
392 ; CHECK-NEXT: vmov.f32 s15, s18
393 ; CHECK-NEXT: vmov.16 q1[4], r2
394 ; CHECK-NEXT: vmov q4, q2
395 ; CHECK-NEXT: vmovnb.i32 q4, q1
396 ; CHECK-NEXT: vmov.f32 s7, s11
397 ; CHECK-NEXT: vmov.f32 s6, s18
398 ; CHECK-NEXT: vldrw.u32 q2, [r0]
399 ; CHECK-NEXT: vadd.i16 q0, q0, q1
400 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
401 ; CHECK-NEXT: vmov.f32 s14, s26
402 ; CHECK-NEXT: vmovx.f16 s6, s10
403 ; CHECK-NEXT: vadd.i16 q0, q0, q3
404 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
405 ; CHECK-NEXT: vmov.f32 s4, s9
406 ; CHECK-NEXT: vmovx.f16 s7, s19
407 ; CHECK-NEXT: vmov.f32 s27, s18
408 ; CHECK-NEXT: vins.f16 s4, s6
409 ; CHECK-NEXT: vmovx.f16 s6, s13
410 ; CHECK-NEXT: vmov.f32 s5, s12
411 ; CHECK-NEXT: vins.f16 s27, s7
412 ; CHECK-NEXT: vmov.f32 s26, s16
413 ; CHECK-NEXT: vins.f16 s5, s6
414 ; CHECK-NEXT: vmov.f32 s6, s15
415 ; CHECK-NEXT: vmov q7, q6
416 ; CHECK-NEXT: vmov.f32 s20, s8
417 ; CHECK-NEXT: vmovnb.i32 q7, q1
418 ; CHECK-NEXT: vmovx.f16 s6, s9
419 ; CHECK-NEXT: vins.f16 s20, s6
420 ; CHECK-NEXT: vmovx.f16 s6, s12
421 ; CHECK-NEXT: vmov.f32 s21, s11
422 ; CHECK-NEXT: vmovx.f16 s8, s8
423 ; CHECK-NEXT: vmovx.f16 s9, s11
424 ; CHECK-NEXT: vins.f16 s21, s6
425 ; CHECK-NEXT: vmovx.f16 s6, s15
426 ; CHECK-NEXT: vmov.u16 r0, q3[5]
427 ; CHECK-NEXT: vmovx.f16 s15, s17
428 ; CHECK-NEXT: vmov.f32 s22, s14
429 ; CHECK-NEXT: vins.f16 s8, s10
430 ; CHECK-NEXT: vins.f16 s9, s13
431 ; CHECK-NEXT: vins.f16 s14, s16
432 ; CHECK-NEXT: vins.f16 s15, s19
433 ; CHECK-NEXT: vins.f16 s22, s6
434 ; CHECK-NEXT: vmovx.f16 s6, s18
435 ; CHECK-NEXT: vmov.f32 s23, s17
436 ; CHECK-NEXT: vmov.16 q2[4], r0
437 ; CHECK-NEXT: vmov q4, q3
438 ; CHECK-NEXT: vins.f16 s23, s6
439 ; CHECK-NEXT: vmovnb.i32 q4, q2
440 ; CHECK-NEXT: vmov.f32 s11, s15
441 ; CHECK-NEXT: vmov.f32 s10, s18
442 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
443 ; CHECK-NEXT: vmov.f32 s6, s30
444 ; CHECK-NEXT: vadd.i16 q2, q5, q2
445 ; CHECK-NEXT: vmov.f32 s7, s27
446 ; CHECK-NEXT: vadd.i16 q1, q2, q1
447 ; CHECK-NEXT: vstrw.32 q1, [r1]
448 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
451 %l1 = load <48 x i16>, <48 x i16>* %src, align 4
452 %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
453 %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
454 %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
455 %a1 = add <16 x i16> %s1, %s2
456 %a = add <16 x i16> %a1, %s3
457 store <16 x i16> %a, <16 x i16> *%dst
463 define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) {
464 ; CHECK-LABEL: vld3_v2i8:
465 ; CHECK: @ %bb.0: @ %entry
466 ; CHECK-NEXT: .pad #8
467 ; CHECK-NEXT: sub sp, #8
468 ; CHECK-NEXT: ldrd r2, r0, [r0]
469 ; CHECK-NEXT: strd r2, r0, [sp]
470 ; CHECK-NEXT: mov r0, sp
471 ; CHECK-NEXT: vldrb.u16 q0, [r0]
472 ; CHECK-NEXT: vmov.u16 r0, q0[4]
473 ; CHECK-NEXT: vmov.u16 r2, q0[3]
474 ; CHECK-NEXT: add r0, r2
475 ; CHECK-NEXT: vmov.u16 r2, q0[5]
476 ; CHECK-NEXT: add r0, r2
477 ; CHECK-NEXT: strb r0, [r1, #1]
478 ; CHECK-NEXT: vmov.u16 r0, q0[1]
479 ; CHECK-NEXT: vmov.u16 r2, q0[0]
480 ; CHECK-NEXT: add r0, r2
481 ; CHECK-NEXT: vmov.u16 r2, q0[2]
482 ; CHECK-NEXT: add r0, r2
483 ; CHECK-NEXT: strb r0, [r1]
484 ; CHECK-NEXT: add sp, #8
487 %l1 = load <6 x i8>, <6 x i8>* %src, align 4
488 %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
489 %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
490 %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
491 %a1 = add <2 x i8> %s1, %s2
492 %a = add <2 x i8> %a1, %s3
493 store <2 x i8> %a, <2 x i8> *%dst
497 define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) {
498 ; CHECK-LABEL: vld3_v4i8:
499 ; CHECK: @ %bb.0: @ %entry
500 ; CHECK-NEXT: .save {r4, lr}
501 ; CHECK-NEXT: push {r4, lr}
502 ; CHECK-NEXT: .pad #8
503 ; CHECK-NEXT: sub sp, #8
504 ; CHECK-NEXT: vldrb.u16 q0, [r0]
505 ; CHECK-NEXT: ldr r0, [r0, #8]
506 ; CHECK-NEXT: str r0, [sp]
507 ; CHECK-NEXT: vmov.u16 r3, q0[6]
508 ; CHECK-NEXT: vmov.u16 r4, q0[0]
509 ; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
510 ; CHECK-NEXT: vmov.u16 r3, q0[7]
511 ; CHECK-NEXT: vmov.u16 r4, q0[1]
512 ; CHECK-NEXT: vmov.u16 r12, q0[5]
513 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r3
514 ; CHECK-NEXT: mov r3, sp
515 ; CHECK-NEXT: vmov.u16 lr, q0[2]
516 ; CHECK-NEXT: vmov.u16 r2, q0[3]
517 ; CHECK-NEXT: vmov.u16 r0, q0[4]
518 ; CHECK-NEXT: vldrb.u16 q0, [r3]
519 ; CHECK-NEXT: vmov.u16 r3, q0[2]
520 ; CHECK-NEXT: vmov q2[3], q2[1], r0, r3
521 ; CHECK-NEXT: vmov.u16 r0, q0[1]
522 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r0
523 ; CHECK-NEXT: vmov.u16 r0, q0[0]
524 ; CHECK-NEXT: vadd.i32 q1, q1, q2
525 ; CHECK-NEXT: vmov q2[2], q2[0], lr, r0
526 ; CHECK-NEXT: vmov.u16 r0, q0[3]
527 ; CHECK-NEXT: vmov q2[3], q2[1], r12, r0
528 ; CHECK-NEXT: vadd.i32 q0, q1, q2
529 ; CHECK-NEXT: vstrb.32 q0, [r1]
530 ; CHECK-NEXT: add sp, #8
531 ; CHECK-NEXT: pop {r4, pc}
533 %l1 = load <12 x i8>, <12 x i8>* %src, align 4
534 %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
535 %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
536 %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
537 %a1 = add <4 x i8> %s1, %s2
538 %a = add <4 x i8> %a1, %s3
539 store <4 x i8> %a, <4 x i8> *%dst
543 define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
544 ; CHECK-LABEL: vld3_v8i8:
545 ; CHECK: @ %bb.0: @ %entry
546 ; CHECK-NEXT: .vsave {d8, d9}
547 ; CHECK-NEXT: vpush {d8, d9}
548 ; CHECK-NEXT: vldrw.u32 q0, [r0]
549 ; CHECK-NEXT: vldrb.u16 q1, [r0, #16]
550 ; CHECK-NEXT: vmov.u8 r2, q0[1]
551 ; CHECK-NEXT: vmov.u8 r0, q0[0]
552 ; CHECK-NEXT: vmov.16 q2[0], r2
553 ; CHECK-NEXT: vmov.u8 r2, q0[4]
554 ; CHECK-NEXT: vmov.16 q3[0], r0
555 ; CHECK-NEXT: vmov.u8 r0, q0[3]
556 ; CHECK-NEXT: vmov.16 q2[1], r2
557 ; CHECK-NEXT: vmov.u8 r2, q0[7]
558 ; CHECK-NEXT: vmov.16 q3[1], r0
559 ; CHECK-NEXT: vmov.u8 r0, q0[6]
560 ; CHECK-NEXT: vmov.16 q2[2], r2
561 ; CHECK-NEXT: vmov.u8 r2, q0[10]
562 ; CHECK-NEXT: vmov.16 q3[2], r0
563 ; CHECK-NEXT: vmov.u8 r0, q0[9]
564 ; CHECK-NEXT: vmov.16 q2[3], r2
565 ; CHECK-NEXT: vmov.u8 r2, q0[13]
566 ; CHECK-NEXT: vmov.16 q3[3], r0
567 ; CHECK-NEXT: vmov.u8 r0, q0[12]
568 ; CHECK-NEXT: vmov.16 q2[4], r2
569 ; CHECK-NEXT: vmov.16 q3[4], r0
570 ; CHECK-NEXT: vmov.u8 r0, q0[15]
571 ; CHECK-NEXT: vmovx.f16 s16, s6
572 ; CHECK-NEXT: vmov.f32 s18, s5
573 ; CHECK-NEXT: vmovx.f16 s11, s5
574 ; CHECK-NEXT: vmov.16 q3[5], r0
575 ; CHECK-NEXT: vins.f16 s18, s16
576 ; CHECK-NEXT: vins.f16 s10, s4
577 ; CHECK-NEXT: vins.f16 s11, s7
578 ; CHECK-NEXT: vmov.f32 s15, s18
579 ; CHECK-NEXT: vmov.u8 r0, q0[2]
580 ; CHECK-NEXT: vadd.i16 q2, q3, q2
581 ; CHECK-NEXT: vmov.16 q3[0], r0
582 ; CHECK-NEXT: vmov.u8 r0, q0[5]
583 ; CHECK-NEXT: vmov.16 q3[1], r0
584 ; CHECK-NEXT: vmov.u8 r0, q0[8]
585 ; CHECK-NEXT: vmov.16 q3[2], r0
586 ; CHECK-NEXT: vmov.u8 r0, q0[11]
587 ; CHECK-NEXT: vmov.16 q3[3], r0
588 ; CHECK-NEXT: vmov.u8 r0, q0[14]
589 ; CHECK-NEXT: vmov.16 q3[4], r0
590 ; CHECK-NEXT: vmov.u16 r0, q1[1]
591 ; CHECK-NEXT: vmovx.f16 s0, s7
592 ; CHECK-NEXT: vmov.f32 s2, s6
593 ; CHECK-NEXT: vins.f16 s2, s0
594 ; CHECK-NEXT: vmov.16 q3[5], r0
595 ; CHECK-NEXT: vmov.f32 s15, s2
596 ; CHECK-NEXT: vadd.i16 q0, q2, q3
597 ; CHECK-NEXT: vstrb.16 q0, [r1]
598 ; CHECK-NEXT: vpop {d8, d9}
601 %l1 = load <24 x i8>, <24 x i8>* %src, align 4
602 %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
603 %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
604 %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
605 %a1 = add <8 x i8> %s1, %s2
606 %a = add <8 x i8> %a1, %s3
607 store <8 x i8> %a, <8 x i8> *%dst
611 define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
612 ; CHECK-LABEL: vld3_v16i8:
613 ; CHECK: @ %bb.0: @ %entry
614 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
615 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
616 ; CHECK-NEXT: vldrw.u32 q1, [r0]
617 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
618 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
619 ; CHECK-NEXT: vmov.u8 r2, q1[1]
620 ; CHECK-NEXT: vmov.8 q3[0], r2
621 ; CHECK-NEXT: vmov.u8 r2, q1[4]
622 ; CHECK-NEXT: vmov.8 q3[1], r2
623 ; CHECK-NEXT: vmov.u8 r2, q1[7]
624 ; CHECK-NEXT: vmov.8 q3[2], r2
625 ; CHECK-NEXT: vmov.u8 r2, q1[10]
626 ; CHECK-NEXT: vmov.8 q3[3], r2
627 ; CHECK-NEXT: vmov.u8 r2, q1[13]
628 ; CHECK-NEXT: vmov.8 q3[4], r2
629 ; CHECK-NEXT: vmov.u8 r2, q0[0]
630 ; CHECK-NEXT: vmov.8 q3[5], r2
631 ; CHECK-NEXT: vmov.u8 r2, q0[3]
632 ; CHECK-NEXT: vmov.8 q3[6], r2
633 ; CHECK-NEXT: vmov.u8 r2, q0[6]
634 ; CHECK-NEXT: vmov.8 q3[7], r2
635 ; CHECK-NEXT: vmov.u8 r2, q0[9]
636 ; CHECK-NEXT: vmov.u8 r0, q2[5]
637 ; CHECK-NEXT: vmov.8 q3[8], r2
638 ; CHECK-NEXT: vmov.u8 r2, q0[12]
639 ; CHECK-NEXT: vmov.8 q4[12], r0
640 ; CHECK-NEXT: vmov.u8 r0, q2[8]
641 ; CHECK-NEXT: vmov.8 q3[9], r2
642 ; CHECK-NEXT: vmov.u8 r2, q0[15]
643 ; CHECK-NEXT: vmov.8 q4[13], r0
644 ; CHECK-NEXT: vmov.u8 r0, q2[11]
645 ; CHECK-NEXT: vmov.8 q3[10], r2
646 ; CHECK-NEXT: vmov.8 q4[14], r0
647 ; CHECK-NEXT: vmov.u8 r0, q2[14]
648 ; CHECK-NEXT: vmov.8 q4[15], r0
649 ; CHECK-NEXT: vmov q5, q3
650 ; CHECK-NEXT: vmov.u8 r0, q2[2]
651 ; CHECK-NEXT: vmov.f32 s15, s19
652 ; CHECK-NEXT: vmov.8 q5[11], r0
653 ; CHECK-NEXT: vmov.u8 r0, q1[0]
654 ; CHECK-NEXT: vmov.8 q4[0], r0
655 ; CHECK-NEXT: vmov.u8 r0, q1[3]
656 ; CHECK-NEXT: vmov.8 q4[1], r0
657 ; CHECK-NEXT: vmov.u8 r0, q1[6]
658 ; CHECK-NEXT: vmov.8 q4[2], r0
659 ; CHECK-NEXT: vmov.u8 r0, q1[9]
660 ; CHECK-NEXT: vmov.8 q4[3], r0
661 ; CHECK-NEXT: vmov.u8 r0, q1[12]
662 ; CHECK-NEXT: vmov.8 q4[4], r0
663 ; CHECK-NEXT: vmov.u8 r0, q1[15]
664 ; CHECK-NEXT: vmov.8 q4[5], r0
665 ; CHECK-NEXT: vmov.u8 r0, q0[2]
666 ; CHECK-NEXT: vmov.8 q4[6], r0
667 ; CHECK-NEXT: vmov.u8 r0, q0[5]
668 ; CHECK-NEXT: vmov.8 q4[7], r0
669 ; CHECK-NEXT: vmov.u8 r0, q0[8]
670 ; CHECK-NEXT: vmov.8 q4[8], r0
671 ; CHECK-NEXT: vmov.u8 r0, q0[11]
672 ; CHECK-NEXT: vmov.8 q4[9], r0
673 ; CHECK-NEXT: vmov.u8 r0, q0[14]
674 ; CHECK-NEXT: vmov.8 q4[10], r0
675 ; CHECK-NEXT: vmov.u8 r0, q2[4]
676 ; CHECK-NEXT: vmov.f32 s14, s22
677 ; CHECK-NEXT: vmov.8 q5[12], r0
678 ; CHECK-NEXT: vmov.u8 r0, q2[7]
679 ; CHECK-NEXT: vmov q6, q4
680 ; CHECK-NEXT: vmov.8 q5[13], r0
681 ; CHECK-NEXT: vmov.u8 r0, q2[10]
682 ; CHECK-NEXT: vmov.8 q5[14], r0
683 ; CHECK-NEXT: vmov.u8 r0, q2[13]
684 ; CHECK-NEXT: vmov.8 q5[15], r0
685 ; CHECK-NEXT: vmov.u8 r0, q2[1]
686 ; CHECK-NEXT: vmov.8 q6[11], r0
687 ; CHECK-NEXT: vmov.f32 s19, s23
688 ; CHECK-NEXT: vmov.f32 s18, s26
689 ; CHECK-NEXT: vmov.u8 r0, q1[2]
690 ; CHECK-NEXT: vadd.i8 q3, q4, q3
691 ; CHECK-NEXT: vmov.8 q4[0], r0
692 ; CHECK-NEXT: vmov.u8 r0, q1[5]
693 ; CHECK-NEXT: vmov.8 q4[1], r0
694 ; CHECK-NEXT: vmov.u8 r0, q1[8]
695 ; CHECK-NEXT: vmov.8 q4[2], r0
696 ; CHECK-NEXT: vmov.u8 r0, q1[11]
697 ; CHECK-NEXT: vmov.8 q4[3], r0
698 ; CHECK-NEXT: vmov.u8 r0, q1[14]
699 ; CHECK-NEXT: vmov.8 q4[4], r0
700 ; CHECK-NEXT: vmov.u8 r0, q0[1]
701 ; CHECK-NEXT: vmov.8 q4[5], r0
702 ; CHECK-NEXT: vmov.u8 r0, q0[4]
703 ; CHECK-NEXT: vmov.8 q4[6], r0
704 ; CHECK-NEXT: vmov.u8 r0, q2[6]
705 ; CHECK-NEXT: vmov.8 q1[12], r0
706 ; CHECK-NEXT: vmov.u8 r0, q2[9]
707 ; CHECK-NEXT: vmov.8 q1[13], r0
708 ; CHECK-NEXT: vmov.u8 r0, q2[12]
709 ; CHECK-NEXT: vmov.8 q1[14], r0
710 ; CHECK-NEXT: vmov.u8 r0, q2[15]
711 ; CHECK-NEXT: vmov.8 q1[15], r0
712 ; CHECK-NEXT: vmov.u8 r0, q0[10]
713 ; CHECK-NEXT: vmov.8 q5[8], r0
714 ; CHECK-NEXT: vmov.u8 r0, q0[13]
715 ; CHECK-NEXT: vmov.8 q5[9], r0
716 ; CHECK-NEXT: vmov.u8 r0, q2[0]
717 ; CHECK-NEXT: vmov.8 q5[10], r0
718 ; CHECK-NEXT: vmov.u8 r0, q2[3]
719 ; CHECK-NEXT: vmov.8 q5[11], r0
720 ; CHECK-NEXT: vmov.u8 r0, q0[7]
721 ; CHECK-NEXT: vmov.8 q4[7], r0
722 ; CHECK-NEXT: vmov.f32 s18, s22
723 ; CHECK-NEXT: vmov.f32 s19, s7
724 ; CHECK-NEXT: vadd.i8 q0, q3, q4
725 ; CHECK-NEXT: vstrw.32 q0, [r1]
726 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
729 %l1 = load <48 x i8>, <48 x i8>* %src, align 4
730 %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
731 %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
732 %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
733 %a1 = add <16 x i8> %s1, %s2
734 %a = add <16 x i8> %a1, %s3
735 store <16 x i8> %a, <16 x i8> *%dst
741 define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
742 ; CHECK-LABEL: vld3_v2i64:
743 ; CHECK: @ %bb.0: @ %entry
744 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
745 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
746 ; CHECK-NEXT: vldrw.u32 q0, [r0]
747 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
748 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
749 ; CHECK-NEXT: vmov.f32 s12, s2
750 ; CHECK-NEXT: vmov.f32 s13, s3
751 ; CHECK-NEXT: vmov.f32 s2, s4
752 ; CHECK-NEXT: vmov.f32 s3, s5
753 ; CHECK-NEXT: vmov r0, r3, d5
754 ; CHECK-NEXT: vmov r2, r4, d3
755 ; CHECK-NEXT: vmov r6, r7, d0
756 ; CHECK-NEXT: vmov r5, r8, d6
757 ; CHECK-NEXT: vmov lr, r12, d1
758 ; CHECK-NEXT: adds.w r0, r0, lr
759 ; CHECK-NEXT: adc.w r3, r3, r12
760 ; CHECK-NEXT: adds r0, r0, r2
761 ; CHECK-NEXT: adc.w r2, r3, r4
762 ; CHECK-NEXT: vmov r3, r4, d4
763 ; CHECK-NEXT: adds r6, r6, r5
764 ; CHECK-NEXT: adc.w r7, r7, r8
765 ; CHECK-NEXT: adds r3, r3, r6
766 ; CHECK-NEXT: adcs r7, r4
767 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
768 ; CHECK-NEXT: vmov q0[3], q0[1], r7, r2
769 ; CHECK-NEXT: vstrw.32 q0, [r1]
770 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
772 %l1 = load <6 x i64>, <6 x i64>* %src, align 4
773 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
774 %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
775 %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
776 %a1 = add <2 x i64> %s1, %s2
777 %a = add <2 x i64> %a1, %s3
778 store <2 x i64> %a, <2 x i64> *%dst
782 define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
783 ; CHECK-LABEL: vld3_v4i64:
784 ; CHECK: @ %bb.0: @ %entry
785 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
786 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
787 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12}
788 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12}
789 ; CHECK-NEXT: vldrw.u32 q0, [r0]
790 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
791 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
792 ; CHECK-NEXT: vldrw.u32 q5, [r0, #48]
793 ; CHECK-NEXT: vmov.f32 s4, s2
794 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
795 ; CHECK-NEXT: vmov.f32 s5, s3
796 ; CHECK-NEXT: vmov.f32 s2, s12
797 ; CHECK-NEXT: vmov.f32 s3, s13
798 ; CHECK-NEXT: vmov r5, r4, d5
799 ; CHECK-NEXT: vmov r3, r8, d7
800 ; CHECK-NEXT: vldrw.u32 q3, [r0, #80]
801 ; CHECK-NEXT: vmov.f32 s24, s22
802 ; CHECK-NEXT: vmov.f32 s25, s23
803 ; CHECK-NEXT: vmov lr, r12, d1
804 ; CHECK-NEXT: vmov.f32 s2, s12
805 ; CHECK-NEXT: vmov.f32 s3, s13
806 ; CHECK-NEXT: vmov r6, r7, d12
807 ; CHECK-NEXT: adds.w r0, r5, lr
808 ; CHECK-NEXT: adc.w r5, r4, r12
809 ; CHECK-NEXT: adds.w lr, r0, r3
810 ; CHECK-NEXT: vmov r4, r2, d10
811 ; CHECK-NEXT: adc.w r12, r5, r8
812 ; CHECK-NEXT: vmov r5, r0, d8
813 ; CHECK-NEXT: adds r6, r6, r4
814 ; CHECK-NEXT: adcs r2, r7
815 ; CHECK-NEXT: adds r6, r6, r5
816 ; CHECK-NEXT: adc.w r8, r2, r0
817 ; CHECK-NEXT: vmov r7, r4, d1
818 ; CHECK-NEXT: vmov r2, r5, d9
819 ; CHECK-NEXT: vmov r3, r0, d0
820 ; CHECK-NEXT: adds r2, r2, r7
821 ; CHECK-NEXT: adc.w r7, r5, r4
822 ; CHECK-NEXT: vmov r5, r4, d7
823 ; CHECK-NEXT: adds r2, r2, r5
824 ; CHECK-NEXT: adcs r7, r4
825 ; CHECK-NEXT: vmov r5, r4, d2
826 ; CHECK-NEXT: vmov q1[2], q1[0], r6, r2
827 ; CHECK-NEXT: vmov q1[3], q1[1], r8, r7
828 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
829 ; CHECK-NEXT: adds r3, r3, r5
830 ; CHECK-NEXT: adcs r0, r4
831 ; CHECK-NEXT: vmov r4, r5, d4
832 ; CHECK-NEXT: adds r3, r3, r4
833 ; CHECK-NEXT: vmov q0[2], q0[0], r3, lr
834 ; CHECK-NEXT: adcs r0, r5
835 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
836 ; CHECK-NEXT: vstrw.32 q0, [r1]
837 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12}
838 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
840 %l1 = load <12 x i64>, <12 x i64>* %src, align 4
841 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
842 %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
843 %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
844 %a1 = add <4 x i64> %s1, %s2
845 %a = add <4 x i64> %a1, %s3
846 store <4 x i64> %a, <4 x i64> *%dst
852 define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) {
853 ; CHECK-LABEL: vld3_v2f32:
854 ; CHECK: @ %bb.0: @ %entry
855 ; CHECK-NEXT: vldrw.u32 q2, [r0]
856 ; CHECK-NEXT: vldr s1, [r0, #16]
857 ; CHECK-NEXT: vldr s5, [r0, #20]
858 ; CHECK-NEXT: vmov.f32 s12, s8
859 ; CHECK-NEXT: vmov.f32 s13, s11
860 ; CHECK-NEXT: vmov.f32 s0, s9
861 ; CHECK-NEXT: vadd.f32 q0, q3, q0
862 ; CHECK-NEXT: vmov.f32 s4, s10
863 ; CHECK-NEXT: vadd.f32 q0, q0, q1
864 ; CHECK-NEXT: vstmia r1, {s0, s1}
867 %l1 = load <6 x float>, <6 x float>* %src, align 4
868 %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3>
869 %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4>
870 %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5>
871 %a1 = fadd <2 x float> %s1, %s2
872 %a = fadd <2 x float> %a1, %s3
873 store <2 x float> %a, <2 x float> *%dst
877 define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
878 ; CHECK-LABEL: vld3_v4f32:
879 ; CHECK: @ %bb.0: @ %entry
880 ; CHECK-NEXT: .vsave {d8, d9}
881 ; CHECK-NEXT: vpush {d8, d9}
882 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
883 ; CHECK-NEXT: vldrw.u32 q1, [r0]
884 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
885 ; CHECK-NEXT: vmov.f32 s10, s2
886 ; CHECK-NEXT: vmov.f32 s13, s0
887 ; CHECK-NEXT: vmov.f32 s14, s3
888 ; CHECK-NEXT: vmov.f32 s8, s4
889 ; CHECK-NEXT: vmov.f32 s9, s7
890 ; CHECK-NEXT: vmov.f32 s12, s5
891 ; CHECK-NEXT: vmov.f32 s15, s18
892 ; CHECK-NEXT: vmov.f32 s11, s17
893 ; CHECK-NEXT: vadd.f32 q2, q2, q3
894 ; CHECK-NEXT: vmov.f32 s0, s6
895 ; CHECK-NEXT: vmov.f32 s2, s16
896 ; CHECK-NEXT: vmov.f32 s3, s19
897 ; CHECK-NEXT: vadd.f32 q0, q2, q0
898 ; CHECK-NEXT: vstrw.32 q0, [r1]
899 ; CHECK-NEXT: vpop {d8, d9}
902 %l1 = load <12 x float>, <12 x float>* %src, align 4
903 %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
904 %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
905 %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
906 %a1 = fadd <4 x float> %s1, %s2
907 %a = fadd <4 x float> %a1, %s3
908 store <4 x float> %a, <4 x float> *%dst
912 define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
913 ; CHECK-LABEL: vld3_v8f32:
914 ; CHECK: @ %bb.0: @ %entry
915 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
916 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
917 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
918 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
919 ; CHECK-NEXT: vldrw.u32 q4, [r0, #80]
920 ; CHECK-NEXT: vmov.f32 s10, s2
921 ; CHECK-NEXT: vmov.f32 s13, s0
922 ; CHECK-NEXT: vmov.f32 s14, s3
923 ; CHECK-NEXT: vmov.f32 s8, s4
924 ; CHECK-NEXT: vmov.f32 s9, s7
925 ; CHECK-NEXT: vmov.f32 s12, s5
926 ; CHECK-NEXT: vmov.f32 s15, s18
927 ; CHECK-NEXT: vmov.f32 s11, s17
928 ; CHECK-NEXT: vadd.f32 q2, q2, q3
929 ; CHECK-NEXT: vmov.f32 s0, s6
930 ; CHECK-NEXT: vmov.f32 s2, s16
931 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
932 ; CHECK-NEXT: vmov.f32 s3, s19
933 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
934 ; CHECK-NEXT: vadd.f32 q0, q2, q0
935 ; CHECK-NEXT: vldrw.u32 q2, [r0]
936 ; CHECK-NEXT: vmov.f32 s17, s4
937 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
938 ; CHECK-NEXT: vmov.f32 s18, s7
939 ; CHECK-NEXT: vmov.f32 s22, s6
940 ; CHECK-NEXT: vmov.f32 s16, s9
941 ; CHECK-NEXT: vmov.f32 s19, s14
942 ; CHECK-NEXT: vmov.f32 s20, s8
943 ; CHECK-NEXT: vmov.f32 s21, s11
944 ; CHECK-NEXT: vmov.f32 s23, s13
945 ; CHECK-NEXT: vadd.f32 q4, q5, q4
946 ; CHECK-NEXT: vmov.f32 s4, s10
947 ; CHECK-NEXT: vmov.f32 s6, s12
948 ; CHECK-NEXT: vmov.f32 s7, s15
949 ; CHECK-NEXT: vadd.f32 q1, q4, q1
950 ; CHECK-NEXT: vstrw.32 q1, [r1]
951 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
954 %l1 = load <24 x float>, <24 x float>* %src, align 4
955 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
956 %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
957 %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
958 %a1 = fadd <8 x float> %s1, %s2
959 %a = fadd <8 x float> %a1, %s3
960 store <8 x float> %a, <8 x float> *%dst
964 define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
965 ; CHECK-LABEL: vld3_v16f32:
966 ; CHECK: @ %bb.0: @ %entry
967 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
968 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
969 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
970 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
971 ; CHECK-NEXT: vldrw.u32 q4, [r0, #80]
972 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176]
973 ; CHECK-NEXT: vmov.f32 s10, s2
974 ; CHECK-NEXT: vmov.f32 s13, s0
975 ; CHECK-NEXT: vmov.f32 s14, s3
976 ; CHECK-NEXT: vmov.f32 s8, s4
977 ; CHECK-NEXT: vmov.f32 s9, s7
978 ; CHECK-NEXT: vmov.f32 s12, s5
979 ; CHECK-NEXT: vmov.f32 s15, s18
980 ; CHECK-NEXT: vmov.f32 s11, s17
981 ; CHECK-NEXT: vadd.f32 q2, q2, q3
982 ; CHECK-NEXT: vmov.f32 s0, s6
983 ; CHECK-NEXT: vmov.f32 s2, s16
984 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
985 ; CHECK-NEXT: vmov.f32 s3, s19
986 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
987 ; CHECK-NEXT: vadd.f32 q0, q2, q0
988 ; CHECK-NEXT: vldrw.u32 q2, [r0]
989 ; CHECK-NEXT: vmov.f32 s17, s4
990 ; CHECK-NEXT: vmov.f32 s18, s7
991 ; CHECK-NEXT: vmov.f32 s22, s6
992 ; CHECK-NEXT: vmov.f32 s16, s9
993 ; CHECK-NEXT: vmov.f32 s19, s14
994 ; CHECK-NEXT: vmov.f32 s20, s8
995 ; CHECK-NEXT: vmov.f32 s21, s11
996 ; CHECK-NEXT: vmov.f32 s23, s13
997 ; CHECK-NEXT: vmov.f32 s4, s10
998 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
999 ; CHECK-NEXT: vmov.f32 s6, s12
1000 ; CHECK-NEXT: vadd.f32 q4, q5, q4
1001 ; CHECK-NEXT: vmov.f32 s7, s15
1002 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
1003 ; CHECK-NEXT: vadd.f32 q1, q4, q1
1004 ; CHECK-NEXT: vmov.f32 s18, s10
1005 ; CHECK-NEXT: vmov.f32 s21, s8
1006 ; CHECK-NEXT: vmov.f32 s22, s11
1007 ; CHECK-NEXT: vmov.f32 s16, s12
1008 ; CHECK-NEXT: vmov.f32 s17, s15
1009 ; CHECK-NEXT: vmov.f32 s20, s13
1010 ; CHECK-NEXT: vmov.f32 s23, s26
1011 ; CHECK-NEXT: vmov.f32 s19, s25
1012 ; CHECK-NEXT: vadd.f32 q4, q4, q5
1013 ; CHECK-NEXT: vmov.f32 s8, s14
1014 ; CHECK-NEXT: vmov.f32 s10, s24
1015 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
1016 ; CHECK-NEXT: vmov.f32 s11, s27
1017 ; CHECK-NEXT: vldrw.u32 q5, [r0, #128]
1018 ; CHECK-NEXT: vadd.f32 q2, q4, q2
1019 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96]
1020 ; CHECK-NEXT: vmov.f32 s25, s12
1021 ; CHECK-NEXT: vstrw.32 q2, [r1, #48]
1022 ; CHECK-NEXT: vmov.f32 s26, s15
1023 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
1024 ; CHECK-NEXT: vmov.f32 s30, s14
1025 ; CHECK-NEXT: vstrw.32 q1, [r1]
1026 ; CHECK-NEXT: vmov.f32 s24, s17
1027 ; CHECK-NEXT: vmov.f32 s27, s22
1028 ; CHECK-NEXT: vmov.f32 s28, s16
1029 ; CHECK-NEXT: vmov.f32 s29, s19
1030 ; CHECK-NEXT: vmov.f32 s31, s21
1031 ; CHECK-NEXT: vadd.f32 q6, q7, q6
1032 ; CHECK-NEXT: vmov.f32 s12, s18
1033 ; CHECK-NEXT: vmov.f32 s14, s20
1034 ; CHECK-NEXT: vmov.f32 s15, s23
1035 ; CHECK-NEXT: vadd.f32 q3, q6, q3
1036 ; CHECK-NEXT: vstrw.32 q3, [r1, #32]
1037 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1040 %l1 = load <48 x float>, <48 x float>* %src, align 4
1041 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1042 %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1043 %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1044 %a1 = fadd <16 x float> %s1, %s2
1045 %a = fadd <16 x float> %a1, %s3
1046 store <16 x float> %a, <16 x float> *%dst
1052 define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
1053 ; CHECK-LABEL: vld3_v2f16:
1054 ; CHECK: @ %bb.0: @ %entry
1055 ; CHECK-NEXT: ldrd r2, r3, [r0]
1056 ; CHECK-NEXT: ldr r0, [r0, #8]
1057 ; CHECK-NEXT: vmov.32 q0[0], r2
1058 ; CHECK-NEXT: vmov.32 q0[1], r3
1059 ; CHECK-NEXT: vmov.32 q0[2], r0
1060 ; CHECK-NEXT: vmovx.f16 s8, s0
1061 ; CHECK-NEXT: vmovx.f16 s4, s2
1062 ; CHECK-NEXT: vins.f16 s8, s2
1063 ; CHECK-NEXT: vmovx.f16 s2, s1
1064 ; CHECK-NEXT: vins.f16 s1, s4
1065 ; CHECK-NEXT: vins.f16 s0, s2
1066 ; CHECK-NEXT: vadd.f16 q1, q0, q2
1067 ; CHECK-NEXT: vmov.f32 s0, s1
1068 ; CHECK-NEXT: vadd.f16 q0, q1, q0
1069 ; CHECK-NEXT: vmov r0, s0
1070 ; CHECK-NEXT: str r0, [r1]
1073 %l1 = load <6 x half>, <6 x half>* %src, align 4
1074 %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3>
1075 %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4>
1076 %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5>
1077 %a1 = fadd <2 x half> %s1, %s2
1078 %a = fadd <2 x half> %a1, %s3
1079 store <2 x half> %a, <2 x half> *%dst
1083 define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
1084 ; CHECK-LABEL: vld3_v4f16:
1085 ; CHECK: @ %bb.0: @ %entry
1086 ; CHECK-NEXT: ldrd r2, r3, [r0, #16]
1087 ; CHECK-NEXT: vldrw.u32 q1, [r0]
1088 ; CHECK-NEXT: vmov.32 q2[0], r2
1089 ; CHECK-NEXT: vmovx.f16 s12, s4
1090 ; CHECK-NEXT: vmov.32 q2[1], r3
1091 ; CHECK-NEXT: vmovx.f16 s13, s7
1092 ; CHECK-NEXT: vmovx.f16 s0, s9
1093 ; CHECK-NEXT: vmov.f32 s1, s8
1094 ; CHECK-NEXT: vins.f16 s1, s0
1095 ; CHECK-NEXT: vmovx.f16 s0, s5
1096 ; CHECK-NEXT: vins.f16 s4, s0
1097 ; CHECK-NEXT: vmovx.f16 s0, s6
1098 ; CHECK-NEXT: vins.f16 s5, s0
1099 ; CHECK-NEXT: vmovx.f16 s0, s8
1100 ; CHECK-NEXT: vins.f16 s7, s0
1101 ; CHECK-NEXT: vmov.f32 s0, s5
1102 ; CHECK-NEXT: vins.f16 s12, s6
1103 ; CHECK-NEXT: vins.f16 s13, s9
1104 ; CHECK-NEXT: vmov.f32 s5, s7
1105 ; CHECK-NEXT: vadd.f16 q1, q1, q3
1106 ; CHECK-NEXT: vadd.f16 q0, q1, q0
1107 ; CHECK-NEXT: vmov r0, r2, d0
1108 ; CHECK-NEXT: strd r0, r2, [r1]
1111 %l1 = load <12 x half>, <12 x half>* %src, align 4
1112 %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1113 %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1114 %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1115 %a1 = fadd <4 x half> %s1, %s2
1116 %a = fadd <4 x half> %a1, %s3
1117 store <4 x half> %a, <4 x half> *%dst
1121 define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
1122 ; CHECK-LABEL: vld3_v8f16:
1123 ; CHECK: @ %bb.0: @ %entry
1124 ; CHECK-NEXT: .vsave {d8, d9, d10}
1125 ; CHECK-NEXT: vpush {d8, d9, d10}
1126 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1127 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
1128 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1129 ; CHECK-NEXT: vmov.f32 s4, s1
1130 ; CHECK-NEXT: vmovx.f16 s6, s2
1131 ; CHECK-NEXT: vins.f16 s4, s6
1132 ; CHECK-NEXT: vmov.f32 s5, s12
1133 ; CHECK-NEXT: vmovx.f16 s6, s13
1134 ; CHECK-NEXT: vmov.f32 s7, s10
1135 ; CHECK-NEXT: vins.f16 s5, s6
1136 ; CHECK-NEXT: vmovx.f16 s6, s11
1137 ; CHECK-NEXT: vins.f16 s7, s6
1138 ; CHECK-NEXT: vmov.f32 s6, s15
1139 ; CHECK-NEXT: vmovx.f16 s16, s8
1140 ; CHECK-NEXT: vmovx.f16 s12, s12
1141 ; CHECK-NEXT: vmovx.f16 s15, s15
1142 ; CHECK-NEXT: vmov.f32 s20, s14
1143 ; CHECK-NEXT: vmovx.f16 s10, s10
1144 ; CHECK-NEXT: vmovx.f16 s17, s3
1145 ; CHECK-NEXT: vmovx.f16 s19, s9
1146 ; CHECK-NEXT: vmovx.f16 s18, s14
1147 ; CHECK-NEXT: vins.f16 s6, s16
1148 ; CHECK-NEXT: vmovx.f16 s16, s0
1149 ; CHECK-NEXT: vmovx.f16 s1, s1
1150 ; CHECK-NEXT: vins.f16 s20, s15
1151 ; CHECK-NEXT: vins.f16 s3, s12
1152 ; CHECK-NEXT: vins.f16 s9, s10
1153 ; CHECK-NEXT: vins.f16 s0, s1
1154 ; CHECK-NEXT: vins.f16 s16, s2
1155 ; CHECK-NEXT: vmov.f32 s1, s3
1156 ; CHECK-NEXT: vins.f16 s17, s13
1157 ; CHECK-NEXT: vins.f16 s19, s11
1158 ; CHECK-NEXT: vins.f16 s18, s8
1159 ; CHECK-NEXT: vmov.f32 s2, s20
1160 ; CHECK-NEXT: vmov.f32 s3, s9
1161 ; CHECK-NEXT: vadd.f16 q0, q0, q4
1162 ; CHECK-NEXT: vadd.f16 q0, q0, q1
1163 ; CHECK-NEXT: vstrw.32 q0, [r1]
1164 ; CHECK-NEXT: vpop {d8, d9, d10}
1167 %l1 = load <24 x half>, <24 x half>* %src, align 4
1168 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1169 %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1170 %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1171 %a1 = fadd <8 x half> %s1, %s2
1172 %a = fadd <8 x half> %a1, %s3
1173 store <8 x half> %a, <8 x half> *%dst
1177 define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
1178 ; CHECK-LABEL: vld3_v16f16:
1179 ; CHECK: @ %bb.0: @ %entry
1180 ; CHECK-NEXT: .vsave {d8, d9, d10}
1181 ; CHECK-NEXT: vpush {d8, d9, d10}
1182 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
1183 ; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
1184 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
1185 ; CHECK-NEXT: vmov.f32 s4, s1
1186 ; CHECK-NEXT: vmovx.f16 s6, s2
1187 ; CHECK-NEXT: vins.f16 s4, s6
1188 ; CHECK-NEXT: vmov.f32 s5, s12
1189 ; CHECK-NEXT: vmovx.f16 s6, s13
1190 ; CHECK-NEXT: vmov.f32 s7, s10
1191 ; CHECK-NEXT: vins.f16 s5, s6
1192 ; CHECK-NEXT: vmovx.f16 s6, s11
1193 ; CHECK-NEXT: vins.f16 s7, s6
1194 ; CHECK-NEXT: vmov.f32 s6, s15
1195 ; CHECK-NEXT: vmovx.f16 s16, s8
1196 ; CHECK-NEXT: vmovx.f16 s12, s12
1197 ; CHECK-NEXT: vmovx.f16 s15, s15
1198 ; CHECK-NEXT: vmov.f32 s20, s14
1199 ; CHECK-NEXT: vmovx.f16 s10, s10
1200 ; CHECK-NEXT: vmovx.f16 s17, s3
1201 ; CHECK-NEXT: vmovx.f16 s19, s9
1202 ; CHECK-NEXT: vmovx.f16 s18, s14
1203 ; CHECK-NEXT: vins.f16 s6, s16
1204 ; CHECK-NEXT: vmovx.f16 s16, s0
1205 ; CHECK-NEXT: vmovx.f16 s1, s1
1206 ; CHECK-NEXT: vins.f16 s20, s15
1207 ; CHECK-NEXT: vins.f16 s3, s12
1208 ; CHECK-NEXT: vins.f16 s9, s10
1209 ; CHECK-NEXT: vins.f16 s0, s1
1210 ; CHECK-NEXT: vins.f16 s16, s2
1211 ; CHECK-NEXT: vins.f16 s17, s13
1212 ; CHECK-NEXT: vmov.f32 s1, s3
1213 ; CHECK-NEXT: vins.f16 s19, s11
1214 ; CHECK-NEXT: vins.f16 s18, s8
1215 ; CHECK-NEXT: vmov.f32 s3, s9
1216 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
1217 ; CHECK-NEXT: vmov.f32 s2, s20
1218 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1219 ; CHECK-NEXT: vadd.f16 q0, q0, q4
1220 ; CHECK-NEXT: vmov.f32 s20, s14
1221 ; CHECK-NEXT: vadd.f16 q1, q0, q1
1222 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1223 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
1224 ; CHECK-NEXT: vmov.f32 s5, s12
1225 ; CHECK-NEXT: vmov.f32 s4, s1
1226 ; CHECK-NEXT: vmovx.f16 s6, s2
1227 ; CHECK-NEXT: vins.f16 s4, s6
1228 ; CHECK-NEXT: vmovx.f16 s6, s13
1229 ; CHECK-NEXT: vins.f16 s5, s6
1230 ; CHECK-NEXT: vmov.f32 s7, s10
1231 ; CHECK-NEXT: vmovx.f16 s6, s11
1232 ; CHECK-NEXT: vmovx.f16 s16, s8
1233 ; CHECK-NEXT: vins.f16 s7, s6
1234 ; CHECK-NEXT: vmov.f32 s6, s15
1235 ; CHECK-NEXT: vmovx.f16 s15, s15
1236 ; CHECK-NEXT: vmovx.f16 s12, s12
1237 ; CHECK-NEXT: vmovx.f16 s10, s10
1238 ; CHECK-NEXT: vmovx.f16 s17, s3
1239 ; CHECK-NEXT: vmovx.f16 s19, s9
1240 ; CHECK-NEXT: vmovx.f16 s18, s14
1241 ; CHECK-NEXT: vins.f16 s6, s16
1242 ; CHECK-NEXT: vmovx.f16 s16, s0
1243 ; CHECK-NEXT: vmovx.f16 s1, s1
1244 ; CHECK-NEXT: vins.f16 s20, s15
1245 ; CHECK-NEXT: vins.f16 s3, s12
1246 ; CHECK-NEXT: vins.f16 s9, s10
1247 ; CHECK-NEXT: vins.f16 s0, s1
1248 ; CHECK-NEXT: vins.f16 s16, s2
1249 ; CHECK-NEXT: vmov.f32 s1, s3
1250 ; CHECK-NEXT: vins.f16 s17, s13
1251 ; CHECK-NEXT: vins.f16 s19, s11
1252 ; CHECK-NEXT: vins.f16 s18, s8
1253 ; CHECK-NEXT: vmov.f32 s2, s20
1254 ; CHECK-NEXT: vmov.f32 s3, s9
1255 ; CHECK-NEXT: vadd.f16 q0, q0, q4
1256 ; CHECK-NEXT: vadd.f16 q0, q0, q1
1257 ; CHECK-NEXT: vstrw.32 q0, [r1]
1258 ; CHECK-NEXT: vpop {d8, d9, d10}
1261 %l1 = load <48 x half>, <48 x half>* %src, align 4
1262 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1263 %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1264 %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1265 %a1 = fadd <16 x half> %s1, %s2
1266 %a = fadd <16 x half> %a1, %s3
1267 store <16 x half> %a, <16 x half> *%dst
1273 define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) {
1274 ; CHECK-LABEL: vld3_v2f64:
1275 ; CHECK: @ %bb.0: @ %entry
1276 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
1277 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
1278 ; CHECK-NEXT: vldrw.u32 q2, [r0]
1279 ; CHECK-NEXT: vadd.f64 d0, d3, d0
1280 ; CHECK-NEXT: vadd.f64 d3, d4, d5
1281 ; CHECK-NEXT: vadd.f64 d1, d0, d1
1282 ; CHECK-NEXT: vadd.f64 d0, d3, d2
1283 ; CHECK-NEXT: vstrw.32 q0, [r1]
1286 %l1 = load <6 x double>, <6 x double>* %src, align 4
1287 %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3>
1288 %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4>
1289 %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5>
1290 %a1 = fadd <2 x double> %s1, %s2
1291 %a = fadd <2 x double> %a1, %s3
1292 store <2 x double> %a, <2 x double> *%dst
1296 define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) {
1297 ; CHECK-LABEL: vld3_v4f64:
1298 ; CHECK: @ %bb.0: @ %entry
1299 ; CHECK-NEXT: .vsave {d8, d9}
1300 ; CHECK-NEXT: vpush {d8, d9}
1301 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
1302 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
1303 ; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
1304 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
1305 ; CHECK-NEXT: vadd.f64 d1, d1, d2
1306 ; CHECK-NEXT: vldrw.u32 q4, [r0]
1307 ; CHECK-NEXT: vadd.f64 d2, d4, d5
1308 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1309 ; CHECK-NEXT: vadd.f64 d4, d7, d4
1310 ; CHECK-NEXT: vadd.f64 d7, d8, d9
1311 ; CHECK-NEXT: vadd.f64 d1, d1, d3
1312 ; CHECK-NEXT: vadd.f64 d0, d2, d0
1313 ; CHECK-NEXT: vadd.f64 d3, d4, d5
1314 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
1315 ; CHECK-NEXT: vadd.f64 d2, d7, d6
1316 ; CHECK-NEXT: vstrw.32 q1, [r1]
1317 ; CHECK-NEXT: vpop {d8, d9}
1320 %l1 = load <12 x double>, <12 x double>* %src, align 4
1321 %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1322 %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1323 %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1324 %a1 = fadd <4 x double> %s1, %s2
1325 %a = fadd <4 x double> %a1, %s3
1326 store <4 x double> %a, <4 x double> *%dst