1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s
6 define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
7 ; CHECK-LABEL: vst4_v2i32:
8 ; CHECK: @ %bb.0: @ %entry
9 ; CHECK-NEXT: .save {r4, r5, r6, lr}
10 ; CHECK-NEXT: push {r4, r5, r6, lr}
11 ; CHECK-NEXT: add.w r6, r0, #16
12 ; CHECK-NEXT: ldrd lr, r12, [r0]
13 ; CHECK-NEXT: ldrd r3, r2, [r0, #8]
14 ; CHECK-NEXT: ldm r6, {r4, r5, r6}
15 ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
16 ; CHECK-NEXT: ldr r0, [r0, #28]
17 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
18 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6
19 ; CHECK-NEXT: vmov.f32 s8, s4
20 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0
21 ; CHECK-NEXT: vmov.f32 s9, s6
22 ; CHECK-NEXT: vmov.f32 s4, s5
23 ; CHECK-NEXT: vmov.f32 s5, s7
24 ; CHECK-NEXT: vmov.f32 s10, s0
25 ; CHECK-NEXT: vmov.f32 s11, s2
26 ; CHECK-NEXT: vmov.f32 s6, s1
27 ; CHECK-NEXT: vstrw.32 q2, [r1]
28 ; CHECK-NEXT: vmov.f32 s7, s3
29 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
30 ; CHECK-NEXT: pop {r4, r5, r6, pc}
32 %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
33 %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
34 %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
35 %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
36 %s3 = getelementptr <2 x i32>, <2 x i32>* %src, i32 2
37 %l3 = load <2 x i32>, <2 x i32>* %s3, align 4
38 %s4 = getelementptr <2 x i32>, <2 x i32>* %src, i32 3
39 %l4 = load <2 x i32>, <2 x i32>* %s4, align 4
40 %t1 = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41 %t2 = shufflevector <2 x i32> %l3, <2 x i32> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
42 %s = shufflevector <4 x i32> %t1, <4 x i32> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
43 store <8 x i32> %s, <8 x i32> *%dst, align 4
47 define void @vst4_v4i32(<4 x i32> *%src, <16 x i32> *%dst) {
48 ; CHECK-LABEL: vst4_v4i32:
49 ; CHECK: @ %bb.0: @ %entry
50 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
51 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
52 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
53 ; CHECK-NEXT: vldrw.u32 q0, [r0]
54 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
55 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
56 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
57 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
60 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
61 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
62 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
63 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
64 %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
65 %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
66 %s4 = getelementptr <4 x i32>, <4 x i32>* %src, i32 3
67 %l4 = load <4 x i32>, <4 x i32>* %s4, align 4
68 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
69 %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
70 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
71 store <16 x i32> %s, <16 x i32> *%dst, align 4
75 define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) {
76 ; CHECK-LABEL: vst4_v8i32:
77 ; CHECK: @ %bb.0: @ %entry
78 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
79 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
80 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
81 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
82 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
83 ; CHECK-NEXT: vldrw.u32 q4, [r0]
84 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
85 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
86 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
87 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
88 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
89 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
90 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
91 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]!
92 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
93 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
94 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
95 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
99 %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
100 %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
101 %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
102 %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
103 %s3 = getelementptr <8 x i32>, <8 x i32>* %src, i32 2
104 %l3 = load <8 x i32>, <8 x i32>* %s3, align 4
105 %s4 = getelementptr <8 x i32>, <8 x i32>* %src, i32 3
106 %l4 = load <8 x i32>, <8 x i32>* %s4, align 4
107 %t1 = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
108 %t2 = shufflevector <8 x i32> %l3, <8 x i32> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
109 %s = shufflevector <16 x i32> %t1, <16 x i32> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
110 store <32 x i32> %s, <32 x i32> *%dst, align 4
114 define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) {
115 ; CHECK-LABEL: vst4_v16i32:
116 ; CHECK: @ %bb.0: @ %entry
117 ; CHECK-NEXT: .save {r4, r5}
118 ; CHECK-NEXT: push {r4, r5}
119 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
120 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
121 ; CHECK-NEXT: .pad #192
122 ; CHECK-NEXT: sub sp, #192
123 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
124 ; CHECK-NEXT: add r2, sp, #64
125 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
126 ; CHECK-NEXT: vldrw.u32 q3, [r0, #208]
127 ; CHECK-NEXT: vldrw.u32 q2, [r0, #144]
128 ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
129 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
130 ; CHECK-NEXT: add r2, sp, #128
131 ; CHECK-NEXT: vldrw.u32 q3, [r0, #192]
132 ; CHECK-NEXT: vldrw.u32 q2, [r0, #128]
133 ; CHECK-NEXT: vldrw.u32 q4, [r0, #240]
134 ; CHECK-NEXT: vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
135 ; CHECK-NEXT: add r2, sp, #128
136 ; CHECK-NEXT: vldrw.u32 q0, [r0]
137 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
138 ; CHECK-NEXT: add r2, sp, #128
139 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176]
140 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
141 ; CHECK-NEXT: add r2, sp, #128
142 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
143 ; CHECK-NEXT: add r2, sp, #128
144 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
145 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
146 ; CHECK-NEXT: add r2, sp, #128
147 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
148 ; CHECK-NEXT: add r2, sp, #128
149 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
150 ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
151 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
152 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224]
153 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
154 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
155 ; CHECK-NEXT: vmov q6, q2
156 ; CHECK-NEXT: vmov q7, q3
157 ; CHECK-NEXT: vmov q5, q1
158 ; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
159 ; CHECK-NEXT: add r2, sp, #64
160 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
161 ; CHECK-NEXT: mov r0, r1
162 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
163 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
164 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
165 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]!
166 ; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
167 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
168 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
169 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
170 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
171 ; CHECK-NEXT: add.w r0, r1, #192
172 ; CHECK-NEXT: adds r1, #128
173 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
174 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
175 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
176 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]
177 ; CHECK-NEXT: add r1, sp, #128
178 ; CHECK-NEXT: vldmia r1, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
179 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
180 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
181 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
182 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
183 ; CHECK-NEXT: add sp, #192
184 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
185 ; CHECK-NEXT: pop {r4, r5}
188 %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
189 %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
190 %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
191 %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
192 %s3 = getelementptr <16 x i32>, <16 x i32>* %src, i32 2
193 %l3 = load <16 x i32>, <16 x i32>* %s3, align 4
194 %s4 = getelementptr <16 x i32>, <16 x i32>* %src, i32 3
195 %l4 = load <16 x i32>, <16 x i32>* %s4, align 4
196 %t1 = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
197 %t2 = shufflevector <16 x i32> %l3, <16 x i32> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
198 %s = shufflevector <32 x i32> %t1, <32 x i32> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
199 store <64 x i32> %s, <64 x i32> *%dst, align 4
203 define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) {
204 ; CHECK-LABEL: vst4_v4i32_align1:
205 ; CHECK: @ %bb.0: @ %entry
206 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
207 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
208 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
209 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
210 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
211 ; CHECK-NEXT: vldrw.u32 q4, [r0]
212 ; CHECK-NEXT: vmov.f32 s14, s1
213 ; CHECK-NEXT: vmov.f32 s22, s0
214 ; CHECK-NEXT: vmov.f32 s26, s3
215 ; CHECK-NEXT: vmov.f32 s12, s17
216 ; CHECK-NEXT: vmov.f32 s13, s9
217 ; CHECK-NEXT: vmov.f32 s15, s5
218 ; CHECK-NEXT: vmov.f32 s20, s16
219 ; CHECK-NEXT: vstrb.8 q3, [r1, #16]
220 ; CHECK-NEXT: vmov.f32 s21, s8
221 ; CHECK-NEXT: vmov.f32 s23, s4
222 ; CHECK-NEXT: vmov.f32 s24, s19
223 ; CHECK-NEXT: vstrb.8 q5, [r1]
224 ; CHECK-NEXT: vmov.f32 s25, s11
225 ; CHECK-NEXT: vmov.f32 s27, s7
226 ; CHECK-NEXT: vmov.f32 s0, s18
227 ; CHECK-NEXT: vstrb.8 q6, [r1, #48]
228 ; CHECK-NEXT: vmov.f32 s1, s10
229 ; CHECK-NEXT: vmov.f32 s3, s6
230 ; CHECK-NEXT: vstrb.8 q0, [r1, #32]
231 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
234 %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
235 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
236 %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
237 %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
238 %s3 = getelementptr <4 x i32>, <4 x i32>* %src, i32 2
239 %l3 = load <4 x i32>, <4 x i32>* %s3, align 4
240 %s4 = getelementptr <4 x i32>, <4 x i32>* %src, i32 3
241 %l4 = load <4 x i32>, <4 x i32>* %s4, align 4
242 %t1 = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
243 %t2 = shufflevector <4 x i32> %l3, <4 x i32> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
244 %s = shufflevector <8 x i32> %t1, <8 x i32> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
245 store <16 x i32> %s, <16 x i32> *%dst, align 1
251 define void @vst4_v2i16(<2 x i16> *%src, <8 x i16> *%dst) {
252 ; CHECK-LABEL: vst4_v2i16:
253 ; CHECK: @ %bb.0: @ %entry
254 ; CHECK-NEXT: .save {r4, r5, r6, lr}
255 ; CHECK-NEXT: push {r4, r5, r6, lr}
256 ; CHECK-NEXT: ldrh r3, [r0, #2]
257 ; CHECK-NEXT: ldrh r2, [r0]
258 ; CHECK-NEXT: ldrh.w r12, [r0, #10]
259 ; CHECK-NEXT: ldrh.w lr, [r0, #4]
260 ; CHECK-NEXT: vmov q1[2], q1[0], r2, r3
261 ; CHECK-NEXT: ldrh r4, [r0, #12]
262 ; CHECK-NEXT: ldrh r5, [r0, #6]
263 ; CHECK-NEXT: ldrh r6, [r0, #14]
264 ; CHECK-NEXT: ldrh r0, [r0, #8]
265 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r12
266 ; CHECK-NEXT: vmov r0, s4
267 ; CHECK-NEXT: vmov.16 q1[0], r0
268 ; CHECK-NEXT: vmov r0, s0
269 ; CHECK-NEXT: vmov.16 q1[1], lr
270 ; CHECK-NEXT: vmov.16 q1[2], r0
271 ; CHECK-NEXT: vmov.16 q1[3], r4
272 ; CHECK-NEXT: vmov.16 q1[4], r3
273 ; CHECK-NEXT: vmov.16 q1[5], r5
274 ; CHECK-NEXT: vmov.16 q1[6], r12
275 ; CHECK-NEXT: vmov.16 q1[7], r6
276 ; CHECK-NEXT: vstrh.16 q1, [r1]
277 ; CHECK-NEXT: pop {r4, r5, r6, pc}
279 %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
280 %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
281 %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
282 %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
283 %s3 = getelementptr <2 x i16>, <2 x i16>* %src, i32 2
284 %l3 = load <2 x i16>, <2 x i16>* %s3, align 4
285 %s4 = getelementptr <2 x i16>, <2 x i16>* %src, i32 3
286 %l4 = load <2 x i16>, <2 x i16>* %s4, align 4
287 %t1 = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
288 %t2 = shufflevector <2 x i16> %l3, <2 x i16> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
289 %s = shufflevector <4 x i16> %t1, <4 x i16> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
290 store <8 x i16> %s, <8 x i16> *%dst, align 2
294 define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) {
295 ; CHECK-LABEL: vst4_v4i16:
296 ; CHECK: @ %bb.0: @ %entry
297 ; CHECK-NEXT: .save {r4, r5, r6, lr}
298 ; CHECK-NEXT: push {r4, r5, r6, lr}
299 ; CHECK-NEXT: .vsave {d8, d9}
300 ; CHECK-NEXT: vpush {d8, d9}
301 ; CHECK-NEXT: vldrh.u32 q0, [r0]
302 ; CHECK-NEXT: vldrh.u32 q3, [r0, #8]
303 ; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
304 ; CHECK-NEXT: vldrh.u32 q4, [r0, #24]
305 ; CHECK-NEXT: vmov r3, r4, d1
306 ; CHECK-NEXT: vmov r5, r12, d0
307 ; CHECK-NEXT: vmov.16 q2[0], r3
308 ; CHECK-NEXT: vmov.16 q0[0], r5
309 ; CHECK-NEXT: vmov r0, r5, d7
310 ; CHECK-NEXT: vmov.16 q2[1], r0
311 ; CHECK-NEXT: vmov r2, lr, d3
312 ; CHECK-NEXT: vmov r0, r3, d9
313 ; CHECK-NEXT: vmov.16 q2[2], r2
314 ; CHECK-NEXT: vmov.16 q2[3], r0
315 ; CHECK-NEXT: vmov r0, r6, d8
316 ; CHECK-NEXT: vmov.16 q2[4], r4
317 ; CHECK-NEXT: vmov.16 q2[5], r5
318 ; CHECK-NEXT: vmov r4, r5, d6
319 ; CHECK-NEXT: vmov.16 q2[6], lr
320 ; CHECK-NEXT: vmov.16 q0[1], r4
321 ; CHECK-NEXT: vmov.16 q2[7], r3
322 ; CHECK-NEXT: vmov r3, r2, d2
323 ; CHECK-NEXT: vmov.16 q0[2], r3
324 ; CHECK-NEXT: vstrh.16 q2, [r1, #16]
325 ; CHECK-NEXT: vmov.16 q0[3], r0
326 ; CHECK-NEXT: vmov.16 q0[4], r12
327 ; CHECK-NEXT: vmov.16 q0[5], r5
328 ; CHECK-NEXT: vmov.16 q0[6], r2
329 ; CHECK-NEXT: vmov.16 q0[7], r6
330 ; CHECK-NEXT: vstrh.16 q0, [r1]
331 ; CHECK-NEXT: vpop {d8, d9}
332 ; CHECK-NEXT: pop {r4, r5, r6, pc}
334 %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
335 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
336 %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
337 %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
338 %s3 = getelementptr <4 x i16>, <4 x i16>* %src, i32 2
339 %l3 = load <4 x i16>, <4 x i16>* %s3, align 4
340 %s4 = getelementptr <4 x i16>, <4 x i16>* %src, i32 3
341 %l4 = load <4 x i16>, <4 x i16>* %s4, align 4
342 %t1 = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
343 %t2 = shufflevector <4 x i16> %l3, <4 x i16> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344 %s = shufflevector <8 x i16> %t1, <8 x i16> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
345 store <16 x i16> %s, <16 x i16> *%dst, align 2
349 define void @vst4_v8i16(<8 x i16> *%src, <32 x i16> *%dst) {
350 ; CHECK-LABEL: vst4_v8i16:
351 ; CHECK: @ %bb.0: @ %entry
352 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
353 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
354 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
355 ; CHECK-NEXT: vldrw.u32 q0, [r0]
356 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
357 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
358 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
359 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
362 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
363 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
364 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
365 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
366 %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
367 %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
368 %s4 = getelementptr <8 x i16>, <8 x i16>* %src, i32 3
369 %l4 = load <8 x i16>, <8 x i16>* %s4, align 4
370 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
371 %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
372 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
373 store <32 x i16> %s, <32 x i16> *%dst, align 2
377 define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) {
378 ; CHECK-LABEL: vst4_v16i16:
379 ; CHECK: @ %bb.0: @ %entry
380 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
381 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
382 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
383 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
384 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
385 ; CHECK-NEXT: vldrw.u32 q4, [r0]
386 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
387 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
388 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
389 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
390 ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1]
391 ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1]
392 ; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1]
393 ; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]!
394 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
395 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
396 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
397 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
398 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
401 %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
402 %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
403 %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
404 %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
405 %s3 = getelementptr <16 x i16>, <16 x i16>* %src, i32 2
406 %l3 = load <16 x i16>, <16 x i16>* %s3, align 4
407 %s4 = getelementptr <16 x i16>, <16 x i16>* %src, i32 3
408 %l4 = load <16 x i16>, <16 x i16>* %s4, align 4
409 %t1 = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
410 %t2 = shufflevector <16 x i16> %l3, <16 x i16> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
411 %s = shufflevector <32 x i16> %t1, <32 x i16> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
412 store <64 x i16> %s, <64 x i16> *%dst, align 2
416 define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) {
417 ; CHECK-LABEL: vst4_v8i16_align1:
418 ; CHECK: @ %bb.0: @ %entry
419 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
420 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
421 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
422 ; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
423 ; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
424 ; CHECK-NEXT: vmovx.f16 s12, s5
425 ; CHECK-NEXT: vmovx.f16 s0, s9
426 ; CHECK-NEXT: vins.f16 s5, s9
427 ; CHECK-NEXT: vins.f16 s12, s0
428 ; CHECK-NEXT: vmov q0, q1
429 ; CHECK-NEXT: vmovx.f16 s27, s4
430 ; CHECK-NEXT: vins.f16 s4, s8
431 ; CHECK-NEXT: vmov.f32 s3, s12
432 ; CHECK-NEXT: vldrw.u32 q3, [r0]
433 ; CHECK-NEXT: vmov.f32 s5, s4
434 ; CHECK-NEXT: vmovx.f16 s8, s8
435 ; CHECK-NEXT: vmovx.f16 s0, s17
436 ; CHECK-NEXT: vmovx.f16 s2, s13
437 ; CHECK-NEXT: vins.f16 s27, s8
438 ; CHECK-NEXT: vmovx.f16 s4, s12
439 ; CHECK-NEXT: vmovx.f16 s8, s16
440 ; CHECK-NEXT: vins.f16 s13, s17
441 ; CHECK-NEXT: vins.f16 s12, s16
442 ; CHECK-NEXT: vmov q5, q3
443 ; CHECK-NEXT: vins.f16 s4, s8
444 ; CHECK-NEXT: vmov.f32 s22, s4
445 ; CHECK-NEXT: vmovx.f16 s4, s11
446 ; CHECK-NEXT: vmov.f32 s23, s27
447 ; CHECK-NEXT: vmovx.f16 s27, s7
448 ; CHECK-NEXT: vins.f16 s7, s11
449 ; CHECK-NEXT: vins.f16 s27, s4
450 ; CHECK-NEXT: vmovx.f16 s26, s15
451 ; CHECK-NEXT: vmovx.f16 s4, s19
452 ; CHECK-NEXT: vmov.f32 s25, s7
453 ; CHECK-NEXT: vins.f16 s26, s4
454 ; CHECK-NEXT: vmovx.f16 s7, s6
455 ; CHECK-NEXT: vmovx.f16 s4, s10
456 ; CHECK-NEXT: vins.f16 s6, s10
457 ; CHECK-NEXT: vmov.f32 s21, s5
458 ; CHECK-NEXT: vins.f16 s15, s19
459 ; CHECK-NEXT: vins.f16 s7, s4
460 ; CHECK-NEXT: vmov.f32 s5, s6
461 ; CHECK-NEXT: vmovx.f16 s6, s14
462 ; CHECK-NEXT: vmovx.f16 s4, s18
463 ; CHECK-NEXT: vins.f16 s14, s18
464 ; CHECK-NEXT: vins.f16 s2, s0
465 ; CHECK-NEXT: vmov.f32 s0, s13
466 ; CHECK-NEXT: vmov.f32 s24, s15
467 ; CHECK-NEXT: vins.f16 s6, s4
468 ; CHECK-NEXT: vmov.f32 s4, s14
469 ; CHECK-NEXT: vstrb.8 q6, [r1, #48]
470 ; CHECK-NEXT: vstrb.8 q1, [r1, #32]
471 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
472 ; CHECK-NEXT: vstrb.8 q5, [r1]
473 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
476 %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
477 %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
478 %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
479 %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
480 %s3 = getelementptr <8 x i16>, <8 x i16>* %src, i32 2
481 %l3 = load <8 x i16>, <8 x i16>* %s3, align 4
482 %s4 = getelementptr <8 x i16>, <8 x i16>* %src, i32 3
483 %l4 = load <8 x i16>, <8 x i16>* %s4, align 4
484 %t1 = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
485 %t2 = shufflevector <8 x i16> %l3, <8 x i16> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
486 %s = shufflevector <16 x i16> %t1, <16 x i16> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
487 store <32 x i16> %s, <32 x i16> *%dst, align 1
493 define void @vst4_v2i8(<2 x i8> *%src, <8 x i8> *%dst) {
494 ; CHECK-LABEL: vst4_v2i8:
495 ; CHECK: @ %bb.0: @ %entry
496 ; CHECK-NEXT: .save {r4, r5, r6, lr}
497 ; CHECK-NEXT: push {r4, r5, r6, lr}
498 ; CHECK-NEXT: ldrb r4, [r0, #5]
499 ; CHECK-NEXT: ldrb r5, [r0, #4]
500 ; CHECK-NEXT: ldrb r2, [r0]
501 ; CHECK-NEXT: ldrb r3, [r0, #1]
502 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
503 ; CHECK-NEXT: vmov r5, s0
504 ; CHECK-NEXT: ldrb.w r12, [r0, #2]
505 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
506 ; CHECK-NEXT: ldrb.w lr, [r0, #3]
507 ; CHECK-NEXT: vmov r2, s0
508 ; CHECK-NEXT: ldrb r6, [r0, #7]
509 ; CHECK-NEXT: vmov.16 q0[0], r2
510 ; CHECK-NEXT: ldrb r0, [r0, #6]
511 ; CHECK-NEXT: vmov.16 q0[1], r12
512 ; CHECK-NEXT: vmov.16 q0[2], r5
513 ; CHECK-NEXT: vmov.16 q0[3], r0
514 ; CHECK-NEXT: vmov.16 q0[4], r3
515 ; CHECK-NEXT: vmov.16 q0[5], lr
516 ; CHECK-NEXT: vmov.16 q0[6], r4
517 ; CHECK-NEXT: vmov.16 q0[7], r6
518 ; CHECK-NEXT: vstrb.16 q0, [r1]
519 ; CHECK-NEXT: pop {r4, r5, r6, pc}
521 %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
522 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
523 %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
524 %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
525 %s3 = getelementptr <2 x i8>, <2 x i8>* %src, i32 2
526 %l3 = load <2 x i8>, <2 x i8>* %s3, align 4
527 %s4 = getelementptr <2 x i8>, <2 x i8>* %src, i32 3
528 %l4 = load <2 x i8>, <2 x i8>* %s4, align 4
529 %t1 = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
530 %t2 = shufflevector <2 x i8> %l3, <2 x i8> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
531 %s = shufflevector <4 x i8> %t1, <4 x i8> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
532 store <8 x i8> %s, <8 x i8> *%dst, align 1
536 define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) {
537 ; CHECK-LABEL: vst4_v4i8:
538 ; CHECK: @ %bb.0: @ %entry
539 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
540 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
541 ; CHECK-NEXT: .pad #4
542 ; CHECK-NEXT: sub sp, #4
543 ; CHECK-NEXT: .vsave {d8, d9}
544 ; CHECK-NEXT: vpush {d8, d9}
545 ; CHECK-NEXT: vldrb.u32 q2, [r0]
546 ; CHECK-NEXT: vldrb.u32 q3, [r0, #4]
547 ; CHECK-NEXT: vldrb.u32 q1, [r0, #8]
548 ; CHECK-NEXT: vldrb.u32 q4, [r0, #12]
549 ; CHECK-NEXT: vmov r4, r5, d4
550 ; CHECK-NEXT: vmov.8 q0[0], r4
551 ; CHECK-NEXT: vmov r2, lr, d6
552 ; CHECK-NEXT: vmov.8 q0[1], r2
553 ; CHECK-NEXT: vmov r0, r4, d2
554 ; CHECK-NEXT: vmov r3, r12, d8
555 ; CHECK-NEXT: vmov.8 q0[2], r0
556 ; CHECK-NEXT: vmov.8 q0[3], r3
557 ; CHECK-NEXT: vmov r2, r7, d9
558 ; CHECK-NEXT: vmov.8 q0[4], r5
559 ; CHECK-NEXT: vmov r3, r5, d7
560 ; CHECK-NEXT: vmov.8 q0[5], lr
561 ; CHECK-NEXT: vmov.8 q0[6], r4
562 ; CHECK-NEXT: vmov r4, r0, d5
563 ; CHECK-NEXT: vmov.8 q0[7], r12
564 ; CHECK-NEXT: vmov.8 q0[8], r4
565 ; CHECK-NEXT: vmov r4, r6, d3
566 ; CHECK-NEXT: vmov.8 q0[9], r3
567 ; CHECK-NEXT: vmov.8 q0[10], r4
568 ; CHECK-NEXT: vmov.8 q0[11], r2
569 ; CHECK-NEXT: vmov.8 q0[12], r0
570 ; CHECK-NEXT: vmov.8 q0[13], r5
571 ; CHECK-NEXT: vmov.8 q0[14], r6
572 ; CHECK-NEXT: vmov.8 q0[15], r7
573 ; CHECK-NEXT: vstrb.8 q0, [r1]
574 ; CHECK-NEXT: vpop {d8, d9}
575 ; CHECK-NEXT: add sp, #4
576 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
578 %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
579 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
580 %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
581 %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
582 %s3 = getelementptr <4 x i8>, <4 x i8>* %src, i32 2
583 %l3 = load <4 x i8>, <4 x i8>* %s3, align 4
584 %s4 = getelementptr <4 x i8>, <4 x i8>* %src, i32 3
585 %l4 = load <4 x i8>, <4 x i8>* %s4, align 4
586 %t1 = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
587 %t2 = shufflevector <4 x i8> %l3, <4 x i8> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
588 %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
589 store <16 x i8> %s, <16 x i8> *%dst, align 1
593 define void @vst4_v8i8(<8 x i8> *%src, <32 x i8> *%dst) {
594 ; CHECK-LABEL: vst4_v8i8:
595 ; CHECK: @ %bb.0: @ %entry
596 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
597 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
598 ; CHECK-NEXT: vldrb.u16 q1, [r0]
599 ; CHECK-NEXT: vldrb.u16 q2, [r0, #8]
600 ; CHECK-NEXT: vldrb.u16 q3, [r0, #16]
601 ; CHECK-NEXT: vldrb.u16 q4, [r0, #24]
602 ; CHECK-NEXT: vmov.u16 r2, q1[4]
603 ; CHECK-NEXT: vmov.8 q0[0], r2
604 ; CHECK-NEXT: vmov.u16 r2, q2[4]
605 ; CHECK-NEXT: vmov.8 q0[1], r2
606 ; CHECK-NEXT: vmov.u16 r2, q3[4]
607 ; CHECK-NEXT: vmov.8 q0[2], r2
608 ; CHECK-NEXT: vmov.u16 r0, q4[4]
609 ; CHECK-NEXT: vmov.8 q0[3], r0
610 ; CHECK-NEXT: vmov.u16 r0, q1[5]
611 ; CHECK-NEXT: vmov.8 q0[4], r0
612 ; CHECK-NEXT: vmov.u16 r0, q2[5]
613 ; CHECK-NEXT: vmov.8 q0[5], r0
614 ; CHECK-NEXT: vmov.u16 r0, q3[5]
615 ; CHECK-NEXT: vmov.8 q0[6], r0
616 ; CHECK-NEXT: vmov.u16 r0, q4[5]
617 ; CHECK-NEXT: vmov.8 q0[7], r0
618 ; CHECK-NEXT: vmov.u16 r0, q1[6]
619 ; CHECK-NEXT: vmov.8 q0[8], r0
620 ; CHECK-NEXT: vmov.u16 r0, q2[6]
621 ; CHECK-NEXT: vmov.8 q0[9], r0
622 ; CHECK-NEXT: vmov.u16 r0, q3[6]
623 ; CHECK-NEXT: vmov.8 q0[10], r0
624 ; CHECK-NEXT: vmov.u16 r0, q4[6]
625 ; CHECK-NEXT: vmov.8 q0[11], r0
626 ; CHECK-NEXT: vmov.u16 r0, q1[7]
627 ; CHECK-NEXT: vmov.8 q0[12], r0
628 ; CHECK-NEXT: vmov.u16 r0, q2[7]
629 ; CHECK-NEXT: vmov.8 q0[13], r0
630 ; CHECK-NEXT: vmov.u16 r0, q3[7]
631 ; CHECK-NEXT: vmov.8 q0[14], r0
632 ; CHECK-NEXT: vmov.u16 r0, q4[7]
633 ; CHECK-NEXT: vmov.8 q0[15], r0
634 ; CHECK-NEXT: vmov.u16 r0, q1[0]
635 ; CHECK-NEXT: vmov.8 q5[0], r0
636 ; CHECK-NEXT: vmov.u16 r0, q2[0]
637 ; CHECK-NEXT: vmov.8 q5[1], r0
638 ; CHECK-NEXT: vmov.u16 r0, q3[0]
639 ; CHECK-NEXT: vmov.8 q5[2], r0
640 ; CHECK-NEXT: vmov.u16 r0, q4[0]
641 ; CHECK-NEXT: vmov.8 q5[3], r0
642 ; CHECK-NEXT: vmov.u16 r0, q1[1]
643 ; CHECK-NEXT: vmov.8 q5[4], r0
644 ; CHECK-NEXT: vmov.u16 r0, q2[1]
645 ; CHECK-NEXT: vmov.8 q5[5], r0
646 ; CHECK-NEXT: vmov.u16 r0, q3[1]
647 ; CHECK-NEXT: vmov.8 q5[6], r0
648 ; CHECK-NEXT: vmov.u16 r0, q4[1]
649 ; CHECK-NEXT: vmov.8 q5[7], r0
650 ; CHECK-NEXT: vmov.u16 r0, q1[2]
651 ; CHECK-NEXT: vmov.8 q5[8], r0
652 ; CHECK-NEXT: vmov.u16 r0, q2[2]
653 ; CHECK-NEXT: vmov.8 q5[9], r0
654 ; CHECK-NEXT: vmov.u16 r0, q3[2]
655 ; CHECK-NEXT: vmov.8 q5[10], r0
656 ; CHECK-NEXT: vmov.u16 r0, q4[2]
657 ; CHECK-NEXT: vmov.8 q5[11], r0
658 ; CHECK-NEXT: vmov.u16 r0, q1[3]
659 ; CHECK-NEXT: vmov.8 q5[12], r0
660 ; CHECK-NEXT: vmov.u16 r0, q2[3]
661 ; CHECK-NEXT: vmov.8 q5[13], r0
662 ; CHECK-NEXT: vmov.u16 r0, q3[3]
663 ; CHECK-NEXT: vmov.8 q5[14], r0
664 ; CHECK-NEXT: vmov.u16 r0, q4[3]
665 ; CHECK-NEXT: vmov.8 q5[15], r0
666 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
667 ; CHECK-NEXT: vstrb.8 q5, [r1]
668 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
671 %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
672 %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
673 %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
674 %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
675 %s3 = getelementptr <8 x i8>, <8 x i8>* %src, i32 2
676 %l3 = load <8 x i8>, <8 x i8>* %s3, align 4
677 %s4 = getelementptr <8 x i8>, <8 x i8>* %src, i32 3
678 %l4 = load <8 x i8>, <8 x i8>* %s4, align 4
679 %t1 = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
680 %t2 = shufflevector <8 x i8> %l3, <8 x i8> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
681 %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
682 store <32 x i8> %s, <32 x i8> *%dst, align 1
686 define void @vst4_v16i8(<16 x i8> *%src, <64 x i8> *%dst) {
687 ; CHECK-LABEL: vst4_v16i8:
688 ; CHECK: @ %bb.0: @ %entry
689 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
690 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
691 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
692 ; CHECK-NEXT: vldrw.u32 q0, [r0]
693 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r1]
694 ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r1]
695 ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r1]
696 ; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r1]
699 %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
700 %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
701 %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
702 %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
703 %s3 = getelementptr <16 x i8>, <16 x i8>* %src, i32 2
704 %l3 = load <16 x i8>, <16 x i8>* %s3, align 4
705 %s4 = getelementptr <16 x i8>, <16 x i8>* %src, i32 3
706 %l4 = load <16 x i8>, <16 x i8>* %s4, align 4
707 %t1 = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
708 %t2 = shufflevector <16 x i8> %l3, <16 x i8> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
709 %s = shufflevector <32 x i8> %t1, <32 x i8> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
710 store <64 x i8> %s, <64 x i8> *%dst, align 1
716 define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
717 ; CHECK-LABEL: vst4_v2i64:
718 ; CHECK: @ %bb.0: @ %entry
719 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
720 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
721 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
722 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
723 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
724 ; CHECK-NEXT: vldrw.u32 q2, [r0]
725 ; CHECK-NEXT: vmov.f32 s14, s0
726 ; CHECK-NEXT: vmov.f32 s15, s1
727 ; CHECK-NEXT: vmov.f32 s22, s4
728 ; CHECK-NEXT: vmov.f32 s23, s5
729 ; CHECK-NEXT: vmov.f32 s12, s16
730 ; CHECK-NEXT: vmov.f32 s13, s17
731 ; CHECK-NEXT: vmov.f32 s20, s8
732 ; CHECK-NEXT: vstrw.32 q3, [r1, #16]
733 ; CHECK-NEXT: vmov.f32 s21, s9
734 ; CHECK-NEXT: vmov.f32 s0, s18
735 ; CHECK-NEXT: vstrw.32 q5, [r1]
736 ; CHECK-NEXT: vmov.f32 s1, s19
737 ; CHECK-NEXT: vmov.f32 s4, s10
738 ; CHECK-NEXT: vstrw.32 q0, [r1, #48]
739 ; CHECK-NEXT: vmov.f32 s5, s11
740 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
741 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
744 %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
745 %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
746 %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
747 %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
748 %s3 = getelementptr <2 x i64>, <2 x i64>* %src, i32 2
749 %l3 = load <2 x i64>, <2 x i64>* %s3, align 4
750 %s4 = getelementptr <2 x i64>, <2 x i64>* %src, i32 3
751 %l4 = load <2 x i64>, <2 x i64>* %s4, align 4
752 %t1 = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
753 %t2 = shufflevector <2 x i64> %l3, <2 x i64> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
754 %s = shufflevector <4 x i64> %t1, <4 x i64> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
755 store <8 x i64> %s, <8 x i64> *%dst, align 8
759 define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) {
760 ; CHECK-LABEL: vst4_v4i64:
761 ; CHECK: @ %bb.0: @ %entry
762 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
763 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
764 ; CHECK-NEXT: .pad #64
765 ; CHECK-NEXT: sub sp, #64
766 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
767 ; CHECK-NEXT: vldrw.u32 q7, [r0]
768 ; CHECK-NEXT: vldrw.u32 q2, [r0, #96]
769 ; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
770 ; CHECK-NEXT: vmov.f32 s6, s0
771 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
772 ; CHECK-NEXT: vmov.f32 s7, s1
773 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
774 ; CHECK-NEXT: vmov.f64 d13, d1
775 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
776 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
777 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
778 ; CHECK-NEXT: vmov.f32 s4, s28
779 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
780 ; CHECK-NEXT: vmov.f32 s5, s29
781 ; CHECK-NEXT: vmov.f32 s24, s30
782 ; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
783 ; CHECK-NEXT: vmov.f32 s25, s31
784 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
785 ; CHECK-NEXT: vmov.f32 s6, s8
786 ; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
787 ; CHECK-NEXT: vmov.f32 s7, s9
788 ; CHECK-NEXT: vmov.f32 s4, s12
789 ; CHECK-NEXT: vmov.f32 s5, s13
790 ; CHECK-NEXT: vmov.f32 s8, s14
791 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
792 ; CHECK-NEXT: vmov.f32 s9, s15
793 ; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload
794 ; CHECK-NEXT: vmov.f64 d1, d15
795 ; CHECK-NEXT: vstrw.32 q2, [r1, #48]
796 ; CHECK-NEXT: vmov.f64 d13, d7
797 ; CHECK-NEXT: vmov.f32 s14, s20
798 ; CHECK-NEXT: vmov.f32 s15, s21
799 ; CHECK-NEXT: vmov.f32 s30, s16
800 ; CHECK-NEXT: vstrw.32 q3, [r1, #80]
801 ; CHECK-NEXT: vmov.f32 s31, s17
802 ; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload
803 ; CHECK-NEXT: vmov.f32 s16, s2
804 ; CHECK-NEXT: vstrw.32 q7, [r1, #64]
805 ; CHECK-NEXT: vmov.f32 s17, s3
806 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
807 ; CHECK-NEXT: vmov.f32 s20, s26
808 ; CHECK-NEXT: vstrw.32 q4, [r1, #96]
809 ; CHECK-NEXT: vmov.f32 s21, s27
810 ; CHECK-NEXT: vstrw.32 q3, [r1, #32]
811 ; CHECK-NEXT: vstrw.32 q5, [r1, #112]
812 ; CHECK-NEXT: vstrw.32 q0, [r1]
813 ; CHECK-NEXT: add sp, #64
814 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
817 %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
818 %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
819 %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
820 %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
821 %s3 = getelementptr <4 x i64>, <4 x i64>* %src, i32 2
822 %l3 = load <4 x i64>, <4 x i64>* %s3, align 4
823 %s4 = getelementptr <4 x i64>, <4 x i64>* %src, i32 3
824 %l4 = load <4 x i64>, <4 x i64>* %s4, align 4
825 %t1 = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
826 %t2 = shufflevector <4 x i64> %l3, <4 x i64> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
827 %s = shufflevector <8 x i64> %t1, <8 x i64> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
828 store <16 x i64> %s, <16 x i64> *%dst, align 8
834 define void @vst4_v2f32(<2 x float> *%src, <8 x float> *%dst) {
835 ; CHECK-LABEL: vst4_v2f32:
836 ; CHECK: @ %bb.0: @ %entry
837 ; CHECK-NEXT: vldr s0, [r0]
838 ; CHECK-NEXT: vldr s4, [r0, #4]
839 ; CHECK-NEXT: vldr s1, [r0, #8]
840 ; CHECK-NEXT: vldr s5, [r0, #12]
841 ; CHECK-NEXT: vldr s2, [r0, #16]
842 ; CHECK-NEXT: vldr s6, [r0, #20]
843 ; CHECK-NEXT: vldr s3, [r0, #24]
844 ; CHECK-NEXT: vldr s7, [r0, #28]
845 ; CHECK-NEXT: vstrw.32 q0, [r1]
846 ; CHECK-NEXT: vstrw.32 q1, [r1, #16]
849 %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
850 %l1 = load <2 x float>, <2 x float>* %s1, align 4
851 %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
852 %l2 = load <2 x float>, <2 x float>* %s2, align 4
853 %s3 = getelementptr <2 x float>, <2 x float>* %src, i32 2
854 %l3 = load <2 x float>, <2 x float>* %s3, align 4
855 %s4 = getelementptr <2 x float>, <2 x float>* %src, i32 3
856 %l4 = load <2 x float>, <2 x float>* %s4, align 4
857 %t1 = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
858 %t2 = shufflevector <2 x float> %l3, <2 x float> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
859 %s = shufflevector <4 x float> %t1, <4 x float> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
860 store <8 x float> %s, <8 x float> *%dst, align 4
864 define void @vst4_v4f32(<4 x float> *%src, <16 x float> *%dst) {
865 ; CHECK-LABEL: vst4_v4f32:
866 ; CHECK: @ %bb.0: @ %entry
867 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
868 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
869 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
870 ; CHECK-NEXT: vldrw.u32 q0, [r0]
871 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
872 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
873 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
874 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
877 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
878 %l1 = load <4 x float>, <4 x float>* %s1, align 4
879 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
880 %l2 = load <4 x float>, <4 x float>* %s2, align 4
881 %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
882 %l3 = load <4 x float>, <4 x float>* %s3, align 4
883 %s4 = getelementptr <4 x float>, <4 x float>* %src, i32 3
884 %l4 = load <4 x float>, <4 x float>* %s4, align 4
885 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
886 %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
887 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
888 store <16 x float> %s, <16 x float> *%dst, align 4
892 define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) {
893 ; CHECK-LABEL: vst4_v8f32:
894 ; CHECK: @ %bb.0: @ %entry
895 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
896 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
897 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
898 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
899 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
900 ; CHECK-NEXT: vldrw.u32 q4, [r0]
901 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
902 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
903 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
904 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
905 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
906 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
907 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
908 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]!
909 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
910 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
911 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
912 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r1]
913 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
916 %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
917 %l1 = load <8 x float>, <8 x float>* %s1, align 4
918 %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
919 %l2 = load <8 x float>, <8 x float>* %s2, align 4
920 %s3 = getelementptr <8 x float>, <8 x float>* %src, i32 2
921 %l3 = load <8 x float>, <8 x float>* %s3, align 4
922 %s4 = getelementptr <8 x float>, <8 x float>* %src, i32 3
923 %l4 = load <8 x float>, <8 x float>* %s4, align 4
924 %t1 = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
925 %t2 = shufflevector <8 x float> %l3, <8 x float> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
926 %s = shufflevector <16 x float> %t1, <16 x float> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
927 store <32 x float> %s, <32 x float> *%dst, align 4
931 define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) {
932 ; CHECK-LABEL: vst4_v16f32:
933 ; CHECK: @ %bb.0: @ %entry
934 ; CHECK-NEXT: .save {r4, r5}
935 ; CHECK-NEXT: push {r4, r5}
936 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
937 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
938 ; CHECK-NEXT: .pad #192
939 ; CHECK-NEXT: sub sp, #192
940 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
941 ; CHECK-NEXT: add r2, sp, #64
942 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
943 ; CHECK-NEXT: vldrw.u32 q3, [r0, #208]
944 ; CHECK-NEXT: vldrw.u32 q2, [r0, #144]
945 ; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
946 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
947 ; CHECK-NEXT: add r2, sp, #128
948 ; CHECK-NEXT: vldrw.u32 q3, [r0, #192]
949 ; CHECK-NEXT: vldrw.u32 q2, [r0, #128]
950 ; CHECK-NEXT: vldrw.u32 q4, [r0, #240]
951 ; CHECK-NEXT: vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
952 ; CHECK-NEXT: add r2, sp, #128
953 ; CHECK-NEXT: vldrw.u32 q0, [r0]
954 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
955 ; CHECK-NEXT: add r2, sp, #128
956 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176]
957 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
958 ; CHECK-NEXT: add r2, sp, #128
959 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
960 ; CHECK-NEXT: add r2, sp, #128
961 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
962 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
963 ; CHECK-NEXT: add r2, sp, #128
964 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
965 ; CHECK-NEXT: add r2, sp, #128
966 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
967 ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
968 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
969 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224]
970 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
971 ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
972 ; CHECK-NEXT: vmov q6, q2
973 ; CHECK-NEXT: vmov q7, q3
974 ; CHECK-NEXT: vmov q5, q1
975 ; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
976 ; CHECK-NEXT: add r2, sp, #64
977 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
978 ; CHECK-NEXT: mov r0, r1
979 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1]
980 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1]
981 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1]
982 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]!
983 ; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
984 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
985 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
986 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
987 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
988 ; CHECK-NEXT: add.w r0, r1, #192
989 ; CHECK-NEXT: adds r1, #128
990 ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1]
991 ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1]
992 ; CHECK-NEXT: vst42.32 {q4, q5, q6, q7}, [r1]
993 ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1]
994 ; CHECK-NEXT: add r1, sp, #128
995 ; CHECK-NEXT: vldmia r1, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
996 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0]
997 ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0]
998 ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0]
999 ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]
1000 ; CHECK-NEXT: add sp, #192
1001 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1002 ; CHECK-NEXT: pop {r4, r5}
1005 %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
1006 %l1 = load <16 x float>, <16 x float>* %s1, align 4
1007 %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
1008 %l2 = load <16 x float>, <16 x float>* %s2, align 4
1009 %s3 = getelementptr <16 x float>, <16 x float>* %src, i32 2
1010 %l3 = load <16 x float>, <16 x float>* %s3, align 4
1011 %s4 = getelementptr <16 x float>, <16 x float>* %src, i32 3
1012 %l4 = load <16 x float>, <16 x float>* %s4, align 4
1013 %t1 = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1014 %t2 = shufflevector <16 x float> %l3, <16 x float> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1015 %s = shufflevector <32 x float> %t1, <32 x float> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
1016 store <64 x float> %s, <64 x float> *%dst, align 4
1020 define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) {
1021 ; CHECK-LABEL: vst4_v4f32_align1:
1022 ; CHECK: @ %bb.0: @ %entry
1023 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
1024 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
1025 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
1026 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
1027 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
1028 ; CHECK-NEXT: vldrw.u32 q4, [r0]
1029 ; CHECK-NEXT: vmov.f32 s14, s1
1030 ; CHECK-NEXT: vmov.f32 s22, s0
1031 ; CHECK-NEXT: vmov.f32 s26, s3
1032 ; CHECK-NEXT: vmov.f32 s12, s17
1033 ; CHECK-NEXT: vmov.f32 s13, s9
1034 ; CHECK-NEXT: vmov.f32 s15, s5
1035 ; CHECK-NEXT: vmov.f32 s20, s16
1036 ; CHECK-NEXT: vstrb.8 q3, [r1, #16]
1037 ; CHECK-NEXT: vmov.f32 s21, s8
1038 ; CHECK-NEXT: vmov.f32 s23, s4
1039 ; CHECK-NEXT: vmov.f32 s24, s19
1040 ; CHECK-NEXT: vstrb.8 q5, [r1]
1041 ; CHECK-NEXT: vmov.f32 s25, s11
1042 ; CHECK-NEXT: vmov.f32 s27, s7
1043 ; CHECK-NEXT: vmov.f32 s0, s18
1044 ; CHECK-NEXT: vstrb.8 q6, [r1, #48]
1045 ; CHECK-NEXT: vmov.f32 s1, s10
1046 ; CHECK-NEXT: vmov.f32 s3, s6
1047 ; CHECK-NEXT: vstrb.8 q0, [r1, #32]
1048 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
1051 %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
1052 %l1 = load <4 x float>, <4 x float>* %s1, align 4
1053 %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
1054 %l2 = load <4 x float>, <4 x float>* %s2, align 4
1055 %s3 = getelementptr <4 x float>, <4 x float>* %src, i32 2
1056 %l3 = load <4 x float>, <4 x float>* %s3, align 4
1057 %s4 = getelementptr <4 x float>, <4 x float>* %src, i32 3
1058 %l4 = load <4 x float>, <4 x float>* %s4, align 4
1059 %t1 = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1060 %t2 = shufflevector <4 x float> %l3, <4 x float> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1061 %s = shufflevector <8 x float> %t1, <8 x float> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1062 store <16 x float> %s, <16 x float> *%dst, align 1
1068 define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
1069 ; CHECK-LABEL: vst4_v2f16:
1070 ; CHECK: @ %bb.0: @ %entry
1071 ; CHECK-NEXT: vldr s0, [r0]
1072 ; CHECK-NEXT: vldr s5, [r0, #4]
1073 ; CHECK-NEXT: vldr s4, [r0, #8]
1074 ; CHECK-NEXT: vmovx.f16 s2, s0
1075 ; CHECK-NEXT: vldr s1, [r0, #12]
1076 ; CHECK-NEXT: vmovx.f16 s6, s5
1077 ; CHECK-NEXT: vmovx.f16 s3, s4
1078 ; CHECK-NEXT: vins.f16 s2, s6
1079 ; CHECK-NEXT: vmovx.f16 s6, s1
1080 ; CHECK-NEXT: vins.f16 s4, s1
1081 ; CHECK-NEXT: vins.f16 s0, s5
1082 ; CHECK-NEXT: vins.f16 s3, s6
1083 ; CHECK-NEXT: vmov.f32 s1, s4
1084 ; CHECK-NEXT: vstrh.16 q0, [r1]
1087 %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
1088 %l1 = load <2 x half>, <2 x half>* %s1, align 4
1089 %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
1090 %l2 = load <2 x half>, <2 x half>* %s2, align 4
1091 %s3 = getelementptr <2 x half>, <2 x half>* %src, i32 2
1092 %l3 = load <2 x half>, <2 x half>* %s3, align 4
1093 %s4 = getelementptr <2 x half>, <2 x half>* %src, i32 3
1094 %l4 = load <2 x half>, <2 x half>* %s4, align 4
1095 %t1 = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1096 %t2 = shufflevector <2 x half> %l3, <2 x half> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1097 %s = shufflevector <4 x half> %t1, <4 x half> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
1098 store <8 x half> %s, <8 x half> *%dst, align 2
1102 define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
1103 ; CHECK-LABEL: vst4_v4f16:
1104 ; CHECK: @ %bb.0: @ %entry
1105 ; CHECK-NEXT: .save {r7, lr}
1106 ; CHECK-NEXT: push {r7, lr}
1107 ; CHECK-NEXT: add.w lr, r0, #16
1108 ; CHECK-NEXT: ldr r2, [r0, #28]
1109 ; CHECK-NEXT: ldm.w lr, {r3, r12, lr}
1110 ; CHECK-NEXT: vmov.32 q1[0], lr
1111 ; CHECK-NEXT: vmov.32 q1[1], r2
1112 ; CHECK-NEXT: vmov.32 q0[0], r3
1113 ; CHECK-NEXT: vmov.32 q0[1], r12
1114 ; CHECK-NEXT: ldrd r2, r12, [r0]
1115 ; CHECK-NEXT: ldrd r3, r0, [r0, #8]
1116 ; CHECK-NEXT: vmovx.f16 s12, s0
1117 ; CHECK-NEXT: vmovx.f16 s2, s4
1118 ; CHECK-NEXT: vmov.f32 s3, s5
1119 ; CHECK-NEXT: vmov.32 q2[0], r3
1120 ; CHECK-NEXT: vins.f16 s0, s4
1121 ; CHECK-NEXT: vmov.32 q1[0], r2
1122 ; CHECK-NEXT: vmov.32 q2[1], r0
1123 ; CHECK-NEXT: vmov.32 q1[1], r12
1124 ; CHECK-NEXT: vins.f16 s12, s2
1125 ; CHECK-NEXT: vmovx.f16 s6, s4
1126 ; CHECK-NEXT: vmovx.f16 s2, s8
1127 ; CHECK-NEXT: vins.f16 s6, s2
1128 ; CHECK-NEXT: vmovx.f16 s11, s1
1129 ; CHECK-NEXT: vmovx.f16 s2, s3
1130 ; CHECK-NEXT: vmovx.f16 s10, s5
1131 ; CHECK-NEXT: vins.f16 s11, s2
1132 ; CHECK-NEXT: vmovx.f16 s2, s9
1133 ; CHECK-NEXT: vins.f16 s1, s3
1134 ; CHECK-NEXT: vins.f16 s5, s9
1135 ; CHECK-NEXT: vins.f16 s4, s8
1136 ; CHECK-NEXT: vmov.f32 s8, s5
1137 ; CHECK-NEXT: vins.f16 s10, s2
1138 ; CHECK-NEXT: vmov.f32 s9, s1
1139 ; CHECK-NEXT: vmov.f32 s5, s0
1140 ; CHECK-NEXT: vstrh.16 q2, [r1, #16]
1141 ; CHECK-NEXT: vmov.f32 s7, s12
1142 ; CHECK-NEXT: vstrh.16 q1, [r1]
1143 ; CHECK-NEXT: pop {r7, pc}
1145 %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
1146 %l1 = load <4 x half>, <4 x half>* %s1, align 4
1147 %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
1148 %l2 = load <4 x half>, <4 x half>* %s2, align 4
1149 %s3 = getelementptr <4 x half>, <4 x half>* %src, i32 2
1150 %l3 = load <4 x half>, <4 x half>* %s3, align 4
1151 %s4 = getelementptr <4 x half>, <4 x half>* %src, i32 3
1152 %l4 = load <4 x half>, <4 x half>* %s4, align 4
1153 %t1 = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1154 %t2 = shufflevector <4 x half> %l3, <4 x half> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1155 %s = shufflevector <8 x half> %t1, <8 x half> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1156 store <16 x half> %s, <16 x half> *%dst, align 2
1160 define void @vst4_v8f16(<8 x half> *%src, <32 x half> *%dst) {
1161 ; CHECK-LABEL: vst4_v8f16:
1162 ; CHECK: @ %bb.0: @ %entry
1163 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
1164 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1165 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
1166 ; CHECK-NEXT: vldrw.u32 q0, [r0]
1167 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
1168 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
1169 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
1170 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
1173 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
1174 %l1 = load <8 x half>, <8 x half>* %s1, align 4
1175 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
1176 %l2 = load <8 x half>, <8 x half>* %s2, align 4
1177 %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
1178 %l3 = load <8 x half>, <8 x half>* %s3, align 4
1179 %s4 = getelementptr <8 x half>, <8 x half>* %src, i32 3
1180 %l4 = load <8 x half>, <8 x half>* %s4, align 4
1181 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1182 %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1183 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1184 store <32 x half> %s, <32 x half> *%dst, align 2
1188 define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) {
1189 ; CHECK-LABEL: vst4_v16f16:
1190 ; CHECK: @ %bb.0: @ %entry
1191 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1192 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1193 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96]
1194 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64]
1195 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
1196 ; CHECK-NEXT: vldrw.u32 q4, [r0]
1197 ; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
1198 ; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
1199 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
1200 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1201 ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1]
1202 ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1]
1203 ; CHECK-NEXT: vst42.16 {q4, q5, q6, q7}, [r1]
1204 ; CHECK-NEXT: vst43.16 {q4, q5, q6, q7}, [r1]!
1205 ; CHECK-NEXT: vst40.16 {q0, q1, q2, q3}, [r1]
1206 ; CHECK-NEXT: vst41.16 {q0, q1, q2, q3}, [r1]
1207 ; CHECK-NEXT: vst42.16 {q0, q1, q2, q3}, [r1]
1208 ; CHECK-NEXT: vst43.16 {q0, q1, q2, q3}, [r1]
1209 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1212 %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
1213 %l1 = load <16 x half>, <16 x half>* %s1, align 4
1214 %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
1215 %l2 = load <16 x half>, <16 x half>* %s2, align 4
1216 %s3 = getelementptr <16 x half>, <16 x half>* %src, i32 2
1217 %l3 = load <16 x half>, <16 x half>* %s3, align 4
1218 %s4 = getelementptr <16 x half>, <16 x half>* %src, i32 3
1219 %l4 = load <16 x half>, <16 x half>* %s4, align 4
1220 %t1 = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1221 %t2 = shufflevector <16 x half> %l3, <16 x half> %l4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1222 %s = shufflevector <32 x half> %t1, <32 x half> %t2, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
1223 store <64 x half> %s, <64 x half> *%dst, align 2
1227 define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
1228 ; CHECK-LABEL: vst4_v8f16_align1:
1229 ; CHECK: @ %bb.0: @ %entry
1230 ; CHECK-NEXT: .vsave {d9, d10, d11, d12, d13}
1231 ; CHECK-NEXT: vpush {d9, d10, d11, d12, d13}
1232 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
1233 ; CHECK-NEXT: vldrw.u32 q5, [r0, #48]
1234 ; CHECK-NEXT: vldrw.u32 q6, [r0, #16]
1235 ; CHECK-NEXT: vldrw.u32 q2, [r0]
1236 ; CHECK-NEXT: vmovx.f16 s0, s5
1237 ; CHECK-NEXT: vmovx.f16 s2, s21
1238 ; CHECK-NEXT: vins.f16 s0, s2
1239 ; CHECK-NEXT: vmovx.f16 s2, s9
1240 ; CHECK-NEXT: vmovx.f16 s12, s25
1241 ; CHECK-NEXT: vmovx.f16 s19, s4
1242 ; CHECK-NEXT: vins.f16 s2, s12
1243 ; CHECK-NEXT: vmovx.f16 s12, s20
1244 ; CHECK-NEXT: vins.f16 s19, s12
1245 ; CHECK-NEXT: vmovx.f16 s12, s8
1246 ; CHECK-NEXT: vmovx.f16 s14, s24
1247 ; CHECK-NEXT: vmovx.f16 s15, s7
1248 ; CHECK-NEXT: vins.f16 s12, s14
1249 ; CHECK-NEXT: vmovx.f16 s14, s23
1250 ; CHECK-NEXT: vins.f16 s15, s14
1251 ; CHECK-NEXT: vmovx.f16 s14, s11
1252 ; CHECK-NEXT: vmovx.f16 s1, s27
1253 ; CHECK-NEXT: vins.f16 s7, s23
1254 ; CHECK-NEXT: vins.f16 s14, s1
1255 ; CHECK-NEXT: vmovx.f16 s23, s6
1256 ; CHECK-NEXT: vmovx.f16 s1, s22
1257 ; CHECK-NEXT: vins.f16 s6, s22
1258 ; CHECK-NEXT: vins.f16 s5, s21
1259 ; CHECK-NEXT: vins.f16 s4, s20
1260 ; CHECK-NEXT: vins.f16 s23, s1
1261 ; CHECK-NEXT: vmovx.f16 s22, s10
1262 ; CHECK-NEXT: vins.f16 s10, s26
1263 ; CHECK-NEXT: vmovx.f16 s1, s26
1264 ; CHECK-NEXT: vins.f16 s9, s25
1265 ; CHECK-NEXT: vins.f16 s8, s24
1266 ; CHECK-NEXT: vins.f16 s11, s27
1267 ; CHECK-NEXT: vmov q6, q1
1268 ; CHECK-NEXT: vins.f16 s22, s1
1269 ; CHECK-NEXT: vmov.f32 s1, s25
1270 ; CHECK-NEXT: vmov q6, q2
1271 ; CHECK-NEXT: vmov.f32 s3, s0
1272 ; CHECK-NEXT: vmov.f32 s0, s9
1273 ; CHECK-NEXT: vmov.f32 s26, s12
1274 ; CHECK-NEXT: vstrb.8 q0, [r1, #16]
1275 ; CHECK-NEXT: vmov.f32 s25, s4
1276 ; CHECK-NEXT: vmov.f32 s27, s19
1277 ; CHECK-NEXT: vmov.f32 s13, s7
1278 ; CHECK-NEXT: vstrb.8 q6, [r1]
1279 ; CHECK-NEXT: vmov.f32 s12, s11
1280 ; CHECK-NEXT: vmov.f32 s21, s6
1281 ; CHECK-NEXT: vstrb.8 q3, [r1, #48]
1282 ; CHECK-NEXT: vmov.f32 s20, s10
1283 ; CHECK-NEXT: vstrb.8 q5, [r1, #32]
1284 ; CHECK-NEXT: vpop {d9, d10, d11, d12, d13}
1287 %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
1288 %l1 = load <8 x half>, <8 x half>* %s1, align 4
1289 %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
1290 %l2 = load <8 x half>, <8 x half>* %s2, align 4
1291 %s3 = getelementptr <8 x half>, <8 x half>* %src, i32 2
1292 %l3 = load <8 x half>, <8 x half>* %s3, align 4
1293 %s4 = getelementptr <8 x half>, <8 x half>* %src, i32 3
1294 %l4 = load <8 x half>, <8 x half>* %s4, align 4
1295 %t1 = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1296 %t2 = shufflevector <8 x half> %l3, <8 x half> %l4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1297 %s = shufflevector <16 x half> %t1, <16 x half> %t2, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
1298 store <32 x half> %s, <32 x half> *%dst, align 1
1304 define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
1305 ; CHECK-LABEL: vst4_v2f64:
1306 ; CHECK: @ %bb.0: @ %entry
1307 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1308 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
1309 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
1310 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
1311 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
1312 ; CHECK-NEXT: vldrw.u32 q3, [r0]
1313 ; CHECK-NEXT: vmov.f64 d9, d0
1314 ; CHECK-NEXT: vmov.f64 d8, d4
1315 ; CHECK-NEXT: vmov.f64 d11, d2
1316 ; CHECK-NEXT: vstrw.32 q4, [r1, #16]
1317 ; CHECK-NEXT: vmov.f64 d10, d6
1318 ; CHECK-NEXT: vmov.f64 d0, d5
1319 ; CHECK-NEXT: vstrw.32 q5, [r1]
1320 ; CHECK-NEXT: vmov.f64 d2, d7
1321 ; CHECK-NEXT: vstrw.32 q0, [r1, #48]
1322 ; CHECK-NEXT: vstrw.32 q1, [r1, #32]
1323 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
1326 %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
1327 %l1 = load <2 x double>, <2 x double>* %s1, align 4
1328 %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
1329 %l2 = load <2 x double>, <2 x double>* %s2, align 4
1330 %s3 = getelementptr <2 x double>, <2 x double>* %src, i32 2
1331 %l3 = load <2 x double>, <2 x double>* %s3, align 4
1332 %s4 = getelementptr <2 x double>, <2 x double>* %src, i32 3
1333 %l4 = load <2 x double>, <2 x double>* %s4, align 4
1334 %t1 = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1335 %t2 = shufflevector <2 x double> %l3, <2 x double> %l4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1336 %s = shufflevector <4 x double> %t1, <4 x double> %t2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
1337 store <8 x double> %s, <8 x double> *%dst, align 8
1341 define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) {
1342 ; CHECK-LABEL: vst4_v4f64:
1343 ; CHECK: @ %bb.0: @ %entry
1344 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1345 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1346 ; CHECK-NEXT: .pad #64
1347 ; CHECK-NEXT: sub sp, #64
1348 ; CHECK-NEXT: vldrw.u32 q7, [r0, #80]
1349 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32]
1350 ; CHECK-NEXT: vldrw.u32 q6, [r0]
1351 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
1352 ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
1353 ; CHECK-NEXT: vmov.f64 d15, d10
1354 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64]
1355 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
1356 ; CHECK-NEXT: vldrw.u32 q3, [r0, #48]
1357 ; CHECK-NEXT: vldrw.u32 q4, [r0, #112]
1358 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
1359 ; CHECK-NEXT: vmov.f64 d14, d12
1360 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
1361 ; CHECK-NEXT: vmov.f64 d14, d4
1362 ; CHECK-NEXT: vmov.f64 d15, d2
1363 ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill
1364 ; CHECK-NEXT: vmov.f64 d4, d0
1365 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
1366 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
1367 ; CHECK-NEXT: vmov.f64 d10, d13
1368 ; CHECK-NEXT: vmov.f64 d2, d5
1369 ; CHECK-NEXT: vstrw.32 q5, [r1, #32]
1370 ; CHECK-NEXT: vmov.f64 d5, d6
1371 ; CHECK-NEXT: vstrw.32 q1, [r1, #48]
1372 ; CHECK-NEXT: vmov.f64 d13, d8
1373 ; CHECK-NEXT: vstrw.32 q2, [r1, #64]
1374 ; CHECK-NEXT: vmov.f64 d12, d0
1375 ; CHECK-NEXT: vmov.f64 d8, d1
1376 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
1377 ; CHECK-NEXT: vstrw.32 q6, [r1, #80]
1378 ; CHECK-NEXT: vstrw.32 q0, [r1]
1379 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
1380 ; CHECK-NEXT: vmov.f64 d6, d15
1381 ; CHECK-NEXT: vstrw.32 q4, [r1, #112]
1382 ; CHECK-NEXT: vstrw.32 q0, [r1, #16]
1383 ; CHECK-NEXT: vstrw.32 q3, [r1, #96]
1384 ; CHECK-NEXT: add sp, #64
1385 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1388 %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
1389 %l1 = load <4 x double>, <4 x double>* %s1, align 4
1390 %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
1391 %l2 = load <4 x double>, <4 x double>* %s2, align 4
1392 %s3 = getelementptr <4 x double>, <4 x double>* %src, i32 2
1393 %l3 = load <4 x double>, <4 x double>* %s3, align 4
1394 %s4 = getelementptr <4 x double>, <4 x double>* %src, i32 3
1395 %l4 = load <4 x double>, <4 x double>* %s4, align 4
1396 %t1 = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1397 %t2 = shufflevector <4 x double> %l3, <4 x double> %l4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1398 %s = shufflevector <8 x double> %t1, <8 x double> %t2, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
1399 store <16 x double> %s, <16 x double> *%dst, align 8