1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <16 x i8> @vrhadd_s8(<16 x i8> %src1, <16 x i8> %src2) {
5 ; CHECK-LABEL: vrhadd_s8:
7 ; CHECK-NEXT: vmovlt.s8 q2, q1
8 ; CHECK-NEXT: vmovlt.s8 q3, q0
9 ; CHECK-NEXT: vmovlb.s8 q1, q1
10 ; CHECK-NEXT: vmovlb.s8 q0, q0
11 ; CHECK-NEXT: vadd.i16 q2, q3, q2
12 ; CHECK-NEXT: vmov.i16 q3, #0x1
13 ; CHECK-NEXT: vadd.i16 q0, q0, q1
14 ; CHECK-NEXT: vadd.i16 q2, q2, q3
15 ; CHECK-NEXT: vadd.i16 q0, q0, q3
16 ; CHECK-NEXT: vshr.u16 q2, q2, #1
17 ; CHECK-NEXT: vshr.u16 q0, q0, #1
18 ; CHECK-NEXT: vmovnt.i16 q0, q2
20 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
21 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
22 %add1 = add <16 x i16> %sextsrc1, %sextsrc2
23 %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
24 %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
25 %result = trunc <16 x i16> %resulti16 to <16 x i8>
29 define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
30 ; CHECK-LABEL: vrhadd_s16:
32 ; CHECK-NEXT: vmovlt.s16 q2, q1
33 ; CHECK-NEXT: vmovlt.s16 q3, q0
34 ; CHECK-NEXT: vmovlb.s16 q1, q1
35 ; CHECK-NEXT: vmovlb.s16 q0, q0
36 ; CHECK-NEXT: vadd.i32 q2, q3, q2
37 ; CHECK-NEXT: vmov.i32 q3, #0x1
38 ; CHECK-NEXT: vadd.i32 q0, q0, q1
39 ; CHECK-NEXT: vadd.i32 q2, q2, q3
40 ; CHECK-NEXT: vadd.i32 q0, q0, q3
41 ; CHECK-NEXT: vshr.u32 q2, q2, #1
42 ; CHECK-NEXT: vshr.u32 q0, q0, #1
43 ; CHECK-NEXT: vmovnt.i32 q0, q2
45 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
46 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
47 %add1 = add <8 x i32> %sextsrc1, %sextsrc2
48 %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
49 %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
50 %result = trunc <8 x i32> %resulti16 to <8 x i16>
54 define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
55 ; CHECK-LABEL: vrhadd_s32:
57 ; CHECK-NEXT: .vsave {d9}
58 ; CHECK-NEXT: vpush {d9}
59 ; CHECK-NEXT: vmov.f32 s8, s2
60 ; CHECK-NEXT: vmov r3, s4
61 ; CHECK-NEXT: vmov.f32 s10, s3
62 ; CHECK-NEXT: vmov.f32 s14, s1
63 ; CHECK-NEXT: vmov.f32 s18, s5
64 ; CHECK-NEXT: vmov r0, s8
65 ; CHECK-NEXT: vmov.f32 s8, s6
66 ; CHECK-NEXT: vmov.f32 s6, s7
67 ; CHECK-NEXT: vmov r2, s8
68 ; CHECK-NEXT: asrs r1, r0, #31
69 ; CHECK-NEXT: adds r0, r0, r2
70 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31
71 ; CHECK-NEXT: adds r0, #1
72 ; CHECK-NEXT: adc r1, r1, #0
73 ; CHECK-NEXT: lsrl r0, r1, #1
74 ; CHECK-NEXT: vmov r1, s0
75 ; CHECK-NEXT: asrs r2, r1, #31
76 ; CHECK-NEXT: adds r1, r1, r3
77 ; CHECK-NEXT: adc.w r3, r2, r3, asr #31
78 ; CHECK-NEXT: adds r2, r1, #1
79 ; CHECK-NEXT: adc r1, r3, #0
80 ; CHECK-NEXT: vmov r3, s18
81 ; CHECK-NEXT: lsrl r2, r1, #1
82 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
83 ; CHECK-NEXT: vmov r0, s10
84 ; CHECK-NEXT: vmov r2, s6
85 ; CHECK-NEXT: asrs r1, r0, #31
86 ; CHECK-NEXT: adds r0, r0, r2
87 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31
88 ; CHECK-NEXT: adds r0, #1
89 ; CHECK-NEXT: adc r1, r1, #0
90 ; CHECK-NEXT: lsrl r0, r1, #1
91 ; CHECK-NEXT: vmov r1, s14
92 ; CHECK-NEXT: asrs r2, r1, #31
93 ; CHECK-NEXT: adds r1, r1, r3
94 ; CHECK-NEXT: adc.w r3, r2, r3, asr #31
95 ; CHECK-NEXT: adds r2, r1, #1
96 ; CHECK-NEXT: adc r1, r3, #0
97 ; CHECK-NEXT: lsrl r2, r1, #1
98 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r0
99 ; CHECK-NEXT: vpop {d9}
101 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
102 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
103 %add1 = add <4 x i64> %sextsrc1, %sextsrc2
104 %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
105 %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
106 %result = trunc <4 x i64> %resulti16 to <4 x i32>
107 ret <4 x i32> %result
110 define arm_aapcs_vfpcc <16 x i8> @vhadd_s8(<16 x i8> %src1, <16 x i8> %src2) {
111 ; CHECK-LABEL: vhadd_s8:
113 ; CHECK-NEXT: vmovlt.s8 q2, q1
114 ; CHECK-NEXT: vmovlt.s8 q3, q0
115 ; CHECK-NEXT: vmovlb.s8 q1, q1
116 ; CHECK-NEXT: vmovlb.s8 q0, q0
117 ; CHECK-NEXT: vadd.i16 q2, q3, q2
118 ; CHECK-NEXT: vadd.i16 q0, q0, q1
119 ; CHECK-NEXT: vshr.u16 q2, q2, #1
120 ; CHECK-NEXT: vshr.u16 q0, q0, #1
121 ; CHECK-NEXT: vmovnt.i16 q0, q2
123 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
124 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
125 %add = add <16 x i16> %sextsrc1, %sextsrc2
126 %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
127 %result = trunc <16 x i16> %resulti16 to <16 x i8>
128 ret <16 x i8> %result
131 define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
132 ; CHECK-LABEL: vhadd_s16:
134 ; CHECK-NEXT: vmovlt.s16 q2, q1
135 ; CHECK-NEXT: vmovlt.s16 q3, q0
136 ; CHECK-NEXT: vmovlb.s16 q1, q1
137 ; CHECK-NEXT: vmovlb.s16 q0, q0
138 ; CHECK-NEXT: vadd.i32 q2, q3, q2
139 ; CHECK-NEXT: vadd.i32 q0, q0, q1
140 ; CHECK-NEXT: vshr.u32 q2, q2, #1
141 ; CHECK-NEXT: vshr.u32 q0, q0, #1
142 ; CHECK-NEXT: vmovnt.i32 q0, q2
144 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
145 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
146 %add = add <8 x i32> %sextsrc1, %sextsrc2
147 %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
148 %result = trunc <8 x i32> %resulti16 to <8 x i16>
149 ret <8 x i16> %result
152 define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
153 ; CHECK-LABEL: vhadd_s32:
155 ; CHECK-NEXT: .vsave {d9}
156 ; CHECK-NEXT: vpush {d9}
157 ; CHECK-NEXT: vmov.f32 s8, s2
158 ; CHECK-NEXT: vmov r3, s4
159 ; CHECK-NEXT: vmov.f32 s10, s3
160 ; CHECK-NEXT: vmov.f32 s14, s1
161 ; CHECK-NEXT: vmov.f32 s18, s5
162 ; CHECK-NEXT: vmov r0, s8
163 ; CHECK-NEXT: vmov.f32 s8, s6
164 ; CHECK-NEXT: vmov.f32 s6, s7
165 ; CHECK-NEXT: vmov r2, s8
166 ; CHECK-NEXT: asrs r1, r0, #31
167 ; CHECK-NEXT: adds r0, r0, r2
168 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31
169 ; CHECK-NEXT: lsrl r0, r1, #1
170 ; CHECK-NEXT: vmov r1, s0
171 ; CHECK-NEXT: adds r2, r1, r3
172 ; CHECK-NEXT: asr.w r12, r1, #31
173 ; CHECK-NEXT: adc.w r1, r12, r3, asr #31
174 ; CHECK-NEXT: lsrl r2, r1, #1
175 ; CHECK-NEXT: vmov r3, s18
176 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
177 ; CHECK-NEXT: vmov r0, s10
178 ; CHECK-NEXT: vmov r2, s6
179 ; CHECK-NEXT: asrs r1, r0, #31
180 ; CHECK-NEXT: adds r0, r0, r2
181 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31
182 ; CHECK-NEXT: lsrl r0, r1, #1
183 ; CHECK-NEXT: vmov r1, s14
184 ; CHECK-NEXT: adds r2, r1, r3
185 ; CHECK-NEXT: asr.w r12, r1, #31
186 ; CHECK-NEXT: adc.w r1, r12, r3, asr #31
187 ; CHECK-NEXT: lsrl r2, r1, #1
188 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r0
189 ; CHECK-NEXT: vpop {d9}
191 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
192 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
193 %add = add <4 x i64> %sextsrc1, %sextsrc2
194 %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
195 %result = trunc <4 x i64> %resulti16 to <4 x i32>
196 ret <4 x i32> %result
199 define arm_aapcs_vfpcc <16 x i8> @vrhadd_u8(<16 x i8> %src1, <16 x i8> %src2) {
200 ; CHECK-LABEL: vrhadd_u8:
202 ; CHECK-NEXT: vmovlt.u8 q2, q1
203 ; CHECK-NEXT: vmovlt.u8 q3, q0
204 ; CHECK-NEXT: vmovlb.u8 q1, q1
205 ; CHECK-NEXT: vmovlb.u8 q0, q0
206 ; CHECK-NEXT: vadd.i16 q2, q3, q2
207 ; CHECK-NEXT: vmov.i16 q3, #0x1
208 ; CHECK-NEXT: vadd.i16 q0, q0, q1
209 ; CHECK-NEXT: vadd.i16 q2, q2, q3
210 ; CHECK-NEXT: vadd.i16 q0, q0, q3
211 ; CHECK-NEXT: vshr.u16 q2, q2, #1
212 ; CHECK-NEXT: vshr.u16 q0, q0, #1
213 ; CHECK-NEXT: vmovnt.i16 q0, q2
215 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
216 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
217 %add1 = add <16 x i16> %zextsrc1, %zextsrc2
218 %add2 = add <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
219 %resulti16 = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
220 %result = trunc <16 x i16> %resulti16 to <16 x i8>
221 ret <16 x i8> %result
224 define arm_aapcs_vfpcc <8 x i16> @vrhadd_u16(<8 x i16> %src1, <8 x i16> %src2) {
225 ; CHECK-LABEL: vrhadd_u16:
227 ; CHECK-NEXT: vmovlt.u16 q2, q1
228 ; CHECK-NEXT: vmovlt.u16 q3, q0
229 ; CHECK-NEXT: vmovlb.u16 q1, q1
230 ; CHECK-NEXT: vmovlb.u16 q0, q0
231 ; CHECK-NEXT: vadd.i32 q2, q3, q2
232 ; CHECK-NEXT: vmov.i32 q3, #0x1
233 ; CHECK-NEXT: vadd.i32 q0, q0, q1
234 ; CHECK-NEXT: vadd.i32 q2, q2, q3
235 ; CHECK-NEXT: vadd.i32 q0, q0, q3
236 ; CHECK-NEXT: vshr.u32 q2, q2, #1
237 ; CHECK-NEXT: vshr.u32 q0, q0, #1
238 ; CHECK-NEXT: vmovnt.i32 q0, q2
240 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
241 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
242 %add1 = add <8 x i32> %zextsrc1, %zextsrc2
243 %add2 = add <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
244 %resulti16 = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
245 %result = trunc <8 x i32> %resulti16 to <8 x i16>
246 ret <8 x i16> %result
249 define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
250 ; CHECK-LABEL: vrhadd_u32:
252 ; CHECK-NEXT: .vsave {d8, d9}
253 ; CHECK-NEXT: vpush {d8, d9}
254 ; CHECK-NEXT: vmov.f32 s8, s6
255 ; CHECK-NEXT: vmov.i64 q4, #0xffffffff
256 ; CHECK-NEXT: vmov.f32 s10, s7
257 ; CHECK-NEXT: vmov.f32 s12, s2
258 ; CHECK-NEXT: vand q2, q2, q4
259 ; CHECK-NEXT: vmov.f32 s14, s3
260 ; CHECK-NEXT: vand q3, q3, q4
261 ; CHECK-NEXT: vmov r0, r1, d4
262 ; CHECK-NEXT: vmov r2, r3, d6
263 ; CHECK-NEXT: vmov.f32 s6, s5
264 ; CHECK-NEXT: vmov.f32 s2, s1
265 ; CHECK-NEXT: vand q1, q1, q4
266 ; CHECK-NEXT: vand q4, q0, q4
267 ; CHECK-NEXT: adds r0, r0, r2
268 ; CHECK-NEXT: adcs r1, r3
269 ; CHECK-NEXT: adds r0, #1
270 ; CHECK-NEXT: adc r1, r1, #0
271 ; CHECK-NEXT: vmov r3, r2, d8
272 ; CHECK-NEXT: lsrl r0, r1, #1
273 ; CHECK-NEXT: vmov r1, r12, d2
274 ; CHECK-NEXT: adds r1, r1, r3
275 ; CHECK-NEXT: adc.w r3, r2, r12
276 ; CHECK-NEXT: adds r2, r1, #1
277 ; CHECK-NEXT: adc r1, r3, #0
278 ; CHECK-NEXT: lsrl r2, r1, #1
279 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
280 ; CHECK-NEXT: vmov r0, r1, d5
281 ; CHECK-NEXT: vmov r2, r3, d7
282 ; CHECK-NEXT: adds r0, r0, r2
283 ; CHECK-NEXT: adcs r1, r3
284 ; CHECK-NEXT: adds r0, #1
285 ; CHECK-NEXT: adc r1, r1, #0
286 ; CHECK-NEXT: vmov r3, r2, d9
287 ; CHECK-NEXT: lsrl r0, r1, #1
288 ; CHECK-NEXT: vmov r1, r12, d3
289 ; CHECK-NEXT: adds r1, r1, r3
290 ; CHECK-NEXT: adc.w r3, r2, r12
291 ; CHECK-NEXT: adds r2, r1, #1
292 ; CHECK-NEXT: adc r1, r3, #0
293 ; CHECK-NEXT: lsrl r2, r1, #1
294 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r0
295 ; CHECK-NEXT: vpop {d8, d9}
297 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
298 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
299 %add1 = add <4 x i64> %zextsrc1, %zextsrc2
300 %add2 = add <4 x i64> %add1, <i64 1, i64 1, i64 1, i64 1>
301 %resulti16 = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
302 %result = trunc <4 x i64> %resulti16 to <4 x i32>
303 ret <4 x i32> %result
306 define arm_aapcs_vfpcc <16 x i8> @vhadd_u8(<16 x i8> %src1, <16 x i8> %src2) {
307 ; CHECK-LABEL: vhadd_u8:
309 ; CHECK-NEXT: vmovlt.u8 q2, q1
310 ; CHECK-NEXT: vmovlt.u8 q3, q0
311 ; CHECK-NEXT: vmovlb.u8 q1, q1
312 ; CHECK-NEXT: vmovlb.u8 q0, q0
313 ; CHECK-NEXT: vadd.i16 q2, q3, q2
314 ; CHECK-NEXT: vadd.i16 q0, q0, q1
315 ; CHECK-NEXT: vshr.u16 q2, q2, #1
316 ; CHECK-NEXT: vshr.u16 q0, q0, #1
317 ; CHECK-NEXT: vmovnt.i16 q0, q2
319 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
320 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
321 %add = add <16 x i16> %zextsrc1, %zextsrc2
322 %resulti16 = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
323 %result = trunc <16 x i16> %resulti16 to <16 x i8>
324 ret <16 x i8> %result
327 define arm_aapcs_vfpcc <8 x i16> @vhadd_u16(<8 x i16> %src1, <8 x i16> %src2) {
328 ; CHECK-LABEL: vhadd_u16:
330 ; CHECK-NEXT: vmovlt.u16 q2, q1
331 ; CHECK-NEXT: vmovlt.u16 q3, q0
332 ; CHECK-NEXT: vmovlb.u16 q1, q1
333 ; CHECK-NEXT: vmovlb.u16 q0, q0
334 ; CHECK-NEXT: vadd.i32 q2, q3, q2
335 ; CHECK-NEXT: vadd.i32 q0, q0, q1
336 ; CHECK-NEXT: vshr.u32 q2, q2, #1
337 ; CHECK-NEXT: vshr.u32 q0, q0, #1
338 ; CHECK-NEXT: vmovnt.i32 q0, q2
340 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
341 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
342 %add = add <8 x i32> %zextsrc1, %zextsrc2
343 %resulti16 = lshr <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
344 %result = trunc <8 x i32> %resulti16 to <8 x i16>
345 ret <8 x i16> %result
348 define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
349 ; CHECK-LABEL: vhadd_u32:
351 ; CHECK-NEXT: .save {r4, lr}
352 ; CHECK-NEXT: push {r4, lr}
353 ; CHECK-NEXT: .vsave {d8, d9}
354 ; CHECK-NEXT: vpush {d8, d9}
355 ; CHECK-NEXT: vmov.f32 s8, s6
356 ; CHECK-NEXT: vmov.i64 q4, #0xffffffff
357 ; CHECK-NEXT: vmov.f32 s10, s7
358 ; CHECK-NEXT: vmov.f32 s12, s2
359 ; CHECK-NEXT: vand q2, q2, q4
360 ; CHECK-NEXT: vmov.f32 s14, s3
361 ; CHECK-NEXT: vand q3, q3, q4
362 ; CHECK-NEXT: vmov r0, r1, d4
363 ; CHECK-NEXT: vmov r2, r3, d6
364 ; CHECK-NEXT: vmov.f32 s6, s5
365 ; CHECK-NEXT: vmov.f32 s2, s1
366 ; CHECK-NEXT: vand q1, q1, q4
367 ; CHECK-NEXT: vand q4, q0, q4
368 ; CHECK-NEXT: adds r0, r0, r2
369 ; CHECK-NEXT: adcs r1, r3
370 ; CHECK-NEXT: vmov r3, r2, d8
371 ; CHECK-NEXT: lsrl r0, r1, #1
372 ; CHECK-NEXT: vmov r1, r12, d2
373 ; CHECK-NEXT: adds r4, r3, r1
374 ; CHECK-NEXT: adc.w r1, r2, r12
375 ; CHECK-NEXT: vmov r2, r3, d7
376 ; CHECK-NEXT: lsrl r4, r1, #1
377 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
378 ; CHECK-NEXT: vmov r0, r1, d5
379 ; CHECK-NEXT: adds r0, r0, r2
380 ; CHECK-NEXT: adcs r1, r3
381 ; CHECK-NEXT: vmov r3, r4, d9
382 ; CHECK-NEXT: lsrl r0, r1, #1
383 ; CHECK-NEXT: vmov r1, r12, d3
384 ; CHECK-NEXT: adds r2, r3, r1
385 ; CHECK-NEXT: adc.w r1, r4, r12
386 ; CHECK-NEXT: lsrl r2, r1, #1
387 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r0
388 ; CHECK-NEXT: vpop {d8, d9}
389 ; CHECK-NEXT: pop {r4, pc}
390 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
391 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
392 %add = add <4 x i64> %zextsrc1, %zextsrc2
393 %resulti16 = lshr <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
394 %result = trunc <4 x i64> %resulti16 to <4 x i32>
395 ret <4 x i32> %result
398 define void @vhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
399 ; CHECK-LABEL: vhadd_loop_s8:
400 ; CHECK: @ %bb.0: @ %entry
401 ; CHECK-NEXT: .save {r7, lr}
402 ; CHECK-NEXT: push {r7, lr}
403 ; CHECK-NEXT: mov.w lr, #64
404 ; CHECK-NEXT: .LBB12_1: @ %vector.body
405 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
406 ; CHECK-NEXT: vldrb.s16 q0, [r0, #8]
407 ; CHECK-NEXT: vldrb.s16 q1, [r1, #8]
408 ; CHECK-NEXT: vadd.i16 q0, q1, q0
409 ; CHECK-NEXT: vldrb.s16 q1, [r1], #16
410 ; CHECK-NEXT: vshr.u16 q0, q0, #1
411 ; CHECK-NEXT: vstrb.16 q0, [r2, #8]
412 ; CHECK-NEXT: vldrb.s16 q0, [r0], #16
413 ; CHECK-NEXT: vadd.i16 q0, q1, q0
414 ; CHECK-NEXT: vshr.u16 q0, q0, #1
415 ; CHECK-NEXT: vstrb.16 q0, [r2], #16
416 ; CHECK-NEXT: le lr, .LBB12_1
417 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
418 ; CHECK-NEXT: pop {r7, pc}
420 br label %vector.body
422 vector.body: ; preds = %vector.body, %entry
423 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
424 %0 = getelementptr inbounds i8, i8* %x, i32 %index
425 %1 = bitcast i8* %0 to <16 x i8>*
426 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
427 %2 = sext <16 x i8> %wide.load to <16 x i16>
428 %3 = getelementptr inbounds i8, i8* %y, i32 %index
429 %4 = bitcast i8* %3 to <16 x i8>*
430 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
431 %5 = sext <16 x i8> %wide.load16 to <16 x i16>
432 %6 = add nsw <16 x i16> %5, %2
433 %7 = lshr <16 x i16> %6, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
434 %8 = trunc <16 x i16> %7 to <16 x i8>
435 %9 = getelementptr inbounds i8, i8* %z, i32 %index
436 %10 = bitcast i8* %9 to <16 x i8>*
437 store <16 x i8> %8, <16 x i8>* %10, align 1
438 %index.next = add i32 %index, 16
439 %11 = icmp eq i32 %index.next, 1024
440 br i1 %11, label %for.cond.cleanup, label %vector.body
442 for.cond.cleanup: ; preds = %vector.body
446 define void @vhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
447 ; CHECK-LABEL: vhadd_loop_s16:
448 ; CHECK: @ %bb.0: @ %entry
449 ; CHECK-NEXT: .save {r7, lr}
450 ; CHECK-NEXT: push {r7, lr}
451 ; CHECK-NEXT: mov.w lr, #128
452 ; CHECK-NEXT: .LBB13_1: @ %vector.body
453 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
454 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
455 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
456 ; CHECK-NEXT: vadd.i32 q0, q1, q0
457 ; CHECK-NEXT: vldrh.s32 q1, [r1], #16
458 ; CHECK-NEXT: vshr.u32 q0, q0, #1
459 ; CHECK-NEXT: vstrh.32 q0, [r2, #8]
460 ; CHECK-NEXT: vldrh.s32 q0, [r0], #16
461 ; CHECK-NEXT: vadd.i32 q0, q1, q0
462 ; CHECK-NEXT: vshr.u32 q0, q0, #1
463 ; CHECK-NEXT: vstrh.32 q0, [r2], #16
464 ; CHECK-NEXT: le lr, .LBB13_1
465 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
466 ; CHECK-NEXT: pop {r7, pc}
468 br label %vector.body
470 vector.body: ; preds = %vector.body, %entry
471 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
472 %0 = getelementptr inbounds i16, i16* %x, i32 %index
473 %1 = bitcast i16* %0 to <8 x i16>*
474 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
475 %2 = sext <8 x i16> %wide.load to <8 x i32>
476 %3 = getelementptr inbounds i16, i16* %y, i32 %index
477 %4 = bitcast i16* %3 to <8 x i16>*
478 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
479 %5 = sext <8 x i16> %wide.load16 to <8 x i32>
480 %6 = add nsw <8 x i32> %5, %2
481 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
482 %8 = trunc <8 x i32> %7 to <8 x i16>
483 %9 = getelementptr inbounds i16, i16* %z, i32 %index
484 %10 = bitcast i16* %9 to <8 x i16>*
485 store <8 x i16> %8, <8 x i16>* %10, align 2
486 %index.next = add i32 %index, 8
487 %11 = icmp eq i32 %index.next, 1024
488 br i1 %11, label %for.cond.cleanup, label %vector.body
490 for.cond.cleanup: ; preds = %vector.body
494 define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
495 ; CHECK-LABEL: vhadd_loop_s32:
496 ; CHECK: @ %bb.0: @ %entry
497 ; CHECK-NEXT: .save {r4, r5, r6, lr}
498 ; CHECK-NEXT: push {r4, r5, r6, lr}
499 ; CHECK-NEXT: .vsave {d9}
500 ; CHECK-NEXT: vpush {d9}
501 ; CHECK-NEXT: mov.w lr, #256
502 ; CHECK-NEXT: .LBB14_1: @ %vector.body
503 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
504 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
505 ; CHECK-NEXT: vmov.f32 s4, s2
506 ; CHECK-NEXT: vmov.f32 s2, s3
507 ; CHECK-NEXT: vmov.f32 s10, s1
508 ; CHECK-NEXT: vmov r3, s4
509 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
510 ; CHECK-NEXT: vmov.f32 s8, s6
511 ; CHECK-NEXT: vmov.f32 s6, s7
512 ; CHECK-NEXT: vmov.f32 s18, s5
513 ; CHECK-NEXT: vmov r5, s8
514 ; CHECK-NEXT: asrs r4, r3, #31
515 ; CHECK-NEXT: adds.w r12, r3, r5
516 ; CHECK-NEXT: adc.w r3, r4, r5, asr #31
517 ; CHECK-NEXT: vmov r5, s4
518 ; CHECK-NEXT: lsrl r12, r3, #1
519 ; CHECK-NEXT: vmov r3, s0
520 ; CHECK-NEXT: adds r6, r3, r5
521 ; CHECK-NEXT: asr.w r4, r3, #31
522 ; CHECK-NEXT: adc.w r3, r4, r5, asr #31
523 ; CHECK-NEXT: lsrl r6, r3, #1
524 ; CHECK-NEXT: vmov r5, s6
525 ; CHECK-NEXT: vmov r3, s2
526 ; CHECK-NEXT: vmov q3[2], q3[0], r6, r12
527 ; CHECK-NEXT: adds r4, r3, r5
528 ; CHECK-NEXT: asr.w r6, r3, #31
529 ; CHECK-NEXT: adc.w r3, r6, r5, asr #31
530 ; CHECK-NEXT: lsrl r4, r3, #1
531 ; CHECK-NEXT: vmov r5, s18
532 ; CHECK-NEXT: vmov r3, s10
533 ; CHECK-NEXT: adds r6, r3, r5
534 ; CHECK-NEXT: asr.w r12, r3, #31
535 ; CHECK-NEXT: adc.w r3, r12, r5, asr #31
536 ; CHECK-NEXT: lsrl r6, r3, #1
537 ; CHECK-NEXT: vmov q3[3], q3[1], r6, r4
538 ; CHECK-NEXT: vstrb.8 q3, [r2], #16
539 ; CHECK-NEXT: le lr, .LBB14_1
540 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
541 ; CHECK-NEXT: vpop {d9}
542 ; CHECK-NEXT: pop {r4, r5, r6, pc}
544 br label %vector.body
546 vector.body: ; preds = %vector.body, %entry
547 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
548 %0 = getelementptr inbounds i32, i32* %x, i32 %index
549 %1 = bitcast i32* %0 to <4 x i32>*
550 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
551 %2 = sext <4 x i32> %wide.load to <4 x i64>
552 %3 = getelementptr inbounds i32, i32* %y, i32 %index
553 %4 = bitcast i32* %3 to <4 x i32>*
554 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
555 %5 = sext <4 x i32> %wide.load16 to <4 x i64>
556 %6 = add nsw <4 x i64> %5, %2
557 %7 = lshr <4 x i64> %6, <i64 1, i64 1, i64 1, i64 1>
558 %8 = trunc <4 x i64> %7 to <4 x i32>
559 %9 = getelementptr inbounds i32, i32* %z, i32 %index
560 %10 = bitcast i32* %9 to <4 x i32>*
561 store <4 x i32> %8, <4 x i32>* %10, align 4
562 %index.next = add i32 %index, 4
563 %11 = icmp eq i32 %index.next, 1024
564 br i1 %11, label %for.cond.cleanup, label %vector.body
566 for.cond.cleanup: ; preds = %vector.body
570 define void @vhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
571 ; CHECK-LABEL: vhadd_loop_u8:
572 ; CHECK: @ %bb.0: @ %entry
573 ; CHECK-NEXT: .save {r7, lr}
574 ; CHECK-NEXT: push {r7, lr}
575 ; CHECK-NEXT: mov.w lr, #64
576 ; CHECK-NEXT: .LBB15_1: @ %vector.body
577 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
578 ; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
579 ; CHECK-NEXT: vldrb.u16 q1, [r1, #8]
580 ; CHECK-NEXT: vhadd.u16 q0, q1, q0
581 ; CHECK-NEXT: vldrb.u16 q1, [r1], #16
582 ; CHECK-NEXT: vstrb.16 q0, [r2, #8]
583 ; CHECK-NEXT: vldrb.u16 q0, [r0], #16
584 ; CHECK-NEXT: vhadd.u16 q0, q1, q0
585 ; CHECK-NEXT: vstrb.16 q0, [r2], #16
586 ; CHECK-NEXT: le lr, .LBB15_1
587 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
588 ; CHECK-NEXT: pop {r7, pc}
590 br label %vector.body
592 vector.body: ; preds = %vector.body, %entry
593 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
594 %0 = getelementptr inbounds i8, i8* %x, i32 %index
595 %1 = bitcast i8* %0 to <16 x i8>*
596 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
597 %2 = zext <16 x i8> %wide.load to <16 x i16>
598 %3 = getelementptr inbounds i8, i8* %y, i32 %index
599 %4 = bitcast i8* %3 to <16 x i8>*
600 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
601 %5 = zext <16 x i8> %wide.load16 to <16 x i16>
602 %6 = add nuw nsw <16 x i16> %5, %2
603 %7 = lshr <16 x i16> %6, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
604 %8 = trunc <16 x i16> %7 to <16 x i8>
605 %9 = getelementptr inbounds i8, i8* %z, i32 %index
606 %10 = bitcast i8* %9 to <16 x i8>*
607 store <16 x i8> %8, <16 x i8>* %10, align 1
608 %index.next = add i32 %index, 16
609 %11 = icmp eq i32 %index.next, 1024
610 br i1 %11, label %for.cond.cleanup, label %vector.body
612 for.cond.cleanup: ; preds = %vector.body
616 define void @vhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
617 ; CHECK-LABEL: vhadd_loop_u16:
618 ; CHECK: @ %bb.0: @ %entry
619 ; CHECK-NEXT: .save {r7, lr}
620 ; CHECK-NEXT: push {r7, lr}
621 ; CHECK-NEXT: mov.w lr, #128
622 ; CHECK-NEXT: .LBB16_1: @ %vector.body
623 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
624 ; CHECK-NEXT: vldrh.u32 q0, [r0, #8]
625 ; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
626 ; CHECK-NEXT: vhadd.u32 q0, q1, q0
627 ; CHECK-NEXT: vldrh.u32 q1, [r1], #16
628 ; CHECK-NEXT: vstrh.32 q0, [r2, #8]
629 ; CHECK-NEXT: vldrh.u32 q0, [r0], #16
630 ; CHECK-NEXT: vhadd.u32 q0, q1, q0
631 ; CHECK-NEXT: vstrh.32 q0, [r2], #16
632 ; CHECK-NEXT: le lr, .LBB16_1
633 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
634 ; CHECK-NEXT: pop {r7, pc}
636 br label %vector.body
638 vector.body: ; preds = %vector.body, %entry
639 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
640 %0 = getelementptr inbounds i16, i16* %x, i32 %index
641 %1 = bitcast i16* %0 to <8 x i16>*
642 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
643 %2 = zext <8 x i16> %wide.load to <8 x i32>
644 %3 = getelementptr inbounds i16, i16* %y, i32 %index
645 %4 = bitcast i16* %3 to <8 x i16>*
646 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
647 %5 = zext <8 x i16> %wide.load16 to <8 x i32>
648 %6 = add nuw nsw <8 x i32> %5, %2
649 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
650 %8 = trunc <8 x i32> %7 to <8 x i16>
651 %9 = getelementptr inbounds i16, i16* %z, i32 %index
652 %10 = bitcast i16* %9 to <8 x i16>*
653 store <8 x i16> %8, <8 x i16>* %10, align 2
654 %index.next = add i32 %index, 8
655 %11 = icmp eq i32 %index.next, 1024
656 br i1 %11, label %for.cond.cleanup, label %vector.body
658 for.cond.cleanup: ; preds = %vector.body
662 define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
663 ; CHECK-LABEL: vhadd_loop_u32:
664 ; CHECK: @ %bb.0: @ %entry
665 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
666 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
667 ; CHECK-NEXT: .pad #4
668 ; CHECK-NEXT: sub sp, #4
669 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
670 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
671 ; CHECK-NEXT: mov.w lr, #256
672 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff
673 ; CHECK-NEXT: .LBB17_1: @ %vector.body
674 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
675 ; CHECK-NEXT: vldrw.u32 q3, [r0], #16
676 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16
677 ; CHECK-NEXT: vmov.f32 s4, s14
678 ; CHECK-NEXT: vmov.f32 s6, s15
679 ; CHECK-NEXT: vmov.f32 s8, s18
680 ; CHECK-NEXT: vand q1, q1, q0
681 ; CHECK-NEXT: vmov.f32 s10, s19
682 ; CHECK-NEXT: vand q2, q2, q0
683 ; CHECK-NEXT: vmov r3, r5, d2
684 ; CHECK-NEXT: vmov r4, r6, d4
685 ; CHECK-NEXT: vmov.f32 s14, s13
686 ; CHECK-NEXT: vmov.f32 s18, s17
687 ; CHECK-NEXT: vand q3, q3, q0
688 ; CHECK-NEXT: vand q5, q4, q0
689 ; CHECK-NEXT: adds.w r12, r4, r3
690 ; CHECK-NEXT: adc.w r3, r6, r5
691 ; CHECK-NEXT: vmov r5, r6, d10
692 ; CHECK-NEXT: lsrl r12, r3, #1
693 ; CHECK-NEXT: vmov r3, r7, d6
694 ; CHECK-NEXT: adds r4, r5, r3
695 ; CHECK-NEXT: adc.w r3, r6, r7
696 ; CHECK-NEXT: vmov r6, r5, d5
697 ; CHECK-NEXT: lsrl r4, r3, #1
698 ; CHECK-NEXT: vmov r3, r7, d3
699 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r12
700 ; CHECK-NEXT: adds r6, r6, r3
701 ; CHECK-NEXT: adc.w r3, r5, r7
702 ; CHECK-NEXT: vmov r5, r7, d11
703 ; CHECK-NEXT: lsrl r6, r3, #1
704 ; CHECK-NEXT: vmov r3, r12, d7
705 ; CHECK-NEXT: adds r4, r5, r3
706 ; CHECK-NEXT: adc.w r3, r7, r12
707 ; CHECK-NEXT: lsrl r4, r3, #1
708 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r6
709 ; CHECK-NEXT: vstrb.8 q4, [r2], #16
710 ; CHECK-NEXT: le lr, .LBB17_1
711 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
712 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
713 ; CHECK-NEXT: add sp, #4
714 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
716 br label %vector.body
718 vector.body: ; preds = %vector.body, %entry
719 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
720 %0 = getelementptr inbounds i32, i32* %x, i32 %index
721 %1 = bitcast i32* %0 to <4 x i32>*
722 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
723 %2 = zext <4 x i32> %wide.load to <4 x i64>
724 %3 = getelementptr inbounds i32, i32* %y, i32 %index
725 %4 = bitcast i32* %3 to <4 x i32>*
726 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
727 %5 = zext <4 x i32> %wide.load16 to <4 x i64>
728 %6 = add nuw nsw <4 x i64> %5, %2
729 %7 = lshr <4 x i64> %6, <i64 1, i64 1, i64 1, i64 1>
730 %8 = trunc <4 x i64> %7 to <4 x i32>
731 %9 = getelementptr inbounds i32, i32* %z, i32 %index
732 %10 = bitcast i32* %9 to <4 x i32>*
733 store <4 x i32> %8, <4 x i32>* %10, align 4
734 %index.next = add i32 %index, 4
735 %11 = icmp eq i32 %index.next, 1024
736 br i1 %11, label %for.cond.cleanup, label %vector.body
738 for.cond.cleanup: ; preds = %vector.body
742 define void @vrhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
743 ; CHECK-LABEL: vrhadd_loop_s8:
744 ; CHECK: @ %bb.0: @ %entry
745 ; CHECK-NEXT: .save {r7, lr}
746 ; CHECK-NEXT: push {r7, lr}
747 ; CHECK-NEXT: mov.w lr, #64
748 ; CHECK-NEXT: vmov.i16 q0, #0x1
749 ; CHECK-NEXT: .LBB18_1: @ %vector.body
750 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
751 ; CHECK-NEXT: vldrb.u16 q1, [r1, #8]
752 ; CHECK-NEXT: vldrb.u16 q2, [r0, #8]
753 ; CHECK-NEXT: vadd.i16 q1, q2, q1
754 ; CHECK-NEXT: vldrb.u16 q2, [r0], #16
755 ; CHECK-NEXT: vadd.i16 q1, q1, q0
756 ; CHECK-NEXT: vshr.u16 q1, q1, #1
757 ; CHECK-NEXT: vstrb.16 q1, [r2, #8]
758 ; CHECK-NEXT: vldrb.u16 q1, [r1], #16
759 ; CHECK-NEXT: vadd.i16 q1, q2, q1
760 ; CHECK-NEXT: vadd.i16 q1, q1, q0
761 ; CHECK-NEXT: vshr.u16 q1, q1, #1
762 ; CHECK-NEXT: vstrb.16 q1, [r2], #16
763 ; CHECK-NEXT: le lr, .LBB18_1
764 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
765 ; CHECK-NEXT: pop {r7, pc}
767 br label %vector.body
769 vector.body: ; preds = %vector.body, %entry
770 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
771 %0 = getelementptr inbounds i8, i8* %x, i32 %index
772 %1 = bitcast i8* %0 to <16 x i8>*
773 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
774 %2 = zext <16 x i8> %wide.load to <16 x i16>
775 %3 = getelementptr inbounds i8, i8* %y, i32 %index
776 %4 = bitcast i8* %3 to <16 x i8>*
777 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
778 %5 = zext <16 x i8> %wide.load16 to <16 x i16>
779 %6 = add nuw nsw <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
780 %7 = add nuw nsw <16 x i16> %6, %5
781 %8 = lshr <16 x i16> %7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
782 %9 = trunc <16 x i16> %8 to <16 x i8>
783 %10 = getelementptr inbounds i8, i8* %z, i32 %index
784 %11 = bitcast i8* %10 to <16 x i8>*
785 store <16 x i8> %9, <16 x i8>* %11, align 1
786 %index.next = add i32 %index, 16
787 %12 = icmp eq i32 %index.next, 1024
788 br i1 %12, label %for.cond.cleanup, label %vector.body
790 for.cond.cleanup: ; preds = %vector.body
794 define void @vrhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
795 ; CHECK-LABEL: vrhadd_loop_s16:
796 ; CHECK: @ %bb.0: @ %entry
797 ; CHECK-NEXT: .save {r7, lr}
798 ; CHECK-NEXT: push {r7, lr}
799 ; CHECK-NEXT: mov.w lr, #128
800 ; CHECK-NEXT: vmov.i32 q0, #0x1
801 ; CHECK-NEXT: .LBB19_1: @ %vector.body
802 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
803 ; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
804 ; CHECK-NEXT: vldrh.u32 q2, [r0, #8]
805 ; CHECK-NEXT: vadd.i32 q1, q2, q1
806 ; CHECK-NEXT: vldrh.u32 q2, [r0], #16
807 ; CHECK-NEXT: vadd.i32 q1, q1, q0
808 ; CHECK-NEXT: vshr.u32 q1, q1, #1
809 ; CHECK-NEXT: vstrh.32 q1, [r2, #8]
810 ; CHECK-NEXT: vldrh.u32 q1, [r1], #16
811 ; CHECK-NEXT: vadd.i32 q1, q2, q1
812 ; CHECK-NEXT: vadd.i32 q1, q1, q0
813 ; CHECK-NEXT: vshr.u32 q1, q1, #1
814 ; CHECK-NEXT: vstrh.32 q1, [r2], #16
815 ; CHECK-NEXT: le lr, .LBB19_1
816 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
817 ; CHECK-NEXT: pop {r7, pc}
819 br label %vector.body
821 vector.body: ; preds = %vector.body, %entry
822 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
823 %0 = getelementptr inbounds i16, i16* %x, i32 %index
824 %1 = bitcast i16* %0 to <8 x i16>*
825 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
826 %2 = zext <8 x i16> %wide.load to <8 x i32>
827 %3 = getelementptr inbounds i16, i16* %y, i32 %index
828 %4 = bitcast i16* %3 to <8 x i16>*
829 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
830 %5 = zext <8 x i16> %wide.load16 to <8 x i32>
831 %6 = add nuw nsw <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
832 %7 = add nuw nsw <8 x i32> %6, %5
833 %8 = lshr <8 x i32> %7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
834 %9 = trunc <8 x i32> %8 to <8 x i16>
835 %10 = getelementptr inbounds i16, i16* %z, i32 %index
836 %11 = bitcast i16* %10 to <8 x i16>*
837 store <8 x i16> %9, <8 x i16>* %11, align 2
838 %index.next = add i32 %index, 8
839 %12 = icmp eq i32 %index.next, 1024
840 br i1 %12, label %for.cond.cleanup, label %vector.body
842 for.cond.cleanup: ; preds = %vector.body
846 define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
847 ; CHECK-LABEL: vrhadd_loop_s32:
848 ; CHECK: @ %bb.0: @ %entry
849 ; CHECK-NEXT: .save {r4, r5, r6, lr}
850 ; CHECK-NEXT: push {r4, r5, r6, lr}
851 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
852 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
853 ; CHECK-NEXT: mov.w lr, #256
854 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff
855 ; CHECK-NEXT: .LBB20_1: @ %vector.body
856 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
857 ; CHECK-NEXT: vldrw.u32 q3, [r1], #16
858 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16
859 ; CHECK-NEXT: vmov.f32 s4, s14
860 ; CHECK-NEXT: vmov.f32 s6, s15
861 ; CHECK-NEXT: vmov.f32 s8, s18
862 ; CHECK-NEXT: vand q1, q1, q0
863 ; CHECK-NEXT: vmov.f32 s10, s19
864 ; CHECK-NEXT: vand q2, q2, q0
865 ; CHECK-NEXT: vmov r3, r12, d2
866 ; CHECK-NEXT: vmov r4, r5, d4
867 ; CHECK-NEXT: vmov.f32 s14, s13
868 ; CHECK-NEXT: vmov.f32 s18, s17
869 ; CHECK-NEXT: vand q3, q3, q0
870 ; CHECK-NEXT: vand q5, q4, q0
871 ; CHECK-NEXT: adds r3, r3, r4
872 ; CHECK-NEXT: adc.w r4, r5, r12
873 ; CHECK-NEXT: adds.w r12, r3, #1
874 ; CHECK-NEXT: adc r3, r4, #0
875 ; CHECK-NEXT: vmov r5, r6, d10
876 ; CHECK-NEXT: lsrl r12, r3, #1
877 ; CHECK-NEXT: vmov r3, r4, d6
878 ; CHECK-NEXT: adds r3, r3, r5
879 ; CHECK-NEXT: adcs r4, r6
880 ; CHECK-NEXT: adds r6, r3, #1
881 ; CHECK-NEXT: adc r3, r4, #0
882 ; CHECK-NEXT: vmov r5, r4, d5
883 ; CHECK-NEXT: lsrl r6, r3, #1
884 ; CHECK-NEXT: vmov q4[2], q4[0], r6, r12
885 ; CHECK-NEXT: vmov r3, r6, d3
886 ; CHECK-NEXT: adds r3, r3, r5
887 ; CHECK-NEXT: adcs r4, r6
888 ; CHECK-NEXT: adds.w r12, r3, #1
889 ; CHECK-NEXT: adc r3, r4, #0
890 ; CHECK-NEXT: vmov r5, r6, d11
891 ; CHECK-NEXT: lsrl r12, r3, #1
892 ; CHECK-NEXT: vmov r3, r4, d7
893 ; CHECK-NEXT: adds r3, r3, r5
894 ; CHECK-NEXT: adcs r4, r6
895 ; CHECK-NEXT: adds r6, r3, #1
896 ; CHECK-NEXT: adc r3, r4, #0
897 ; CHECK-NEXT: lsrl r6, r3, #1
898 ; CHECK-NEXT: vmov q4[3], q4[1], r6, r12
899 ; CHECK-NEXT: vstrb.8 q4, [r2], #16
900 ; CHECK-NEXT: le lr, .LBB20_1
901 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
902 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
903 ; CHECK-NEXT: pop {r4, r5, r6, pc}
905 br label %vector.body
907 vector.body: ; preds = %vector.body, %entry
908 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
909 %0 = getelementptr inbounds i32, i32* %x, i32 %index
910 %1 = bitcast i32* %0 to <4 x i32>*
911 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
912 %2 = zext <4 x i32> %wide.load to <4 x i64>
913 %3 = getelementptr inbounds i32, i32* %y, i32 %index
914 %4 = bitcast i32* %3 to <4 x i32>*
915 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
916 %5 = zext <4 x i32> %wide.load16 to <4 x i64>
917 %6 = add nuw nsw <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
918 %7 = add nuw nsw <4 x i64> %6, %5
919 %8 = lshr <4 x i64> %7, <i64 1, i64 1, i64 1, i64 1>
920 %9 = trunc <4 x i64> %8 to <4 x i32>
921 %10 = getelementptr inbounds i32, i32* %z, i32 %index
922 %11 = bitcast i32* %10 to <4 x i32>*
923 store <4 x i32> %9, <4 x i32>* %11, align 4
924 %index.next = add i32 %index, 4
925 %12 = icmp eq i32 %index.next, 1024
926 br i1 %12, label %for.cond.cleanup, label %vector.body
928 for.cond.cleanup: ; preds = %vector.body
932 define void @vrhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
933 ; CHECK-LABEL: vrhadd_loop_u8:
934 ; CHECK: @ %bb.0: @ %entry
935 ; CHECK-NEXT: .save {r7, lr}
936 ; CHECK-NEXT: push {r7, lr}
937 ; CHECK-NEXT: mov.w lr, #64
938 ; CHECK-NEXT: vmov.i16 q0, #0x1
939 ; CHECK-NEXT: .LBB21_1: @ %vector.body
940 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
941 ; CHECK-NEXT: vldrb.u16 q1, [r1, #8]
942 ; CHECK-NEXT: vldrb.u16 q2, [r0, #8]
943 ; CHECK-NEXT: vadd.i16 q1, q2, q1
944 ; CHECK-NEXT: vldrb.u16 q2, [r0], #16
945 ; CHECK-NEXT: vadd.i16 q1, q1, q0
946 ; CHECK-NEXT: vshr.u16 q1, q1, #1
947 ; CHECK-NEXT: vstrb.16 q1, [r2, #8]
948 ; CHECK-NEXT: vldrb.u16 q1, [r1], #16
949 ; CHECK-NEXT: vadd.i16 q1, q2, q1
950 ; CHECK-NEXT: vadd.i16 q1, q1, q0
951 ; CHECK-NEXT: vshr.u16 q1, q1, #1
952 ; CHECK-NEXT: vstrb.16 q1, [r2], #16
953 ; CHECK-NEXT: le lr, .LBB21_1
954 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
955 ; CHECK-NEXT: pop {r7, pc}
957 br label %vector.body
959 vector.body: ; preds = %vector.body, %entry
960 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
961 %0 = getelementptr inbounds i8, i8* %x, i32 %index
962 %1 = bitcast i8* %0 to <16 x i8>*
963 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
964 %2 = zext <16 x i8> %wide.load to <16 x i16>
965 %3 = getelementptr inbounds i8, i8* %y, i32 %index
966 %4 = bitcast i8* %3 to <16 x i8>*
967 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
968 %5 = zext <16 x i8> %wide.load16 to <16 x i16>
969 %6 = add nuw nsw <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
970 %7 = add nuw nsw <16 x i16> %6, %5
971 %8 = lshr <16 x i16> %7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
972 %9 = trunc <16 x i16> %8 to <16 x i8>
973 %10 = getelementptr inbounds i8, i8* %z, i32 %index
974 %11 = bitcast i8* %10 to <16 x i8>*
975 store <16 x i8> %9, <16 x i8>* %11, align 1
976 %index.next = add i32 %index, 16
977 %12 = icmp eq i32 %index.next, 1024
978 br i1 %12, label %for.cond.cleanup, label %vector.body
980 for.cond.cleanup: ; preds = %vector.body
984 define void @vrhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
985 ; CHECK-LABEL: vrhadd_loop_u16:
986 ; CHECK: @ %bb.0: @ %entry
987 ; CHECK-NEXT: .save {r7, lr}
988 ; CHECK-NEXT: push {r7, lr}
989 ; CHECK-NEXT: mov.w lr, #128
990 ; CHECK-NEXT: vmov.i32 q0, #0x1
991 ; CHECK-NEXT: .LBB22_1: @ %vector.body
992 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
993 ; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
994 ; CHECK-NEXT: vldrh.u32 q2, [r0, #8]
995 ; CHECK-NEXT: vadd.i32 q1, q2, q1
996 ; CHECK-NEXT: vldrh.u32 q2, [r0], #16
997 ; CHECK-NEXT: vadd.i32 q1, q1, q0
998 ; CHECK-NEXT: vshr.u32 q1, q1, #1
999 ; CHECK-NEXT: vstrh.32 q1, [r2, #8]
1000 ; CHECK-NEXT: vldrh.u32 q1, [r1], #16
1001 ; CHECK-NEXT: vadd.i32 q1, q2, q1
1002 ; CHECK-NEXT: vadd.i32 q1, q1, q0
1003 ; CHECK-NEXT: vshr.u32 q1, q1, #1
1004 ; CHECK-NEXT: vstrh.32 q1, [r2], #16
1005 ; CHECK-NEXT: le lr, .LBB22_1
1006 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
1007 ; CHECK-NEXT: pop {r7, pc}
1009 br label %vector.body
1011 vector.body: ; preds = %vector.body, %entry
1012 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
1013 %0 = getelementptr inbounds i16, i16* %x, i32 %index
1014 %1 = bitcast i16* %0 to <8 x i16>*
1015 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
1016 %2 = zext <8 x i16> %wide.load to <8 x i32>
1017 %3 = getelementptr inbounds i16, i16* %y, i32 %index
1018 %4 = bitcast i16* %3 to <8 x i16>*
1019 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
1020 %5 = zext <8 x i16> %wide.load16 to <8 x i32>
1021 %6 = add nuw nsw <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1022 %7 = add nuw nsw <8 x i32> %6, %5
1023 %8 = lshr <8 x i32> %7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1024 %9 = trunc <8 x i32> %8 to <8 x i16>
1025 %10 = getelementptr inbounds i16, i16* %z, i32 %index
1026 %11 = bitcast i16* %10 to <8 x i16>*
1027 store <8 x i16> %9, <8 x i16>* %11, align 2
1028 %index.next = add i32 %index, 8
1029 %12 = icmp eq i32 %index.next, 1024
1030 br i1 %12, label %for.cond.cleanup, label %vector.body
1032 for.cond.cleanup: ; preds = %vector.body
1036 define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
1037 ; CHECK-LABEL: vrhadd_loop_u32:
1038 ; CHECK: @ %bb.0: @ %entry
1039 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1040 ; CHECK-NEXT: push {r4, r5, r6, lr}
1041 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1042 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
1043 ; CHECK-NEXT: mov.w lr, #256
1044 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff
1045 ; CHECK-NEXT: .LBB23_1: @ %vector.body
1046 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1047 ; CHECK-NEXT: vldrw.u32 q3, [r1], #16
1048 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16
1049 ; CHECK-NEXT: vmov.f32 s4, s14
1050 ; CHECK-NEXT: vmov.f32 s6, s15
1051 ; CHECK-NEXT: vmov.f32 s8, s18
1052 ; CHECK-NEXT: vand q1, q1, q0
1053 ; CHECK-NEXT: vmov.f32 s10, s19
1054 ; CHECK-NEXT: vand q2, q2, q0
1055 ; CHECK-NEXT: vmov r3, r12, d2
1056 ; CHECK-NEXT: vmov r4, r5, d4
1057 ; CHECK-NEXT: vmov.f32 s14, s13
1058 ; CHECK-NEXT: vmov.f32 s18, s17
1059 ; CHECK-NEXT: vand q3, q3, q0
1060 ; CHECK-NEXT: vand q5, q4, q0
1061 ; CHECK-NEXT: adds r3, r3, r4
1062 ; CHECK-NEXT: adc.w r4, r5, r12
1063 ; CHECK-NEXT: adds.w r12, r3, #1
1064 ; CHECK-NEXT: adc r3, r4, #0
1065 ; CHECK-NEXT: vmov r5, r6, d10
1066 ; CHECK-NEXT: lsrl r12, r3, #1
1067 ; CHECK-NEXT: vmov r3, r4, d6
1068 ; CHECK-NEXT: adds r3, r3, r5
1069 ; CHECK-NEXT: adcs r4, r6
1070 ; CHECK-NEXT: adds r6, r3, #1
1071 ; CHECK-NEXT: adc r3, r4, #0
1072 ; CHECK-NEXT: vmov r5, r4, d5
1073 ; CHECK-NEXT: lsrl r6, r3, #1
1074 ; CHECK-NEXT: vmov q4[2], q4[0], r6, r12
1075 ; CHECK-NEXT: vmov r3, r6, d3
1076 ; CHECK-NEXT: adds r3, r3, r5
1077 ; CHECK-NEXT: adcs r4, r6
1078 ; CHECK-NEXT: adds.w r12, r3, #1
1079 ; CHECK-NEXT: adc r3, r4, #0
1080 ; CHECK-NEXT: vmov r5, r6, d11
1081 ; CHECK-NEXT: lsrl r12, r3, #1
1082 ; CHECK-NEXT: vmov r3, r4, d7
1083 ; CHECK-NEXT: adds r3, r3, r5
1084 ; CHECK-NEXT: adcs r4, r6
1085 ; CHECK-NEXT: adds r6, r3, #1
1086 ; CHECK-NEXT: adc r3, r4, #0
1087 ; CHECK-NEXT: lsrl r6, r3, #1
1088 ; CHECK-NEXT: vmov q4[3], q4[1], r6, r12
1089 ; CHECK-NEXT: vstrb.8 q4, [r2], #16
1090 ; CHECK-NEXT: le lr, .LBB23_1
1091 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
1092 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
1093 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1095 br label %vector.body
1097 vector.body: ; preds = %vector.body, %entry
1098 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
1099 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1100 %1 = bitcast i32* %0 to <4 x i32>*
1101 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1102 %2 = zext <4 x i32> %wide.load to <4 x i64>
1103 %3 = getelementptr inbounds i32, i32* %y, i32 %index
1104 %4 = bitcast i32* %3 to <4 x i32>*
1105 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
1106 %5 = zext <4 x i32> %wide.load16 to <4 x i64>
1107 %6 = add nuw nsw <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
1108 %7 = add nuw nsw <4 x i64> %6, %5
1109 %8 = lshr <4 x i64> %7, <i64 1, i64 1, i64 1, i64 1>
1110 %9 = trunc <4 x i64> %8 to <4 x i32>
1111 %10 = getelementptr inbounds i32, i32* %z, i32 %index
1112 %11 = bitcast i32* %10 to <4 x i32>*
1113 store <4 x i32> %9, <4 x i32>* %11, align 4
1114 %index.next = add i32 %index, 4
1115 %12 = icmp eq i32 %index.next, 1024
1116 br i1 %12, label %for.cond.cleanup, label %vector.body
1118 for.cond.cleanup: ; preds = %vector.body