1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4 define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
5 ; CHECK-LABEL: vhadds_v4i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vhadd.s32 q0, q0, q1
10 %s0s = sext <4 x i32> %s0 to <4 x i64>
11 %s1s = sext <4 x i32> %s1 to <4 x i64>
12 %m = add <4 x i64> %s0s, %s1s
13 %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1>
14 %s2 = trunc <4 x i64> %s to <4 x i32>
18 define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
19 ; CHECK-LABEL: vhaddu_v4i32:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vhadd.u32 q0, q0, q1
24 %s0s = zext <4 x i32> %s0 to <4 x i64>
25 %s1s = zext <4 x i32> %s1 to <4 x i64>
26 %m = add <4 x i64> %s0s, %s1s
27 %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1>
28 %s2 = trunc <4 x i64> %s to <4 x i32>
32 define arm_aapcs_vfpcc <4 x i16> @vhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
33 ; CHECK-LABEL: vhadds_v4i16:
34 ; CHECK: @ %bb.0: @ %entry
35 ; CHECK-NEXT: vmovlb.s16 q1, q1
36 ; CHECK-NEXT: vmovlb.s16 q0, q0
37 ; CHECK-NEXT: vadd.i32 q0, q0, q1
38 ; CHECK-NEXT: vshr.u32 q0, q0, #1
41 %s0s = sext <4 x i16> %s0 to <4 x i32>
42 %s1s = sext <4 x i16> %s1 to <4 x i32>
43 %m = add <4 x i32> %s0s, %s1s
44 %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1>
45 %s2 = trunc <4 x i32> %s to <4 x i16>
49 define arm_aapcs_vfpcc <4 x i16> @vhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
50 ; CHECK-LABEL: vhaddu_v4i16:
51 ; CHECK: @ %bb.0: @ %entry
52 ; CHECK-NEXT: vmovlb.u16 q1, q1
53 ; CHECK-NEXT: vmovlb.u16 q0, q0
54 ; CHECK-NEXT: vadd.i32 q0, q0, q1
55 ; CHECK-NEXT: vshr.u32 q0, q0, #1
58 %s0s = zext <4 x i16> %s0 to <4 x i32>
59 %s1s = zext <4 x i16> %s1 to <4 x i32>
60 %m = add <4 x i32> %s0s, %s1s
61 %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1>
62 %s2 = trunc <4 x i32> %s to <4 x i16>
66 define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
67 ; CHECK-LABEL: vhadds_v8i16:
68 ; CHECK: @ %bb.0: @ %entry
69 ; CHECK-NEXT: vhadd.s16 q0, q0, q1
72 %s0s = sext <8 x i16> %s0 to <8 x i32>
73 %s1s = sext <8 x i16> %s1 to <8 x i32>
74 %m = add <8 x i32> %s0s, %s1s
75 %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
76 %s2 = trunc <8 x i32> %s to <8 x i16>
80 define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
81 ; CHECK-LABEL: vhaddu_v8i16:
82 ; CHECK: @ %bb.0: @ %entry
83 ; CHECK-NEXT: vhadd.u16 q0, q0, q1
86 %s0s = zext <8 x i16> %s0 to <8 x i32>
87 %s1s = zext <8 x i16> %s1 to <8 x i32>
88 %m = add <8 x i32> %s0s, %s1s
89 %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
90 %s2 = trunc <8 x i32> %s to <8 x i16>
94 define arm_aapcs_vfpcc <4 x i8> @vhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
95 ; CHECK-LABEL: vhadds_v4i8:
96 ; CHECK: @ %bb.0: @ %entry
97 ; CHECK-NEXT: vmovlb.s8 q1, q1
98 ; CHECK-NEXT: vmovlb.s8 q0, q0
99 ; CHECK-NEXT: vmovlb.s16 q1, q1
100 ; CHECK-NEXT: vmovlb.s16 q0, q0
101 ; CHECK-NEXT: vadd.i32 q0, q0, q1
102 ; CHECK-NEXT: vmovlb.u16 q0, q0
103 ; CHECK-NEXT: vshr.u32 q0, q0, #1
106 %s0s = sext <4 x i8> %s0 to <4 x i16>
107 %s1s = sext <4 x i8> %s1 to <4 x i16>
108 %m = add <4 x i16> %s0s, %s1s
109 %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1>
110 %s2 = trunc <4 x i16> %s to <4 x i8>
114 define arm_aapcs_vfpcc <4 x i8> @vhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
115 ; CHECK-LABEL: vhaddu_v4i8:
116 ; CHECK: @ %bb.0: @ %entry
117 ; CHECK-NEXT: vmov.i32 q2, #0xff
118 ; CHECK-NEXT: vand q1, q1, q2
119 ; CHECK-NEXT: vand q0, q0, q2
120 ; CHECK-NEXT: vadd.i32 q0, q0, q1
121 ; CHECK-NEXT: vshr.u32 q0, q0, #1
124 %s0s = zext <4 x i8> %s0 to <4 x i16>
125 %s1s = zext <4 x i8> %s1 to <4 x i16>
126 %m = add <4 x i16> %s0s, %s1s
127 %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1>
128 %s2 = trunc <4 x i16> %s to <4 x i8>
132 define arm_aapcs_vfpcc <8 x i8> @vhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
133 ; CHECK-LABEL: vhadds_v8i8:
134 ; CHECK: @ %bb.0: @ %entry
135 ; CHECK-NEXT: vmovlb.s8 q1, q1
136 ; CHECK-NEXT: vmovlb.s8 q0, q0
137 ; CHECK-NEXT: vadd.i16 q0, q0, q1
138 ; CHECK-NEXT: vshr.u16 q0, q0, #1
141 %s0s = sext <8 x i8> %s0 to <8 x i16>
142 %s1s = sext <8 x i8> %s1 to <8 x i16>
143 %m = add <8 x i16> %s0s, %s1s
144 %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
145 %s2 = trunc <8 x i16> %s to <8 x i8>
149 define arm_aapcs_vfpcc <8 x i8> @vhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
150 ; CHECK-LABEL: vhaddu_v8i8:
151 ; CHECK: @ %bb.0: @ %entry
152 ; CHECK-NEXT: vmovlb.u8 q1, q1
153 ; CHECK-NEXT: vmovlb.u8 q0, q0
154 ; CHECK-NEXT: vadd.i16 q0, q0, q1
155 ; CHECK-NEXT: vshr.u16 q0, q0, #1
158 %s0s = zext <8 x i8> %s0 to <8 x i16>
159 %s1s = zext <8 x i8> %s1 to <8 x i16>
160 %m = add <8 x i16> %s0s, %s1s
161 %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
162 %s2 = trunc <8 x i16> %s to <8 x i8>
166 define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
167 ; CHECK-LABEL: vhadds_v16i8:
168 ; CHECK: @ %bb.0: @ %entry
169 ; CHECK-NEXT: vhadd.s8 q0, q0, q1
172 %s0s = sext <16 x i8> %s0 to <16 x i16>
173 %s1s = sext <16 x i8> %s1 to <16 x i16>
174 %m = add <16 x i16> %s0s, %s1s
175 %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
176 %s2 = trunc <16 x i16> %s to <16 x i8>
180 define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
181 ; CHECK-LABEL: vhaddu_v16i8:
182 ; CHECK: @ %bb.0: @ %entry
183 ; CHECK-NEXT: vhadd.u8 q0, q0, q1
186 %s0s = zext <16 x i8> %s0 to <16 x i16>
187 %s1s = zext <16 x i8> %s1 to <16 x i16>
188 %m = add <16 x i16> %s0s, %s1s
189 %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
190 %s2 = trunc <16 x i16> %s to <16 x i8>
194 define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
195 ; CHECK-LABEL: vrhadds_v4i32:
196 ; CHECK: @ %bb.0: @ %entry
197 ; CHECK-NEXT: vrhadd.s32 q0, q0, q1
200 %s0s = sext <4 x i32> %s0 to <4 x i64>
201 %s1s = sext <4 x i32> %s1 to <4 x i64>
202 %add = add <4 x i64> %s0s, %s1s
203 %add2 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
204 %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
205 %result = trunc <4 x i64> %s to <4 x i32>
206 ret <4 x i32> %result
209 define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
210 ; CHECK-LABEL: vrhaddu_v4i32:
211 ; CHECK: @ %bb.0: @ %entry
212 ; CHECK-NEXT: vrhadd.u32 q0, q0, q1
215 %s0s = zext <4 x i32> %s0 to <4 x i64>
216 %s1s = zext <4 x i32> %s1 to <4 x i64>
217 %add = add <4 x i64> %s0s, %s1s
218 %add2 = add <4 x i64> %add, <i64 1, i64 1, i64 1, i64 1>
219 %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
220 %result = trunc <4 x i64> %s to <4 x i32>
221 ret <4 x i32> %result
224 define arm_aapcs_vfpcc <4 x i16> @vrhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
225 ; CHECK-LABEL: vrhadds_v4i16:
226 ; CHECK: @ %bb.0: @ %entry
227 ; CHECK-NEXT: vmovlb.s16 q1, q1
228 ; CHECK-NEXT: vmovlb.s16 q0, q0
229 ; CHECK-NEXT: vadd.i32 q0, q0, q1
230 ; CHECK-NEXT: movs r0, #1
231 ; CHECK-NEXT: vadd.i32 q0, q0, r0
232 ; CHECK-NEXT: vshr.u32 q0, q0, #1
235 %s0s = sext <4 x i16> %s0 to <4 x i32>
236 %s1s = sext <4 x i16> %s1 to <4 x i32>
237 %add = add <4 x i32> %s0s, %s1s
238 %add2 = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
239 %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
240 %result = trunc <4 x i32> %s to <4 x i16>
241 ret <4 x i16> %result
244 define arm_aapcs_vfpcc <4 x i16> @vrhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
245 ; CHECK-LABEL: vrhaddu_v4i16:
246 ; CHECK: @ %bb.0: @ %entry
247 ; CHECK-NEXT: vmovlb.u16 q1, q1
248 ; CHECK-NEXT: vmovlb.u16 q0, q0
249 ; CHECK-NEXT: vadd.i32 q0, q0, q1
250 ; CHECK-NEXT: movs r0, #1
251 ; CHECK-NEXT: vadd.i32 q0, q0, r0
252 ; CHECK-NEXT: vshr.u32 q0, q0, #1
255 %s0s = zext <4 x i16> %s0 to <4 x i32>
256 %s1s = zext <4 x i16> %s1 to <4 x i32>
257 %add = add <4 x i32> %s0s, %s1s
258 %add2 = add <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
259 %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
260 %result = trunc <4 x i32> %s to <4 x i16>
261 ret <4 x i16> %result
264 define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
265 ; CHECK-LABEL: vrhadds_v8i16:
266 ; CHECK: @ %bb.0: @ %entry
267 ; CHECK-NEXT: vrhadd.s16 q0, q0, q1
270 %s0s = sext <8 x i16> %s0 to <8 x i32>
271 %s1s = sext <8 x i16> %s1 to <8 x i32>
272 %add = add <8 x i32> %s0s, %s1s
273 %add2 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
274 %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
275 %result = trunc <8 x i32> %s to <8 x i16>
276 ret <8 x i16> %result
279 define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
280 ; CHECK-LABEL: vrhaddu_v8i16:
281 ; CHECK: @ %bb.0: @ %entry
282 ; CHECK-NEXT: vrhadd.u16 q0, q0, q1
285 %s0s = zext <8 x i16> %s0 to <8 x i32>
286 %s1s = zext <8 x i16> %s1 to <8 x i32>
287 %add = add <8 x i32> %s0s, %s1s
288 %add2 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
289 %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
290 %result = trunc <8 x i32> %s to <8 x i16>
291 ret <8 x i16> %result
294 define arm_aapcs_vfpcc <4 x i8> @vrhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
295 ; CHECK-LABEL: vrhadds_v4i8:
296 ; CHECK: @ %bb.0: @ %entry
297 ; CHECK-NEXT: vmovlb.s8 q1, q1
298 ; CHECK-NEXT: vmovlb.s8 q0, q0
299 ; CHECK-NEXT: vmovlb.s16 q1, q1
300 ; CHECK-NEXT: vmovlb.s16 q0, q0
301 ; CHECK-NEXT: vadd.i32 q0, q0, q1
302 ; CHECK-NEXT: movs r0, #1
303 ; CHECK-NEXT: vadd.i32 q0, q0, r0
304 ; CHECK-NEXT: vmovlb.u16 q0, q0
305 ; CHECK-NEXT: vshr.u32 q0, q0, #1
308 %s0s = sext <4 x i8> %s0 to <4 x i16>
309 %s1s = sext <4 x i8> %s1 to <4 x i16>
310 %add = add <4 x i16> %s0s, %s1s
311 %add2 = add <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
312 %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
313 %result = trunc <4 x i16> %s to <4 x i8>
317 define arm_aapcs_vfpcc <4 x i8> @vrhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
318 ; CHECK-LABEL: vrhaddu_v4i8:
319 ; CHECK: @ %bb.0: @ %entry
320 ; CHECK-NEXT: vmov.i32 q2, #0xff
321 ; CHECK-NEXT: movs r0, #1
322 ; CHECK-NEXT: vand q1, q1, q2
323 ; CHECK-NEXT: vand q0, q0, q2
324 ; CHECK-NEXT: vadd.i32 q0, q0, q1
325 ; CHECK-NEXT: vadd.i32 q0, q0, r0
326 ; CHECK-NEXT: vshr.u32 q0, q0, #1
329 %s0s = zext <4 x i8> %s0 to <4 x i16>
330 %s1s = zext <4 x i8> %s1 to <4 x i16>
331 %add = add <4 x i16> %s0s, %s1s
332 %add2 = add <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
333 %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
334 %result = trunc <4 x i16> %s to <4 x i8>
338 define arm_aapcs_vfpcc <8 x i8> @vrhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
339 ; CHECK-LABEL: vrhadds_v8i8:
340 ; CHECK: @ %bb.0: @ %entry
341 ; CHECK-NEXT: vmovlb.s8 q1, q1
342 ; CHECK-NEXT: vmovlb.s8 q0, q0
343 ; CHECK-NEXT: vadd.i16 q0, q0, q1
344 ; CHECK-NEXT: movs r0, #1
345 ; CHECK-NEXT: vadd.i16 q0, q0, r0
346 ; CHECK-NEXT: vshr.u16 q0, q0, #1
349 %s0s = sext <8 x i8> %s0 to <8 x i16>
350 %s1s = sext <8 x i8> %s1 to <8 x i16>
351 %add = add <8 x i16> %s0s, %s1s
352 %add2 = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
353 %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
354 %result = trunc <8 x i16> %s to <8 x i8>
358 define arm_aapcs_vfpcc <8 x i8> @vrhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
359 ; CHECK-LABEL: vrhaddu_v8i8:
360 ; CHECK: @ %bb.0: @ %entry
361 ; CHECK-NEXT: vmovlb.u8 q1, q1
362 ; CHECK-NEXT: vmovlb.u8 q0, q0
363 ; CHECK-NEXT: vadd.i16 q0, q0, q1
364 ; CHECK-NEXT: movs r0, #1
365 ; CHECK-NEXT: vadd.i16 q0, q0, r0
366 ; CHECK-NEXT: vshr.u16 q0, q0, #1
369 %s0s = zext <8 x i8> %s0 to <8 x i16>
370 %s1s = zext <8 x i8> %s1 to <8 x i16>
371 %add = add <8 x i16> %s0s, %s1s
372 %add2 = add <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
373 %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
374 %result = trunc <8 x i16> %s to <8 x i8>
378 define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
379 ; CHECK-LABEL: vrhadds_v16i8:
380 ; CHECK: @ %bb.0: @ %entry
381 ; CHECK-NEXT: vrhadd.s8 q0, q0, q1
384 %s0s = sext <16 x i8> %s0 to <16 x i16>
385 %s1s = sext <16 x i8> %s1 to <16 x i16>
386 %add = add <16 x i16> %s0s, %s1s
387 %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
388 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
389 %result = trunc <16 x i16> %s to <16 x i8>
390 ret <16 x i8> %result
393 define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
394 ; CHECK-LABEL: vrhaddu_v16i8:
395 ; CHECK: @ %bb.0: @ %entry
396 ; CHECK-NEXT: vrhadd.u8 q0, q0, q1
399 %s0s = zext <16 x i8> %s0 to <16 x i16>
400 %s1s = zext <16 x i8> %s1 to <16 x i16>
401 %add = add <16 x i16> %s0s, %s1s
402 %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
403 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
404 %result = trunc <16 x i16> %s to <16 x i8>
405 ret <16 x i8> %result
411 define void @vhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
412 ; CHECK-LABEL: vhadd_loop_s8:
413 ; CHECK: @ %bb.0: @ %entry
414 ; CHECK-NEXT: .save {r7, lr}
415 ; CHECK-NEXT: push {r7, lr}
416 ; CHECK-NEXT: mov.w lr, #64
417 ; CHECK-NEXT: .LBB24_1: @ %vector.body
418 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
419 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
420 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
421 ; CHECK-NEXT: vhadd.s8 q0, q1, q0
422 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
423 ; CHECK-NEXT: le lr, .LBB24_1
424 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
425 ; CHECK-NEXT: pop {r7, pc}
427 br label %vector.body
429 vector.body: ; preds = %vector.body, %entry
430 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
431 %0 = getelementptr inbounds i8, i8* %x, i32 %index
432 %1 = bitcast i8* %0 to <16 x i8>*
433 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
434 %2 = sext <16 x i8> %wide.load to <16 x i16>
435 %3 = getelementptr inbounds i8, i8* %y, i32 %index
436 %4 = bitcast i8* %3 to <16 x i8>*
437 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
438 %5 = sext <16 x i8> %wide.load16 to <16 x i16>
439 %6 = add nsw <16 x i16> %5, %2
440 %7 = lshr <16 x i16> %6, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
441 %8 = trunc <16 x i16> %7 to <16 x i8>
442 %9 = getelementptr inbounds i8, i8* %z, i32 %index
443 %10 = bitcast i8* %9 to <16 x i8>*
444 store <16 x i8> %8, <16 x i8>* %10, align 1
445 %index.next = add i32 %index, 16
446 %11 = icmp eq i32 %index.next, 1024
447 br i1 %11, label %for.cond.cleanup, label %vector.body
449 for.cond.cleanup: ; preds = %vector.body
453 define void @vhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
454 ; CHECK-LABEL: vhadd_loop_s16:
455 ; CHECK: @ %bb.0: @ %entry
456 ; CHECK-NEXT: .save {r7, lr}
457 ; CHECK-NEXT: push {r7, lr}
458 ; CHECK-NEXT: mov.w lr, #128
459 ; CHECK-NEXT: .LBB25_1: @ %vector.body
460 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
461 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
462 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
463 ; CHECK-NEXT: vhadd.s16 q0, q1, q0
464 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
465 ; CHECK-NEXT: le lr, .LBB25_1
466 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
467 ; CHECK-NEXT: pop {r7, pc}
469 br label %vector.body
471 vector.body: ; preds = %vector.body, %entry
472 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
473 %0 = getelementptr inbounds i16, i16* %x, i32 %index
474 %1 = bitcast i16* %0 to <8 x i16>*
475 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
476 %2 = sext <8 x i16> %wide.load to <8 x i32>
477 %3 = getelementptr inbounds i16, i16* %y, i32 %index
478 %4 = bitcast i16* %3 to <8 x i16>*
479 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
480 %5 = sext <8 x i16> %wide.load16 to <8 x i32>
481 %6 = add nsw <8 x i32> %5, %2
482 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
483 %8 = trunc <8 x i32> %7 to <8 x i16>
484 %9 = getelementptr inbounds i16, i16* %z, i32 %index
485 %10 = bitcast i16* %9 to <8 x i16>*
486 store <8 x i16> %8, <8 x i16>* %10, align 2
487 %index.next = add i32 %index, 8
488 %11 = icmp eq i32 %index.next, 1024
489 br i1 %11, label %for.cond.cleanup, label %vector.body
491 for.cond.cleanup: ; preds = %vector.body
495 define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
496 ; CHECK-LABEL: vhadd_loop_s32:
497 ; CHECK: @ %bb.0: @ %entry
498 ; CHECK-NEXT: .save {r7, lr}
499 ; CHECK-NEXT: push {r7, lr}
500 ; CHECK-NEXT: mov.w lr, #256
501 ; CHECK-NEXT: .LBB26_1: @ %vector.body
502 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
503 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
504 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
505 ; CHECK-NEXT: vhadd.s32 q0, q1, q0
506 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
507 ; CHECK-NEXT: le lr, .LBB26_1
508 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
509 ; CHECK-NEXT: pop {r7, pc}
511 br label %vector.body
513 vector.body: ; preds = %vector.body, %entry
514 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
515 %0 = getelementptr inbounds i32, i32* %x, i32 %index
516 %1 = bitcast i32* %0 to <4 x i32>*
517 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
518 %2 = sext <4 x i32> %wide.load to <4 x i64>
519 %3 = getelementptr inbounds i32, i32* %y, i32 %index
520 %4 = bitcast i32* %3 to <4 x i32>*
521 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
522 %5 = sext <4 x i32> %wide.load16 to <4 x i64>
523 %6 = add nsw <4 x i64> %5, %2
524 %7 = lshr <4 x i64> %6, <i64 1, i64 1, i64 1, i64 1>
525 %8 = trunc <4 x i64> %7 to <4 x i32>
526 %9 = getelementptr inbounds i32, i32* %z, i32 %index
527 %10 = bitcast i32* %9 to <4 x i32>*
528 store <4 x i32> %8, <4 x i32>* %10, align 4
529 %index.next = add i32 %index, 4
530 %11 = icmp eq i32 %index.next, 1024
531 br i1 %11, label %for.cond.cleanup, label %vector.body
533 for.cond.cleanup: ; preds = %vector.body
537 define void @vhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
538 ; CHECK-LABEL: vhadd_loop_u8:
539 ; CHECK: @ %bb.0: @ %entry
540 ; CHECK-NEXT: .save {r7, lr}
541 ; CHECK-NEXT: push {r7, lr}
542 ; CHECK-NEXT: mov.w lr, #64
543 ; CHECK-NEXT: .LBB27_1: @ %vector.body
544 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
545 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
546 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
547 ; CHECK-NEXT: vhadd.u8 q0, q1, q0
548 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
549 ; CHECK-NEXT: le lr, .LBB27_1
550 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
551 ; CHECK-NEXT: pop {r7, pc}
553 br label %vector.body
555 vector.body: ; preds = %vector.body, %entry
556 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
557 %0 = getelementptr inbounds i8, i8* %x, i32 %index
558 %1 = bitcast i8* %0 to <16 x i8>*
559 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
560 %2 = zext <16 x i8> %wide.load to <16 x i16>
561 %3 = getelementptr inbounds i8, i8* %y, i32 %index
562 %4 = bitcast i8* %3 to <16 x i8>*
563 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
564 %5 = zext <16 x i8> %wide.load16 to <16 x i16>
565 %6 = add nuw nsw <16 x i16> %5, %2
566 %7 = lshr <16 x i16> %6, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
567 %8 = trunc <16 x i16> %7 to <16 x i8>
568 %9 = getelementptr inbounds i8, i8* %z, i32 %index
569 %10 = bitcast i8* %9 to <16 x i8>*
570 store <16 x i8> %8, <16 x i8>* %10, align 1
571 %index.next = add i32 %index, 16
572 %11 = icmp eq i32 %index.next, 1024
573 br i1 %11, label %for.cond.cleanup, label %vector.body
575 for.cond.cleanup: ; preds = %vector.body
579 define void @vhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
580 ; CHECK-LABEL: vhadd_loop_u16:
581 ; CHECK: @ %bb.0: @ %entry
582 ; CHECK-NEXT: .save {r7, lr}
583 ; CHECK-NEXT: push {r7, lr}
584 ; CHECK-NEXT: mov.w lr, #128
585 ; CHECK-NEXT: .LBB28_1: @ %vector.body
586 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
587 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
588 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
589 ; CHECK-NEXT: vhadd.u16 q0, q1, q0
590 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
591 ; CHECK-NEXT: le lr, .LBB28_1
592 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
593 ; CHECK-NEXT: pop {r7, pc}
595 br label %vector.body
597 vector.body: ; preds = %vector.body, %entry
598 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
599 %0 = getelementptr inbounds i16, i16* %x, i32 %index
600 %1 = bitcast i16* %0 to <8 x i16>*
601 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
602 %2 = zext <8 x i16> %wide.load to <8 x i32>
603 %3 = getelementptr inbounds i16, i16* %y, i32 %index
604 %4 = bitcast i16* %3 to <8 x i16>*
605 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
606 %5 = zext <8 x i16> %wide.load16 to <8 x i32>
607 %6 = add nuw nsw <8 x i32> %5, %2
608 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
609 %8 = trunc <8 x i32> %7 to <8 x i16>
610 %9 = getelementptr inbounds i16, i16* %z, i32 %index
611 %10 = bitcast i16* %9 to <8 x i16>*
612 store <8 x i16> %8, <8 x i16>* %10, align 2
613 %index.next = add i32 %index, 8
614 %11 = icmp eq i32 %index.next, 1024
615 br i1 %11, label %for.cond.cleanup, label %vector.body
617 for.cond.cleanup: ; preds = %vector.body
621 define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
622 ; CHECK-LABEL: vhadd_loop_u32:
623 ; CHECK: @ %bb.0: @ %entry
624 ; CHECK-NEXT: .save {r7, lr}
625 ; CHECK-NEXT: push {r7, lr}
626 ; CHECK-NEXT: mov.w lr, #256
627 ; CHECK-NEXT: .LBB29_1: @ %vector.body
628 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
629 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
630 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
631 ; CHECK-NEXT: vhadd.u32 q0, q1, q0
632 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
633 ; CHECK-NEXT: le lr, .LBB29_1
634 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
635 ; CHECK-NEXT: pop {r7, pc}
637 br label %vector.body
639 vector.body: ; preds = %vector.body, %entry
640 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
641 %0 = getelementptr inbounds i32, i32* %x, i32 %index
642 %1 = bitcast i32* %0 to <4 x i32>*
643 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
644 %2 = zext <4 x i32> %wide.load to <4 x i64>
645 %3 = getelementptr inbounds i32, i32* %y, i32 %index
646 %4 = bitcast i32* %3 to <4 x i32>*
647 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
648 %5 = zext <4 x i32> %wide.load16 to <4 x i64>
649 %6 = add nuw nsw <4 x i64> %5, %2
650 %7 = lshr <4 x i64> %6, <i64 1, i64 1, i64 1, i64 1>
651 %8 = trunc <4 x i64> %7 to <4 x i32>
652 %9 = getelementptr inbounds i32, i32* %z, i32 %index
653 %10 = bitcast i32* %9 to <4 x i32>*
654 store <4 x i32> %8, <4 x i32>* %10, align 4
655 %index.next = add i32 %index, 4
656 %11 = icmp eq i32 %index.next, 1024
657 br i1 %11, label %for.cond.cleanup, label %vector.body
659 for.cond.cleanup: ; preds = %vector.body
663 define void @vrhadd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
664 ; CHECK-LABEL: vrhadd_loop_s8:
665 ; CHECK: @ %bb.0: @ %entry
666 ; CHECK-NEXT: .save {r7, lr}
667 ; CHECK-NEXT: push {r7, lr}
668 ; CHECK-NEXT: mov.w lr, #64
669 ; CHECK-NEXT: .LBB30_1: @ %vector.body
670 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
671 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
672 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
673 ; CHECK-NEXT: vrhadd.u8 q0, q1, q0
674 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
675 ; CHECK-NEXT: le lr, .LBB30_1
676 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
677 ; CHECK-NEXT: pop {r7, pc}
679 br label %vector.body
681 vector.body: ; preds = %vector.body, %entry
682 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
683 %0 = getelementptr inbounds i8, i8* %x, i32 %index
684 %1 = bitcast i8* %0 to <16 x i8>*
685 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
686 %2 = zext <16 x i8> %wide.load to <16 x i16>
687 %3 = getelementptr inbounds i8, i8* %y, i32 %index
688 %4 = bitcast i8* %3 to <16 x i8>*
689 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
690 %5 = zext <16 x i8> %wide.load16 to <16 x i16>
691 %6 = add nuw nsw <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
692 %7 = add nuw nsw <16 x i16> %6, %5
693 %8 = lshr <16 x i16> %7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
694 %9 = trunc <16 x i16> %8 to <16 x i8>
695 %10 = getelementptr inbounds i8, i8* %z, i32 %index
696 %11 = bitcast i8* %10 to <16 x i8>*
697 store <16 x i8> %9, <16 x i8>* %11, align 1
698 %index.next = add i32 %index, 16
699 %12 = icmp eq i32 %index.next, 1024
700 br i1 %12, label %for.cond.cleanup, label %vector.body
702 for.cond.cleanup: ; preds = %vector.body
706 define void @vrhadd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
707 ; CHECK-LABEL: vrhadd_loop_s16:
708 ; CHECK: @ %bb.0: @ %entry
709 ; CHECK-NEXT: .save {r7, lr}
710 ; CHECK-NEXT: push {r7, lr}
711 ; CHECK-NEXT: mov.w lr, #128
712 ; CHECK-NEXT: .LBB31_1: @ %vector.body
713 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
714 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
715 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
716 ; CHECK-NEXT: vrhadd.u16 q0, q1, q0
717 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
718 ; CHECK-NEXT: le lr, .LBB31_1
719 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
720 ; CHECK-NEXT: pop {r7, pc}
722 br label %vector.body
724 vector.body: ; preds = %vector.body, %entry
725 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
726 %0 = getelementptr inbounds i16, i16* %x, i32 %index
727 %1 = bitcast i16* %0 to <8 x i16>*
728 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
729 %2 = zext <8 x i16> %wide.load to <8 x i32>
730 %3 = getelementptr inbounds i16, i16* %y, i32 %index
731 %4 = bitcast i16* %3 to <8 x i16>*
732 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
733 %5 = zext <8 x i16> %wide.load16 to <8 x i32>
734 %6 = add nuw nsw <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
735 %7 = add nuw nsw <8 x i32> %6, %5
736 %8 = lshr <8 x i32> %7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
737 %9 = trunc <8 x i32> %8 to <8 x i16>
738 %10 = getelementptr inbounds i16, i16* %z, i32 %index
739 %11 = bitcast i16* %10 to <8 x i16>*
740 store <8 x i16> %9, <8 x i16>* %11, align 2
741 %index.next = add i32 %index, 8
742 %12 = icmp eq i32 %index.next, 1024
743 br i1 %12, label %for.cond.cleanup, label %vector.body
745 for.cond.cleanup: ; preds = %vector.body
749 define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
750 ; CHECK-LABEL: vrhadd_loop_s32:
751 ; CHECK: @ %bb.0: @ %entry
752 ; CHECK-NEXT: .save {r7, lr}
753 ; CHECK-NEXT: push {r7, lr}
754 ; CHECK-NEXT: mov.w lr, #256
755 ; CHECK-NEXT: .LBB32_1: @ %vector.body
756 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
757 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
758 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
759 ; CHECK-NEXT: vrhadd.u32 q0, q1, q0
760 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
761 ; CHECK-NEXT: le lr, .LBB32_1
762 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
763 ; CHECK-NEXT: pop {r7, pc}
765 br label %vector.body
767 vector.body: ; preds = %vector.body, %entry
768 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
769 %0 = getelementptr inbounds i32, i32* %x, i32 %index
770 %1 = bitcast i32* %0 to <4 x i32>*
771 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
772 %2 = zext <4 x i32> %wide.load to <4 x i64>
773 %3 = getelementptr inbounds i32, i32* %y, i32 %index
774 %4 = bitcast i32* %3 to <4 x i32>*
775 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
776 %5 = zext <4 x i32> %wide.load16 to <4 x i64>
777 %6 = add nuw nsw <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
778 %7 = add nuw nsw <4 x i64> %6, %5
779 %8 = lshr <4 x i64> %7, <i64 1, i64 1, i64 1, i64 1>
780 %9 = trunc <4 x i64> %8 to <4 x i32>
781 %10 = getelementptr inbounds i32, i32* %z, i32 %index
782 %11 = bitcast i32* %10 to <4 x i32>*
783 store <4 x i32> %9, <4 x i32>* %11, align 4
784 %index.next = add i32 %index, 4
785 %12 = icmp eq i32 %index.next, 1024
786 br i1 %12, label %for.cond.cleanup, label %vector.body
788 for.cond.cleanup: ; preds = %vector.body
792 define void @vrhadd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
793 ; CHECK-LABEL: vrhadd_loop_u8:
794 ; CHECK: @ %bb.0: @ %entry
795 ; CHECK-NEXT: .save {r7, lr}
796 ; CHECK-NEXT: push {r7, lr}
797 ; CHECK-NEXT: mov.w lr, #64
798 ; CHECK-NEXT: .LBB33_1: @ %vector.body
799 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
800 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
801 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
802 ; CHECK-NEXT: vrhadd.u8 q0, q1, q0
803 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
804 ; CHECK-NEXT: le lr, .LBB33_1
805 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
806 ; CHECK-NEXT: pop {r7, pc}
808 br label %vector.body
810 vector.body: ; preds = %vector.body, %entry
811 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
812 %0 = getelementptr inbounds i8, i8* %x, i32 %index
813 %1 = bitcast i8* %0 to <16 x i8>*
814 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
815 %2 = zext <16 x i8> %wide.load to <16 x i16>
816 %3 = getelementptr inbounds i8, i8* %y, i32 %index
817 %4 = bitcast i8* %3 to <16 x i8>*
818 %wide.load16 = load <16 x i8>, <16 x i8>* %4, align 1
819 %5 = zext <16 x i8> %wide.load16 to <16 x i16>
820 %6 = add nuw nsw <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
821 %7 = add nuw nsw <16 x i16> %6, %5
822 %8 = lshr <16 x i16> %7, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
823 %9 = trunc <16 x i16> %8 to <16 x i8>
824 %10 = getelementptr inbounds i8, i8* %z, i32 %index
825 %11 = bitcast i8* %10 to <16 x i8>*
826 store <16 x i8> %9, <16 x i8>* %11, align 1
827 %index.next = add i32 %index, 16
828 %12 = icmp eq i32 %index.next, 1024
829 br i1 %12, label %for.cond.cleanup, label %vector.body
831 for.cond.cleanup: ; preds = %vector.body
835 define void @vrhadd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
836 ; CHECK-LABEL: vrhadd_loop_u16:
837 ; CHECK: @ %bb.0: @ %entry
838 ; CHECK-NEXT: .save {r7, lr}
839 ; CHECK-NEXT: push {r7, lr}
840 ; CHECK-NEXT: mov.w lr, #128
841 ; CHECK-NEXT: .LBB34_1: @ %vector.body
842 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
843 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
844 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
845 ; CHECK-NEXT: vrhadd.u16 q0, q1, q0
846 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
847 ; CHECK-NEXT: le lr, .LBB34_1
848 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
849 ; CHECK-NEXT: pop {r7, pc}
851 br label %vector.body
853 vector.body: ; preds = %vector.body, %entry
854 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
855 %0 = getelementptr inbounds i16, i16* %x, i32 %index
856 %1 = bitcast i16* %0 to <8 x i16>*
857 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
858 %2 = zext <8 x i16> %wide.load to <8 x i32>
859 %3 = getelementptr inbounds i16, i16* %y, i32 %index
860 %4 = bitcast i16* %3 to <8 x i16>*
861 %wide.load16 = load <8 x i16>, <8 x i16>* %4, align 2
862 %5 = zext <8 x i16> %wide.load16 to <8 x i32>
863 %6 = add nuw nsw <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
864 %7 = add nuw nsw <8 x i32> %6, %5
865 %8 = lshr <8 x i32> %7, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
866 %9 = trunc <8 x i32> %8 to <8 x i16>
867 %10 = getelementptr inbounds i16, i16* %z, i32 %index
868 %11 = bitcast i16* %10 to <8 x i16>*
869 store <8 x i16> %9, <8 x i16>* %11, align 2
870 %index.next = add i32 %index, 8
871 %12 = icmp eq i32 %index.next, 1024
872 br i1 %12, label %for.cond.cleanup, label %vector.body
874 for.cond.cleanup: ; preds = %vector.body
878 define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
879 ; CHECK-LABEL: vrhadd_loop_u32:
880 ; CHECK: @ %bb.0: @ %entry
881 ; CHECK-NEXT: .save {r7, lr}
882 ; CHECK-NEXT: push {r7, lr}
883 ; CHECK-NEXT: mov.w lr, #256
884 ; CHECK-NEXT: .LBB35_1: @ %vector.body
885 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
886 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
887 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
888 ; CHECK-NEXT: vrhadd.u32 q0, q1, q0
889 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
890 ; CHECK-NEXT: le lr, .LBB35_1
891 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
892 ; CHECK-NEXT: pop {r7, pc}
894 br label %vector.body
896 vector.body: ; preds = %vector.body, %entry
897 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
898 %0 = getelementptr inbounds i32, i32* %x, i32 %index
899 %1 = bitcast i32* %0 to <4 x i32>*
900 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
901 %2 = zext <4 x i32> %wide.load to <4 x i64>
902 %3 = getelementptr inbounds i32, i32* %y, i32 %index
903 %4 = bitcast i32* %3 to <4 x i32>*
904 %wide.load16 = load <4 x i32>, <4 x i32>* %4, align 4
905 %5 = zext <4 x i32> %wide.load16 to <4 x i64>
906 %6 = add nuw nsw <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
907 %7 = add nuw nsw <4 x i64> %6, %5
908 %8 = lshr <4 x i64> %7, <i64 1, i64 1, i64 1, i64 1>
909 %9 = trunc <4 x i64> %8 to <4 x i32>
910 %10 = getelementptr inbounds i32, i32* %z, i32 %index
911 %11 = bitcast i32* %10 to <4 x i32>*
912 store <4 x i32> %9, <4 x i32>* %11, align 4
913 %index.next = add i32 %index, 4
914 %12 = icmp eq i32 %index.next, 1024
915 br i1 %12, label %for.cond.cleanup, label %vector.body
917 for.cond.cleanup: ; preds = %vector.body