1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
5 ; CHECK-LABEL: vhadds_v4i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vhadd.s32 q0, q0, q1
10 %s0s = sext <4 x i32> %s0 to <4 x i64>
11 %s1s = sext <4 x i32> %s1 to <4 x i64>
12 %m = add nsw <4 x i64> %s0s, %s1s
13 %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1>
14 %s2 = trunc <4 x i64> %s to <4 x i32>
18 define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
19 ; CHECK-LABEL: vhaddu_v4i32:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vhadd.u32 q0, q0, q1
24 %s0s = zext <4 x i32> %s0 to <4 x i64>
25 %s1s = zext <4 x i32> %s1 to <4 x i64>
26 %m = add nuw nsw <4 x i64> %s0s, %s1s
27 %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1>
28 %s2 = trunc <4 x i64> %s to <4 x i32>
32 define arm_aapcs_vfpcc <4 x i16> @vhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
33 ; CHECK-LABEL: vhadds_v4i16:
34 ; CHECK: @ %bb.0: @ %entry
35 ; CHECK-NEXT: vmovlb.s16 q1, q1
36 ; CHECK-NEXT: vmovlb.s16 q0, q0
37 ; CHECK-NEXT: vadd.i32 q0, q0, q1
38 ; CHECK-NEXT: vshr.u32 q0, q0, #1
41 %s0s = sext <4 x i16> %s0 to <4 x i32>
42 %s1s = sext <4 x i16> %s1 to <4 x i32>
43 %m = add nsw <4 x i32> %s0s, %s1s
44 %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1>
45 %s2 = trunc <4 x i32> %s to <4 x i16>
49 define arm_aapcs_vfpcc <4 x i16> @vhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
50 ; CHECK-LABEL: vhaddu_v4i16:
51 ; CHECK: @ %bb.0: @ %entry
52 ; CHECK-NEXT: vmovlb.u16 q1, q1
53 ; CHECK-NEXT: vmovlb.u16 q0, q0
54 ; CHECK-NEXT: vhadd.u32 q0, q0, q1
57 %s0s = zext <4 x i16> %s0 to <4 x i32>
58 %s1s = zext <4 x i16> %s1 to <4 x i32>
59 %m = add nuw nsw <4 x i32> %s0s, %s1s
60 %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1>
61 %s2 = trunc <4 x i32> %s to <4 x i16>
65 define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
66 ; CHECK-LABEL: vhadds_v8i16:
67 ; CHECK: @ %bb.0: @ %entry
68 ; CHECK-NEXT: vhadd.s16 q0, q0, q1
71 %s0s = sext <8 x i16> %s0 to <8 x i32>
72 %s1s = sext <8 x i16> %s1 to <8 x i32>
73 %m = add nsw <8 x i32> %s0s, %s1s
74 %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
75 %s2 = trunc <8 x i32> %s to <8 x i16>
79 define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
80 ; CHECK-LABEL: vhaddu_v8i16:
81 ; CHECK: @ %bb.0: @ %entry
82 ; CHECK-NEXT: vhadd.u16 q0, q0, q1
85 %s0s = zext <8 x i16> %s0 to <8 x i32>
86 %s1s = zext <8 x i16> %s1 to <8 x i32>
87 %m = add nuw nsw <8 x i32> %s0s, %s1s
88 %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
89 %s2 = trunc <8 x i32> %s to <8 x i16>
93 define arm_aapcs_vfpcc <4 x i8> @vhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
94 ; CHECK-LABEL: vhadds_v4i8:
95 ; CHECK: @ %bb.0: @ %entry
96 ; CHECK-NEXT: vmovlb.s8 q1, q1
97 ; CHECK-NEXT: vmovlb.s8 q0, q0
98 ; CHECK-NEXT: vmovlb.s16 q1, q1
99 ; CHECK-NEXT: vmovlb.s16 q0, q0
100 ; CHECK-NEXT: vadd.i32 q0, q0, q1
101 ; CHECK-NEXT: vmovlb.u16 q0, q0
102 ; CHECK-NEXT: vshr.u32 q0, q0, #1
105 %s0s = sext <4 x i8> %s0 to <4 x i16>
106 %s1s = sext <4 x i8> %s1 to <4 x i16>
107 %m = add nsw <4 x i16> %s0s, %s1s
108 %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1>
109 %s2 = trunc <4 x i16> %s to <4 x i8>
113 define arm_aapcs_vfpcc <4 x i8> @vhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
114 ; CHECK-LABEL: vhaddu_v4i8:
115 ; CHECK: @ %bb.0: @ %entry
116 ; CHECK-NEXT: vmov.i32 q2, #0xff
117 ; CHECK-NEXT: vand q1, q1, q2
118 ; CHECK-NEXT: vand q0, q0, q2
119 ; CHECK-NEXT: vhadd.u32 q0, q0, q1
122 %s0s = zext <4 x i8> %s0 to <4 x i16>
123 %s1s = zext <4 x i8> %s1 to <4 x i16>
124 %m = add nuw nsw <4 x i16> %s0s, %s1s
125 %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1>
126 %s2 = trunc <4 x i16> %s to <4 x i8>
130 define arm_aapcs_vfpcc <8 x i8> @vhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
131 ; CHECK-LABEL: vhadds_v8i8:
132 ; CHECK: @ %bb.0: @ %entry
133 ; CHECK-NEXT: vmovlb.s8 q1, q1
134 ; CHECK-NEXT: vmovlb.s8 q0, q0
135 ; CHECK-NEXT: vadd.i16 q0, q0, q1
136 ; CHECK-NEXT: vshr.u16 q0, q0, #1
139 %s0s = sext <8 x i8> %s0 to <8 x i16>
140 %s1s = sext <8 x i8> %s1 to <8 x i16>
141 %m = add nsw <8 x i16> %s0s, %s1s
142 %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
143 %s2 = trunc <8 x i16> %s to <8 x i8>
147 define arm_aapcs_vfpcc <8 x i8> @vhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
148 ; CHECK-LABEL: vhaddu_v8i8:
149 ; CHECK: @ %bb.0: @ %entry
150 ; CHECK-NEXT: vmovlb.u8 q1, q1
151 ; CHECK-NEXT: vmovlb.u8 q0, q0
152 ; CHECK-NEXT: vhadd.u16 q0, q0, q1
155 %s0s = zext <8 x i8> %s0 to <8 x i16>
156 %s1s = zext <8 x i8> %s1 to <8 x i16>
157 %m = add nuw nsw <8 x i16> %s0s, %s1s
158 %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
159 %s2 = trunc <8 x i16> %s to <8 x i8>
163 define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
164 ; CHECK-LABEL: vhadds_v16i8:
165 ; CHECK: @ %bb.0: @ %entry
166 ; CHECK-NEXT: vhadd.s8 q0, q0, q1
169 %s0s = sext <16 x i8> %s0 to <16 x i16>
170 %s1s = sext <16 x i8> %s1 to <16 x i16>
171 %m = add nsw <16 x i16> %s0s, %s1s
172 %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
173 %s2 = trunc <16 x i16> %s to <16 x i8>
177 define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
178 ; CHECK-LABEL: vhaddu_v16i8:
179 ; CHECK: @ %bb.0: @ %entry
180 ; CHECK-NEXT: vhadd.u8 q0, q0, q1
183 %s0s = zext <16 x i8> %s0 to <16 x i16>
184 %s1s = zext <16 x i8> %s1 to <16 x i16>
185 %m = add nuw nsw <16 x i16> %s0s, %s1s
186 %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
187 %s2 = trunc <16 x i16> %s to <16 x i8>
191 define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
192 ; CHECK-LABEL: vrhadds_v4i32:
193 ; CHECK: @ %bb.0: @ %entry
194 ; CHECK-NEXT: vrhadd.s32 q0, q0, q1
197 %s0s = sext <4 x i32> %s0 to <4 x i64>
198 %s1s = sext <4 x i32> %s1 to <4 x i64>
199 %add = add nsw <4 x i64> %s0s, <i64 1, i64 1, i64 1, i64 1>
200 %add2 = add nsw <4 x i64> %add, %s1s
201 %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
202 %result = trunc <4 x i64> %s to <4 x i32>
203 ret <4 x i32> %result
206 define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
207 ; CHECK-LABEL: vrhaddu_v4i32:
208 ; CHECK: @ %bb.0: @ %entry
209 ; CHECK-NEXT: vrhadd.u32 q0, q0, q1
212 %s0s = zext <4 x i32> %s0 to <4 x i64>
213 %s1s = zext <4 x i32> %s1 to <4 x i64>
214 %add = add nuw nsw <4 x i64> %s0s, <i64 1, i64 1, i64 1, i64 1>
215 %add2 = add nuw nsw <4 x i64> %add, %s1s
216 %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1>
217 %result = trunc <4 x i64> %s to <4 x i32>
218 ret <4 x i32> %result
221 define arm_aapcs_vfpcc <4 x i16> @vrhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
222 ; CHECK-LABEL: vrhadds_v4i16:
223 ; CHECK: @ %bb.0: @ %entry
224 ; CHECK-NEXT: vmovlb.s16 q1, q1
225 ; CHECK-NEXT: vmovlb.s16 q0, q0
226 ; CHECK-NEXT: vadd.i32 q0, q0, q1
227 ; CHECK-NEXT: movs r0, #1
228 ; CHECK-NEXT: vadd.i32 q0, q0, r0
229 ; CHECK-NEXT: vshr.u32 q0, q0, #1
232 %s0s = sext <4 x i16> %s0 to <4 x i32>
233 %s1s = sext <4 x i16> %s1 to <4 x i32>
234 %add = add nsw <4 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1>
235 %add2 = add nsw <4 x i32> %add, %s1s
236 %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
237 %result = trunc <4 x i32> %s to <4 x i16>
238 ret <4 x i16> %result
241 define arm_aapcs_vfpcc <4 x i16> @vrhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
242 ; CHECK-LABEL: vrhaddu_v4i16:
243 ; CHECK: @ %bb.0: @ %entry
244 ; CHECK-NEXT: vmovlb.u16 q1, q1
245 ; CHECK-NEXT: vmovlb.u16 q0, q0
246 ; CHECK-NEXT: vrhadd.u32 q0, q0, q1
249 %s0s = zext <4 x i16> %s0 to <4 x i32>
250 %s1s = zext <4 x i16> %s1 to <4 x i32>
251 %add = add nuw nsw <4 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1>
252 %add2 = add nuw nsw <4 x i32> %add, %s1s
253 %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1>
254 %result = trunc <4 x i32> %s to <4 x i16>
255 ret <4 x i16> %result
258 define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
259 ; CHECK-LABEL: vrhadds_v8i16:
260 ; CHECK: @ %bb.0: @ %entry
261 ; CHECK-NEXT: vrhadd.s16 q0, q0, q1
264 %s0s = sext <8 x i16> %s0 to <8 x i32>
265 %s1s = sext <8 x i16> %s1 to <8 x i32>
266 %add = add nsw <8 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
267 %add2 = add nsw <8 x i32> %add, %s1s
268 %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
269 %result = trunc <8 x i32> %s to <8 x i16>
270 ret <8 x i16> %result
273 define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
274 ; CHECK-LABEL: vrhaddu_v8i16:
275 ; CHECK: @ %bb.0: @ %entry
276 ; CHECK-NEXT: vrhadd.u16 q0, q0, q1
279 %s0s = zext <8 x i16> %s0 to <8 x i32>
280 %s1s = zext <8 x i16> %s1 to <8 x i32>
281 %add = add nuw nsw <8 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
282 %add2 = add nuw nsw <8 x i32> %add, %s1s
283 %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
284 %result = trunc <8 x i32> %s to <8 x i16>
285 ret <8 x i16> %result
288 define arm_aapcs_vfpcc <4 x i8> @vrhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
289 ; CHECK-LABEL: vrhadds_v4i8:
290 ; CHECK: @ %bb.0: @ %entry
291 ; CHECK-NEXT: vmovlb.s8 q1, q1
292 ; CHECK-NEXT: vmovlb.s8 q0, q0
293 ; CHECK-NEXT: vmovlb.s16 q1, q1
294 ; CHECK-NEXT: vmovlb.s16 q0, q0
295 ; CHECK-NEXT: vadd.i32 q0, q0, q1
296 ; CHECK-NEXT: movs r0, #1
297 ; CHECK-NEXT: vadd.i32 q0, q0, r0
298 ; CHECK-NEXT: vmovlb.u16 q0, q0
299 ; CHECK-NEXT: vshr.u32 q0, q0, #1
302 %s0s = sext <4 x i8> %s0 to <4 x i16>
303 %s1s = sext <4 x i8> %s1 to <4 x i16>
304 %add = add nsw <4 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1>
305 %add2 = add nsw <4 x i16> %add, %s1s
306 %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
307 %result = trunc <4 x i16> %s to <4 x i8>
311 define arm_aapcs_vfpcc <4 x i8> @vrhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
312 ; CHECK-LABEL: vrhaddu_v4i8:
313 ; CHECK: @ %bb.0: @ %entry
314 ; CHECK-NEXT: vmov.i32 q2, #0xff
315 ; CHECK-NEXT: vand q1, q1, q2
316 ; CHECK-NEXT: vand q0, q0, q2
317 ; CHECK-NEXT: vrhadd.u32 q0, q0, q1
320 %s0s = zext <4 x i8> %s0 to <4 x i16>
321 %s1s = zext <4 x i8> %s1 to <4 x i16>
322 %add = add nuw nsw <4 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1>
323 %add2 = add nuw nsw <4 x i16> %add, %s1s
324 %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1>
325 %result = trunc <4 x i16> %s to <4 x i8>
329 define arm_aapcs_vfpcc <8 x i8> @vrhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
330 ; CHECK-LABEL: vrhadds_v8i8:
331 ; CHECK: @ %bb.0: @ %entry
332 ; CHECK-NEXT: vmovlb.s8 q1, q1
333 ; CHECK-NEXT: vmovlb.s8 q0, q0
334 ; CHECK-NEXT: vadd.i16 q0, q0, q1
335 ; CHECK-NEXT: movs r0, #1
336 ; CHECK-NEXT: vadd.i16 q0, q0, r0
337 ; CHECK-NEXT: vshr.u16 q0, q0, #1
340 %s0s = sext <8 x i8> %s0 to <8 x i16>
341 %s1s = sext <8 x i8> %s1 to <8 x i16>
342 %add = add nsw <8 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
343 %add2 = add nsw <8 x i16> %add, %s1s
344 %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
345 %result = trunc <8 x i16> %s to <8 x i8>
349 define arm_aapcs_vfpcc <8 x i8> @vrhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
350 ; CHECK-LABEL: vrhaddu_v8i8:
351 ; CHECK: @ %bb.0: @ %entry
352 ; CHECK-NEXT: vmovlb.u8 q1, q1
353 ; CHECK-NEXT: vmovlb.u8 q0, q0
354 ; CHECK-NEXT: vrhadd.u16 q0, q0, q1
357 %s0s = zext <8 x i8> %s0 to <8 x i16>
358 %s1s = zext <8 x i8> %s1 to <8 x i16>
359 %add = add nuw nsw <8 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
360 %add2 = add nuw nsw <8 x i16> %add, %s1s
361 %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
362 %result = trunc <8 x i16> %s to <8 x i8>
366 define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
367 ; CHECK-LABEL: vrhadds_v16i8:
368 ; CHECK: @ %bb.0: @ %entry
369 ; CHECK-NEXT: vrhadd.s8 q0, q0, q1
372 %s0s = sext <16 x i8> %s0 to <16 x i16>
373 %s1s = sext <16 x i8> %s1 to <16 x i16>
374 %add = add nsw <16 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
375 %add2 = add nsw <16 x i16> %add, %s1s
376 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
377 %result = trunc <16 x i16> %s to <16 x i8>
378 ret <16 x i8> %result
381 define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
382 ; CHECK-LABEL: vrhaddu_v16i8:
383 ; CHECK: @ %bb.0: @ %entry
384 ; CHECK-NEXT: vrhadd.u8 q0, q0, q1
387 %s0s = zext <16 x i8> %s0 to <16 x i16>
388 %s1s = zext <16 x i8> %s1 to <16 x i16>
389 %add = add nuw nsw <16 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
390 %add2 = add nuw nsw <16 x i16> %add, %s1s
391 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
392 %result = trunc <16 x i16> %s to <16 x i8>
393 ret <16 x i8> %result
396 define void @vhadd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
397 ; CHECK-LABEL: vhadd_loop_s8:
398 ; CHECK: @ %bb.0: @ %entry
399 ; CHECK-NEXT: .save {r7, lr}
400 ; CHECK-NEXT: push {r7, lr}
401 ; CHECK-NEXT: mov.w lr, #64
402 ; CHECK-NEXT: .LBB24_1: @ %vector.body
403 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
404 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
405 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
406 ; CHECK-NEXT: vhadd.s8 q0, q1, q0
407 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
408 ; CHECK-NEXT: le lr, .LBB24_1
409 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
410 ; CHECK-NEXT: pop {r7, pc}
412 br label %vector.body
414 vector.body: ; preds = %vector.body, %entry
415 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
416 %0 = getelementptr inbounds i8, ptr %x, i32 %index
417 %wide.load = load <16 x i8>, ptr %0, align 1
418 %1 = sext <16 x i8> %wide.load to <16 x i16>
419 %2 = getelementptr inbounds i8, ptr %y, i32 %index
420 %wide.load16 = load <16 x i8>, ptr %2, align 1
421 %3 = sext <16 x i8> %wide.load16 to <16 x i16>
422 %4 = add nsw <16 x i16> %3, %1
423 %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
424 %6 = trunc <16 x i16> %5 to <16 x i8>
425 %7 = getelementptr inbounds i8, ptr %z, i32 %index
426 store <16 x i8> %6, ptr %7, align 1
427 %index.next = add i32 %index, 16
428 %8 = icmp eq i32 %index.next, 1024
429 br i1 %8, label %for.cond.cleanup, label %vector.body
431 for.cond.cleanup: ; preds = %vector.body
435 define void @vhadd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
436 ; CHECK-LABEL: vhadd_loop_s16:
437 ; CHECK: @ %bb.0: @ %entry
438 ; CHECK-NEXT: .save {r7, lr}
439 ; CHECK-NEXT: push {r7, lr}
440 ; CHECK-NEXT: mov.w lr, #128
441 ; CHECK-NEXT: .LBB25_1: @ %vector.body
442 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
443 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
444 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
445 ; CHECK-NEXT: vhadd.s16 q0, q1, q0
446 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
447 ; CHECK-NEXT: le lr, .LBB25_1
448 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
449 ; CHECK-NEXT: pop {r7, pc}
451 br label %vector.body
453 vector.body: ; preds = %vector.body, %entry
454 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
455 %0 = getelementptr inbounds i16, ptr %x, i32 %index
456 %wide.load = load <8 x i16>, ptr %0, align 2
457 %1 = sext <8 x i16> %wide.load to <8 x i32>
458 %2 = getelementptr inbounds i16, ptr %y, i32 %index
459 %wide.load16 = load <8 x i16>, ptr %2, align 2
460 %3 = sext <8 x i16> %wide.load16 to <8 x i32>
461 %4 = add nsw <8 x i32> %3, %1
462 %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
463 %6 = trunc <8 x i32> %5 to <8 x i16>
464 %7 = getelementptr inbounds i16, ptr %z, i32 %index
465 store <8 x i16> %6, ptr %7, align 2
466 %index.next = add i32 %index, 8
467 %8 = icmp eq i32 %index.next, 1024
468 br i1 %8, label %for.cond.cleanup, label %vector.body
470 for.cond.cleanup: ; preds = %vector.body
474 define void @vhadd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
475 ; CHECK-LABEL: vhadd_loop_s32:
476 ; CHECK: @ %bb.0: @ %entry
477 ; CHECK-NEXT: .save {r7, lr}
478 ; CHECK-NEXT: push {r7, lr}
479 ; CHECK-NEXT: mov.w lr, #256
480 ; CHECK-NEXT: .LBB26_1: @ %vector.body
481 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
482 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
483 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
484 ; CHECK-NEXT: vhadd.s32 q0, q1, q0
485 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
486 ; CHECK-NEXT: le lr, .LBB26_1
487 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
488 ; CHECK-NEXT: pop {r7, pc}
490 br label %vector.body
492 vector.body: ; preds = %vector.body, %entry
493 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
494 %0 = getelementptr inbounds i32, ptr %x, i32 %index
495 %wide.load = load <4 x i32>, ptr %0, align 4
496 %1 = sext <4 x i32> %wide.load to <4 x i64>
497 %2 = getelementptr inbounds i32, ptr %y, i32 %index
498 %wide.load16 = load <4 x i32>, ptr %2, align 4
499 %3 = sext <4 x i32> %wide.load16 to <4 x i64>
500 %4 = add nsw <4 x i64> %3, %1
501 %5 = lshr <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1>
502 %6 = trunc <4 x i64> %5 to <4 x i32>
503 %7 = getelementptr inbounds i32, ptr %z, i32 %index
504 store <4 x i32> %6, ptr %7, align 4
505 %index.next = add i32 %index, 4
506 %8 = icmp eq i32 %index.next, 1024
507 br i1 %8, label %for.cond.cleanup, label %vector.body
509 for.cond.cleanup: ; preds = %vector.body
513 define void @vhadd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
514 ; CHECK-LABEL: vhadd_loop_u8:
515 ; CHECK: @ %bb.0: @ %entry
516 ; CHECK-NEXT: .save {r7, lr}
517 ; CHECK-NEXT: push {r7, lr}
518 ; CHECK-NEXT: mov.w lr, #64
519 ; CHECK-NEXT: .LBB27_1: @ %vector.body
520 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
521 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
522 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
523 ; CHECK-NEXT: vhadd.u8 q0, q1, q0
524 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
525 ; CHECK-NEXT: le lr, .LBB27_1
526 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
527 ; CHECK-NEXT: pop {r7, pc}
529 br label %vector.body
531 vector.body: ; preds = %vector.body, %entry
532 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
533 %0 = getelementptr inbounds i8, ptr %x, i32 %index
534 %wide.load = load <16 x i8>, ptr %0, align 1
535 %1 = zext <16 x i8> %wide.load to <16 x i16>
536 %2 = getelementptr inbounds i8, ptr %y, i32 %index
537 %wide.load16 = load <16 x i8>, ptr %2, align 1
538 %3 = zext <16 x i8> %wide.load16 to <16 x i16>
539 %4 = add nuw nsw <16 x i16> %3, %1
540 %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
541 %6 = trunc <16 x i16> %5 to <16 x i8>
542 %7 = getelementptr inbounds i8, ptr %z, i32 %index
543 store <16 x i8> %6, ptr %7, align 1
544 %index.next = add i32 %index, 16
545 %8 = icmp eq i32 %index.next, 1024
546 br i1 %8, label %for.cond.cleanup, label %vector.body
548 for.cond.cleanup: ; preds = %vector.body
552 define void @vhadd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
553 ; CHECK-LABEL: vhadd_loop_u16:
554 ; CHECK: @ %bb.0: @ %entry
555 ; CHECK-NEXT: .save {r7, lr}
556 ; CHECK-NEXT: push {r7, lr}
557 ; CHECK-NEXT: mov.w lr, #128
558 ; CHECK-NEXT: .LBB28_1: @ %vector.body
559 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
560 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
561 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
562 ; CHECK-NEXT: vhadd.u16 q0, q1, q0
563 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
564 ; CHECK-NEXT: le lr, .LBB28_1
565 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
566 ; CHECK-NEXT: pop {r7, pc}
568 br label %vector.body
570 vector.body: ; preds = %vector.body, %entry
571 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
572 %0 = getelementptr inbounds i16, ptr %x, i32 %index
573 %wide.load = load <8 x i16>, ptr %0, align 2
574 %1 = zext <8 x i16> %wide.load to <8 x i32>
575 %2 = getelementptr inbounds i16, ptr %y, i32 %index
576 %wide.load16 = load <8 x i16>, ptr %2, align 2
577 %3 = zext <8 x i16> %wide.load16 to <8 x i32>
578 %4 = add nuw nsw <8 x i32> %3, %1
579 %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
580 %6 = trunc <8 x i32> %5 to <8 x i16>
581 %7 = getelementptr inbounds i16, ptr %z, i32 %index
582 store <8 x i16> %6, ptr %7, align 2
583 %index.next = add i32 %index, 8
584 %8 = icmp eq i32 %index.next, 1024
585 br i1 %8, label %for.cond.cleanup, label %vector.body
587 for.cond.cleanup: ; preds = %vector.body
591 define void @vhadd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
592 ; CHECK-LABEL: vhadd_loop_u32:
593 ; CHECK: @ %bb.0: @ %entry
594 ; CHECK-NEXT: .save {r7, lr}
595 ; CHECK-NEXT: push {r7, lr}
596 ; CHECK-NEXT: mov.w lr, #256
597 ; CHECK-NEXT: .LBB29_1: @ %vector.body
598 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
599 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
600 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
601 ; CHECK-NEXT: vhadd.u32 q0, q1, q0
602 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
603 ; CHECK-NEXT: le lr, .LBB29_1
604 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
605 ; CHECK-NEXT: pop {r7, pc}
607 br label %vector.body
609 vector.body: ; preds = %vector.body, %entry
610 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
611 %0 = getelementptr inbounds i32, ptr %x, i32 %index
612 %wide.load = load <4 x i32>, ptr %0, align 4
613 %1 = zext <4 x i32> %wide.load to <4 x i64>
614 %2 = getelementptr inbounds i32, ptr %y, i32 %index
615 %wide.load16 = load <4 x i32>, ptr %2, align 4
616 %3 = zext <4 x i32> %wide.load16 to <4 x i64>
617 %4 = add nuw nsw <4 x i64> %3, %1
618 %5 = lshr <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1>
619 %6 = trunc <4 x i64> %5 to <4 x i32>
620 %7 = getelementptr inbounds i32, ptr %z, i32 %index
621 store <4 x i32> %6, ptr %7, align 4
622 %index.next = add i32 %index, 4
623 %8 = icmp eq i32 %index.next, 1024
624 br i1 %8, label %for.cond.cleanup, label %vector.body
626 for.cond.cleanup: ; preds = %vector.body
630 define void @vrhadd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
631 ; CHECK-LABEL: vrhadd_loop_s8:
632 ; CHECK: @ %bb.0: @ %entry
633 ; CHECK-NEXT: .save {r7, lr}
634 ; CHECK-NEXT: push {r7, lr}
635 ; CHECK-NEXT: mov.w lr, #64
636 ; CHECK-NEXT: .LBB30_1: @ %vector.body
637 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
638 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
639 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
640 ; CHECK-NEXT: vrhadd.u8 q0, q1, q0
641 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
642 ; CHECK-NEXT: le lr, .LBB30_1
643 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
644 ; CHECK-NEXT: pop {r7, pc}
646 br label %vector.body
648 vector.body: ; preds = %vector.body, %entry
649 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
650 %0 = getelementptr inbounds i8, ptr %x, i32 %index
651 %wide.load = load <16 x i8>, ptr %0, align 1
652 %1 = zext <16 x i8> %wide.load to <16 x i16>
653 %2 = getelementptr inbounds i8, ptr %y, i32 %index
654 %wide.load16 = load <16 x i8>, ptr %2, align 1
655 %3 = zext <16 x i8> %wide.load16 to <16 x i16>
656 %4 = add nuw nsw <16 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
657 %5 = add nuw nsw <16 x i16> %4, %3
658 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
659 %7 = trunc <16 x i16> %6 to <16 x i8>
660 %8 = getelementptr inbounds i8, ptr %z, i32 %index
661 store <16 x i8> %7, ptr %8, align 1
662 %index.next = add i32 %index, 16
663 %9 = icmp eq i32 %index.next, 1024
664 br i1 %9, label %for.cond.cleanup, label %vector.body
666 for.cond.cleanup: ; preds = %vector.body
670 define void @vrhadd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
671 ; CHECK-LABEL: vrhadd_loop_s16:
672 ; CHECK: @ %bb.0: @ %entry
673 ; CHECK-NEXT: .save {r7, lr}
674 ; CHECK-NEXT: push {r7, lr}
675 ; CHECK-NEXT: mov.w lr, #128
676 ; CHECK-NEXT: .LBB31_1: @ %vector.body
677 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
678 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
679 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
680 ; CHECK-NEXT: vrhadd.u16 q0, q1, q0
681 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
682 ; CHECK-NEXT: le lr, .LBB31_1
683 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
684 ; CHECK-NEXT: pop {r7, pc}
686 br label %vector.body
688 vector.body: ; preds = %vector.body, %entry
689 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
690 %0 = getelementptr inbounds i16, ptr %x, i32 %index
691 %wide.load = load <8 x i16>, ptr %0, align 2
692 %1 = zext <8 x i16> %wide.load to <8 x i32>
693 %2 = getelementptr inbounds i16, ptr %y, i32 %index
694 %wide.load16 = load <8 x i16>, ptr %2, align 2
695 %3 = zext <8 x i16> %wide.load16 to <8 x i32>
696 %4 = add nuw nsw <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
697 %5 = add nuw nsw <8 x i32> %4, %3
698 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
699 %7 = trunc <8 x i32> %6 to <8 x i16>
700 %8 = getelementptr inbounds i16, ptr %z, i32 %index
701 store <8 x i16> %7, ptr %8, align 2
702 %index.next = add i32 %index, 8
703 %9 = icmp eq i32 %index.next, 1024
704 br i1 %9, label %for.cond.cleanup, label %vector.body
706 for.cond.cleanup: ; preds = %vector.body
710 define void @vrhadd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
711 ; CHECK-LABEL: vrhadd_loop_s32:
712 ; CHECK: @ %bb.0: @ %entry
713 ; CHECK-NEXT: .save {r7, lr}
714 ; CHECK-NEXT: push {r7, lr}
715 ; CHECK-NEXT: mov.w lr, #256
716 ; CHECK-NEXT: .LBB32_1: @ %vector.body
717 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
718 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
719 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
720 ; CHECK-NEXT: vrhadd.u32 q0, q1, q0
721 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
722 ; CHECK-NEXT: le lr, .LBB32_1
723 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
724 ; CHECK-NEXT: pop {r7, pc}
726 br label %vector.body
728 vector.body: ; preds = %vector.body, %entry
729 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
730 %0 = getelementptr inbounds i32, ptr %x, i32 %index
731 %wide.load = load <4 x i32>, ptr %0, align 4
732 %1 = zext <4 x i32> %wide.load to <4 x i64>
733 %2 = getelementptr inbounds i32, ptr %y, i32 %index
734 %wide.load16 = load <4 x i32>, ptr %2, align 4
735 %3 = zext <4 x i32> %wide.load16 to <4 x i64>
736 %4 = add nuw nsw <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1>
737 %5 = add nuw nsw <4 x i64> %4, %3
738 %6 = lshr <4 x i64> %5, <i64 1, i64 1, i64 1, i64 1>
739 %7 = trunc <4 x i64> %6 to <4 x i32>
740 %8 = getelementptr inbounds i32, ptr %z, i32 %index
741 store <4 x i32> %7, ptr %8, align 4
742 %index.next = add i32 %index, 4
743 %9 = icmp eq i32 %index.next, 1024
744 br i1 %9, label %for.cond.cleanup, label %vector.body
746 for.cond.cleanup: ; preds = %vector.body
750 define void @vrhadd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
751 ; CHECK-LABEL: vrhadd_loop_u8:
752 ; CHECK: @ %bb.0: @ %entry
753 ; CHECK-NEXT: .save {r7, lr}
754 ; CHECK-NEXT: push {r7, lr}
755 ; CHECK-NEXT: mov.w lr, #64
756 ; CHECK-NEXT: .LBB33_1: @ %vector.body
757 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
758 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
759 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16
760 ; CHECK-NEXT: vrhadd.u8 q0, q1, q0
761 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
762 ; CHECK-NEXT: le lr, .LBB33_1
763 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
764 ; CHECK-NEXT: pop {r7, pc}
766 br label %vector.body
768 vector.body: ; preds = %vector.body, %entry
769 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
770 %0 = getelementptr inbounds i8, ptr %x, i32 %index
771 %wide.load = load <16 x i8>, ptr %0, align 1
772 %1 = zext <16 x i8> %wide.load to <16 x i16>
773 %2 = getelementptr inbounds i8, ptr %y, i32 %index
774 %wide.load16 = load <16 x i8>, ptr %2, align 1
775 %3 = zext <16 x i8> %wide.load16 to <16 x i16>
776 %4 = add nuw nsw <16 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
777 %5 = add nuw nsw <16 x i16> %4, %3
778 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
779 %7 = trunc <16 x i16> %6 to <16 x i8>
780 %8 = getelementptr inbounds i8, ptr %z, i32 %index
781 store <16 x i8> %7, ptr %8, align 1
782 %index.next = add i32 %index, 16
783 %9 = icmp eq i32 %index.next, 1024
784 br i1 %9, label %for.cond.cleanup, label %vector.body
786 for.cond.cleanup: ; preds = %vector.body
790 define void @vrhadd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
791 ; CHECK-LABEL: vrhadd_loop_u16:
792 ; CHECK: @ %bb.0: @ %entry
793 ; CHECK-NEXT: .save {r7, lr}
794 ; CHECK-NEXT: push {r7, lr}
795 ; CHECK-NEXT: mov.w lr, #128
796 ; CHECK-NEXT: .LBB34_1: @ %vector.body
797 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
798 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
799 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16
800 ; CHECK-NEXT: vrhadd.u16 q0, q1, q0
801 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
802 ; CHECK-NEXT: le lr, .LBB34_1
803 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
804 ; CHECK-NEXT: pop {r7, pc}
806 br label %vector.body
808 vector.body: ; preds = %vector.body, %entry
809 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
810 %0 = getelementptr inbounds i16, ptr %x, i32 %index
811 %wide.load = load <8 x i16>, ptr %0, align 2
812 %1 = zext <8 x i16> %wide.load to <8 x i32>
813 %2 = getelementptr inbounds i16, ptr %y, i32 %index
814 %wide.load16 = load <8 x i16>, ptr %2, align 2
815 %3 = zext <8 x i16> %wide.load16 to <8 x i32>
816 %4 = add nuw nsw <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
817 %5 = add nuw nsw <8 x i32> %4, %3
818 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
819 %7 = trunc <8 x i32> %6 to <8 x i16>
820 %8 = getelementptr inbounds i16, ptr %z, i32 %index
821 store <8 x i16> %7, ptr %8, align 2
822 %index.next = add i32 %index, 8
823 %9 = icmp eq i32 %index.next, 1024
824 br i1 %9, label %for.cond.cleanup, label %vector.body
826 for.cond.cleanup: ; preds = %vector.body
830 define void @vrhadd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) {
831 ; CHECK-LABEL: vrhadd_loop_u32:
832 ; CHECK: @ %bb.0: @ %entry
833 ; CHECK-NEXT: .save {r7, lr}
834 ; CHECK-NEXT: push {r7, lr}
835 ; CHECK-NEXT: mov.w lr, #256
836 ; CHECK-NEXT: .LBB35_1: @ %vector.body
837 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
838 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
839 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
840 ; CHECK-NEXT: vrhadd.u32 q0, q1, q0
841 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
842 ; CHECK-NEXT: le lr, .LBB35_1
843 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
844 ; CHECK-NEXT: pop {r7, pc}
846 br label %vector.body
848 vector.body: ; preds = %vector.body, %entry
849 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
850 %0 = getelementptr inbounds i32, ptr %x, i32 %index
851 %wide.load = load <4 x i32>, ptr %0, align 4
852 %1 = zext <4 x i32> %wide.load to <4 x i64>
853 %2 = getelementptr inbounds i32, ptr %y, i32 %index
854 %wide.load16 = load <4 x i32>, ptr %2, align 4
855 %3 = zext <4 x i32> %wide.load16 to <4 x i64>
856 %4 = add nuw nsw <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1>
857 %5 = add nuw nsw <4 x i64> %4, %3
858 %6 = lshr <4 x i64> %5, <i64 1, i64 1, i64 1, i64 1>
859 %7 = trunc <4 x i64> %6 to <4 x i32>
860 %8 = getelementptr inbounds i32, ptr %z, i32 %index
861 store <4 x i32> %7, ptr %8, align 4
862 %index.next = add i32 %index, 4
863 %9 = icmp eq i32 %index.next, 1024
864 br i1 %9, label %for.cond.cleanup, label %vector.body
866 for.cond.cleanup: ; preds = %vector.body
871 define arm_aapcs_vfpcc i16 @vhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
872 ; CHECK-LABEL: vhadds_reduce_v16i8:
873 ; CHECK: @ %bb.0: @ %entry
874 ; CHECK-NEXT: vhadd.s8 q0, q0, q1
875 ; CHECK-NEXT: vaddv.s8 r0, q0
878 %s0s = sext <16 x i8> %s0 to <16 x i16>
879 %s1s = sext <16 x i8> %s1 to <16 x i16>
880 %add = add <16 x i16> %s0s, %s1s
881 %s = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
882 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
886 define arm_aapcs_vfpcc i16 @vhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
887 ; CHECK-LABEL: vhaddu_reduce_v16i8:
888 ; CHECK: @ %bb.0: @ %entry
889 ; CHECK-NEXT: vhadd.u8 q0, q0, q1
890 ; CHECK-NEXT: vaddv.u8 r0, q0
893 %s0s = zext <16 x i8> %s0 to <16 x i16>
894 %s1s = zext <16 x i8> %s1 to <16 x i16>
895 %add = add <16 x i16> %s0s, %s1s
896 %s = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
897 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
901 define arm_aapcs_vfpcc i16 @vrhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
902 ; CHECK-LABEL: vrhadds_reduce_v16i8:
903 ; CHECK: @ %bb.0: @ %entry
904 ; CHECK-NEXT: vrhadd.s8 q0, q0, q1
905 ; CHECK-NEXT: vaddv.s8 r0, q0
908 %s0s = sext <16 x i8> %s0 to <16 x i16>
909 %s1s = sext <16 x i8> %s1 to <16 x i16>
910 %add = add <16 x i16> %s0s, %s1s
911 %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
912 %s = ashr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
913 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
917 define arm_aapcs_vfpcc i16 @vrhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
918 ; CHECK-LABEL: vrhaddu_reduce_v16i8:
919 ; CHECK: @ %bb.0: @ %entry
920 ; CHECK-NEXT: vrhadd.u8 q0, q0, q1
921 ; CHECK-NEXT: vaddv.u8 r0, q0
924 %s0s = zext <16 x i8> %s0 to <16 x i16>
925 %s1s = zext <16 x i8> %s1 to <16 x i16>
926 %add = add <16 x i16> %s0s, %s1s
927 %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
928 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
929 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
933 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)