1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
5 declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
7 define i8 @vreduce_add_v1i8(ptr %x) {
8 ; CHECK-LABEL: vreduce_add_v1i8:
10 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
11 ; CHECK-NEXT: vle8.v v8, (a0)
12 ; CHECK-NEXT: vmv.x.s a0, v8
14 %v = load <1 x i8>, ptr %x
15 %red = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %v)
19 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
21 define i8 @vreduce_add_v2i8(ptr %x) {
22 ; CHECK-LABEL: vreduce_add_v2i8:
24 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
25 ; CHECK-NEXT: vle8.v v8, (a0)
26 ; CHECK-NEXT: vmv.s.x v9, zero
27 ; CHECK-NEXT: vredsum.vs v8, v8, v9
28 ; CHECK-NEXT: vmv.x.s a0, v8
30 %v = load <2 x i8>, ptr %x
31 %red = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v)
35 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
37 define i8 @vreduce_add_v4i8(ptr %x) {
38 ; CHECK-LABEL: vreduce_add_v4i8:
40 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
41 ; CHECK-NEXT: vle8.v v8, (a0)
42 ; CHECK-NEXT: vmv.s.x v9, zero
43 ; CHECK-NEXT: vredsum.vs v8, v8, v9
44 ; CHECK-NEXT: vmv.x.s a0, v8
46 %v = load <4 x i8>, ptr %x
47 %red = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v)
51 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
53 define i8 @vreduce_add_v8i8(ptr %x) {
54 ; CHECK-LABEL: vreduce_add_v8i8:
56 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
57 ; CHECK-NEXT: vle8.v v8, (a0)
58 ; CHECK-NEXT: vmv.s.x v9, zero
59 ; CHECK-NEXT: vredsum.vs v8, v8, v9
60 ; CHECK-NEXT: vmv.x.s a0, v8
62 %v = load <8 x i8>, ptr %x
63 %red = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
67 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
69 define i8 @vreduce_add_v16i8(ptr %x) {
70 ; CHECK-LABEL: vreduce_add_v16i8:
72 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
73 ; CHECK-NEXT: vle8.v v8, (a0)
74 ; CHECK-NEXT: vmv.s.x v9, zero
75 ; CHECK-NEXT: vredsum.vs v8, v8, v9
76 ; CHECK-NEXT: vmv.x.s a0, v8
78 %v = load <16 x i8>, ptr %x
79 %red = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
83 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
85 define i8 @vreduce_add_v32i8(ptr %x) {
86 ; CHECK-LABEL: vreduce_add_v32i8:
88 ; CHECK-NEXT: li a1, 32
89 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
90 ; CHECK-NEXT: vle8.v v8, (a0)
91 ; CHECK-NEXT: vmv.s.x v10, zero
92 ; CHECK-NEXT: vredsum.vs v8, v8, v10
93 ; CHECK-NEXT: vmv.x.s a0, v8
95 %v = load <32 x i8>, ptr %x
96 %red = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v)
100 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
102 define i8 @vreduce_add_v64i8(ptr %x) {
103 ; CHECK-LABEL: vreduce_add_v64i8:
105 ; CHECK-NEXT: li a1, 64
106 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
107 ; CHECK-NEXT: vle8.v v8, (a0)
108 ; CHECK-NEXT: vmv.s.x v12, zero
109 ; CHECK-NEXT: vredsum.vs v8, v8, v12
110 ; CHECK-NEXT: vmv.x.s a0, v8
112 %v = load <64 x i8>, ptr %x
113 %red = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %v)
117 declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
119 define i8 @vreduce_add_v128i8(ptr %x) {
120 ; CHECK-LABEL: vreduce_add_v128i8:
122 ; CHECK-NEXT: li a1, 128
123 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
124 ; CHECK-NEXT: vle8.v v8, (a0)
125 ; CHECK-NEXT: vmv.s.x v16, zero
126 ; CHECK-NEXT: vredsum.vs v8, v8, v16
127 ; CHECK-NEXT: vmv.x.s a0, v8
129 %v = load <128 x i8>, ptr %x
130 %red = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %v)
134 declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
136 define i8 @vreduce_add_v256i8(ptr %x) {
137 ; CHECK-LABEL: vreduce_add_v256i8:
139 ; CHECK-NEXT: li a1, 128
140 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
141 ; CHECK-NEXT: vle8.v v8, (a0)
142 ; CHECK-NEXT: addi a0, a0, 128
143 ; CHECK-NEXT: vle8.v v16, (a0)
144 ; CHECK-NEXT: vadd.vv v8, v8, v16
145 ; CHECK-NEXT: vmv.s.x v16, zero
146 ; CHECK-NEXT: vredsum.vs v8, v8, v16
147 ; CHECK-NEXT: vmv.x.s a0, v8
149 %v = load <256 x i8>, ptr %x
150 %red = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %v)
154 declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
156 define i16 @vreduce_add_v1i16(ptr %x) {
157 ; CHECK-LABEL: vreduce_add_v1i16:
159 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
160 ; CHECK-NEXT: vle16.v v8, (a0)
161 ; CHECK-NEXT: vmv.x.s a0, v8
163 %v = load <1 x i16>, ptr %x
164 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %v)
168 define i16 @vwreduce_add_v1i16(ptr %x) {
169 ; CHECK-LABEL: vwreduce_add_v1i16:
171 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
172 ; CHECK-NEXT: vle8.v v8, (a0)
173 ; CHECK-NEXT: vsext.vf2 v9, v8
174 ; CHECK-NEXT: vmv.x.s a0, v9
176 %v = load <1 x i8>, ptr %x
177 %e = sext <1 x i8> %v to <1 x i16>
178 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
182 define i16 @vwreduce_uadd_v1i16(ptr %x) {
183 ; CHECK-LABEL: vwreduce_uadd_v1i16:
185 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
186 ; CHECK-NEXT: vle8.v v8, (a0)
187 ; CHECK-NEXT: vzext.vf2 v9, v8
188 ; CHECK-NEXT: vmv.x.s a0, v9
190 %v = load <1 x i8>, ptr %x
191 %e = zext <1 x i8> %v to <1 x i16>
192 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
196 declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
198 define i16 @vreduce_add_v2i16(ptr %x) {
199 ; CHECK-LABEL: vreduce_add_v2i16:
201 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
202 ; CHECK-NEXT: vle16.v v8, (a0)
203 ; CHECK-NEXT: vmv.s.x v9, zero
204 ; CHECK-NEXT: vredsum.vs v8, v8, v9
205 ; CHECK-NEXT: vmv.x.s a0, v8
207 %v = load <2 x i16>, ptr %x
208 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v)
212 define i16 @vwreduce_add_v2i16(ptr %x) {
213 ; CHECK-LABEL: vwreduce_add_v2i16:
215 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
216 ; CHECK-NEXT: vle8.v v8, (a0)
217 ; CHECK-NEXT: vmv.s.x v9, zero
218 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
219 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
220 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
221 ; CHECK-NEXT: vmv.x.s a0, v8
223 %v = load <2 x i8>, ptr %x
224 %e = sext <2 x i8> %v to <2 x i16>
225 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
229 define i16 @vwreduce_uadd_v2i16(ptr %x) {
230 ; CHECK-LABEL: vwreduce_uadd_v2i16:
232 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
233 ; CHECK-NEXT: vle8.v v8, (a0)
234 ; CHECK-NEXT: vmv.s.x v9, zero
235 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
236 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
237 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
238 ; CHECK-NEXT: vmv.x.s a0, v8
240 %v = load <2 x i8>, ptr %x
241 %e = zext <2 x i8> %v to <2 x i16>
242 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
246 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
248 define i16 @vreduce_add_v4i16(ptr %x) {
249 ; CHECK-LABEL: vreduce_add_v4i16:
251 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
252 ; CHECK-NEXT: vle16.v v8, (a0)
253 ; CHECK-NEXT: vmv.s.x v9, zero
254 ; CHECK-NEXT: vredsum.vs v8, v8, v9
255 ; CHECK-NEXT: vmv.x.s a0, v8
257 %v = load <4 x i16>, ptr %x
258 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
262 define i16 @vwreduce_add_v4i16(ptr %x) {
263 ; CHECK-LABEL: vwreduce_add_v4i16:
265 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
266 ; CHECK-NEXT: vle8.v v8, (a0)
267 ; CHECK-NEXT: vmv.s.x v9, zero
268 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
269 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
270 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
271 ; CHECK-NEXT: vmv.x.s a0, v8
273 %v = load <4 x i8>, ptr %x
274 %e = sext <4 x i8> %v to <4 x i16>
275 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
279 define i16 @vwreduce_uadd_v4i16(ptr %x) {
280 ; CHECK-LABEL: vwreduce_uadd_v4i16:
282 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
283 ; CHECK-NEXT: vle8.v v8, (a0)
284 ; CHECK-NEXT: vmv.s.x v9, zero
285 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
286 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
287 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
288 ; CHECK-NEXT: vmv.x.s a0, v8
290 %v = load <4 x i8>, ptr %x
291 %e = zext <4 x i8> %v to <4 x i16>
292 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
296 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
298 define i16 @vreduce_add_v8i16(ptr %x) {
299 ; CHECK-LABEL: vreduce_add_v8i16:
301 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
302 ; CHECK-NEXT: vle16.v v8, (a0)
303 ; CHECK-NEXT: vmv.s.x v9, zero
304 ; CHECK-NEXT: vredsum.vs v8, v8, v9
305 ; CHECK-NEXT: vmv.x.s a0, v8
307 %v = load <8 x i16>, ptr %x
308 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
312 define i16 @vwreduce_add_v8i16(ptr %x) {
313 ; CHECK-LABEL: vwreduce_add_v8i16:
315 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
316 ; CHECK-NEXT: vle8.v v8, (a0)
317 ; CHECK-NEXT: vmv.s.x v9, zero
318 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
319 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
320 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
321 ; CHECK-NEXT: vmv.x.s a0, v8
323 %v = load <8 x i8>, ptr %x
324 %e = sext <8 x i8> %v to <8 x i16>
325 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
329 define i16 @vwreduce_uadd_v8i16(ptr %x) {
330 ; CHECK-LABEL: vwreduce_uadd_v8i16:
332 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
333 ; CHECK-NEXT: vle8.v v8, (a0)
334 ; CHECK-NEXT: vmv.s.x v9, zero
335 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
336 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
337 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
338 ; CHECK-NEXT: vmv.x.s a0, v8
340 %v = load <8 x i8>, ptr %x
341 %e = zext <8 x i8> %v to <8 x i16>
342 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
346 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
348 define i16 @vreduce_add_v16i16(ptr %x) {
349 ; CHECK-LABEL: vreduce_add_v16i16:
351 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
352 ; CHECK-NEXT: vle16.v v8, (a0)
353 ; CHECK-NEXT: vmv.s.x v10, zero
354 ; CHECK-NEXT: vredsum.vs v8, v8, v10
355 ; CHECK-NEXT: vmv.x.s a0, v8
357 %v = load <16 x i16>, ptr %x
358 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v)
362 define i16 @vwreduce_add_v16i16(ptr %x) {
363 ; CHECK-LABEL: vwreduce_add_v16i16:
365 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
366 ; CHECK-NEXT: vle8.v v8, (a0)
367 ; CHECK-NEXT: vmv.s.x v9, zero
368 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
369 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
370 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
371 ; CHECK-NEXT: vmv.x.s a0, v8
373 %v = load <16 x i8>, ptr %x
374 %e = sext <16 x i8> %v to <16 x i16>
375 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
379 define i16 @vwreduce_uadd_v16i16(ptr %x) {
380 ; CHECK-LABEL: vwreduce_uadd_v16i16:
382 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
383 ; CHECK-NEXT: vle8.v v8, (a0)
384 ; CHECK-NEXT: vmv.s.x v9, zero
385 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
386 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
387 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
388 ; CHECK-NEXT: vmv.x.s a0, v8
390 %v = load <16 x i8>, ptr %x
391 %e = zext <16 x i8> %v to <16 x i16>
392 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
396 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
398 define i16 @vreduce_add_v32i16(ptr %x) {
399 ; CHECK-LABEL: vreduce_add_v32i16:
401 ; CHECK-NEXT: li a1, 32
402 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
403 ; CHECK-NEXT: vle16.v v8, (a0)
404 ; CHECK-NEXT: vmv.s.x v12, zero
405 ; CHECK-NEXT: vredsum.vs v8, v8, v12
406 ; CHECK-NEXT: vmv.x.s a0, v8
408 %v = load <32 x i16>, ptr %x
409 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %v)
413 define i16 @vwreduce_add_v32i16(ptr %x) {
414 ; CHECK-LABEL: vwreduce_add_v32i16:
416 ; CHECK-NEXT: li a1, 32
417 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
418 ; CHECK-NEXT: vle8.v v8, (a0)
419 ; CHECK-NEXT: vmv.s.x v10, zero
420 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
421 ; CHECK-NEXT: vwredsum.vs v8, v8, v10
422 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
423 ; CHECK-NEXT: vmv.x.s a0, v8
425 %v = load <32 x i8>, ptr %x
426 %e = sext <32 x i8> %v to <32 x i16>
427 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
431 define i16 @vwreduce_uadd_v32i16(ptr %x) {
432 ; CHECK-LABEL: vwreduce_uadd_v32i16:
434 ; CHECK-NEXT: li a1, 32
435 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
436 ; CHECK-NEXT: vle8.v v8, (a0)
437 ; CHECK-NEXT: vmv.s.x v10, zero
438 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
439 ; CHECK-NEXT: vwredsumu.vs v8, v8, v10
440 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
441 ; CHECK-NEXT: vmv.x.s a0, v8
443 %v = load <32 x i8>, ptr %x
444 %e = zext <32 x i8> %v to <32 x i16>
445 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
449 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
451 define i16 @vreduce_add_v64i16(ptr %x) {
452 ; CHECK-LABEL: vreduce_add_v64i16:
454 ; CHECK-NEXT: li a1, 64
455 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
456 ; CHECK-NEXT: vle16.v v8, (a0)
457 ; CHECK-NEXT: vmv.s.x v16, zero
458 ; CHECK-NEXT: vredsum.vs v8, v8, v16
459 ; CHECK-NEXT: vmv.x.s a0, v8
461 %v = load <64 x i16>, ptr %x
462 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %v)
466 define i16 @vwreduce_add_v64i16(ptr %x) {
467 ; CHECK-LABEL: vwreduce_add_v64i16:
469 ; CHECK-NEXT: li a1, 64
470 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
471 ; CHECK-NEXT: vle8.v v8, (a0)
472 ; CHECK-NEXT: vmv.s.x v12, zero
473 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma
474 ; CHECK-NEXT: vwredsum.vs v8, v8, v12
475 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
476 ; CHECK-NEXT: vmv.x.s a0, v8
478 %v = load <64 x i8>, ptr %x
479 %e = sext <64 x i8> %v to <64 x i16>
480 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
484 define i16 @vwreduce_uadd_v64i16(ptr %x) {
485 ; CHECK-LABEL: vwreduce_uadd_v64i16:
487 ; CHECK-NEXT: li a1, 64
488 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
489 ; CHECK-NEXT: vle8.v v8, (a0)
490 ; CHECK-NEXT: vmv.s.x v12, zero
491 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma
492 ; CHECK-NEXT: vwredsumu.vs v8, v8, v12
493 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
494 ; CHECK-NEXT: vmv.x.s a0, v8
496 %v = load <64 x i8>, ptr %x
497 %e = zext <64 x i8> %v to <64 x i16>
498 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
502 declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
504 define i16 @vreduce_add_v128i16(ptr %x) {
505 ; CHECK-LABEL: vreduce_add_v128i16:
507 ; CHECK-NEXT: li a1, 64
508 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
509 ; CHECK-NEXT: vle16.v v8, (a0)
510 ; CHECK-NEXT: addi a0, a0, 128
511 ; CHECK-NEXT: vle16.v v16, (a0)
512 ; CHECK-NEXT: vadd.vv v8, v8, v16
513 ; CHECK-NEXT: vmv.s.x v16, zero
514 ; CHECK-NEXT: vredsum.vs v8, v8, v16
515 ; CHECK-NEXT: vmv.x.s a0, v8
517 %v = load <128 x i16>, ptr %x
518 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %v)
522 define i16 @vwreduce_add_v128i16(ptr %x) {
523 ; CHECK-LABEL: vwreduce_add_v128i16:
525 ; CHECK-NEXT: li a1, 128
526 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
527 ; CHECK-NEXT: vle8.v v8, (a0)
528 ; CHECK-NEXT: li a0, 64
529 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
530 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
531 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
532 ; CHECK-NEXT: vwadd.vv v24, v8, v16
533 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
534 ; CHECK-NEXT: vmv.s.x v8, zero
535 ; CHECK-NEXT: vredsum.vs v8, v24, v8
536 ; CHECK-NEXT: vmv.x.s a0, v8
538 %v = load <128 x i8>, ptr %x
539 %e = sext <128 x i8> %v to <128 x i16>
540 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
544 define i16 @vwreduce_uadd_v128i16(ptr %x) {
545 ; CHECK-LABEL: vwreduce_uadd_v128i16:
547 ; CHECK-NEXT: li a1, 128
548 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
549 ; CHECK-NEXT: vle8.v v8, (a0)
550 ; CHECK-NEXT: li a0, 64
551 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
552 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
553 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
554 ; CHECK-NEXT: vwaddu.vv v24, v8, v16
555 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
556 ; CHECK-NEXT: vmv.s.x v8, zero
557 ; CHECK-NEXT: vredsum.vs v8, v24, v8
558 ; CHECK-NEXT: vmv.x.s a0, v8
560 %v = load <128 x i8>, ptr %x
561 %e = zext <128 x i8> %v to <128 x i16>
562 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
566 declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
568 define i32 @vreduce_add_v1i32(ptr %x) {
569 ; CHECK-LABEL: vreduce_add_v1i32:
571 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
572 ; CHECK-NEXT: vle32.v v8, (a0)
573 ; CHECK-NEXT: vmv.x.s a0, v8
575 %v = load <1 x i32>, ptr %x
576 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %v)
580 define i32 @vwreduce_add_v1i32(ptr %x) {
581 ; CHECK-LABEL: vwreduce_add_v1i32:
583 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
584 ; CHECK-NEXT: vle16.v v8, (a0)
585 ; CHECK-NEXT: vsext.vf2 v9, v8
586 ; CHECK-NEXT: vmv.x.s a0, v9
588 %v = load <1 x i16>, ptr %x
589 %e = sext <1 x i16> %v to <1 x i32>
590 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
594 define i32 @vwreduce_uadd_v1i32(ptr %x) {
595 ; CHECK-LABEL: vwreduce_uadd_v1i32:
597 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
598 ; CHECK-NEXT: vle16.v v8, (a0)
599 ; CHECK-NEXT: vzext.vf2 v9, v8
600 ; CHECK-NEXT: vmv.x.s a0, v9
602 %v = load <1 x i16>, ptr %x
603 %e = zext <1 x i16> %v to <1 x i32>
604 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
608 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
610 define i32 @vreduce_add_v2i32(ptr %x) {
611 ; CHECK-LABEL: vreduce_add_v2i32:
613 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
614 ; CHECK-NEXT: vle32.v v8, (a0)
615 ; CHECK-NEXT: vmv.s.x v9, zero
616 ; CHECK-NEXT: vredsum.vs v8, v8, v9
617 ; CHECK-NEXT: vmv.x.s a0, v8
619 %v = load <2 x i32>, ptr %x
620 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v)
624 define i32 @vwreduce_add_v2i32(ptr %x) {
625 ; CHECK-LABEL: vwreduce_add_v2i32:
627 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
628 ; CHECK-NEXT: vle16.v v8, (a0)
629 ; CHECK-NEXT: vmv.s.x v9, zero
630 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
631 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
632 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
633 ; CHECK-NEXT: vmv.x.s a0, v8
635 %v = load <2 x i16>, ptr %x
636 %e = sext <2 x i16> %v to <2 x i32>
637 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
641 define i32 @vwreduce_uadd_v2i32(ptr %x) {
642 ; CHECK-LABEL: vwreduce_uadd_v2i32:
644 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
645 ; CHECK-NEXT: vle16.v v8, (a0)
646 ; CHECK-NEXT: vmv.s.x v9, zero
647 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
648 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
649 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
650 ; CHECK-NEXT: vmv.x.s a0, v8
652 %v = load <2 x i16>, ptr %x
653 %e = zext <2 x i16> %v to <2 x i32>
654 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
658 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
660 define i32 @vreduce_add_v4i32(ptr %x) {
661 ; CHECK-LABEL: vreduce_add_v4i32:
663 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
664 ; CHECK-NEXT: vle32.v v8, (a0)
665 ; CHECK-NEXT: vmv.s.x v9, zero
666 ; CHECK-NEXT: vredsum.vs v8, v8, v9
667 ; CHECK-NEXT: vmv.x.s a0, v8
669 %v = load <4 x i32>, ptr %x
670 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
674 define i32 @vwreduce_add_v4i32(ptr %x) {
675 ; CHECK-LABEL: vwreduce_add_v4i32:
677 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
678 ; CHECK-NEXT: vle16.v v8, (a0)
679 ; CHECK-NEXT: vmv.s.x v9, zero
680 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
681 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
682 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
683 ; CHECK-NEXT: vmv.x.s a0, v8
685 %v = load <4 x i16>, ptr %x
686 %e = sext <4 x i16> %v to <4 x i32>
687 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
691 define i32 @vwreduce_uadd_v4i32(ptr %x) {
692 ; CHECK-LABEL: vwreduce_uadd_v4i32:
694 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
695 ; CHECK-NEXT: vle16.v v8, (a0)
696 ; CHECK-NEXT: vmv.s.x v9, zero
697 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
698 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
699 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
700 ; CHECK-NEXT: vmv.x.s a0, v8
702 %v = load <4 x i16>, ptr %x
703 %e = zext <4 x i16> %v to <4 x i32>
704 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
708 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
710 define i32 @vreduce_add_v8i32(ptr %x) {
711 ; CHECK-LABEL: vreduce_add_v8i32:
713 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
714 ; CHECK-NEXT: vle32.v v8, (a0)
715 ; CHECK-NEXT: vmv.s.x v10, zero
716 ; CHECK-NEXT: vredsum.vs v8, v8, v10
717 ; CHECK-NEXT: vmv.x.s a0, v8
719 %v = load <8 x i32>, ptr %x
720 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v)
724 define i32 @vwreduce_add_v8i32(ptr %x) {
725 ; CHECK-LABEL: vwreduce_add_v8i32:
727 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
728 ; CHECK-NEXT: vle16.v v8, (a0)
729 ; CHECK-NEXT: vmv.s.x v9, zero
730 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
731 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
732 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
733 ; CHECK-NEXT: vmv.x.s a0, v8
735 %v = load <8 x i16>, ptr %x
736 %e = sext <8 x i16> %v to <8 x i32>
737 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
741 define i32 @vwreduce_uadd_v8i32(ptr %x) {
742 ; CHECK-LABEL: vwreduce_uadd_v8i32:
744 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
745 ; CHECK-NEXT: vle16.v v8, (a0)
746 ; CHECK-NEXT: vmv.s.x v9, zero
747 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
748 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
749 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
750 ; CHECK-NEXT: vmv.x.s a0, v8
752 %v = load <8 x i16>, ptr %x
753 %e = zext <8 x i16> %v to <8 x i32>
754 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
758 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
760 define i32 @vreduce_add_v16i32(ptr %x) {
761 ; CHECK-LABEL: vreduce_add_v16i32:
763 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
764 ; CHECK-NEXT: vle32.v v8, (a0)
765 ; CHECK-NEXT: vmv.s.x v12, zero
766 ; CHECK-NEXT: vredsum.vs v8, v8, v12
767 ; CHECK-NEXT: vmv.x.s a0, v8
769 %v = load <16 x i32>, ptr %x
770 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v)
774 define i32 @vwreduce_add_v16i32(ptr %x) {
775 ; CHECK-LABEL: vwreduce_add_v16i32:
777 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
778 ; CHECK-NEXT: vle16.v v8, (a0)
779 ; CHECK-NEXT: vmv.s.x v10, zero
780 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
781 ; CHECK-NEXT: vwredsum.vs v8, v8, v10
782 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
783 ; CHECK-NEXT: vmv.x.s a0, v8
785 %v = load <16 x i16>, ptr %x
786 %e = sext <16 x i16> %v to <16 x i32>
787 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
791 define i32 @vwreduce_uadd_v16i32(ptr %x) {
792 ; CHECK-LABEL: vwreduce_uadd_v16i32:
794 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
795 ; CHECK-NEXT: vle16.v v8, (a0)
796 ; CHECK-NEXT: vmv.s.x v10, zero
797 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
798 ; CHECK-NEXT: vwredsumu.vs v8, v8, v10
799 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
800 ; CHECK-NEXT: vmv.x.s a0, v8
802 %v = load <16 x i16>, ptr %x
803 %e = zext <16 x i16> %v to <16 x i32>
804 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
808 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
810 define i32 @vreduce_add_v32i32(ptr %x) {
811 ; CHECK-LABEL: vreduce_add_v32i32:
813 ; CHECK-NEXT: li a1, 32
814 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
815 ; CHECK-NEXT: vle32.v v8, (a0)
816 ; CHECK-NEXT: vmv.s.x v16, zero
817 ; CHECK-NEXT: vredsum.vs v8, v8, v16
818 ; CHECK-NEXT: vmv.x.s a0, v8
820 %v = load <32 x i32>, ptr %x
821 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %v)
825 define i32 @vwreduce_add_v32i32(ptr %x) {
826 ; CHECK-LABEL: vwreduce_add_v32i32:
828 ; CHECK-NEXT: li a1, 32
829 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
830 ; CHECK-NEXT: vle16.v v8, (a0)
831 ; CHECK-NEXT: vmv.s.x v12, zero
832 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
833 ; CHECK-NEXT: vwredsum.vs v8, v8, v12
834 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
835 ; CHECK-NEXT: vmv.x.s a0, v8
837 %v = load <32 x i16>, ptr %x
838 %e = sext <32 x i16> %v to <32 x i32>
839 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
843 define i32 @vwreduce_uadd_v32i32(ptr %x) {
844 ; CHECK-LABEL: vwreduce_uadd_v32i32:
846 ; CHECK-NEXT: li a1, 32
847 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
848 ; CHECK-NEXT: vle16.v v8, (a0)
849 ; CHECK-NEXT: vmv.s.x v12, zero
850 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
851 ; CHECK-NEXT: vwredsumu.vs v8, v8, v12
852 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
853 ; CHECK-NEXT: vmv.x.s a0, v8
855 %v = load <32 x i16>, ptr %x
856 %e = zext <32 x i16> %v to <32 x i32>
857 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
861 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
863 define i32 @vreduce_add_v64i32(ptr %x) {
864 ; CHECK-LABEL: vreduce_add_v64i32:
866 ; CHECK-NEXT: li a1, 32
867 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
868 ; CHECK-NEXT: vle32.v v8, (a0)
869 ; CHECK-NEXT: addi a0, a0, 128
870 ; CHECK-NEXT: vle32.v v16, (a0)
871 ; CHECK-NEXT: vadd.vv v8, v8, v16
872 ; CHECK-NEXT: vmv.s.x v16, zero
873 ; CHECK-NEXT: vredsum.vs v8, v8, v16
874 ; CHECK-NEXT: vmv.x.s a0, v8
876 %v = load <64 x i32>, ptr %x
877 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %v)
881 define i32 @vwreduce_add_v64i32(ptr %x) {
882 ; CHECK-LABEL: vwreduce_add_v64i32:
884 ; CHECK-NEXT: li a1, 64
885 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
886 ; CHECK-NEXT: vle16.v v8, (a0)
887 ; CHECK-NEXT: li a0, 32
888 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
889 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
890 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
891 ; CHECK-NEXT: vwadd.vv v24, v8, v16
892 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
893 ; CHECK-NEXT: vmv.s.x v8, zero
894 ; CHECK-NEXT: vredsum.vs v8, v24, v8
895 ; CHECK-NEXT: vmv.x.s a0, v8
897 %v = load <64 x i16>, ptr %x
898 %e = sext <64 x i16> %v to <64 x i32>
899 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
903 define i32 @vwreduce_uadd_v64i32(ptr %x) {
904 ; CHECK-LABEL: vwreduce_uadd_v64i32:
906 ; CHECK-NEXT: li a1, 64
907 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
908 ; CHECK-NEXT: vle16.v v8, (a0)
909 ; CHECK-NEXT: li a0, 32
910 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
911 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
912 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
913 ; CHECK-NEXT: vwaddu.vv v24, v8, v16
914 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
915 ; CHECK-NEXT: vmv.s.x v8, zero
916 ; CHECK-NEXT: vredsum.vs v8, v24, v8
917 ; CHECK-NEXT: vmv.x.s a0, v8
919 %v = load <64 x i16>, ptr %x
920 %e = zext <64 x i16> %v to <64 x i32>
921 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
925 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
927 define i64 @vreduce_add_v1i64(ptr %x) {
928 ; RV32-LABEL: vreduce_add_v1i64:
930 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
931 ; RV32-NEXT: vle64.v v8, (a0)
932 ; RV32-NEXT: li a0, 32
933 ; RV32-NEXT: vsrl.vx v9, v8, a0
934 ; RV32-NEXT: vmv.x.s a1, v9
935 ; RV32-NEXT: vmv.x.s a0, v8
938 ; RV64-LABEL: vreduce_add_v1i64:
940 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
941 ; RV64-NEXT: vle64.v v8, (a0)
942 ; RV64-NEXT: vmv.x.s a0, v8
944 %v = load <1 x i64>, ptr %x
945 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %v)
949 define i64 @vwreduce_add_v1i64(ptr %x) {
950 ; RV32-LABEL: vwreduce_add_v1i64:
952 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
953 ; RV32-NEXT: vle32.v v8, (a0)
954 ; RV32-NEXT: vsext.vf2 v9, v8
955 ; RV32-NEXT: li a0, 32
956 ; RV32-NEXT: vsrl.vx v8, v9, a0
957 ; RV32-NEXT: vmv.x.s a1, v8
958 ; RV32-NEXT: vmv.x.s a0, v9
961 ; RV64-LABEL: vwreduce_add_v1i64:
963 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
964 ; RV64-NEXT: vle32.v v8, (a0)
965 ; RV64-NEXT: vsext.vf2 v9, v8
966 ; RV64-NEXT: vmv.x.s a0, v9
968 %v = load <1 x i32>, ptr %x
969 %e = sext <1 x i32> %v to <1 x i64>
970 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
974 define i64 @vwreduce_uadd_v1i64(ptr %x) {
975 ; RV32-LABEL: vwreduce_uadd_v1i64:
977 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
978 ; RV32-NEXT: vle32.v v8, (a0)
979 ; RV32-NEXT: vzext.vf2 v9, v8
980 ; RV32-NEXT: li a0, 32
981 ; RV32-NEXT: vsrl.vx v8, v9, a0
982 ; RV32-NEXT: vmv.x.s a1, v8
983 ; RV32-NEXT: vmv.x.s a0, v9
986 ; RV64-LABEL: vwreduce_uadd_v1i64:
988 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
989 ; RV64-NEXT: vle32.v v8, (a0)
990 ; RV64-NEXT: vzext.vf2 v9, v8
991 ; RV64-NEXT: vmv.x.s a0, v9
993 %v = load <1 x i32>, ptr %x
994 %e = zext <1 x i32> %v to <1 x i64>
995 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
999 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1001 define i64 @vreduce_add_v2i64(ptr %x) {
1002 ; RV32-LABEL: vreduce_add_v2i64:
1004 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1005 ; RV32-NEXT: vle64.v v8, (a0)
1006 ; RV32-NEXT: vmv.s.x v9, zero
1007 ; RV32-NEXT: vredsum.vs v8, v8, v9
1008 ; RV32-NEXT: vmv.x.s a0, v8
1009 ; RV32-NEXT: li a1, 32
1010 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1011 ; RV32-NEXT: vsrl.vx v8, v8, a1
1012 ; RV32-NEXT: vmv.x.s a1, v8
1015 ; RV64-LABEL: vreduce_add_v2i64:
1017 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1018 ; RV64-NEXT: vle64.v v8, (a0)
1019 ; RV64-NEXT: vmv.s.x v9, zero
1020 ; RV64-NEXT: vredsum.vs v8, v8, v9
1021 ; RV64-NEXT: vmv.x.s a0, v8
1023 %v = load <2 x i64>, ptr %x
1024 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v)
1028 define i64 @vwreduce_add_v2i64(ptr %x) {
1029 ; RV32-LABEL: vwreduce_add_v2i64:
1031 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1032 ; RV32-NEXT: vle32.v v8, (a0)
1033 ; RV32-NEXT: vmv.s.x v9, zero
1034 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1035 ; RV32-NEXT: vwredsum.vs v8, v8, v9
1036 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1037 ; RV32-NEXT: vmv.x.s a0, v8
1038 ; RV32-NEXT: li a1, 32
1039 ; RV32-NEXT: vsrl.vx v8, v8, a1
1040 ; RV32-NEXT: vmv.x.s a1, v8
1043 ; RV64-LABEL: vwreduce_add_v2i64:
1045 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1046 ; RV64-NEXT: vle32.v v8, (a0)
1047 ; RV64-NEXT: vmv.s.x v9, zero
1048 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1049 ; RV64-NEXT: vwredsum.vs v8, v8, v9
1050 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma
1051 ; RV64-NEXT: vmv.x.s a0, v8
1053 %v = load <2 x i32>, ptr %x
1054 %e = sext <2 x i32> %v to <2 x i64>
1055 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
1059 define i64 @vwreduce_uadd_v2i64(ptr %x) {
1060 ; RV32-LABEL: vwreduce_uadd_v2i64:
1062 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1063 ; RV32-NEXT: vle32.v v8, (a0)
1064 ; RV32-NEXT: vmv.s.x v9, zero
1065 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1066 ; RV32-NEXT: vwredsumu.vs v8, v8, v9
1067 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1068 ; RV32-NEXT: vmv.x.s a0, v8
1069 ; RV32-NEXT: li a1, 32
1070 ; RV32-NEXT: vsrl.vx v8, v8, a1
1071 ; RV32-NEXT: vmv.x.s a1, v8
1074 ; RV64-LABEL: vwreduce_uadd_v2i64:
1076 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1077 ; RV64-NEXT: vle32.v v8, (a0)
1078 ; RV64-NEXT: vmv.s.x v9, zero
1079 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1080 ; RV64-NEXT: vwredsumu.vs v8, v8, v9
1081 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma
1082 ; RV64-NEXT: vmv.x.s a0, v8
1084 %v = load <2 x i32>, ptr %x
1085 %e = zext <2 x i32> %v to <2 x i64>
1086 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
1090 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1092 define i64 @vreduce_add_v4i64(ptr %x) {
1093 ; RV32-LABEL: vreduce_add_v4i64:
1095 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1096 ; RV32-NEXT: vle64.v v8, (a0)
1097 ; RV32-NEXT: vmv.s.x v10, zero
1098 ; RV32-NEXT: vredsum.vs v8, v8, v10
1099 ; RV32-NEXT: vmv.x.s a0, v8
1100 ; RV32-NEXT: li a1, 32
1101 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1102 ; RV32-NEXT: vsrl.vx v8, v8, a1
1103 ; RV32-NEXT: vmv.x.s a1, v8
1106 ; RV64-LABEL: vreduce_add_v4i64:
1108 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1109 ; RV64-NEXT: vle64.v v8, (a0)
1110 ; RV64-NEXT: vmv.s.x v10, zero
1111 ; RV64-NEXT: vredsum.vs v8, v8, v10
1112 ; RV64-NEXT: vmv.x.s a0, v8
1114 %v = load <4 x i64>, ptr %x
1115 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
1119 define i64 @vwreduce_add_v4i64(ptr %x) {
1120 ; RV32-LABEL: vwreduce_add_v4i64:
1122 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1123 ; RV32-NEXT: vle32.v v8, (a0)
1124 ; RV32-NEXT: vmv.s.x v9, zero
1125 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1126 ; RV32-NEXT: vwredsum.vs v8, v8, v9
1127 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1128 ; RV32-NEXT: vmv.x.s a0, v8
1129 ; RV32-NEXT: li a1, 32
1130 ; RV32-NEXT: vsrl.vx v8, v8, a1
1131 ; RV32-NEXT: vmv.x.s a1, v8
1134 ; RV64-LABEL: vwreduce_add_v4i64:
1136 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1137 ; RV64-NEXT: vle32.v v8, (a0)
1138 ; RV64-NEXT: vmv.s.x v9, zero
1139 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1140 ; RV64-NEXT: vwredsum.vs v8, v8, v9
1141 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1142 ; RV64-NEXT: vmv.x.s a0, v8
1144 %v = load <4 x i32>, ptr %x
1145 %e = sext <4 x i32> %v to <4 x i64>
1146 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
1150 define i64 @vwreduce_uadd_v4i64(ptr %x) {
1151 ; RV32-LABEL: vwreduce_uadd_v4i64:
1153 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1154 ; RV32-NEXT: vle32.v v8, (a0)
1155 ; RV32-NEXT: vmv.s.x v9, zero
1156 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1157 ; RV32-NEXT: vwredsumu.vs v8, v8, v9
1158 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1159 ; RV32-NEXT: vmv.x.s a0, v8
1160 ; RV32-NEXT: li a1, 32
1161 ; RV32-NEXT: vsrl.vx v8, v8, a1
1162 ; RV32-NEXT: vmv.x.s a1, v8
1165 ; RV64-LABEL: vwreduce_uadd_v4i64:
1167 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1168 ; RV64-NEXT: vle32.v v8, (a0)
1169 ; RV64-NEXT: vmv.s.x v9, zero
1170 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1171 ; RV64-NEXT: vwredsumu.vs v8, v8, v9
1172 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1173 ; RV64-NEXT: vmv.x.s a0, v8
1175 %v = load <4 x i32>, ptr %x
1176 %e = zext <4 x i32> %v to <4 x i64>
1177 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
1181 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1183 define i64 @vreduce_add_v8i64(ptr %x) {
1184 ; RV32-LABEL: vreduce_add_v8i64:
1186 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1187 ; RV32-NEXT: vle64.v v8, (a0)
1188 ; RV32-NEXT: vmv.s.x v12, zero
1189 ; RV32-NEXT: vredsum.vs v8, v8, v12
1190 ; RV32-NEXT: vmv.x.s a0, v8
1191 ; RV32-NEXT: li a1, 32
1192 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1193 ; RV32-NEXT: vsrl.vx v8, v8, a1
1194 ; RV32-NEXT: vmv.x.s a1, v8
1197 ; RV64-LABEL: vreduce_add_v8i64:
1199 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1200 ; RV64-NEXT: vle64.v v8, (a0)
1201 ; RV64-NEXT: vmv.s.x v12, zero
1202 ; RV64-NEXT: vredsum.vs v8, v8, v12
1203 ; RV64-NEXT: vmv.x.s a0, v8
1205 %v = load <8 x i64>, ptr %x
1206 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %v)
1210 define i64 @vwreduce_add_v8i64(ptr %x) {
1211 ; RV32-LABEL: vwreduce_add_v8i64:
1213 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1214 ; RV32-NEXT: vle32.v v8, (a0)
1215 ; RV32-NEXT: vmv.s.x v10, zero
1216 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1217 ; RV32-NEXT: vwredsum.vs v8, v8, v10
1218 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1219 ; RV32-NEXT: vmv.x.s a0, v8
1220 ; RV32-NEXT: li a1, 32
1221 ; RV32-NEXT: vsrl.vx v8, v8, a1
1222 ; RV32-NEXT: vmv.x.s a1, v8
1225 ; RV64-LABEL: vwreduce_add_v8i64:
1227 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1228 ; RV64-NEXT: vle32.v v8, (a0)
1229 ; RV64-NEXT: vmv.s.x v10, zero
1230 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1231 ; RV64-NEXT: vwredsum.vs v8, v8, v10
1232 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1233 ; RV64-NEXT: vmv.x.s a0, v8
1235 %v = load <8 x i32>, ptr %x
1236 %e = sext <8 x i32> %v to <8 x i64>
1237 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
1241 define i64 @vwreduce_uadd_v8i64(ptr %x) {
1242 ; RV32-LABEL: vwreduce_uadd_v8i64:
1244 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1245 ; RV32-NEXT: vle32.v v8, (a0)
1246 ; RV32-NEXT: vmv.s.x v10, zero
1247 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1248 ; RV32-NEXT: vwredsumu.vs v8, v8, v10
1249 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1250 ; RV32-NEXT: vmv.x.s a0, v8
1251 ; RV32-NEXT: li a1, 32
1252 ; RV32-NEXT: vsrl.vx v8, v8, a1
1253 ; RV32-NEXT: vmv.x.s a1, v8
1256 ; RV64-LABEL: vwreduce_uadd_v8i64:
1258 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1259 ; RV64-NEXT: vle32.v v8, (a0)
1260 ; RV64-NEXT: vmv.s.x v10, zero
1261 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1262 ; RV64-NEXT: vwredsumu.vs v8, v8, v10
1263 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1264 ; RV64-NEXT: vmv.x.s a0, v8
1266 %v = load <8 x i32>, ptr %x
1267 %e = zext <8 x i32> %v to <8 x i64>
1268 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
1272 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1274 define i64 @vreduce_add_v16i64(ptr %x) {
1275 ; RV32-LABEL: vreduce_add_v16i64:
1277 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1278 ; RV32-NEXT: vle64.v v8, (a0)
1279 ; RV32-NEXT: vmv.s.x v16, zero
1280 ; RV32-NEXT: vredsum.vs v8, v8, v16
1281 ; RV32-NEXT: vmv.x.s a0, v8
1282 ; RV32-NEXT: li a1, 32
1283 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1284 ; RV32-NEXT: vsrl.vx v8, v8, a1
1285 ; RV32-NEXT: vmv.x.s a1, v8
1288 ; RV64-LABEL: vreduce_add_v16i64:
1290 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1291 ; RV64-NEXT: vle64.v v8, (a0)
1292 ; RV64-NEXT: vmv.s.x v16, zero
1293 ; RV64-NEXT: vredsum.vs v8, v8, v16
1294 ; RV64-NEXT: vmv.x.s a0, v8
1296 %v = load <16 x i64>, ptr %x
1297 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %v)
1301 define i64 @vwreduce_add_v16i64(ptr %x) {
1302 ; RV32-LABEL: vwreduce_add_v16i64:
1304 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1305 ; RV32-NEXT: vle32.v v8, (a0)
1306 ; RV32-NEXT: vmv.s.x v12, zero
1307 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1308 ; RV32-NEXT: vwredsum.vs v8, v8, v12
1309 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1310 ; RV32-NEXT: vmv.x.s a0, v8
1311 ; RV32-NEXT: li a1, 32
1312 ; RV32-NEXT: vsrl.vx v8, v8, a1
1313 ; RV32-NEXT: vmv.x.s a1, v8
1316 ; RV64-LABEL: vwreduce_add_v16i64:
1318 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1319 ; RV64-NEXT: vle32.v v8, (a0)
1320 ; RV64-NEXT: vmv.s.x v12, zero
1321 ; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1322 ; RV64-NEXT: vwredsum.vs v8, v8, v12
1323 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1324 ; RV64-NEXT: vmv.x.s a0, v8
1326 %v = load <16 x i32>, ptr %x
1327 %e = sext <16 x i32> %v to <16 x i64>
1328 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
1332 define i64 @vwreduce_uadd_v16i64(ptr %x) {
1333 ; RV32-LABEL: vwreduce_uadd_v16i64:
1335 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1336 ; RV32-NEXT: vle32.v v8, (a0)
1337 ; RV32-NEXT: vmv.s.x v12, zero
1338 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1339 ; RV32-NEXT: vwredsumu.vs v8, v8, v12
1340 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1341 ; RV32-NEXT: vmv.x.s a0, v8
1342 ; RV32-NEXT: li a1, 32
1343 ; RV32-NEXT: vsrl.vx v8, v8, a1
1344 ; RV32-NEXT: vmv.x.s a1, v8
1347 ; RV64-LABEL: vwreduce_uadd_v16i64:
1349 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1350 ; RV64-NEXT: vle32.v v8, (a0)
1351 ; RV64-NEXT: vmv.s.x v12, zero
1352 ; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1353 ; RV64-NEXT: vwredsumu.vs v8, v8, v12
1354 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1355 ; RV64-NEXT: vmv.x.s a0, v8
1357 %v = load <16 x i32>, ptr %x
1358 %e = zext <16 x i32> %v to <16 x i64>
1359 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
1363 declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
1365 define i64 @vreduce_add_v32i64(ptr %x) {
1366 ; RV32-LABEL: vreduce_add_v32i64:
1368 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1369 ; RV32-NEXT: vle64.v v8, (a0)
1370 ; RV32-NEXT: addi a0, a0, 128
1371 ; RV32-NEXT: vle64.v v16, (a0)
1372 ; RV32-NEXT: vadd.vv v8, v8, v16
1373 ; RV32-NEXT: vmv.s.x v16, zero
1374 ; RV32-NEXT: vredsum.vs v8, v8, v16
1375 ; RV32-NEXT: vmv.x.s a0, v8
1376 ; RV32-NEXT: li a1, 32
1377 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1378 ; RV32-NEXT: vsrl.vx v8, v8, a1
1379 ; RV32-NEXT: vmv.x.s a1, v8
1382 ; RV64-LABEL: vreduce_add_v32i64:
1384 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1385 ; RV64-NEXT: vle64.v v8, (a0)
1386 ; RV64-NEXT: addi a0, a0, 128
1387 ; RV64-NEXT: vle64.v v16, (a0)
1388 ; RV64-NEXT: vadd.vv v8, v8, v16
1389 ; RV64-NEXT: vmv.s.x v16, zero
1390 ; RV64-NEXT: vredsum.vs v8, v8, v16
1391 ; RV64-NEXT: vmv.x.s a0, v8
1393 %v = load <32 x i64>, ptr %x
1394 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %v)
1398 define i64 @vwreduce_add_v32i64(ptr %x) {
1399 ; RV32-LABEL: vwreduce_add_v32i64:
1401 ; RV32-NEXT: li a1, 32
1402 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1403 ; RV32-NEXT: vle32.v v8, (a0)
1404 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1405 ; RV32-NEXT: vslidedown.vi v16, v8, 16
1406 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1407 ; RV32-NEXT: vwadd.vv v24, v8, v16
1408 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1409 ; RV32-NEXT: vmv.s.x v8, zero
1410 ; RV32-NEXT: vredsum.vs v8, v24, v8
1411 ; RV32-NEXT: vmv.x.s a0, v8
1412 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1413 ; RV32-NEXT: vsrl.vx v8, v8, a1
1414 ; RV32-NEXT: vmv.x.s a1, v8
1417 ; RV64-LABEL: vwreduce_add_v32i64:
1419 ; RV64-NEXT: li a1, 32
1420 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1421 ; RV64-NEXT: vle32.v v8, (a0)
1422 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1423 ; RV64-NEXT: vslidedown.vi v16, v8, 16
1424 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1425 ; RV64-NEXT: vwadd.vv v24, v8, v16
1426 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1427 ; RV64-NEXT: vmv.s.x v8, zero
1428 ; RV64-NEXT: vredsum.vs v8, v24, v8
1429 ; RV64-NEXT: vmv.x.s a0, v8
1431 %v = load <32 x i32>, ptr %x
1432 %e = sext <32 x i32> %v to <32 x i64>
1433 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
1437 define i64 @vwreduce_uadd_v32i64(ptr %x) {
1438 ; RV32-LABEL: vwreduce_uadd_v32i64:
1440 ; RV32-NEXT: li a1, 32
1441 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1442 ; RV32-NEXT: vle32.v v8, (a0)
1443 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1444 ; RV32-NEXT: vslidedown.vi v16, v8, 16
1445 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1446 ; RV32-NEXT: vwaddu.vv v24, v8, v16
1447 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1448 ; RV32-NEXT: vmv.s.x v8, zero
1449 ; RV32-NEXT: vredsum.vs v8, v24, v8
1450 ; RV32-NEXT: vmv.x.s a0, v8
1451 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1452 ; RV32-NEXT: vsrl.vx v8, v8, a1
1453 ; RV32-NEXT: vmv.x.s a1, v8
1456 ; RV64-LABEL: vwreduce_uadd_v32i64:
1458 ; RV64-NEXT: li a1, 32
1459 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1460 ; RV64-NEXT: vle32.v v8, (a0)
1461 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1462 ; RV64-NEXT: vslidedown.vi v16, v8, 16
1463 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1464 ; RV64-NEXT: vwaddu.vv v24, v8, v16
1465 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1466 ; RV64-NEXT: vmv.s.x v8, zero
1467 ; RV64-NEXT: vredsum.vs v8, v24, v8
1468 ; RV64-NEXT: vmv.x.s a0, v8
1470 %v = load <32 x i32>, ptr %x
1471 %e = zext <32 x i32> %v to <32 x i64>
1472 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
1476 declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
1478 define i64 @vreduce_add_v64i64(ptr %x) nounwind {
1479 ; RV32-LABEL: vreduce_add_v64i64:
1481 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1482 ; RV32-NEXT: vle64.v v8, (a0)
1483 ; RV32-NEXT: addi a1, a0, 384
1484 ; RV32-NEXT: vle64.v v16, (a1)
1485 ; RV32-NEXT: addi a1, a0, 256
1486 ; RV32-NEXT: addi a0, a0, 128
1487 ; RV32-NEXT: vle64.v v24, (a0)
1488 ; RV32-NEXT: vle64.v v0, (a1)
1489 ; RV32-NEXT: vadd.vv v16, v24, v16
1490 ; RV32-NEXT: vadd.vv v8, v8, v0
1491 ; RV32-NEXT: vadd.vv v8, v8, v16
1492 ; RV32-NEXT: vmv.s.x v16, zero
1493 ; RV32-NEXT: vredsum.vs v8, v8, v16
1494 ; RV32-NEXT: vmv.x.s a0, v8
1495 ; RV32-NEXT: li a1, 32
1496 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1497 ; RV32-NEXT: vsrl.vx v8, v8, a1
1498 ; RV32-NEXT: vmv.x.s a1, v8
1501 ; RV64-LABEL: vreduce_add_v64i64:
1503 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1504 ; RV64-NEXT: vle64.v v8, (a0)
1505 ; RV64-NEXT: addi a1, a0, 384
1506 ; RV64-NEXT: vle64.v v16, (a1)
1507 ; RV64-NEXT: addi a1, a0, 256
1508 ; RV64-NEXT: addi a0, a0, 128
1509 ; RV64-NEXT: vle64.v v24, (a0)
1510 ; RV64-NEXT: vle64.v v0, (a1)
1511 ; RV64-NEXT: vadd.vv v16, v24, v16
1512 ; RV64-NEXT: vadd.vv v8, v8, v0
1513 ; RV64-NEXT: vadd.vv v8, v8, v16
1514 ; RV64-NEXT: vmv.s.x v16, zero
1515 ; RV64-NEXT: vredsum.vs v8, v8, v16
1516 ; RV64-NEXT: vmv.x.s a0, v8
1518 %v = load <64 x i64>, ptr %x
1519 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %v)
1523 define i64 @vwreduce_add_v64i64(ptr %x) {
1524 ; RV32-LABEL: vwreduce_add_v64i64:
1526 ; RV32-NEXT: addi sp, sp, -16
1527 ; RV32-NEXT: .cfi_def_cfa_offset 16
1528 ; RV32-NEXT: csrr a1, vlenb
1529 ; RV32-NEXT: slli a1, a1, 4
1530 ; RV32-NEXT: sub sp, sp, a1
1531 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1532 ; RV32-NEXT: addi a1, a0, 128
1533 ; RV32-NEXT: li a2, 32
1534 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1535 ; RV32-NEXT: vle32.v v8, (a0)
1536 ; RV32-NEXT: addi a0, sp, 16
1537 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1538 ; RV32-NEXT: vle32.v v16, (a1)
1539 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1540 ; RV32-NEXT: vslidedown.vi v24, v8, 16
1541 ; RV32-NEXT: vslidedown.vi v0, v16, 16
1542 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1543 ; RV32-NEXT: vmv4r.v v8, v0
1544 ; RV32-NEXT: vwadd.vv v0, v24, v8
1545 ; RV32-NEXT: csrr a0, vlenb
1546 ; RV32-NEXT: slli a0, a0, 3
1547 ; RV32-NEXT: add a0, sp, a0
1548 ; RV32-NEXT: addi a0, a0, 16
1549 ; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
1550 ; RV32-NEXT: addi a0, sp, 16
1551 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1552 ; RV32-NEXT: vwadd.vv v0, v8, v16
1553 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1554 ; RV32-NEXT: csrr a0, vlenb
1555 ; RV32-NEXT: slli a0, a0, 3
1556 ; RV32-NEXT: add a0, sp, a0
1557 ; RV32-NEXT: addi a0, a0, 16
1558 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1559 ; RV32-NEXT: vadd.vv v8, v0, v8
1560 ; RV32-NEXT: vmv.s.x v16, zero
1561 ; RV32-NEXT: vredsum.vs v8, v8, v16
1562 ; RV32-NEXT: vmv.x.s a0, v8
1563 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1564 ; RV32-NEXT: vsrl.vx v8, v8, a2
1565 ; RV32-NEXT: vmv.x.s a1, v8
1566 ; RV32-NEXT: csrr a2, vlenb
1567 ; RV32-NEXT: slli a2, a2, 4
1568 ; RV32-NEXT: add sp, sp, a2
1569 ; RV32-NEXT: addi sp, sp, 16
1572 ; RV64-LABEL: vwreduce_add_v64i64:
1574 ; RV64-NEXT: addi sp, sp, -16
1575 ; RV64-NEXT: .cfi_def_cfa_offset 16
1576 ; RV64-NEXT: csrr a1, vlenb
1577 ; RV64-NEXT: slli a1, a1, 4
1578 ; RV64-NEXT: sub sp, sp, a1
1579 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1580 ; RV64-NEXT: addi a1, a0, 128
1581 ; RV64-NEXT: li a2, 32
1582 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1583 ; RV64-NEXT: vle32.v v8, (a0)
1584 ; RV64-NEXT: addi a0, sp, 16
1585 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1586 ; RV64-NEXT: vle32.v v16, (a1)
1587 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1588 ; RV64-NEXT: vslidedown.vi v24, v8, 16
1589 ; RV64-NEXT: vslidedown.vi v0, v16, 16
1590 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1591 ; RV64-NEXT: vmv4r.v v8, v0
1592 ; RV64-NEXT: vwadd.vv v0, v24, v8
1593 ; RV64-NEXT: csrr a0, vlenb
1594 ; RV64-NEXT: slli a0, a0, 3
1595 ; RV64-NEXT: add a0, sp, a0
1596 ; RV64-NEXT: addi a0, a0, 16
1597 ; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
1598 ; RV64-NEXT: addi a0, sp, 16
1599 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1600 ; RV64-NEXT: vwadd.vv v0, v8, v16
1601 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1602 ; RV64-NEXT: csrr a0, vlenb
1603 ; RV64-NEXT: slli a0, a0, 3
1604 ; RV64-NEXT: add a0, sp, a0
1605 ; RV64-NEXT: addi a0, a0, 16
1606 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1607 ; RV64-NEXT: vadd.vv v8, v0, v8
1608 ; RV64-NEXT: vmv.s.x v16, zero
1609 ; RV64-NEXT: vredsum.vs v8, v8, v16
1610 ; RV64-NEXT: vmv.x.s a0, v8
1611 ; RV64-NEXT: csrr a1, vlenb
1612 ; RV64-NEXT: slli a1, a1, 4
1613 ; RV64-NEXT: add sp, sp, a1
1614 ; RV64-NEXT: addi sp, sp, 16
1616 %v = load <64 x i32>, ptr %x
1617 %e = sext <64 x i32> %v to <64 x i64>
1618 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
1622 define i64 @vwreduce_uadd_v64i64(ptr %x) {
1623 ; RV32-LABEL: vwreduce_uadd_v64i64:
1625 ; RV32-NEXT: addi sp, sp, -16
1626 ; RV32-NEXT: .cfi_def_cfa_offset 16
1627 ; RV32-NEXT: csrr a1, vlenb
1628 ; RV32-NEXT: slli a1, a1, 4
1629 ; RV32-NEXT: sub sp, sp, a1
1630 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1631 ; RV32-NEXT: addi a1, a0, 128
1632 ; RV32-NEXT: li a2, 32
1633 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1634 ; RV32-NEXT: vle32.v v8, (a0)
1635 ; RV32-NEXT: addi a0, sp, 16
1636 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1637 ; RV32-NEXT: vle32.v v16, (a1)
1638 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1639 ; RV32-NEXT: vslidedown.vi v24, v8, 16
1640 ; RV32-NEXT: vslidedown.vi v0, v16, 16
1641 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1642 ; RV32-NEXT: vmv4r.v v8, v0
1643 ; RV32-NEXT: vwaddu.vv v0, v24, v8
1644 ; RV32-NEXT: csrr a0, vlenb
1645 ; RV32-NEXT: slli a0, a0, 3
1646 ; RV32-NEXT: add a0, sp, a0
1647 ; RV32-NEXT: addi a0, a0, 16
1648 ; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
1649 ; RV32-NEXT: addi a0, sp, 16
1650 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1651 ; RV32-NEXT: vwaddu.vv v0, v8, v16
1652 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1653 ; RV32-NEXT: csrr a0, vlenb
1654 ; RV32-NEXT: slli a0, a0, 3
1655 ; RV32-NEXT: add a0, sp, a0
1656 ; RV32-NEXT: addi a0, a0, 16
1657 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1658 ; RV32-NEXT: vadd.vv v8, v0, v8
1659 ; RV32-NEXT: vmv.s.x v16, zero
1660 ; RV32-NEXT: vredsum.vs v8, v8, v16
1661 ; RV32-NEXT: vmv.x.s a0, v8
1662 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1663 ; RV32-NEXT: vsrl.vx v8, v8, a2
1664 ; RV32-NEXT: vmv.x.s a1, v8
1665 ; RV32-NEXT: csrr a2, vlenb
1666 ; RV32-NEXT: slli a2, a2, 4
1667 ; RV32-NEXT: add sp, sp, a2
1668 ; RV32-NEXT: addi sp, sp, 16
1671 ; RV64-LABEL: vwreduce_uadd_v64i64:
1673 ; RV64-NEXT: addi sp, sp, -16
1674 ; RV64-NEXT: .cfi_def_cfa_offset 16
1675 ; RV64-NEXT: csrr a1, vlenb
1676 ; RV64-NEXT: slli a1, a1, 4
1677 ; RV64-NEXT: sub sp, sp, a1
1678 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1679 ; RV64-NEXT: addi a1, a0, 128
1680 ; RV64-NEXT: li a2, 32
1681 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1682 ; RV64-NEXT: vle32.v v8, (a0)
1683 ; RV64-NEXT: addi a0, sp, 16
1684 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1685 ; RV64-NEXT: vle32.v v16, (a1)
1686 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1687 ; RV64-NEXT: vslidedown.vi v24, v8, 16
1688 ; RV64-NEXT: vslidedown.vi v0, v16, 16
1689 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1690 ; RV64-NEXT: vmv4r.v v8, v0
1691 ; RV64-NEXT: vwaddu.vv v0, v24, v8
1692 ; RV64-NEXT: csrr a0, vlenb
1693 ; RV64-NEXT: slli a0, a0, 3
1694 ; RV64-NEXT: add a0, sp, a0
1695 ; RV64-NEXT: addi a0, a0, 16
1696 ; RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
1697 ; RV64-NEXT: addi a0, sp, 16
1698 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1699 ; RV64-NEXT: vwaddu.vv v0, v8, v16
1700 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1701 ; RV64-NEXT: csrr a0, vlenb
1702 ; RV64-NEXT: slli a0, a0, 3
1703 ; RV64-NEXT: add a0, sp, a0
1704 ; RV64-NEXT: addi a0, a0, 16
1705 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1706 ; RV64-NEXT: vadd.vv v8, v0, v8
1707 ; RV64-NEXT: vmv.s.x v16, zero
1708 ; RV64-NEXT: vredsum.vs v8, v8, v16
1709 ; RV64-NEXT: vmv.x.s a0, v8
1710 ; RV64-NEXT: csrr a1, vlenb
1711 ; RV64-NEXT: slli a1, a1, 4
1712 ; RV64-NEXT: add sp, sp, a1
1713 ; RV64-NEXT: addi sp, sp, 16
1715 %v = load <64 x i32>, ptr %x
1716 %e = zext <64 x i32> %v to <64 x i64>
1717 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
1721 declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
1723 define i8 @vreduce_and_v1i8(ptr %x) {
1724 ; CHECK-LABEL: vreduce_and_v1i8:
1726 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
1727 ; CHECK-NEXT: vle8.v v8, (a0)
1728 ; CHECK-NEXT: vmv.x.s a0, v8
1730 %v = load <1 x i8>, ptr %x
1731 %red = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %v)
1735 declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
1737 define i8 @vreduce_and_v2i8(ptr %x) {
1738 ; CHECK-LABEL: vreduce_and_v2i8:
1740 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
1741 ; CHECK-NEXT: vle8.v v8, (a0)
1742 ; CHECK-NEXT: vredand.vs v8, v8, v8
1743 ; CHECK-NEXT: vmv.x.s a0, v8
1745 %v = load <2 x i8>, ptr %x
1746 %red = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v)
1750 declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
1752 define i8 @vreduce_and_v4i8(ptr %x) {
1753 ; CHECK-LABEL: vreduce_and_v4i8:
1755 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
1756 ; CHECK-NEXT: vle8.v v8, (a0)
1757 ; CHECK-NEXT: vredand.vs v8, v8, v8
1758 ; CHECK-NEXT: vmv.x.s a0, v8
1760 %v = load <4 x i8>, ptr %x
1761 %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
1765 declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
1767 define i8 @vreduce_and_v8i8(ptr %x) {
1768 ; CHECK-LABEL: vreduce_and_v8i8:
1770 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
1771 ; CHECK-NEXT: vle8.v v8, (a0)
1772 ; CHECK-NEXT: vredand.vs v8, v8, v8
1773 ; CHECK-NEXT: vmv.x.s a0, v8
1775 %v = load <8 x i8>, ptr %x
1776 %red = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v)
1780 declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
1782 define i8 @vreduce_and_v16i8(ptr %x) {
1783 ; CHECK-LABEL: vreduce_and_v16i8:
1785 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
1786 ; CHECK-NEXT: vle8.v v8, (a0)
1787 ; CHECK-NEXT: vredand.vs v8, v8, v8
1788 ; CHECK-NEXT: vmv.x.s a0, v8
1790 %v = load <16 x i8>, ptr %x
1791 %red = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v)
1795 declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
1797 define i8 @vreduce_and_v32i8(ptr %x) {
1798 ; CHECK-LABEL: vreduce_and_v32i8:
1800 ; CHECK-NEXT: li a1, 32
1801 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
1802 ; CHECK-NEXT: vle8.v v8, (a0)
1803 ; CHECK-NEXT: vredand.vs v8, v8, v8
1804 ; CHECK-NEXT: vmv.x.s a0, v8
1806 %v = load <32 x i8>, ptr %x
1807 %red = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v)
1811 declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
1813 define i8 @vreduce_and_v64i8(ptr %x) {
1814 ; CHECK-LABEL: vreduce_and_v64i8:
1816 ; CHECK-NEXT: li a1, 64
1817 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
1818 ; CHECK-NEXT: vle8.v v8, (a0)
1819 ; CHECK-NEXT: vredand.vs v8, v8, v8
1820 ; CHECK-NEXT: vmv.x.s a0, v8
1822 %v = load <64 x i8>, ptr %x
1823 %red = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %v)
1827 declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
1829 define i8 @vreduce_and_v128i8(ptr %x) {
1830 ; CHECK-LABEL: vreduce_and_v128i8:
1832 ; CHECK-NEXT: li a1, 128
1833 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
1834 ; CHECK-NEXT: vle8.v v8, (a0)
1835 ; CHECK-NEXT: vredand.vs v8, v8, v8
1836 ; CHECK-NEXT: vmv.x.s a0, v8
1838 %v = load <128 x i8>, ptr %x
1839 %red = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %v)
1843 declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1845 define i8 @vreduce_and_v256i8(ptr %x) {
1846 ; CHECK-LABEL: vreduce_and_v256i8:
1848 ; CHECK-NEXT: li a1, 128
1849 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
1850 ; CHECK-NEXT: vle8.v v8, (a0)
1851 ; CHECK-NEXT: addi a0, a0, 128
1852 ; CHECK-NEXT: vle8.v v16, (a0)
1853 ; CHECK-NEXT: vand.vv v8, v8, v16
1854 ; CHECK-NEXT: vredand.vs v8, v8, v8
1855 ; CHECK-NEXT: vmv.x.s a0, v8
1857 %v = load <256 x i8>, ptr %x
1858 %red = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %v)
1862 declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16>)
1864 define i16 @vreduce_and_v1i16(ptr %x) {
1865 ; CHECK-LABEL: vreduce_and_v1i16:
1867 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
1868 ; CHECK-NEXT: vle16.v v8, (a0)
1869 ; CHECK-NEXT: vmv.x.s a0, v8
1871 %v = load <1 x i16>, ptr %x
1872 %red = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %v)
1876 declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
1878 define i16 @vreduce_and_v2i16(ptr %x) {
1879 ; CHECK-LABEL: vreduce_and_v2i16:
1881 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
1882 ; CHECK-NEXT: vle16.v v8, (a0)
1883 ; CHECK-NEXT: vredand.vs v8, v8, v8
1884 ; CHECK-NEXT: vmv.x.s a0, v8
1886 %v = load <2 x i16>, ptr %x
1887 %red = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v)
1891 declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1893 define i16 @vreduce_and_v4i16(ptr %x) {
1894 ; CHECK-LABEL: vreduce_and_v4i16:
1896 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
1897 ; CHECK-NEXT: vle16.v v8, (a0)
1898 ; CHECK-NEXT: vredand.vs v8, v8, v8
1899 ; CHECK-NEXT: vmv.x.s a0, v8
1901 %v = load <4 x i16>, ptr %x
1902 %red = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
1906 declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1908 define i16 @vreduce_and_v8i16(ptr %x) {
1909 ; CHECK-LABEL: vreduce_and_v8i16:
1911 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
1912 ; CHECK-NEXT: vle16.v v8, (a0)
1913 ; CHECK-NEXT: vredand.vs v8, v8, v8
1914 ; CHECK-NEXT: vmv.x.s a0, v8
1916 %v = load <8 x i16>, ptr %x
1917 %red = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v)
1921 declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1923 define i16 @vreduce_and_v16i16(ptr %x) {
1924 ; CHECK-LABEL: vreduce_and_v16i16:
1926 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
1927 ; CHECK-NEXT: vle16.v v8, (a0)
1928 ; CHECK-NEXT: vredand.vs v8, v8, v8
1929 ; CHECK-NEXT: vmv.x.s a0, v8
1931 %v = load <16 x i16>, ptr %x
1932 %red = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
1936 declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1938 define i16 @vreduce_and_v32i16(ptr %x) {
1939 ; CHECK-LABEL: vreduce_and_v32i16:
1941 ; CHECK-NEXT: li a1, 32
1942 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
1943 ; CHECK-NEXT: vle16.v v8, (a0)
1944 ; CHECK-NEXT: vredand.vs v8, v8, v8
1945 ; CHECK-NEXT: vmv.x.s a0, v8
1947 %v = load <32 x i16>, ptr %x
1948 %red = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %v)
1952 declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1954 define i16 @vreduce_and_v64i16(ptr %x) {
1955 ; CHECK-LABEL: vreduce_and_v64i16:
1957 ; CHECK-NEXT: li a1, 64
1958 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
1959 ; CHECK-NEXT: vle16.v v8, (a0)
1960 ; CHECK-NEXT: vredand.vs v8, v8, v8
1961 ; CHECK-NEXT: vmv.x.s a0, v8
1963 %v = load <64 x i16>, ptr %x
1964 %red = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %v)
1968 declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1970 define i16 @vreduce_and_v128i16(ptr %x) {
1971 ; CHECK-LABEL: vreduce_and_v128i16:
1973 ; CHECK-NEXT: li a1, 64
1974 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
1975 ; CHECK-NEXT: vle16.v v8, (a0)
1976 ; CHECK-NEXT: addi a0, a0, 128
1977 ; CHECK-NEXT: vle16.v v16, (a0)
1978 ; CHECK-NEXT: vand.vv v8, v8, v16
1979 ; CHECK-NEXT: vredand.vs v8, v8, v8
1980 ; CHECK-NEXT: vmv.x.s a0, v8
1982 %v = load <128 x i16>, ptr %x
1983 %red = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %v)
1987 declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32>)
1989 define i32 @vreduce_and_v1i32(ptr %x) {
1990 ; CHECK-LABEL: vreduce_and_v1i32:
1992 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
1993 ; CHECK-NEXT: vle32.v v8, (a0)
1994 ; CHECK-NEXT: vmv.x.s a0, v8
1996 %v = load <1 x i32>, ptr %x
1997 %red = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %v)
2001 declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
2003 define i32 @vreduce_and_v2i32(ptr %x) {
2004 ; CHECK-LABEL: vreduce_and_v2i32:
2006 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
2007 ; CHECK-NEXT: vle32.v v8, (a0)
2008 ; CHECK-NEXT: vredand.vs v8, v8, v8
2009 ; CHECK-NEXT: vmv.x.s a0, v8
2011 %v = load <2 x i32>, ptr %x
2012 %red = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v)
2016 declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
2018 define i32 @vreduce_and_v4i32(ptr %x) {
2019 ; CHECK-LABEL: vreduce_and_v4i32:
2021 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
2022 ; CHECK-NEXT: vle32.v v8, (a0)
2023 ; CHECK-NEXT: vredand.vs v8, v8, v8
2024 ; CHECK-NEXT: vmv.x.s a0, v8
2026 %v = load <4 x i32>, ptr %x
2027 %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v)
2031 declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
2033 define i32 @vreduce_and_v8i32(ptr %x) {
2034 ; CHECK-LABEL: vreduce_and_v8i32:
2036 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
2037 ; CHECK-NEXT: vle32.v v8, (a0)
2038 ; CHECK-NEXT: vredand.vs v8, v8, v8
2039 ; CHECK-NEXT: vmv.x.s a0, v8
2041 %v = load <8 x i32>, ptr %x
2042 %red = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v)
2046 declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
2048 define i32 @vreduce_and_v16i32(ptr %x) {
2049 ; CHECK-LABEL: vreduce_and_v16i32:
2051 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2052 ; CHECK-NEXT: vle32.v v8, (a0)
2053 ; CHECK-NEXT: vredand.vs v8, v8, v8
2054 ; CHECK-NEXT: vmv.x.s a0, v8
2056 %v = load <16 x i32>, ptr %x
2057 %red = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v)
2061 declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
2063 define i32 @vreduce_and_v32i32(ptr %x) {
2064 ; CHECK-LABEL: vreduce_and_v32i32:
2066 ; CHECK-NEXT: li a1, 32
2067 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2068 ; CHECK-NEXT: vle32.v v8, (a0)
2069 ; CHECK-NEXT: vredand.vs v8, v8, v8
2070 ; CHECK-NEXT: vmv.x.s a0, v8
2072 %v = load <32 x i32>, ptr %x
2073 %red = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %v)
2077 declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
2079 define i32 @vreduce_and_v64i32(ptr %x) {
2080 ; CHECK-LABEL: vreduce_and_v64i32:
2082 ; CHECK-NEXT: li a1, 32
2083 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2084 ; CHECK-NEXT: vle32.v v8, (a0)
2085 ; CHECK-NEXT: addi a0, a0, 128
2086 ; CHECK-NEXT: vle32.v v16, (a0)
2087 ; CHECK-NEXT: vand.vv v8, v8, v16
2088 ; CHECK-NEXT: vredand.vs v8, v8, v8
2089 ; CHECK-NEXT: vmv.x.s a0, v8
2091 %v = load <64 x i32>, ptr %x
2092 %red = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %v)
2096 declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
2098 define i64 @vreduce_and_v1i64(ptr %x) {
2099 ; RV32-LABEL: vreduce_and_v1i64:
2101 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2102 ; RV32-NEXT: vle64.v v8, (a0)
2103 ; RV32-NEXT: li a0, 32
2104 ; RV32-NEXT: vsrl.vx v9, v8, a0
2105 ; RV32-NEXT: vmv.x.s a1, v9
2106 ; RV32-NEXT: vmv.x.s a0, v8
2109 ; RV64-LABEL: vreduce_and_v1i64:
2111 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2112 ; RV64-NEXT: vle64.v v8, (a0)
2113 ; RV64-NEXT: vmv.x.s a0, v8
2115 %v = load <1 x i64>, ptr %x
2116 %red = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %v)
2120 declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
2122 define i64 @vreduce_and_v2i64(ptr %x) {
2123 ; RV32-LABEL: vreduce_and_v2i64:
2125 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2126 ; RV32-NEXT: vle64.v v8, (a0)
2127 ; RV32-NEXT: vredand.vs v8, v8, v8
2128 ; RV32-NEXT: li a0, 32
2129 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2130 ; RV32-NEXT: vsrl.vx v9, v8, a0
2131 ; RV32-NEXT: vmv.x.s a1, v9
2132 ; RV32-NEXT: vmv.x.s a0, v8
2135 ; RV64-LABEL: vreduce_and_v2i64:
2137 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2138 ; RV64-NEXT: vle64.v v8, (a0)
2139 ; RV64-NEXT: vredand.vs v8, v8, v8
2140 ; RV64-NEXT: vmv.x.s a0, v8
2142 %v = load <2 x i64>, ptr %x
2143 %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v)
2147 declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
2149 define i64 @vreduce_and_v4i64(ptr %x) {
2150 ; RV32-LABEL: vreduce_and_v4i64:
2152 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2153 ; RV32-NEXT: vle64.v v8, (a0)
2154 ; RV32-NEXT: vredand.vs v8, v8, v8
2155 ; RV32-NEXT: vmv.x.s a0, v8
2156 ; RV32-NEXT: li a1, 32
2157 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2158 ; RV32-NEXT: vsrl.vx v8, v8, a1
2159 ; RV32-NEXT: vmv.x.s a1, v8
2162 ; RV64-LABEL: vreduce_and_v4i64:
2164 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2165 ; RV64-NEXT: vle64.v v8, (a0)
2166 ; RV64-NEXT: vredand.vs v8, v8, v8
2167 ; RV64-NEXT: vmv.x.s a0, v8
2169 %v = load <4 x i64>, ptr %x
2170 %red = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
2174 declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
2176 define i64 @vreduce_and_v8i64(ptr %x) {
2177 ; RV32-LABEL: vreduce_and_v8i64:
2179 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2180 ; RV32-NEXT: vle64.v v8, (a0)
2181 ; RV32-NEXT: vredand.vs v8, v8, v8
2182 ; RV32-NEXT: vmv.x.s a0, v8
2183 ; RV32-NEXT: li a1, 32
2184 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2185 ; RV32-NEXT: vsrl.vx v8, v8, a1
2186 ; RV32-NEXT: vmv.x.s a1, v8
2189 ; RV64-LABEL: vreduce_and_v8i64:
2191 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2192 ; RV64-NEXT: vle64.v v8, (a0)
2193 ; RV64-NEXT: vredand.vs v8, v8, v8
2194 ; RV64-NEXT: vmv.x.s a0, v8
2196 %v = load <8 x i64>, ptr %x
2197 %red = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %v)
2201 declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
2203 define i64 @vreduce_and_v16i64(ptr %x) {
2204 ; RV32-LABEL: vreduce_and_v16i64:
2206 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2207 ; RV32-NEXT: vle64.v v8, (a0)
2208 ; RV32-NEXT: vredand.vs v8, v8, v8
2209 ; RV32-NEXT: vmv.x.s a0, v8
2210 ; RV32-NEXT: li a1, 32
2211 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2212 ; RV32-NEXT: vsrl.vx v8, v8, a1
2213 ; RV32-NEXT: vmv.x.s a1, v8
2216 ; RV64-LABEL: vreduce_and_v16i64:
2218 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2219 ; RV64-NEXT: vle64.v v8, (a0)
2220 ; RV64-NEXT: vredand.vs v8, v8, v8
2221 ; RV64-NEXT: vmv.x.s a0, v8
2223 %v = load <16 x i64>, ptr %x
2224 %red = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %v)
2228 declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
2230 define i64 @vreduce_and_v32i64(ptr %x) {
2231 ; RV32-LABEL: vreduce_and_v32i64:
2233 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2234 ; RV32-NEXT: vle64.v v8, (a0)
2235 ; RV32-NEXT: addi a0, a0, 128
2236 ; RV32-NEXT: vle64.v v16, (a0)
2237 ; RV32-NEXT: vand.vv v8, v8, v16
2238 ; RV32-NEXT: vredand.vs v8, v8, v8
2239 ; RV32-NEXT: vmv.x.s a0, v8
2240 ; RV32-NEXT: li a1, 32
2241 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2242 ; RV32-NEXT: vsrl.vx v8, v8, a1
2243 ; RV32-NEXT: vmv.x.s a1, v8
2246 ; RV64-LABEL: vreduce_and_v32i64:
2248 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2249 ; RV64-NEXT: vle64.v v8, (a0)
2250 ; RV64-NEXT: addi a0, a0, 128
2251 ; RV64-NEXT: vle64.v v16, (a0)
2252 ; RV64-NEXT: vand.vv v8, v8, v16
2253 ; RV64-NEXT: vredand.vs v8, v8, v8
2254 ; RV64-NEXT: vmv.x.s a0, v8
2256 %v = load <32 x i64>, ptr %x
2257 %red = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %v)
2261 declare i64 @llvm.vector.reduce.and.v64i64(<64 x i64>)
2263 define i64 @vreduce_and_v64i64(ptr %x) nounwind {
2264 ; RV32-LABEL: vreduce_and_v64i64:
2266 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2267 ; RV32-NEXT: vle64.v v8, (a0)
2268 ; RV32-NEXT: addi a1, a0, 384
2269 ; RV32-NEXT: vle64.v v16, (a1)
2270 ; RV32-NEXT: addi a1, a0, 256
2271 ; RV32-NEXT: addi a0, a0, 128
2272 ; RV32-NEXT: vle64.v v24, (a0)
2273 ; RV32-NEXT: vle64.v v0, (a1)
2274 ; RV32-NEXT: vand.vv v16, v24, v16
2275 ; RV32-NEXT: vand.vv v8, v8, v0
2276 ; RV32-NEXT: vand.vv v8, v8, v16
2277 ; RV32-NEXT: vredand.vs v8, v8, v8
2278 ; RV32-NEXT: vmv.x.s a0, v8
2279 ; RV32-NEXT: li a1, 32
2280 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2281 ; RV32-NEXT: vsrl.vx v8, v8, a1
2282 ; RV32-NEXT: vmv.x.s a1, v8
2285 ; RV64-LABEL: vreduce_and_v64i64:
2287 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2288 ; RV64-NEXT: vle64.v v8, (a0)
2289 ; RV64-NEXT: addi a1, a0, 384
2290 ; RV64-NEXT: vle64.v v16, (a1)
2291 ; RV64-NEXT: addi a1, a0, 256
2292 ; RV64-NEXT: addi a0, a0, 128
2293 ; RV64-NEXT: vle64.v v24, (a0)
2294 ; RV64-NEXT: vle64.v v0, (a1)
2295 ; RV64-NEXT: vand.vv v16, v24, v16
2296 ; RV64-NEXT: vand.vv v8, v8, v0
2297 ; RV64-NEXT: vand.vv v8, v8, v16
2298 ; RV64-NEXT: vredand.vs v8, v8, v8
2299 ; RV64-NEXT: vmv.x.s a0, v8
2301 %v = load <64 x i64>, ptr %x
2302 %red = call i64 @llvm.vector.reduce.and.v64i64(<64 x i64> %v)
2306 declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
2308 define i8 @vreduce_or_v1i8(ptr %x) {
2309 ; CHECK-LABEL: vreduce_or_v1i8:
2311 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
2312 ; CHECK-NEXT: vle8.v v8, (a0)
2313 ; CHECK-NEXT: vmv.x.s a0, v8
2315 %v = load <1 x i8>, ptr %x
2316 %red = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %v)
2320 declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
2322 define i8 @vreduce_or_v2i8(ptr %x) {
2323 ; CHECK-LABEL: vreduce_or_v2i8:
2325 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
2326 ; CHECK-NEXT: vle8.v v8, (a0)
2327 ; CHECK-NEXT: vredor.vs v8, v8, v8
2328 ; CHECK-NEXT: vmv.x.s a0, v8
2330 %v = load <2 x i8>, ptr %x
2331 %red = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v)
2335 declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
2337 define i8 @vreduce_or_v4i8(ptr %x) {
2338 ; CHECK-LABEL: vreduce_or_v4i8:
2340 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2341 ; CHECK-NEXT: vle8.v v8, (a0)
2342 ; CHECK-NEXT: vredor.vs v8, v8, v8
2343 ; CHECK-NEXT: vmv.x.s a0, v8
2345 %v = load <4 x i8>, ptr %x
2346 %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
2350 declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
2352 define i8 @vreduce_or_v8i8(ptr %x) {
2353 ; CHECK-LABEL: vreduce_or_v8i8:
2355 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2356 ; CHECK-NEXT: vle8.v v8, (a0)
2357 ; CHECK-NEXT: vredor.vs v8, v8, v8
2358 ; CHECK-NEXT: vmv.x.s a0, v8
2360 %v = load <8 x i8>, ptr %x
2361 %red = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
2365 declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
2367 define i8 @vreduce_or_v16i8(ptr %x) {
2368 ; CHECK-LABEL: vreduce_or_v16i8:
2370 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
2371 ; CHECK-NEXT: vle8.v v8, (a0)
2372 ; CHECK-NEXT: vredor.vs v8, v8, v8
2373 ; CHECK-NEXT: vmv.x.s a0, v8
2375 %v = load <16 x i8>, ptr %x
2376 %red = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v)
2380 declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
2382 define i8 @vreduce_or_v32i8(ptr %x) {
2383 ; CHECK-LABEL: vreduce_or_v32i8:
2385 ; CHECK-NEXT: li a1, 32
2386 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
2387 ; CHECK-NEXT: vle8.v v8, (a0)
2388 ; CHECK-NEXT: vredor.vs v8, v8, v8
2389 ; CHECK-NEXT: vmv.x.s a0, v8
2391 %v = load <32 x i8>, ptr %x
2392 %red = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v)
2396 declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
2398 define i8 @vreduce_or_v64i8(ptr %x) {
2399 ; CHECK-LABEL: vreduce_or_v64i8:
2401 ; CHECK-NEXT: li a1, 64
2402 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
2403 ; CHECK-NEXT: vle8.v v8, (a0)
2404 ; CHECK-NEXT: vredor.vs v8, v8, v8
2405 ; CHECK-NEXT: vmv.x.s a0, v8
2407 %v = load <64 x i8>, ptr %x
2408 %red = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %v)
2412 declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
2414 define i8 @vreduce_or_v128i8(ptr %x) {
2415 ; CHECK-LABEL: vreduce_or_v128i8:
2417 ; CHECK-NEXT: li a1, 128
2418 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
2419 ; CHECK-NEXT: vle8.v v8, (a0)
2420 ; CHECK-NEXT: vredor.vs v8, v8, v8
2421 ; CHECK-NEXT: vmv.x.s a0, v8
2423 %v = load <128 x i8>, ptr %x
2424 %red = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %v)
2428 declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
2430 define i8 @vreduce_or_v256i8(ptr %x) {
2431 ; CHECK-LABEL: vreduce_or_v256i8:
2433 ; CHECK-NEXT: li a1, 128
2434 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
2435 ; CHECK-NEXT: vle8.v v8, (a0)
2436 ; CHECK-NEXT: addi a0, a0, 128
2437 ; CHECK-NEXT: vle8.v v16, (a0)
2438 ; CHECK-NEXT: vor.vv v8, v8, v16
2439 ; CHECK-NEXT: vredor.vs v8, v8, v8
2440 ; CHECK-NEXT: vmv.x.s a0, v8
2442 %v = load <256 x i8>, ptr %x
2443 %red = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %v)
2447 declare i16 @llvm.vector.reduce.or.v1i16(<1 x i16>)
2449 define i16 @vreduce_or_v1i16(ptr %x) {
2450 ; CHECK-LABEL: vreduce_or_v1i16:
2452 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
2453 ; CHECK-NEXT: vle16.v v8, (a0)
2454 ; CHECK-NEXT: vmv.x.s a0, v8
2456 %v = load <1 x i16>, ptr %x
2457 %red = call i16 @llvm.vector.reduce.or.v1i16(<1 x i16> %v)
2461 declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
2463 define i16 @vreduce_or_v2i16(ptr %x) {
2464 ; CHECK-LABEL: vreduce_or_v2i16:
2466 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
2467 ; CHECK-NEXT: vle16.v v8, (a0)
2468 ; CHECK-NEXT: vredor.vs v8, v8, v8
2469 ; CHECK-NEXT: vmv.x.s a0, v8
2471 %v = load <2 x i16>, ptr %x
2472 %red = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v)
2476 declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
2478 define i16 @vreduce_or_v4i16(ptr %x) {
2479 ; CHECK-LABEL: vreduce_or_v4i16:
2481 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
2482 ; CHECK-NEXT: vle16.v v8, (a0)
2483 ; CHECK-NEXT: vredor.vs v8, v8, v8
2484 ; CHECK-NEXT: vmv.x.s a0, v8
2486 %v = load <4 x i16>, ptr %x
2487 %red = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
2491 declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
2493 define i16 @vreduce_or_v8i16(ptr %x) {
2494 ; CHECK-LABEL: vreduce_or_v8i16:
2496 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
2497 ; CHECK-NEXT: vle16.v v8, (a0)
2498 ; CHECK-NEXT: vredor.vs v8, v8, v8
2499 ; CHECK-NEXT: vmv.x.s a0, v8
2501 %v = load <8 x i16>, ptr %x
2502 %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v)
2506 declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
2508 define i16 @vreduce_or_v16i16(ptr %x) {
2509 ; CHECK-LABEL: vreduce_or_v16i16:
2511 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
2512 ; CHECK-NEXT: vle16.v v8, (a0)
2513 ; CHECK-NEXT: vredor.vs v8, v8, v8
2514 ; CHECK-NEXT: vmv.x.s a0, v8
2516 %v = load <16 x i16>, ptr %x
2517 %red = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
2521 declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
2523 define i16 @vreduce_or_v32i16(ptr %x) {
2524 ; CHECK-LABEL: vreduce_or_v32i16:
2526 ; CHECK-NEXT: li a1, 32
2527 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
2528 ; CHECK-NEXT: vle16.v v8, (a0)
2529 ; CHECK-NEXT: vredor.vs v8, v8, v8
2530 ; CHECK-NEXT: vmv.x.s a0, v8
2532 %v = load <32 x i16>, ptr %x
2533 %red = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %v)
2537 declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
2539 define i16 @vreduce_or_v64i16(ptr %x) {
2540 ; CHECK-LABEL: vreduce_or_v64i16:
2542 ; CHECK-NEXT: li a1, 64
2543 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
2544 ; CHECK-NEXT: vle16.v v8, (a0)
2545 ; CHECK-NEXT: vredor.vs v8, v8, v8
2546 ; CHECK-NEXT: vmv.x.s a0, v8
2548 %v = load <64 x i16>, ptr %x
2549 %red = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %v)
2553 declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
2555 define i16 @vreduce_or_v128i16(ptr %x) {
2556 ; CHECK-LABEL: vreduce_or_v128i16:
2558 ; CHECK-NEXT: li a1, 64
2559 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
2560 ; CHECK-NEXT: vle16.v v8, (a0)
2561 ; CHECK-NEXT: addi a0, a0, 128
2562 ; CHECK-NEXT: vle16.v v16, (a0)
2563 ; CHECK-NEXT: vor.vv v8, v8, v16
2564 ; CHECK-NEXT: vredor.vs v8, v8, v8
2565 ; CHECK-NEXT: vmv.x.s a0, v8
2567 %v = load <128 x i16>, ptr %x
2568 %red = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %v)
2572 declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32>)
2574 define i32 @vreduce_or_v1i32(ptr %x) {
2575 ; CHECK-LABEL: vreduce_or_v1i32:
2577 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
2578 ; CHECK-NEXT: vle32.v v8, (a0)
2579 ; CHECK-NEXT: vmv.x.s a0, v8
2581 %v = load <1 x i32>, ptr %x
2582 %red = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %v)
2586 declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
2588 define i32 @vreduce_or_v2i32(ptr %x) {
2589 ; CHECK-LABEL: vreduce_or_v2i32:
2591 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
2592 ; CHECK-NEXT: vle32.v v8, (a0)
2593 ; CHECK-NEXT: vredor.vs v8, v8, v8
2594 ; CHECK-NEXT: vmv.x.s a0, v8
2596 %v = load <2 x i32>, ptr %x
2597 %red = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v)
2601 declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2603 define i32 @vreduce_or_v4i32(ptr %x) {
2604 ; CHECK-LABEL: vreduce_or_v4i32:
2606 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
2607 ; CHECK-NEXT: vle32.v v8, (a0)
2608 ; CHECK-NEXT: vredor.vs v8, v8, v8
2609 ; CHECK-NEXT: vmv.x.s a0, v8
2611 %v = load <4 x i32>, ptr %x
2612 %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
2616 declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
2618 define i32 @vreduce_or_v8i32(ptr %x) {
2619 ; CHECK-LABEL: vreduce_or_v8i32:
2621 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
2622 ; CHECK-NEXT: vle32.v v8, (a0)
2623 ; CHECK-NEXT: vredor.vs v8, v8, v8
2624 ; CHECK-NEXT: vmv.x.s a0, v8
2626 %v = load <8 x i32>, ptr %x
2627 %red = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v)
2631 declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
2633 define i32 @vreduce_or_v16i32(ptr %x) {
2634 ; CHECK-LABEL: vreduce_or_v16i32:
2636 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2637 ; CHECK-NEXT: vle32.v v8, (a0)
2638 ; CHECK-NEXT: vredor.vs v8, v8, v8
2639 ; CHECK-NEXT: vmv.x.s a0, v8
2641 %v = load <16 x i32>, ptr %x
2642 %red = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v)
2646 declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
2648 define i32 @vreduce_or_v32i32(ptr %x) {
2649 ; CHECK-LABEL: vreduce_or_v32i32:
2651 ; CHECK-NEXT: li a1, 32
2652 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2653 ; CHECK-NEXT: vle32.v v8, (a0)
2654 ; CHECK-NEXT: vredor.vs v8, v8, v8
2655 ; CHECK-NEXT: vmv.x.s a0, v8
2657 %v = load <32 x i32>, ptr %x
2658 %red = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %v)
2662 declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
2664 define i32 @vreduce_or_v64i32(ptr %x) {
2665 ; CHECK-LABEL: vreduce_or_v64i32:
2667 ; CHECK-NEXT: li a1, 32
2668 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2669 ; CHECK-NEXT: vle32.v v8, (a0)
2670 ; CHECK-NEXT: addi a0, a0, 128
2671 ; CHECK-NEXT: vle32.v v16, (a0)
2672 ; CHECK-NEXT: vor.vv v8, v8, v16
2673 ; CHECK-NEXT: vredor.vs v8, v8, v8
2674 ; CHECK-NEXT: vmv.x.s a0, v8
2676 %v = load <64 x i32>, ptr %x
2677 %red = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %v)
2681 declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
2683 define i64 @vreduce_or_v1i64(ptr %x) {
2684 ; RV32-LABEL: vreduce_or_v1i64:
2686 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2687 ; RV32-NEXT: vle64.v v8, (a0)
2688 ; RV32-NEXT: li a0, 32
2689 ; RV32-NEXT: vsrl.vx v9, v8, a0
2690 ; RV32-NEXT: vmv.x.s a1, v9
2691 ; RV32-NEXT: vmv.x.s a0, v8
2694 ; RV64-LABEL: vreduce_or_v1i64:
2696 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2697 ; RV64-NEXT: vle64.v v8, (a0)
2698 ; RV64-NEXT: vmv.x.s a0, v8
2700 %v = load <1 x i64>, ptr %x
2701 %red = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %v)
2705 declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
2707 define i64 @vreduce_or_v2i64(ptr %x) {
2708 ; RV32-LABEL: vreduce_or_v2i64:
2710 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2711 ; RV32-NEXT: vle64.v v8, (a0)
2712 ; RV32-NEXT: vredor.vs v8, v8, v8
2713 ; RV32-NEXT: li a0, 32
2714 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2715 ; RV32-NEXT: vsrl.vx v9, v8, a0
2716 ; RV32-NEXT: vmv.x.s a1, v9
2717 ; RV32-NEXT: vmv.x.s a0, v8
2720 ; RV64-LABEL: vreduce_or_v2i64:
2722 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2723 ; RV64-NEXT: vle64.v v8, (a0)
2724 ; RV64-NEXT: vredor.vs v8, v8, v8
2725 ; RV64-NEXT: vmv.x.s a0, v8
2727 %v = load <2 x i64>, ptr %x
2728 %red = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v)
2732 declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
2734 define i64 @vreduce_or_v4i64(ptr %x) {
2735 ; RV32-LABEL: vreduce_or_v4i64:
2737 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2738 ; RV32-NEXT: vle64.v v8, (a0)
2739 ; RV32-NEXT: vredor.vs v8, v8, v8
2740 ; RV32-NEXT: vmv.x.s a0, v8
2741 ; RV32-NEXT: li a1, 32
2742 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2743 ; RV32-NEXT: vsrl.vx v8, v8, a1
2744 ; RV32-NEXT: vmv.x.s a1, v8
2747 ; RV64-LABEL: vreduce_or_v4i64:
2749 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2750 ; RV64-NEXT: vle64.v v8, (a0)
2751 ; RV64-NEXT: vredor.vs v8, v8, v8
2752 ; RV64-NEXT: vmv.x.s a0, v8
2754 %v = load <4 x i64>, ptr %x
2755 %red = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
2759 declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
2761 define i64 @vreduce_or_v8i64(ptr %x) {
2762 ; RV32-LABEL: vreduce_or_v8i64:
2764 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2765 ; RV32-NEXT: vle64.v v8, (a0)
2766 ; RV32-NEXT: vredor.vs v8, v8, v8
2767 ; RV32-NEXT: vmv.x.s a0, v8
2768 ; RV32-NEXT: li a1, 32
2769 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2770 ; RV32-NEXT: vsrl.vx v8, v8, a1
2771 ; RV32-NEXT: vmv.x.s a1, v8
2774 ; RV64-LABEL: vreduce_or_v8i64:
2776 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2777 ; RV64-NEXT: vle64.v v8, (a0)
2778 ; RV64-NEXT: vredor.vs v8, v8, v8
2779 ; RV64-NEXT: vmv.x.s a0, v8
2781 %v = load <8 x i64>, ptr %x
2782 %red = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %v)
2786 declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
2788 define i64 @vreduce_or_v16i64(ptr %x) {
2789 ; RV32-LABEL: vreduce_or_v16i64:
2791 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2792 ; RV32-NEXT: vle64.v v8, (a0)
2793 ; RV32-NEXT: vredor.vs v8, v8, v8
2794 ; RV32-NEXT: vmv.x.s a0, v8
2795 ; RV32-NEXT: li a1, 32
2796 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2797 ; RV32-NEXT: vsrl.vx v8, v8, a1
2798 ; RV32-NEXT: vmv.x.s a1, v8
2801 ; RV64-LABEL: vreduce_or_v16i64:
2803 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2804 ; RV64-NEXT: vle64.v v8, (a0)
2805 ; RV64-NEXT: vredor.vs v8, v8, v8
2806 ; RV64-NEXT: vmv.x.s a0, v8
2808 %v = load <16 x i64>, ptr %x
2809 %red = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %v)
2813 declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
2815 define i64 @vreduce_or_v32i64(ptr %x) {
2816 ; RV32-LABEL: vreduce_or_v32i64:
2818 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2819 ; RV32-NEXT: vle64.v v8, (a0)
2820 ; RV32-NEXT: addi a0, a0, 128
2821 ; RV32-NEXT: vle64.v v16, (a0)
2822 ; RV32-NEXT: vor.vv v8, v8, v16
2823 ; RV32-NEXT: vredor.vs v8, v8, v8
2824 ; RV32-NEXT: vmv.x.s a0, v8
2825 ; RV32-NEXT: li a1, 32
2826 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2827 ; RV32-NEXT: vsrl.vx v8, v8, a1
2828 ; RV32-NEXT: vmv.x.s a1, v8
2831 ; RV64-LABEL: vreduce_or_v32i64:
2833 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2834 ; RV64-NEXT: vle64.v v8, (a0)
2835 ; RV64-NEXT: addi a0, a0, 128
2836 ; RV64-NEXT: vle64.v v16, (a0)
2837 ; RV64-NEXT: vor.vv v8, v8, v16
2838 ; RV64-NEXT: vredor.vs v8, v8, v8
2839 ; RV64-NEXT: vmv.x.s a0, v8
2841 %v = load <32 x i64>, ptr %x
2842 %red = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %v)
2846 declare i64 @llvm.vector.reduce.or.v64i64(<64 x i64>)
2848 define i64 @vreduce_or_v64i64(ptr %x) nounwind {
2849 ; RV32-LABEL: vreduce_or_v64i64:
2851 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2852 ; RV32-NEXT: vle64.v v8, (a0)
2853 ; RV32-NEXT: addi a1, a0, 384
2854 ; RV32-NEXT: vle64.v v16, (a1)
2855 ; RV32-NEXT: addi a1, a0, 256
2856 ; RV32-NEXT: addi a0, a0, 128
2857 ; RV32-NEXT: vle64.v v24, (a0)
2858 ; RV32-NEXT: vle64.v v0, (a1)
2859 ; RV32-NEXT: vor.vv v16, v24, v16
2860 ; RV32-NEXT: vor.vv v8, v8, v0
2861 ; RV32-NEXT: vor.vv v8, v8, v16
2862 ; RV32-NEXT: vredor.vs v8, v8, v8
2863 ; RV32-NEXT: vmv.x.s a0, v8
2864 ; RV32-NEXT: li a1, 32
2865 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2866 ; RV32-NEXT: vsrl.vx v8, v8, a1
2867 ; RV32-NEXT: vmv.x.s a1, v8
2870 ; RV64-LABEL: vreduce_or_v64i64:
2872 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2873 ; RV64-NEXT: vle64.v v8, (a0)
2874 ; RV64-NEXT: addi a1, a0, 384
2875 ; RV64-NEXT: vle64.v v16, (a1)
2876 ; RV64-NEXT: addi a1, a0, 256
2877 ; RV64-NEXT: addi a0, a0, 128
2878 ; RV64-NEXT: vle64.v v24, (a0)
2879 ; RV64-NEXT: vle64.v v0, (a1)
2880 ; RV64-NEXT: vor.vv v16, v24, v16
2881 ; RV64-NEXT: vor.vv v8, v8, v0
2882 ; RV64-NEXT: vor.vv v8, v8, v16
2883 ; RV64-NEXT: vredor.vs v8, v8, v8
2884 ; RV64-NEXT: vmv.x.s a0, v8
2886 %v = load <64 x i64>, ptr %x
2887 %red = call i64 @llvm.vector.reduce.or.v64i64(<64 x i64> %v)
2891 declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
2893 define i8 @vreduce_xor_v1i8(ptr %x) {
2894 ; CHECK-LABEL: vreduce_xor_v1i8:
2896 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
2897 ; CHECK-NEXT: vle8.v v8, (a0)
2898 ; CHECK-NEXT: vmv.x.s a0, v8
2900 %v = load <1 x i8>, ptr %x
2901 %red = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> %v)
2905 declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
2907 define i8 @vreduce_xor_v2i8(ptr %x) {
2908 ; CHECK-LABEL: vreduce_xor_v2i8:
2910 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
2911 ; CHECK-NEXT: vle8.v v8, (a0)
2912 ; CHECK-NEXT: vmv.s.x v9, zero
2913 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2914 ; CHECK-NEXT: vmv.x.s a0, v8
2916 %v = load <2 x i8>, ptr %x
2917 %red = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v)
2921 declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
2923 define i8 @vreduce_xor_v4i8(ptr %x) {
2924 ; CHECK-LABEL: vreduce_xor_v4i8:
2926 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2927 ; CHECK-NEXT: vle8.v v8, (a0)
2928 ; CHECK-NEXT: vmv.s.x v9, zero
2929 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2930 ; CHECK-NEXT: vmv.x.s a0, v8
2932 %v = load <4 x i8>, ptr %x
2933 %red = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v)
2937 declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
2939 define i8 @vreduce_xor_v8i8(ptr %x) {
2940 ; CHECK-LABEL: vreduce_xor_v8i8:
2942 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2943 ; CHECK-NEXT: vle8.v v8, (a0)
2944 ; CHECK-NEXT: vmv.s.x v9, zero
2945 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2946 ; CHECK-NEXT: vmv.x.s a0, v8
2948 %v = load <8 x i8>, ptr %x
2949 %red = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v)
2953 declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
2955 define i8 @vreduce_xor_v16i8(ptr %x) {
2956 ; CHECK-LABEL: vreduce_xor_v16i8:
2958 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
2959 ; CHECK-NEXT: vle8.v v8, (a0)
2960 ; CHECK-NEXT: vmv.s.x v9, zero
2961 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2962 ; CHECK-NEXT: vmv.x.s a0, v8
2964 %v = load <16 x i8>, ptr %x
2965 %red = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %v)
2969 declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
2971 define i8 @vreduce_xor_v32i8(ptr %x) {
2972 ; CHECK-LABEL: vreduce_xor_v32i8:
2974 ; CHECK-NEXT: li a1, 32
2975 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
2976 ; CHECK-NEXT: vle8.v v8, (a0)
2977 ; CHECK-NEXT: vmv.s.x v10, zero
2978 ; CHECK-NEXT: vredxor.vs v8, v8, v10
2979 ; CHECK-NEXT: vmv.x.s a0, v8
2981 %v = load <32 x i8>, ptr %x
2982 %red = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v)
2986 declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
2988 define i8 @vreduce_xor_v64i8(ptr %x) {
2989 ; CHECK-LABEL: vreduce_xor_v64i8:
2991 ; CHECK-NEXT: li a1, 64
2992 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
2993 ; CHECK-NEXT: vle8.v v8, (a0)
2994 ; CHECK-NEXT: vmv.s.x v12, zero
2995 ; CHECK-NEXT: vredxor.vs v8, v8, v12
2996 ; CHECK-NEXT: vmv.x.s a0, v8
2998 %v = load <64 x i8>, ptr %x
2999 %red = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %v)
3003 declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
3005 define i8 @vreduce_xor_v128i8(ptr %x) {
3006 ; CHECK-LABEL: vreduce_xor_v128i8:
3008 ; CHECK-NEXT: li a1, 128
3009 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3010 ; CHECK-NEXT: vle8.v v8, (a0)
3011 ; CHECK-NEXT: vmv.s.x v16, zero
3012 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3013 ; CHECK-NEXT: vmv.x.s a0, v8
3015 %v = load <128 x i8>, ptr %x
3016 %red = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %v)
3020 declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
3022 define i8 @vreduce_xor_v256i8(ptr %x) {
3023 ; CHECK-LABEL: vreduce_xor_v256i8:
3025 ; CHECK-NEXT: li a1, 128
3026 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3027 ; CHECK-NEXT: vle8.v v8, (a0)
3028 ; CHECK-NEXT: addi a0, a0, 128
3029 ; CHECK-NEXT: vle8.v v16, (a0)
3030 ; CHECK-NEXT: vxor.vv v8, v8, v16
3031 ; CHECK-NEXT: vmv.s.x v16, zero
3032 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3033 ; CHECK-NEXT: vmv.x.s a0, v8
3035 %v = load <256 x i8>, ptr %x
3036 %red = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %v)
3040 declare i16 @llvm.vector.reduce.xor.v1i16(<1 x i16>)
3042 define i16 @vreduce_xor_v1i16(ptr %x) {
3043 ; CHECK-LABEL: vreduce_xor_v1i16:
3045 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
3046 ; CHECK-NEXT: vle16.v v8, (a0)
3047 ; CHECK-NEXT: vmv.x.s a0, v8
3049 %v = load <1 x i16>, ptr %x
3050 %red = call i16 @llvm.vector.reduce.xor.v1i16(<1 x i16> %v)
3054 declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
3056 define i16 @vreduce_xor_v2i16(ptr %x) {
3057 ; CHECK-LABEL: vreduce_xor_v2i16:
3059 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
3060 ; CHECK-NEXT: vle16.v v8, (a0)
3061 ; CHECK-NEXT: vmv.s.x v9, zero
3062 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3063 ; CHECK-NEXT: vmv.x.s a0, v8
3065 %v = load <2 x i16>, ptr %x
3066 %red = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v)
3070 declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
3072 define i16 @vreduce_xor_v4i16(ptr %x) {
3073 ; CHECK-LABEL: vreduce_xor_v4i16:
3075 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
3076 ; CHECK-NEXT: vle16.v v8, (a0)
3077 ; CHECK-NEXT: vmv.s.x v9, zero
3078 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3079 ; CHECK-NEXT: vmv.x.s a0, v8
3081 %v = load <4 x i16>, ptr %x
3082 %red = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v)
3086 declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
3088 define i16 @vreduce_xor_v8i16(ptr %x) {
3089 ; CHECK-LABEL: vreduce_xor_v8i16:
3091 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
3092 ; CHECK-NEXT: vle16.v v8, (a0)
3093 ; CHECK-NEXT: vmv.s.x v9, zero
3094 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3095 ; CHECK-NEXT: vmv.x.s a0, v8
3097 %v = load <8 x i16>, ptr %x
3098 %red = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v)
3102 declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
3104 define i16 @vreduce_xor_v16i16(ptr %x) {
3105 ; CHECK-LABEL: vreduce_xor_v16i16:
3107 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
3108 ; CHECK-NEXT: vle16.v v8, (a0)
3109 ; CHECK-NEXT: vmv.s.x v10, zero
3110 ; CHECK-NEXT: vredxor.vs v8, v8, v10
3111 ; CHECK-NEXT: vmv.x.s a0, v8
3113 %v = load <16 x i16>, ptr %x
3114 %red = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v)
3118 declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
3120 define i16 @vreduce_xor_v32i16(ptr %x) {
3121 ; CHECK-LABEL: vreduce_xor_v32i16:
3123 ; CHECK-NEXT: li a1, 32
3124 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
3125 ; CHECK-NEXT: vle16.v v8, (a0)
3126 ; CHECK-NEXT: vmv.s.x v12, zero
3127 ; CHECK-NEXT: vredxor.vs v8, v8, v12
3128 ; CHECK-NEXT: vmv.x.s a0, v8
3130 %v = load <32 x i16>, ptr %x
3131 %red = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %v)
3135 declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
3137 define i16 @vreduce_xor_v64i16(ptr %x) {
3138 ; CHECK-LABEL: vreduce_xor_v64i16:
3140 ; CHECK-NEXT: li a1, 64
3141 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3142 ; CHECK-NEXT: vle16.v v8, (a0)
3143 ; CHECK-NEXT: vmv.s.x v16, zero
3144 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3145 ; CHECK-NEXT: vmv.x.s a0, v8
3147 %v = load <64 x i16>, ptr %x
3148 %red = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %v)
3152 declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
3154 define i16 @vreduce_xor_v128i16(ptr %x) {
3155 ; CHECK-LABEL: vreduce_xor_v128i16:
3157 ; CHECK-NEXT: li a1, 64
3158 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3159 ; CHECK-NEXT: vle16.v v8, (a0)
3160 ; CHECK-NEXT: addi a0, a0, 128
3161 ; CHECK-NEXT: vle16.v v16, (a0)
3162 ; CHECK-NEXT: vxor.vv v8, v8, v16
3163 ; CHECK-NEXT: vmv.s.x v16, zero
3164 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3165 ; CHECK-NEXT: vmv.x.s a0, v8
3167 %v = load <128 x i16>, ptr %x
3168 %red = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %v)
3172 declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32>)
3174 define i32 @vreduce_xor_v1i32(ptr %x) {
3175 ; CHECK-LABEL: vreduce_xor_v1i32:
3177 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
3178 ; CHECK-NEXT: vle32.v v8, (a0)
3179 ; CHECK-NEXT: vmv.x.s a0, v8
3181 %v = load <1 x i32>, ptr %x
3182 %red = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %v)
3186 declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
3188 define i32 @vreduce_xor_v2i32(ptr %x) {
3189 ; CHECK-LABEL: vreduce_xor_v2i32:
3191 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
3192 ; CHECK-NEXT: vle32.v v8, (a0)
3193 ; CHECK-NEXT: vmv.s.x v9, zero
3194 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3195 ; CHECK-NEXT: vmv.x.s a0, v8
3197 %v = load <2 x i32>, ptr %x
3198 %red = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v)
3202 declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
3204 define i32 @vreduce_xor_v4i32(ptr %x) {
3205 ; CHECK-LABEL: vreduce_xor_v4i32:
3207 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3208 ; CHECK-NEXT: vle32.v v8, (a0)
3209 ; CHECK-NEXT: vmv.s.x v9, zero
3210 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3211 ; CHECK-NEXT: vmv.x.s a0, v8
3213 %v = load <4 x i32>, ptr %x
3214 %red = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v)
3218 declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
3220 define i32 @vreduce_xor_v8i32(ptr %x) {
3221 ; CHECK-LABEL: vreduce_xor_v8i32:
3223 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
3224 ; CHECK-NEXT: vle32.v v8, (a0)
3225 ; CHECK-NEXT: vmv.s.x v10, zero
3226 ; CHECK-NEXT: vredxor.vs v8, v8, v10
3227 ; CHECK-NEXT: vmv.x.s a0, v8
3229 %v = load <8 x i32>, ptr %x
3230 %red = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v)
3234 declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
3236 define i32 @vreduce_xor_v16i32(ptr %x) {
3237 ; CHECK-LABEL: vreduce_xor_v16i32:
3239 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
3240 ; CHECK-NEXT: vle32.v v8, (a0)
3241 ; CHECK-NEXT: vmv.s.x v12, zero
3242 ; CHECK-NEXT: vredxor.vs v8, v8, v12
3243 ; CHECK-NEXT: vmv.x.s a0, v8
3245 %v = load <16 x i32>, ptr %x
3246 %red = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v)
3250 declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
3252 define i32 @vreduce_xor_v32i32(ptr %x) {
3253 ; CHECK-LABEL: vreduce_xor_v32i32:
3255 ; CHECK-NEXT: li a1, 32
3256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3257 ; CHECK-NEXT: vle32.v v8, (a0)
3258 ; CHECK-NEXT: vmv.s.x v16, zero
3259 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3260 ; CHECK-NEXT: vmv.x.s a0, v8
3262 %v = load <32 x i32>, ptr %x
3263 %red = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %v)
3267 declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
3269 define i32 @vreduce_xor_v64i32(ptr %x) {
3270 ; CHECK-LABEL: vreduce_xor_v64i32:
3272 ; CHECK-NEXT: li a1, 32
3273 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3274 ; CHECK-NEXT: vle32.v v8, (a0)
3275 ; CHECK-NEXT: addi a0, a0, 128
3276 ; CHECK-NEXT: vle32.v v16, (a0)
3277 ; CHECK-NEXT: vxor.vv v8, v8, v16
3278 ; CHECK-NEXT: vmv.s.x v16, zero
3279 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3280 ; CHECK-NEXT: vmv.x.s a0, v8
3282 %v = load <64 x i32>, ptr %x
3283 %red = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %v)
3287 declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
3289 define i64 @vreduce_xor_v1i64(ptr %x) {
3290 ; RV32-LABEL: vreduce_xor_v1i64:
3292 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3293 ; RV32-NEXT: vle64.v v8, (a0)
3294 ; RV32-NEXT: li a0, 32
3295 ; RV32-NEXT: vsrl.vx v9, v8, a0
3296 ; RV32-NEXT: vmv.x.s a1, v9
3297 ; RV32-NEXT: vmv.x.s a0, v8
3300 ; RV64-LABEL: vreduce_xor_v1i64:
3302 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3303 ; RV64-NEXT: vle64.v v8, (a0)
3304 ; RV64-NEXT: vmv.x.s a0, v8
3306 %v = load <1 x i64>, ptr %x
3307 %red = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %v)
3311 declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
3313 define i64 @vreduce_xor_v2i64(ptr %x) {
3314 ; RV32-LABEL: vreduce_xor_v2i64:
3316 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3317 ; RV32-NEXT: vle64.v v8, (a0)
3318 ; RV32-NEXT: vmv.s.x v9, zero
3319 ; RV32-NEXT: vredxor.vs v8, v8, v9
3320 ; RV32-NEXT: vmv.x.s a0, v8
3321 ; RV32-NEXT: li a1, 32
3322 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3323 ; RV32-NEXT: vsrl.vx v8, v8, a1
3324 ; RV32-NEXT: vmv.x.s a1, v8
3327 ; RV64-LABEL: vreduce_xor_v2i64:
3329 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3330 ; RV64-NEXT: vle64.v v8, (a0)
3331 ; RV64-NEXT: vmv.s.x v9, zero
3332 ; RV64-NEXT: vredxor.vs v8, v8, v9
3333 ; RV64-NEXT: vmv.x.s a0, v8
3335 %v = load <2 x i64>, ptr %x
3336 %red = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v)
3340 declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
3342 define i64 @vreduce_xor_v4i64(ptr %x) {
3343 ; RV32-LABEL: vreduce_xor_v4i64:
3345 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3346 ; RV32-NEXT: vle64.v v8, (a0)
3347 ; RV32-NEXT: vmv.s.x v10, zero
3348 ; RV32-NEXT: vredxor.vs v8, v8, v10
3349 ; RV32-NEXT: vmv.x.s a0, v8
3350 ; RV32-NEXT: li a1, 32
3351 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3352 ; RV32-NEXT: vsrl.vx v8, v8, a1
3353 ; RV32-NEXT: vmv.x.s a1, v8
3356 ; RV64-LABEL: vreduce_xor_v4i64:
3358 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3359 ; RV64-NEXT: vle64.v v8, (a0)
3360 ; RV64-NEXT: vmv.s.x v10, zero
3361 ; RV64-NEXT: vredxor.vs v8, v8, v10
3362 ; RV64-NEXT: vmv.x.s a0, v8
3364 %v = load <4 x i64>, ptr %x
3365 %red = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
3369 declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
3371 define i64 @vreduce_xor_v8i64(ptr %x) {
3372 ; RV32-LABEL: vreduce_xor_v8i64:
3374 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3375 ; RV32-NEXT: vle64.v v8, (a0)
3376 ; RV32-NEXT: vmv.s.x v12, zero
3377 ; RV32-NEXT: vredxor.vs v8, v8, v12
3378 ; RV32-NEXT: vmv.x.s a0, v8
3379 ; RV32-NEXT: li a1, 32
3380 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3381 ; RV32-NEXT: vsrl.vx v8, v8, a1
3382 ; RV32-NEXT: vmv.x.s a1, v8
3385 ; RV64-LABEL: vreduce_xor_v8i64:
3387 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3388 ; RV64-NEXT: vle64.v v8, (a0)
3389 ; RV64-NEXT: vmv.s.x v12, zero
3390 ; RV64-NEXT: vredxor.vs v8, v8, v12
3391 ; RV64-NEXT: vmv.x.s a0, v8
3393 %v = load <8 x i64>, ptr %x
3394 %red = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %v)
3398 declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
3400 define i64 @vreduce_xor_v16i64(ptr %x) {
3401 ; RV32-LABEL: vreduce_xor_v16i64:
3403 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3404 ; RV32-NEXT: vle64.v v8, (a0)
3405 ; RV32-NEXT: vmv.s.x v16, zero
3406 ; RV32-NEXT: vredxor.vs v8, v8, v16
3407 ; RV32-NEXT: vmv.x.s a0, v8
3408 ; RV32-NEXT: li a1, 32
3409 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3410 ; RV32-NEXT: vsrl.vx v8, v8, a1
3411 ; RV32-NEXT: vmv.x.s a1, v8
3414 ; RV64-LABEL: vreduce_xor_v16i64:
3416 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3417 ; RV64-NEXT: vle64.v v8, (a0)
3418 ; RV64-NEXT: vmv.s.x v16, zero
3419 ; RV64-NEXT: vredxor.vs v8, v8, v16
3420 ; RV64-NEXT: vmv.x.s a0, v8
3422 %v = load <16 x i64>, ptr %x
3423 %red = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %v)
3427 declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)
3429 define i64 @vreduce_xor_v32i64(ptr %x) {
3430 ; RV32-LABEL: vreduce_xor_v32i64:
3432 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3433 ; RV32-NEXT: vle64.v v8, (a0)
3434 ; RV32-NEXT: addi a0, a0, 128
3435 ; RV32-NEXT: vle64.v v16, (a0)
3436 ; RV32-NEXT: vxor.vv v8, v8, v16
3437 ; RV32-NEXT: vmv.s.x v16, zero
3438 ; RV32-NEXT: vredxor.vs v8, v8, v16
3439 ; RV32-NEXT: vmv.x.s a0, v8
3440 ; RV32-NEXT: li a1, 32
3441 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3442 ; RV32-NEXT: vsrl.vx v8, v8, a1
3443 ; RV32-NEXT: vmv.x.s a1, v8
3446 ; RV64-LABEL: vreduce_xor_v32i64:
3448 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3449 ; RV64-NEXT: vle64.v v8, (a0)
3450 ; RV64-NEXT: addi a0, a0, 128
3451 ; RV64-NEXT: vle64.v v16, (a0)
3452 ; RV64-NEXT: vxor.vv v8, v8, v16
3453 ; RV64-NEXT: vmv.s.x v16, zero
3454 ; RV64-NEXT: vredxor.vs v8, v8, v16
3455 ; RV64-NEXT: vmv.x.s a0, v8
3457 %v = load <32 x i64>, ptr %x
3458 %red = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %v)
3462 declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
3464 define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
3465 ; RV32-LABEL: vreduce_xor_v64i64:
3467 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3468 ; RV32-NEXT: vle64.v v8, (a0)
3469 ; RV32-NEXT: addi a1, a0, 384
3470 ; RV32-NEXT: vle64.v v16, (a1)
3471 ; RV32-NEXT: addi a1, a0, 256
3472 ; RV32-NEXT: addi a0, a0, 128
3473 ; RV32-NEXT: vle64.v v24, (a0)
3474 ; RV32-NEXT: vle64.v v0, (a1)
3475 ; RV32-NEXT: vxor.vv v16, v24, v16
3476 ; RV32-NEXT: vxor.vv v8, v8, v0
3477 ; RV32-NEXT: vxor.vv v8, v8, v16
3478 ; RV32-NEXT: vmv.s.x v16, zero
3479 ; RV32-NEXT: vredxor.vs v8, v8, v16
3480 ; RV32-NEXT: vmv.x.s a0, v8
3481 ; RV32-NEXT: li a1, 32
3482 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3483 ; RV32-NEXT: vsrl.vx v8, v8, a1
3484 ; RV32-NEXT: vmv.x.s a1, v8
3487 ; RV64-LABEL: vreduce_xor_v64i64:
3489 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3490 ; RV64-NEXT: vle64.v v8, (a0)
3491 ; RV64-NEXT: addi a1, a0, 384
3492 ; RV64-NEXT: vle64.v v16, (a1)
3493 ; RV64-NEXT: addi a1, a0, 256
3494 ; RV64-NEXT: addi a0, a0, 128
3495 ; RV64-NEXT: vle64.v v24, (a0)
3496 ; RV64-NEXT: vle64.v v0, (a1)
3497 ; RV64-NEXT: vxor.vv v16, v24, v16
3498 ; RV64-NEXT: vxor.vv v8, v8, v0
3499 ; RV64-NEXT: vxor.vv v8, v8, v16
3500 ; RV64-NEXT: vmv.s.x v16, zero
3501 ; RV64-NEXT: vredxor.vs v8, v8, v16
3502 ; RV64-NEXT: vmv.x.s a0, v8
3504 %v = load <64 x i64>, ptr %x
3505 %red = call i64 @llvm.vector.reduce.xor.v64i64(<64 x i64> %v)
3509 declare i8 @llvm.vector.reduce.smin.v1i8(<1 x i8>)
3511 define i8 @vreduce_smin_v1i8(ptr %x) {
3512 ; CHECK-LABEL: vreduce_smin_v1i8:
3514 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
3515 ; CHECK-NEXT: vle8.v v8, (a0)
3516 ; CHECK-NEXT: vmv.x.s a0, v8
3518 %v = load <1 x i8>, ptr %x
3519 %red = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> %v)
3523 declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
3525 define i8 @vreduce_smin_v2i8(ptr %x) {
3526 ; CHECK-LABEL: vreduce_smin_v2i8:
3528 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
3529 ; CHECK-NEXT: vle8.v v8, (a0)
3530 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3531 ; CHECK-NEXT: vmv.x.s a0, v8
3533 %v = load <2 x i8>, ptr %x
3534 %red = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v)
3538 declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
3540 define i8 @vreduce_smin_v4i8(ptr %x) {
3541 ; CHECK-LABEL: vreduce_smin_v4i8:
3543 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
3544 ; CHECK-NEXT: vle8.v v8, (a0)
3545 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3546 ; CHECK-NEXT: vmv.x.s a0, v8
3548 %v = load <4 x i8>, ptr %x
3549 %red = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v)
3553 declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
3555 define i8 @vreduce_smin_v8i8(ptr %x) {
3556 ; CHECK-LABEL: vreduce_smin_v8i8:
3558 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
3559 ; CHECK-NEXT: vle8.v v8, (a0)
3560 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3561 ; CHECK-NEXT: vmv.x.s a0, v8
3563 %v = load <8 x i8>, ptr %x
3564 %red = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
3568 declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
3570 define i8 @vreduce_smin_v16i8(ptr %x) {
3571 ; CHECK-LABEL: vreduce_smin_v16i8:
3573 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
3574 ; CHECK-NEXT: vle8.v v8, (a0)
3575 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3576 ; CHECK-NEXT: vmv.x.s a0, v8
3578 %v = load <16 x i8>, ptr %x
3579 %red = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
3583 declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
3585 define i8 @vreduce_smin_v32i8(ptr %x) {
3586 ; CHECK-LABEL: vreduce_smin_v32i8:
3588 ; CHECK-NEXT: li a1, 32
3589 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
3590 ; CHECK-NEXT: vle8.v v8, (a0)
3591 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3592 ; CHECK-NEXT: vmv.x.s a0, v8
3594 %v = load <32 x i8>, ptr %x
3595 %red = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v)
3599 declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
3601 define i8 @vreduce_smin_v64i8(ptr %x) {
3602 ; CHECK-LABEL: vreduce_smin_v64i8:
3604 ; CHECK-NEXT: li a1, 64
3605 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
3606 ; CHECK-NEXT: vle8.v v8, (a0)
3607 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3608 ; CHECK-NEXT: vmv.x.s a0, v8
3610 %v = load <64 x i8>, ptr %x
3611 %red = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %v)
3615 declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
3617 define i8 @vreduce_smin_v128i8(ptr %x) {
3618 ; CHECK-LABEL: vreduce_smin_v128i8:
3620 ; CHECK-NEXT: li a1, 128
3621 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3622 ; CHECK-NEXT: vle8.v v8, (a0)
3623 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3624 ; CHECK-NEXT: vmv.x.s a0, v8
3626 %v = load <128 x i8>, ptr %x
3627 %red = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %v)
3631 declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
3633 define i8 @vreduce_smin_v256i8(ptr %x) {
3634 ; CHECK-LABEL: vreduce_smin_v256i8:
3636 ; CHECK-NEXT: li a1, 128
3637 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3638 ; CHECK-NEXT: vle8.v v8, (a0)
3639 ; CHECK-NEXT: addi a0, a0, 128
3640 ; CHECK-NEXT: vle8.v v16, (a0)
3641 ; CHECK-NEXT: vmin.vv v8, v8, v16
3642 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3643 ; CHECK-NEXT: vmv.x.s a0, v8
3645 %v = load <256 x i8>, ptr %x
3646 %red = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %v)
3650 declare i16 @llvm.vector.reduce.smin.v1i16(<1 x i16>)
3652 define i16 @vreduce_smin_v1i16(ptr %x) {
3653 ; CHECK-LABEL: vreduce_smin_v1i16:
3655 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
3656 ; CHECK-NEXT: vle16.v v8, (a0)
3657 ; CHECK-NEXT: vmv.x.s a0, v8
3659 %v = load <1 x i16>, ptr %x
3660 %red = call i16 @llvm.vector.reduce.smin.v1i16(<1 x i16> %v)
3664 declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
3666 define i16 @vreduce_smin_v2i16(ptr %x) {
3667 ; CHECK-LABEL: vreduce_smin_v2i16:
3669 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
3670 ; CHECK-NEXT: vle16.v v8, (a0)
3671 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3672 ; CHECK-NEXT: vmv.x.s a0, v8
3674 %v = load <2 x i16>, ptr %x
3675 %red = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v)
3679 declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
3681 define i16 @vreduce_smin_v4i16(ptr %x) {
3682 ; CHECK-LABEL: vreduce_smin_v4i16:
3684 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
3685 ; CHECK-NEXT: vle16.v v8, (a0)
3686 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3687 ; CHECK-NEXT: vmv.x.s a0, v8
3689 %v = load <4 x i16>, ptr %x
3690 %red = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
3694 declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
3696 define i16 @vreduce_smin_v8i16(ptr %x) {
3697 ; CHECK-LABEL: vreduce_smin_v8i16:
3699 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
3700 ; CHECK-NEXT: vle16.v v8, (a0)
3701 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3702 ; CHECK-NEXT: vmv.x.s a0, v8
3704 %v = load <8 x i16>, ptr %x
3705 %red = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
3709 declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
3711 define i16 @vreduce_smin_v16i16(ptr %x) {
3712 ; CHECK-LABEL: vreduce_smin_v16i16:
3714 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
3715 ; CHECK-NEXT: vle16.v v8, (a0)
3716 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3717 ; CHECK-NEXT: vmv.x.s a0, v8
3719 %v = load <16 x i16>, ptr %x
3720 %red = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v)
3724 declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
3726 define i16 @vreduce_smin_v32i16(ptr %x) {
3727 ; CHECK-LABEL: vreduce_smin_v32i16:
3729 ; CHECK-NEXT: li a1, 32
3730 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
3731 ; CHECK-NEXT: vle16.v v8, (a0)
3732 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3733 ; CHECK-NEXT: vmv.x.s a0, v8
3735 %v = load <32 x i16>, ptr %x
3736 %red = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %v)
3740 declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
3742 define i16 @vreduce_smin_v64i16(ptr %x) {
3743 ; CHECK-LABEL: vreduce_smin_v64i16:
3745 ; CHECK-NEXT: li a1, 64
3746 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3747 ; CHECK-NEXT: vle16.v v8, (a0)
3748 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3749 ; CHECK-NEXT: vmv.x.s a0, v8
3751 %v = load <64 x i16>, ptr %x
3752 %red = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %v)
3756 declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
3758 define i16 @vreduce_smin_v128i16(ptr %x) {
3759 ; CHECK-LABEL: vreduce_smin_v128i16:
3761 ; CHECK-NEXT: li a1, 64
3762 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3763 ; CHECK-NEXT: vle16.v v8, (a0)
3764 ; CHECK-NEXT: addi a0, a0, 128
3765 ; CHECK-NEXT: vle16.v v16, (a0)
3766 ; CHECK-NEXT: vmin.vv v8, v8, v16
3767 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3768 ; CHECK-NEXT: vmv.x.s a0, v8
3770 %v = load <128 x i16>, ptr %x
3771 %red = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %v)
3775 declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32>)
3777 define i32 @vreduce_smin_v1i32(ptr %x) {
3778 ; CHECK-LABEL: vreduce_smin_v1i32:
3780 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
3781 ; CHECK-NEXT: vle32.v v8, (a0)
3782 ; CHECK-NEXT: vmv.x.s a0, v8
3784 %v = load <1 x i32>, ptr %x
3785 %red = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %v)
3789 declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
3791 define i32 @vreduce_smin_v2i32(ptr %x) {
3792 ; CHECK-LABEL: vreduce_smin_v2i32:
3794 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
3795 ; CHECK-NEXT: vle32.v v8, (a0)
3796 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3797 ; CHECK-NEXT: vmv.x.s a0, v8
3799 %v = load <2 x i32>, ptr %x
3800 %red = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v)
3804 declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
3806 define i32 @vreduce_smin_v4i32(ptr %x) {
3807 ; CHECK-LABEL: vreduce_smin_v4i32:
3809 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3810 ; CHECK-NEXT: vle32.v v8, (a0)
3811 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3812 ; CHECK-NEXT: vmv.x.s a0, v8
3814 %v = load <4 x i32>, ptr %x
3815 %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
3819 declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
3821 define i32 @vreduce_smin_v8i32(ptr %x) {
3822 ; CHECK-LABEL: vreduce_smin_v8i32:
3824 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
3825 ; CHECK-NEXT: vle32.v v8, (a0)
3826 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3827 ; CHECK-NEXT: vmv.x.s a0, v8
3829 %v = load <8 x i32>, ptr %x
3830 %red = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v)
3834 declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
3836 define i32 @vreduce_smin_v16i32(ptr %x) {
3837 ; CHECK-LABEL: vreduce_smin_v16i32:
3839 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
3840 ; CHECK-NEXT: vle32.v v8, (a0)
3841 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3842 ; CHECK-NEXT: vmv.x.s a0, v8
3844 %v = load <16 x i32>, ptr %x
3845 %red = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %v)
3849 declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
3851 define i32 @vreduce_smin_v32i32(ptr %x) {
3852 ; CHECK-LABEL: vreduce_smin_v32i32:
3854 ; CHECK-NEXT: li a1, 32
3855 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3856 ; CHECK-NEXT: vle32.v v8, (a0)
3857 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3858 ; CHECK-NEXT: vmv.x.s a0, v8
3860 %v = load <32 x i32>, ptr %x
3861 %red = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %v)
3865 declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
3867 define i32 @vreduce_smin_v64i32(ptr %x) {
3868 ; CHECK-LABEL: vreduce_smin_v64i32:
3870 ; CHECK-NEXT: li a1, 32
3871 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3872 ; CHECK-NEXT: vle32.v v8, (a0)
3873 ; CHECK-NEXT: addi a0, a0, 128
3874 ; CHECK-NEXT: vle32.v v16, (a0)
3875 ; CHECK-NEXT: vmin.vv v8, v8, v16
3876 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3877 ; CHECK-NEXT: vmv.x.s a0, v8
3879 %v = load <64 x i32>, ptr %x
3880 %red = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %v)
3884 declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
3886 define i64 @vreduce_smin_v1i64(ptr %x) {
3887 ; RV32-LABEL: vreduce_smin_v1i64:
3889 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3890 ; RV32-NEXT: vle64.v v8, (a0)
3891 ; RV32-NEXT: li a0, 32
3892 ; RV32-NEXT: vsrl.vx v9, v8, a0
3893 ; RV32-NEXT: vmv.x.s a1, v9
3894 ; RV32-NEXT: vmv.x.s a0, v8
3897 ; RV64-LABEL: vreduce_smin_v1i64:
3899 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3900 ; RV64-NEXT: vle64.v v8, (a0)
3901 ; RV64-NEXT: vmv.x.s a0, v8
3903 %v = load <1 x i64>, ptr %x
3904 %red = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %v)
3908 declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
3910 define i64 @vreduce_smin_v2i64(ptr %x) {
3911 ; RV32-LABEL: vreduce_smin_v2i64:
3913 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3914 ; RV32-NEXT: vle64.v v8, (a0)
3915 ; RV32-NEXT: vredmin.vs v8, v8, v8
3916 ; RV32-NEXT: li a0, 32
3917 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3918 ; RV32-NEXT: vsrl.vx v9, v8, a0
3919 ; RV32-NEXT: vmv.x.s a1, v9
3920 ; RV32-NEXT: vmv.x.s a0, v8
3923 ; RV64-LABEL: vreduce_smin_v2i64:
3925 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3926 ; RV64-NEXT: vle64.v v8, (a0)
3927 ; RV64-NEXT: vredmin.vs v8, v8, v8
3928 ; RV64-NEXT: vmv.x.s a0, v8
3930 %v = load <2 x i64>, ptr %x
3931 %red = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v)
3935 declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
3937 define i64 @vreduce_smin_v4i64(ptr %x) {
3938 ; RV32-LABEL: vreduce_smin_v4i64:
3940 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3941 ; RV32-NEXT: vle64.v v8, (a0)
3942 ; RV32-NEXT: vredmin.vs v8, v8, v8
3943 ; RV32-NEXT: vmv.x.s a0, v8
3944 ; RV32-NEXT: li a1, 32
3945 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3946 ; RV32-NEXT: vsrl.vx v8, v8, a1
3947 ; RV32-NEXT: vmv.x.s a1, v8
3950 ; RV64-LABEL: vreduce_smin_v4i64:
3952 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3953 ; RV64-NEXT: vle64.v v8, (a0)
3954 ; RV64-NEXT: vredmin.vs v8, v8, v8
3955 ; RV64-NEXT: vmv.x.s a0, v8
3957 %v = load <4 x i64>, ptr %x
3958 %red = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
3962 declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
3964 define i64 @vreduce_smin_v8i64(ptr %x) {
3965 ; RV32-LABEL: vreduce_smin_v8i64:
3967 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3968 ; RV32-NEXT: vle64.v v8, (a0)
3969 ; RV32-NEXT: vredmin.vs v8, v8, v8
3970 ; RV32-NEXT: vmv.x.s a0, v8
3971 ; RV32-NEXT: li a1, 32
3972 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3973 ; RV32-NEXT: vsrl.vx v8, v8, a1
3974 ; RV32-NEXT: vmv.x.s a1, v8
3977 ; RV64-LABEL: vreduce_smin_v8i64:
3979 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3980 ; RV64-NEXT: vle64.v v8, (a0)
3981 ; RV64-NEXT: vredmin.vs v8, v8, v8
3982 ; RV64-NEXT: vmv.x.s a0, v8
3984 %v = load <8 x i64>, ptr %x
3985 %red = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %v)
3989 declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
3991 define i64 @vreduce_smin_v16i64(ptr %x) {
3992 ; RV32-LABEL: vreduce_smin_v16i64:
3994 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3995 ; RV32-NEXT: vle64.v v8, (a0)
3996 ; RV32-NEXT: vredmin.vs v8, v8, v8
3997 ; RV32-NEXT: vmv.x.s a0, v8
3998 ; RV32-NEXT: li a1, 32
3999 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4000 ; RV32-NEXT: vsrl.vx v8, v8, a1
4001 ; RV32-NEXT: vmv.x.s a1, v8
4004 ; RV64-LABEL: vreduce_smin_v16i64:
4006 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4007 ; RV64-NEXT: vle64.v v8, (a0)
4008 ; RV64-NEXT: vredmin.vs v8, v8, v8
4009 ; RV64-NEXT: vmv.x.s a0, v8
4011 %v = load <16 x i64>, ptr %x
4012 %red = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %v)
4016 declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
4018 define i64 @vreduce_smin_v32i64(ptr %x) {
4019 ; RV32-LABEL: vreduce_smin_v32i64:
4021 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4022 ; RV32-NEXT: vle64.v v8, (a0)
4023 ; RV32-NEXT: addi a0, a0, 128
4024 ; RV32-NEXT: vle64.v v16, (a0)
4025 ; RV32-NEXT: vmin.vv v8, v8, v16
4026 ; RV32-NEXT: vredmin.vs v8, v8, v8
4027 ; RV32-NEXT: vmv.x.s a0, v8
4028 ; RV32-NEXT: li a1, 32
4029 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4030 ; RV32-NEXT: vsrl.vx v8, v8, a1
4031 ; RV32-NEXT: vmv.x.s a1, v8
4034 ; RV64-LABEL: vreduce_smin_v32i64:
4036 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4037 ; RV64-NEXT: vle64.v v8, (a0)
4038 ; RV64-NEXT: addi a0, a0, 128
4039 ; RV64-NEXT: vle64.v v16, (a0)
4040 ; RV64-NEXT: vmin.vv v8, v8, v16
4041 ; RV64-NEXT: vredmin.vs v8, v8, v8
4042 ; RV64-NEXT: vmv.x.s a0, v8
4044 %v = load <32 x i64>, ptr %x
4045 %red = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %v)
4049 declare i64 @llvm.vector.reduce.smin.v64i64(<64 x i64>)
4051 define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
4052 ; RV32-LABEL: vreduce_smin_v64i64:
4054 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4055 ; RV32-NEXT: vle64.v v8, (a0)
4056 ; RV32-NEXT: addi a1, a0, 384
4057 ; RV32-NEXT: vle64.v v16, (a1)
4058 ; RV32-NEXT: addi a1, a0, 256
4059 ; RV32-NEXT: addi a0, a0, 128
4060 ; RV32-NEXT: vle64.v v24, (a0)
4061 ; RV32-NEXT: vle64.v v0, (a1)
4062 ; RV32-NEXT: vmin.vv v16, v24, v16
4063 ; RV32-NEXT: vmin.vv v8, v8, v0
4064 ; RV32-NEXT: vmin.vv v8, v8, v16
4065 ; RV32-NEXT: vredmin.vs v8, v8, v8
4066 ; RV32-NEXT: vmv.x.s a0, v8
4067 ; RV32-NEXT: li a1, 32
4068 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4069 ; RV32-NEXT: vsrl.vx v8, v8, a1
4070 ; RV32-NEXT: vmv.x.s a1, v8
4073 ; RV64-LABEL: vreduce_smin_v64i64:
4075 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4076 ; RV64-NEXT: vle64.v v8, (a0)
4077 ; RV64-NEXT: addi a1, a0, 384
4078 ; RV64-NEXT: vle64.v v16, (a1)
4079 ; RV64-NEXT: addi a1, a0, 256
4080 ; RV64-NEXT: addi a0, a0, 128
4081 ; RV64-NEXT: vle64.v v24, (a0)
4082 ; RV64-NEXT: vle64.v v0, (a1)
4083 ; RV64-NEXT: vmin.vv v16, v24, v16
4084 ; RV64-NEXT: vmin.vv v8, v8, v0
4085 ; RV64-NEXT: vmin.vv v8, v8, v16
4086 ; RV64-NEXT: vredmin.vs v8, v8, v8
4087 ; RV64-NEXT: vmv.x.s a0, v8
4089 %v = load <64 x i64>, ptr %x
4090 %red = call i64 @llvm.vector.reduce.smin.v64i64(<64 x i64> %v)
4094 declare i8 @llvm.vector.reduce.smax.v1i8(<1 x i8>)
4096 define i8 @vreduce_smax_v1i8(ptr %x) {
4097 ; CHECK-LABEL: vreduce_smax_v1i8:
4099 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
4100 ; CHECK-NEXT: vle8.v v8, (a0)
4101 ; CHECK-NEXT: vmv.x.s a0, v8
4103 %v = load <1 x i8>, ptr %x
4104 %red = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> %v)
4108 declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
4110 define i8 @vreduce_smax_v2i8(ptr %x) {
4111 ; CHECK-LABEL: vreduce_smax_v2i8:
4113 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
4114 ; CHECK-NEXT: vle8.v v8, (a0)
4115 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4116 ; CHECK-NEXT: vmv.x.s a0, v8
4118 %v = load <2 x i8>, ptr %x
4119 %red = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v)
4123 declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
4125 define i8 @vreduce_smax_v4i8(ptr %x) {
4126 ; CHECK-LABEL: vreduce_smax_v4i8:
4128 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4129 ; CHECK-NEXT: vle8.v v8, (a0)
4130 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4131 ; CHECK-NEXT: vmv.x.s a0, v8
4133 %v = load <4 x i8>, ptr %x
4134 %red = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v)
4138 declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
4140 define i8 @vreduce_smax_v8i8(ptr %x) {
4141 ; CHECK-LABEL: vreduce_smax_v8i8:
4143 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4144 ; CHECK-NEXT: vle8.v v8, (a0)
4145 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4146 ; CHECK-NEXT: vmv.x.s a0, v8
4148 %v = load <8 x i8>, ptr %x
4149 %red = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
4153 declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
4155 define i8 @vreduce_smax_v16i8(ptr %x) {
4156 ; CHECK-LABEL: vreduce_smax_v16i8:
4158 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
4159 ; CHECK-NEXT: vle8.v v8, (a0)
4160 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4161 ; CHECK-NEXT: vmv.x.s a0, v8
4163 %v = load <16 x i8>, ptr %x
4164 %red = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
4168 declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
4170 define i8 @vreduce_smax_v32i8(ptr %x) {
4171 ; CHECK-LABEL: vreduce_smax_v32i8:
4173 ; CHECK-NEXT: li a1, 32
4174 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
4175 ; CHECK-NEXT: vle8.v v8, (a0)
4176 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4177 ; CHECK-NEXT: vmv.x.s a0, v8
4179 %v = load <32 x i8>, ptr %x
4180 %red = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v)
4184 declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
4186 define i8 @vreduce_smax_v64i8(ptr %x) {
4187 ; CHECK-LABEL: vreduce_smax_v64i8:
4189 ; CHECK-NEXT: li a1, 64
4190 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
4191 ; CHECK-NEXT: vle8.v v8, (a0)
4192 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4193 ; CHECK-NEXT: vmv.x.s a0, v8
4195 %v = load <64 x i8>, ptr %x
4196 %red = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %v)
4200 declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
4202 define i8 @vreduce_smax_v128i8(ptr %x) {
4203 ; CHECK-LABEL: vreduce_smax_v128i8:
4205 ; CHECK-NEXT: li a1, 128
4206 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4207 ; CHECK-NEXT: vle8.v v8, (a0)
4208 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4209 ; CHECK-NEXT: vmv.x.s a0, v8
4211 %v = load <128 x i8>, ptr %x
4212 %red = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %v)
4216 declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
4218 define i8 @vreduce_smax_v256i8(ptr %x) {
4219 ; CHECK-LABEL: vreduce_smax_v256i8:
4221 ; CHECK-NEXT: li a1, 128
4222 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4223 ; CHECK-NEXT: vle8.v v8, (a0)
4224 ; CHECK-NEXT: addi a0, a0, 128
4225 ; CHECK-NEXT: vle8.v v16, (a0)
4226 ; CHECK-NEXT: vmax.vv v8, v8, v16
4227 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4228 ; CHECK-NEXT: vmv.x.s a0, v8
4230 %v = load <256 x i8>, ptr %x
4231 %red = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %v)
4235 declare i16 @llvm.vector.reduce.smax.v1i16(<1 x i16>)
4237 define i16 @vreduce_smax_v1i16(ptr %x) {
4238 ; CHECK-LABEL: vreduce_smax_v1i16:
4240 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
4241 ; CHECK-NEXT: vle16.v v8, (a0)
4242 ; CHECK-NEXT: vmv.x.s a0, v8
4244 %v = load <1 x i16>, ptr %x
4245 %red = call i16 @llvm.vector.reduce.smax.v1i16(<1 x i16> %v)
4249 declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
4251 define i16 @vreduce_smax_v2i16(ptr %x) {
4252 ; CHECK-LABEL: vreduce_smax_v2i16:
4254 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
4255 ; CHECK-NEXT: vle16.v v8, (a0)
4256 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4257 ; CHECK-NEXT: vmv.x.s a0, v8
4259 %v = load <2 x i16>, ptr %x
4260 %red = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v)
4264 declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
4266 define i16 @vreduce_smax_v4i16(ptr %x) {
4267 ; CHECK-LABEL: vreduce_smax_v4i16:
4269 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
4270 ; CHECK-NEXT: vle16.v v8, (a0)
4271 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4272 ; CHECK-NEXT: vmv.x.s a0, v8
4274 %v = load <4 x i16>, ptr %x
4275 %red = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
4279 declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
4281 define i16 @vreduce_smax_v8i16(ptr %x) {
4282 ; CHECK-LABEL: vreduce_smax_v8i16:
4284 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
4285 ; CHECK-NEXT: vle16.v v8, (a0)
4286 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4287 ; CHECK-NEXT: vmv.x.s a0, v8
4289 %v = load <8 x i16>, ptr %x
4290 %red = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
4294 declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
4296 define i16 @vreduce_smax_v16i16(ptr %x) {
4297 ; CHECK-LABEL: vreduce_smax_v16i16:
4299 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
4300 ; CHECK-NEXT: vle16.v v8, (a0)
4301 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4302 ; CHECK-NEXT: vmv.x.s a0, v8
4304 %v = load <16 x i16>, ptr %x
4305 %red = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v)
4309 declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
4311 define i16 @vreduce_smax_v32i16(ptr %x) {
4312 ; CHECK-LABEL: vreduce_smax_v32i16:
4314 ; CHECK-NEXT: li a1, 32
4315 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
4316 ; CHECK-NEXT: vle16.v v8, (a0)
4317 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4318 ; CHECK-NEXT: vmv.x.s a0, v8
4320 %v = load <32 x i16>, ptr %x
4321 %red = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %v)
4325 declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
4327 define i16 @vreduce_smax_v64i16(ptr %x) {
4328 ; CHECK-LABEL: vreduce_smax_v64i16:
4330 ; CHECK-NEXT: li a1, 64
4331 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4332 ; CHECK-NEXT: vle16.v v8, (a0)
4333 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4334 ; CHECK-NEXT: vmv.x.s a0, v8
4336 %v = load <64 x i16>, ptr %x
4337 %red = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %v)
4341 declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
4343 define i16 @vreduce_smax_v128i16(ptr %x) {
4344 ; CHECK-LABEL: vreduce_smax_v128i16:
4346 ; CHECK-NEXT: li a1, 64
4347 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4348 ; CHECK-NEXT: vle16.v v8, (a0)
4349 ; CHECK-NEXT: addi a0, a0, 128
4350 ; CHECK-NEXT: vle16.v v16, (a0)
4351 ; CHECK-NEXT: vmax.vv v8, v8, v16
4352 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4353 ; CHECK-NEXT: vmv.x.s a0, v8
4355 %v = load <128 x i16>, ptr %x
4356 %red = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %v)
4360 declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32>)
4362 define i32 @vreduce_smax_v1i32(ptr %x) {
4363 ; CHECK-LABEL: vreduce_smax_v1i32:
4365 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
4366 ; CHECK-NEXT: vle32.v v8, (a0)
4367 ; CHECK-NEXT: vmv.x.s a0, v8
4369 %v = load <1 x i32>, ptr %x
4370 %red = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %v)
4374 declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
4376 define i32 @vreduce_smax_v2i32(ptr %x) {
4377 ; CHECK-LABEL: vreduce_smax_v2i32:
4379 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
4380 ; CHECK-NEXT: vle32.v v8, (a0)
4381 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4382 ; CHECK-NEXT: vmv.x.s a0, v8
4384 %v = load <2 x i32>, ptr %x
4385 %red = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v)
4389 declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
4391 define i32 @vreduce_smax_v4i32(ptr %x) {
4392 ; CHECK-LABEL: vreduce_smax_v4i32:
4394 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
4395 ; CHECK-NEXT: vle32.v v8, (a0)
4396 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4397 ; CHECK-NEXT: vmv.x.s a0, v8
4399 %v = load <4 x i32>, ptr %x
4400 %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
4404 declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
4406 define i32 @vreduce_smax_v8i32(ptr %x) {
4407 ; CHECK-LABEL: vreduce_smax_v8i32:
4409 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
4410 ; CHECK-NEXT: vle32.v v8, (a0)
4411 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4412 ; CHECK-NEXT: vmv.x.s a0, v8
4414 %v = load <8 x i32>, ptr %x
4415 %red = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v)
4419 declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
4421 define i32 @vreduce_smax_v16i32(ptr %x) {
4422 ; CHECK-LABEL: vreduce_smax_v16i32:
4424 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
4425 ; CHECK-NEXT: vle32.v v8, (a0)
4426 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4427 ; CHECK-NEXT: vmv.x.s a0, v8
4429 %v = load <16 x i32>, ptr %x
4430 %red = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %v)
4434 declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
4436 define i32 @vreduce_smax_v32i32(ptr %x) {
4437 ; CHECK-LABEL: vreduce_smax_v32i32:
4439 ; CHECK-NEXT: li a1, 32
4440 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
4441 ; CHECK-NEXT: vle32.v v8, (a0)
4442 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4443 ; CHECK-NEXT: vmv.x.s a0, v8
4445 %v = load <32 x i32>, ptr %x
4446 %red = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %v)
4450 declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
4452 define i32 @vreduce_smax_v64i32(ptr %x) {
4453 ; CHECK-LABEL: vreduce_smax_v64i32:
4455 ; CHECK-NEXT: li a1, 32
4456 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
4457 ; CHECK-NEXT: vle32.v v8, (a0)
4458 ; CHECK-NEXT: addi a0, a0, 128
4459 ; CHECK-NEXT: vle32.v v16, (a0)
4460 ; CHECK-NEXT: vmax.vv v8, v8, v16
4461 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4462 ; CHECK-NEXT: vmv.x.s a0, v8
4464 %v = load <64 x i32>, ptr %x
4465 %red = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %v)
4469 declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
4471 define i64 @vreduce_smax_v1i64(ptr %x) {
4472 ; RV32-LABEL: vreduce_smax_v1i64:
4474 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4475 ; RV32-NEXT: vle64.v v8, (a0)
4476 ; RV32-NEXT: li a0, 32
4477 ; RV32-NEXT: vsrl.vx v9, v8, a0
4478 ; RV32-NEXT: vmv.x.s a1, v9
4479 ; RV32-NEXT: vmv.x.s a0, v8
4482 ; RV64-LABEL: vreduce_smax_v1i64:
4484 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4485 ; RV64-NEXT: vle64.v v8, (a0)
4486 ; RV64-NEXT: vmv.x.s a0, v8
4488 %v = load <1 x i64>, ptr %x
4489 %red = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %v)
4493 declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
4495 define i64 @vreduce_smax_v2i64(ptr %x) {
4496 ; RV32-LABEL: vreduce_smax_v2i64:
4498 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
4499 ; RV32-NEXT: vle64.v v8, (a0)
4500 ; RV32-NEXT: vredmax.vs v8, v8, v8
4501 ; RV32-NEXT: li a0, 32
4502 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4503 ; RV32-NEXT: vsrl.vx v9, v8, a0
4504 ; RV32-NEXT: vmv.x.s a1, v9
4505 ; RV32-NEXT: vmv.x.s a0, v8
4508 ; RV64-LABEL: vreduce_smax_v2i64:
4510 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
4511 ; RV64-NEXT: vle64.v v8, (a0)
4512 ; RV64-NEXT: vredmax.vs v8, v8, v8
4513 ; RV64-NEXT: vmv.x.s a0, v8
4515 %v = load <2 x i64>, ptr %x
4516 %red = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v)
4520 declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
4522 define i64 @vreduce_smax_v4i64(ptr %x) {
4523 ; RV32-LABEL: vreduce_smax_v4i64:
4525 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
4526 ; RV32-NEXT: vle64.v v8, (a0)
4527 ; RV32-NEXT: vredmax.vs v8, v8, v8
4528 ; RV32-NEXT: vmv.x.s a0, v8
4529 ; RV32-NEXT: li a1, 32
4530 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4531 ; RV32-NEXT: vsrl.vx v8, v8, a1
4532 ; RV32-NEXT: vmv.x.s a1, v8
4535 ; RV64-LABEL: vreduce_smax_v4i64:
4537 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
4538 ; RV64-NEXT: vle64.v v8, (a0)
4539 ; RV64-NEXT: vredmax.vs v8, v8, v8
4540 ; RV64-NEXT: vmv.x.s a0, v8
4542 %v = load <4 x i64>, ptr %x
4543 %red = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
4547 declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
4549 define i64 @vreduce_smax_v8i64(ptr %x) {
4550 ; RV32-LABEL: vreduce_smax_v8i64:
4552 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
4553 ; RV32-NEXT: vle64.v v8, (a0)
4554 ; RV32-NEXT: vredmax.vs v8, v8, v8
4555 ; RV32-NEXT: vmv.x.s a0, v8
4556 ; RV32-NEXT: li a1, 32
4557 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4558 ; RV32-NEXT: vsrl.vx v8, v8, a1
4559 ; RV32-NEXT: vmv.x.s a1, v8
4562 ; RV64-LABEL: vreduce_smax_v8i64:
4564 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
4565 ; RV64-NEXT: vle64.v v8, (a0)
4566 ; RV64-NEXT: vredmax.vs v8, v8, v8
4567 ; RV64-NEXT: vmv.x.s a0, v8
4569 %v = load <8 x i64>, ptr %x
4570 %red = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %v)
4574 declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
4576 define i64 @vreduce_smax_v16i64(ptr %x) {
4577 ; RV32-LABEL: vreduce_smax_v16i64:
4579 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4580 ; RV32-NEXT: vle64.v v8, (a0)
4581 ; RV32-NEXT: vredmax.vs v8, v8, v8
4582 ; RV32-NEXT: vmv.x.s a0, v8
4583 ; RV32-NEXT: li a1, 32
4584 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4585 ; RV32-NEXT: vsrl.vx v8, v8, a1
4586 ; RV32-NEXT: vmv.x.s a1, v8
4589 ; RV64-LABEL: vreduce_smax_v16i64:
4591 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4592 ; RV64-NEXT: vle64.v v8, (a0)
4593 ; RV64-NEXT: vredmax.vs v8, v8, v8
4594 ; RV64-NEXT: vmv.x.s a0, v8
4596 %v = load <16 x i64>, ptr %x
4597 %red = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %v)
4601 declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
4603 define i64 @vreduce_smax_v32i64(ptr %x) {
4604 ; RV32-LABEL: vreduce_smax_v32i64:
4606 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4607 ; RV32-NEXT: vle64.v v8, (a0)
4608 ; RV32-NEXT: addi a0, a0, 128
4609 ; RV32-NEXT: vle64.v v16, (a0)
4610 ; RV32-NEXT: vmax.vv v8, v8, v16
4611 ; RV32-NEXT: vredmax.vs v8, v8, v8
4612 ; RV32-NEXT: vmv.x.s a0, v8
4613 ; RV32-NEXT: li a1, 32
4614 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4615 ; RV32-NEXT: vsrl.vx v8, v8, a1
4616 ; RV32-NEXT: vmv.x.s a1, v8
4619 ; RV64-LABEL: vreduce_smax_v32i64:
4621 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4622 ; RV64-NEXT: vle64.v v8, (a0)
4623 ; RV64-NEXT: addi a0, a0, 128
4624 ; RV64-NEXT: vle64.v v16, (a0)
4625 ; RV64-NEXT: vmax.vv v8, v8, v16
4626 ; RV64-NEXT: vredmax.vs v8, v8, v8
4627 ; RV64-NEXT: vmv.x.s a0, v8
4629 %v = load <32 x i64>, ptr %x
4630 %red = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %v)
4634 declare i64 @llvm.vector.reduce.smax.v64i64(<64 x i64>)
4636 define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
4637 ; RV32-LABEL: vreduce_smax_v64i64:
4639 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4640 ; RV32-NEXT: vle64.v v8, (a0)
4641 ; RV32-NEXT: addi a1, a0, 384
4642 ; RV32-NEXT: vle64.v v16, (a1)
4643 ; RV32-NEXT: addi a1, a0, 256
4644 ; RV32-NEXT: addi a0, a0, 128
4645 ; RV32-NEXT: vle64.v v24, (a0)
4646 ; RV32-NEXT: vle64.v v0, (a1)
4647 ; RV32-NEXT: vmax.vv v16, v24, v16
4648 ; RV32-NEXT: vmax.vv v8, v8, v0
4649 ; RV32-NEXT: vmax.vv v8, v8, v16
4650 ; RV32-NEXT: vredmax.vs v8, v8, v8
4651 ; RV32-NEXT: vmv.x.s a0, v8
4652 ; RV32-NEXT: li a1, 32
4653 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4654 ; RV32-NEXT: vsrl.vx v8, v8, a1
4655 ; RV32-NEXT: vmv.x.s a1, v8
4658 ; RV64-LABEL: vreduce_smax_v64i64:
4660 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4661 ; RV64-NEXT: vle64.v v8, (a0)
4662 ; RV64-NEXT: addi a1, a0, 384
4663 ; RV64-NEXT: vle64.v v16, (a1)
4664 ; RV64-NEXT: addi a1, a0, 256
4665 ; RV64-NEXT: addi a0, a0, 128
4666 ; RV64-NEXT: vle64.v v24, (a0)
4667 ; RV64-NEXT: vle64.v v0, (a1)
4668 ; RV64-NEXT: vmax.vv v16, v24, v16
4669 ; RV64-NEXT: vmax.vv v8, v8, v0
4670 ; RV64-NEXT: vmax.vv v8, v8, v16
4671 ; RV64-NEXT: vredmax.vs v8, v8, v8
4672 ; RV64-NEXT: vmv.x.s a0, v8
4674 %v = load <64 x i64>, ptr %x
4675 %red = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> %v)
4679 declare i8 @llvm.vector.reduce.umin.v1i8(<1 x i8>)
4681 define i8 @vreduce_umin_v1i8(ptr %x) {
4682 ; CHECK-LABEL: vreduce_umin_v1i8:
4684 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
4685 ; CHECK-NEXT: vle8.v v8, (a0)
4686 ; CHECK-NEXT: vmv.x.s a0, v8
4688 %v = load <1 x i8>, ptr %x
4689 %red = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> %v)
4693 declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
4695 define i8 @vreduce_umin_v2i8(ptr %x) {
4696 ; CHECK-LABEL: vreduce_umin_v2i8:
4698 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
4699 ; CHECK-NEXT: vle8.v v8, (a0)
4700 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4701 ; CHECK-NEXT: vmv.x.s a0, v8
4703 %v = load <2 x i8>, ptr %x
4704 %red = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %v)
4708 declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
4710 define i8 @vreduce_umin_v4i8(ptr %x) {
4711 ; CHECK-LABEL: vreduce_umin_v4i8:
4713 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4714 ; CHECK-NEXT: vle8.v v8, (a0)
4715 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4716 ; CHECK-NEXT: vmv.x.s a0, v8
4718 %v = load <4 x i8>, ptr %x
4719 %red = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %v)
4723 declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
4725 define i8 @vreduce_umin_v8i8(ptr %x) {
4726 ; CHECK-LABEL: vreduce_umin_v8i8:
4728 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4729 ; CHECK-NEXT: vle8.v v8, (a0)
4730 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4731 ; CHECK-NEXT: vmv.x.s a0, v8
4733 %v = load <8 x i8>, ptr %x
4734 %red = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
4738 declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
4740 define i8 @vreduce_umin_v16i8(ptr %x) {
4741 ; CHECK-LABEL: vreduce_umin_v16i8:
4743 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
4744 ; CHECK-NEXT: vle8.v v8, (a0)
4745 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4746 ; CHECK-NEXT: vmv.x.s a0, v8
4748 %v = load <16 x i8>, ptr %x
4749 %red = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
4753 declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
4755 define i8 @vreduce_umin_v32i8(ptr %x) {
4756 ; CHECK-LABEL: vreduce_umin_v32i8:
4758 ; CHECK-NEXT: li a1, 32
4759 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
4760 ; CHECK-NEXT: vle8.v v8, (a0)
4761 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4762 ; CHECK-NEXT: vmv.x.s a0, v8
4764 %v = load <32 x i8>, ptr %x
4765 %red = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v)
4769 declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
4771 define i8 @vreduce_umin_v64i8(ptr %x) {
4772 ; CHECK-LABEL: vreduce_umin_v64i8:
4774 ; CHECK-NEXT: li a1, 64
4775 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
4776 ; CHECK-NEXT: vle8.v v8, (a0)
4777 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4778 ; CHECK-NEXT: vmv.x.s a0, v8
4780 %v = load <64 x i8>, ptr %x
4781 %red = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %v)
4785 declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
4787 define i8 @vreduce_umin_v128i8(ptr %x) {
4788 ; CHECK-LABEL: vreduce_umin_v128i8:
4790 ; CHECK-NEXT: li a1, 128
4791 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4792 ; CHECK-NEXT: vle8.v v8, (a0)
4793 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4794 ; CHECK-NEXT: vmv.x.s a0, v8
4796 %v = load <128 x i8>, ptr %x
4797 %red = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %v)
4801 declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
4803 define i8 @vreduce_umin_v256i8(ptr %x) {
4804 ; CHECK-LABEL: vreduce_umin_v256i8:
4806 ; CHECK-NEXT: li a1, 128
4807 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4808 ; CHECK-NEXT: vle8.v v8, (a0)
4809 ; CHECK-NEXT: addi a0, a0, 128
4810 ; CHECK-NEXT: vle8.v v16, (a0)
4811 ; CHECK-NEXT: vminu.vv v8, v8, v16
4812 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4813 ; CHECK-NEXT: vmv.x.s a0, v8
4815 %v = load <256 x i8>, ptr %x
4816 %red = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %v)
4820 declare i16 @llvm.vector.reduce.umin.v1i16(<1 x i16>)
4822 define i16 @vreduce_umin_v1i16(ptr %x) {
4823 ; CHECK-LABEL: vreduce_umin_v1i16:
4825 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
4826 ; CHECK-NEXT: vle16.v v8, (a0)
4827 ; CHECK-NEXT: vmv.x.s a0, v8
4829 %v = load <1 x i16>, ptr %x
4830 %red = call i16 @llvm.vector.reduce.umin.v1i16(<1 x i16> %v)
4834 declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
4836 define i16 @vreduce_umin_v2i16(ptr %x) {
4837 ; CHECK-LABEL: vreduce_umin_v2i16:
4839 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
4840 ; CHECK-NEXT: vle16.v v8, (a0)
4841 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4842 ; CHECK-NEXT: vmv.x.s a0, v8
4844 %v = load <2 x i16>, ptr %x
4845 %red = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %v)
4849 declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
4851 define i16 @vreduce_umin_v4i16(ptr %x) {
4852 ; CHECK-LABEL: vreduce_umin_v4i16:
4854 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
4855 ; CHECK-NEXT: vle16.v v8, (a0)
4856 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4857 ; CHECK-NEXT: vmv.x.s a0, v8
4859 %v = load <4 x i16>, ptr %x
4860 %red = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
4864 declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
4866 define i16 @vreduce_umin_v8i16(ptr %x) {
4867 ; CHECK-LABEL: vreduce_umin_v8i16:
4869 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
4870 ; CHECK-NEXT: vle16.v v8, (a0)
4871 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4872 ; CHECK-NEXT: vmv.x.s a0, v8
4874 %v = load <8 x i16>, ptr %x
4875 %red = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
4879 declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
4881 define i16 @vreduce_umin_v16i16(ptr %x) {
4882 ; CHECK-LABEL: vreduce_umin_v16i16:
4884 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
4885 ; CHECK-NEXT: vle16.v v8, (a0)
4886 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4887 ; CHECK-NEXT: vmv.x.s a0, v8
4889 %v = load <16 x i16>, ptr %x
4890 %red = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v)
4894 declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
4896 define i16 @vreduce_umin_v32i16(ptr %x) {
4897 ; CHECK-LABEL: vreduce_umin_v32i16:
4899 ; CHECK-NEXT: li a1, 32
4900 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
4901 ; CHECK-NEXT: vle16.v v8, (a0)
4902 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4903 ; CHECK-NEXT: vmv.x.s a0, v8
4905 %v = load <32 x i16>, ptr %x
4906 %red = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %v)
4910 declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
4912 define i16 @vreduce_umin_v64i16(ptr %x) {
4913 ; CHECK-LABEL: vreduce_umin_v64i16:
4915 ; CHECK-NEXT: li a1, 64
4916 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4917 ; CHECK-NEXT: vle16.v v8, (a0)
4918 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4919 ; CHECK-NEXT: vmv.x.s a0, v8
4921 %v = load <64 x i16>, ptr %x
4922 %red = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %v)
4926 declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
4928 define i16 @vreduce_umin_v128i16(ptr %x) {
4929 ; CHECK-LABEL: vreduce_umin_v128i16:
4931 ; CHECK-NEXT: li a1, 64
4932 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4933 ; CHECK-NEXT: vle16.v v8, (a0)
4934 ; CHECK-NEXT: addi a0, a0, 128
4935 ; CHECK-NEXT: vle16.v v16, (a0)
4936 ; CHECK-NEXT: vminu.vv v8, v8, v16
4937 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4938 ; CHECK-NEXT: vmv.x.s a0, v8
4940 %v = load <128 x i16>, ptr %x
4941 %red = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %v)
4945 declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32>)
4947 define i32 @vreduce_umin_v1i32(ptr %x) {
4948 ; CHECK-LABEL: vreduce_umin_v1i32:
4950 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
4951 ; CHECK-NEXT: vle32.v v8, (a0)
4952 ; CHECK-NEXT: vmv.x.s a0, v8
4954 %v = load <1 x i32>, ptr %x
4955 %red = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %v)
4959 declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
4961 define i32 @vreduce_umin_v2i32(ptr %x) {
4962 ; CHECK-LABEL: vreduce_umin_v2i32:
4964 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
4965 ; CHECK-NEXT: vle32.v v8, (a0)
4966 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4967 ; CHECK-NEXT: vmv.x.s a0, v8
4969 %v = load <2 x i32>, ptr %x
4970 %red = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v)
4974 declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
4976 define i32 @vreduce_umin_v4i32(ptr %x) {
4977 ; CHECK-LABEL: vreduce_umin_v4i32:
4979 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
4980 ; CHECK-NEXT: vle32.v v8, (a0)
4981 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4982 ; CHECK-NEXT: vmv.x.s a0, v8
4984 %v = load <4 x i32>, ptr %x
4985 %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
4989 declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
4991 define i32 @vreduce_umin_v8i32(ptr %x) {
4992 ; CHECK-LABEL: vreduce_umin_v8i32:
4994 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
4995 ; CHECK-NEXT: vle32.v v8, (a0)
4996 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4997 ; CHECK-NEXT: vmv.x.s a0, v8
4999 %v = load <8 x i32>, ptr %x
5000 %red = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v)
5004 declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
5006 define i32 @vreduce_umin_v16i32(ptr %x) {
5007 ; CHECK-LABEL: vreduce_umin_v16i32:
5009 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
5010 ; CHECK-NEXT: vle32.v v8, (a0)
5011 ; CHECK-NEXT: vredminu.vs v8, v8, v8
5012 ; CHECK-NEXT: vmv.x.s a0, v8
5014 %v = load <16 x i32>, ptr %x
5015 %red = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %v)
5019 declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
5021 define i32 @vreduce_umin_v32i32(ptr %x) {
5022 ; CHECK-LABEL: vreduce_umin_v32i32:
5024 ; CHECK-NEXT: li a1, 32
5025 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5026 ; CHECK-NEXT: vle32.v v8, (a0)
5027 ; CHECK-NEXT: vredminu.vs v8, v8, v8
5028 ; CHECK-NEXT: vmv.x.s a0, v8
5030 %v = load <32 x i32>, ptr %x
5031 %red = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %v)
5035 declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
5037 define i32 @vreduce_umin_v64i32(ptr %x) {
5038 ; CHECK-LABEL: vreduce_umin_v64i32:
5040 ; CHECK-NEXT: li a1, 32
5041 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5042 ; CHECK-NEXT: vle32.v v8, (a0)
5043 ; CHECK-NEXT: addi a0, a0, 128
5044 ; CHECK-NEXT: vle32.v v16, (a0)
5045 ; CHECK-NEXT: vminu.vv v8, v8, v16
5046 ; CHECK-NEXT: vredminu.vs v8, v8, v8
5047 ; CHECK-NEXT: vmv.x.s a0, v8
5049 %v = load <64 x i32>, ptr %x
5050 %red = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %v)
5054 declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
5056 define i64 @vreduce_umin_v1i64(ptr %x) {
5057 ; RV32-LABEL: vreduce_umin_v1i64:
5059 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5060 ; RV32-NEXT: vle64.v v8, (a0)
5061 ; RV32-NEXT: li a0, 32
5062 ; RV32-NEXT: vsrl.vx v9, v8, a0
5063 ; RV32-NEXT: vmv.x.s a1, v9
5064 ; RV32-NEXT: vmv.x.s a0, v8
5067 ; RV64-LABEL: vreduce_umin_v1i64:
5069 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5070 ; RV64-NEXT: vle64.v v8, (a0)
5071 ; RV64-NEXT: vmv.x.s a0, v8
5073 %v = load <1 x i64>, ptr %x
5074 %red = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %v)
5078 declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
5080 define i64 @vreduce_umin_v2i64(ptr %x) {
5081 ; RV32-LABEL: vreduce_umin_v2i64:
5083 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5084 ; RV32-NEXT: vle64.v v8, (a0)
5085 ; RV32-NEXT: vredminu.vs v8, v8, v8
5086 ; RV32-NEXT: li a0, 32
5087 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5088 ; RV32-NEXT: vsrl.vx v9, v8, a0
5089 ; RV32-NEXT: vmv.x.s a1, v9
5090 ; RV32-NEXT: vmv.x.s a0, v8
5093 ; RV64-LABEL: vreduce_umin_v2i64:
5095 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5096 ; RV64-NEXT: vle64.v v8, (a0)
5097 ; RV64-NEXT: vredminu.vs v8, v8, v8
5098 ; RV64-NEXT: vmv.x.s a0, v8
5100 %v = load <2 x i64>, ptr %x
5101 %red = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v)
5105 declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
5107 define i64 @vreduce_umin_v4i64(ptr %x) {
5108 ; RV32-LABEL: vreduce_umin_v4i64:
5110 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5111 ; RV32-NEXT: vle64.v v8, (a0)
5112 ; RV32-NEXT: vredminu.vs v8, v8, v8
5113 ; RV32-NEXT: vmv.x.s a0, v8
5114 ; RV32-NEXT: li a1, 32
5115 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5116 ; RV32-NEXT: vsrl.vx v8, v8, a1
5117 ; RV32-NEXT: vmv.x.s a1, v8
5120 ; RV64-LABEL: vreduce_umin_v4i64:
5122 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5123 ; RV64-NEXT: vle64.v v8, (a0)
5124 ; RV64-NEXT: vredminu.vs v8, v8, v8
5125 ; RV64-NEXT: vmv.x.s a0, v8
5127 %v = load <4 x i64>, ptr %x
5128 %red = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
5132 declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
5134 define i64 @vreduce_umin_v8i64(ptr %x) {
5135 ; RV32-LABEL: vreduce_umin_v8i64:
5137 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5138 ; RV32-NEXT: vle64.v v8, (a0)
5139 ; RV32-NEXT: vredminu.vs v8, v8, v8
5140 ; RV32-NEXT: vmv.x.s a0, v8
5141 ; RV32-NEXT: li a1, 32
5142 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5143 ; RV32-NEXT: vsrl.vx v8, v8, a1
5144 ; RV32-NEXT: vmv.x.s a1, v8
5147 ; RV64-LABEL: vreduce_umin_v8i64:
5149 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5150 ; RV64-NEXT: vle64.v v8, (a0)
5151 ; RV64-NEXT: vredminu.vs v8, v8, v8
5152 ; RV64-NEXT: vmv.x.s a0, v8
5154 %v = load <8 x i64>, ptr %x
5155 %red = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %v)
5159 declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
5161 define i64 @vreduce_umin_v16i64(ptr %x) {
5162 ; RV32-LABEL: vreduce_umin_v16i64:
5164 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5165 ; RV32-NEXT: vle64.v v8, (a0)
5166 ; RV32-NEXT: vredminu.vs v8, v8, v8
5167 ; RV32-NEXT: vmv.x.s a0, v8
5168 ; RV32-NEXT: li a1, 32
5169 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5170 ; RV32-NEXT: vsrl.vx v8, v8, a1
5171 ; RV32-NEXT: vmv.x.s a1, v8
5174 ; RV64-LABEL: vreduce_umin_v16i64:
5176 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5177 ; RV64-NEXT: vle64.v v8, (a0)
5178 ; RV64-NEXT: vredminu.vs v8, v8, v8
5179 ; RV64-NEXT: vmv.x.s a0, v8
5181 %v = load <16 x i64>, ptr %x
5182 %red = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %v)
5186 declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)
5188 define i64 @vreduce_umin_v32i64(ptr %x) {
5189 ; RV32-LABEL: vreduce_umin_v32i64:
5191 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5192 ; RV32-NEXT: vle64.v v8, (a0)
5193 ; RV32-NEXT: addi a0, a0, 128
5194 ; RV32-NEXT: vle64.v v16, (a0)
5195 ; RV32-NEXT: vminu.vv v8, v8, v16
5196 ; RV32-NEXT: vredminu.vs v8, v8, v8
5197 ; RV32-NEXT: vmv.x.s a0, v8
5198 ; RV32-NEXT: li a1, 32
5199 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5200 ; RV32-NEXT: vsrl.vx v8, v8, a1
5201 ; RV32-NEXT: vmv.x.s a1, v8
5204 ; RV64-LABEL: vreduce_umin_v32i64:
5206 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5207 ; RV64-NEXT: vle64.v v8, (a0)
5208 ; RV64-NEXT: addi a0, a0, 128
5209 ; RV64-NEXT: vle64.v v16, (a0)
5210 ; RV64-NEXT: vminu.vv v8, v8, v16
5211 ; RV64-NEXT: vredminu.vs v8, v8, v8
5212 ; RV64-NEXT: vmv.x.s a0, v8
5214 %v = load <32 x i64>, ptr %x
5215 %red = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %v)
5219 declare i64 @llvm.vector.reduce.umin.v64i64(<64 x i64>)
5221 define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
5222 ; RV32-LABEL: vreduce_umin_v64i64:
5224 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5225 ; RV32-NEXT: vle64.v v8, (a0)
5226 ; RV32-NEXT: addi a1, a0, 384
5227 ; RV32-NEXT: vle64.v v16, (a1)
5228 ; RV32-NEXT: addi a1, a0, 256
5229 ; RV32-NEXT: addi a0, a0, 128
5230 ; RV32-NEXT: vle64.v v24, (a0)
5231 ; RV32-NEXT: vle64.v v0, (a1)
5232 ; RV32-NEXT: vminu.vv v16, v24, v16
5233 ; RV32-NEXT: vminu.vv v8, v8, v0
5234 ; RV32-NEXT: vminu.vv v8, v8, v16
5235 ; RV32-NEXT: vredminu.vs v8, v8, v8
5236 ; RV32-NEXT: vmv.x.s a0, v8
5237 ; RV32-NEXT: li a1, 32
5238 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5239 ; RV32-NEXT: vsrl.vx v8, v8, a1
5240 ; RV32-NEXT: vmv.x.s a1, v8
5243 ; RV64-LABEL: vreduce_umin_v64i64:
5245 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5246 ; RV64-NEXT: vle64.v v8, (a0)
5247 ; RV64-NEXT: addi a1, a0, 384
5248 ; RV64-NEXT: vle64.v v16, (a1)
5249 ; RV64-NEXT: addi a1, a0, 256
5250 ; RV64-NEXT: addi a0, a0, 128
5251 ; RV64-NEXT: vle64.v v24, (a0)
5252 ; RV64-NEXT: vle64.v v0, (a1)
5253 ; RV64-NEXT: vminu.vv v16, v24, v16
5254 ; RV64-NEXT: vminu.vv v8, v8, v0
5255 ; RV64-NEXT: vminu.vv v8, v8, v16
5256 ; RV64-NEXT: vredminu.vs v8, v8, v8
5257 ; RV64-NEXT: vmv.x.s a0, v8
5259 %v = load <64 x i64>, ptr %x
5260 %red = call i64 @llvm.vector.reduce.umin.v64i64(<64 x i64> %v)
5264 declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8>)
5266 define i8 @vreduce_umax_v1i8(ptr %x) {
5267 ; CHECK-LABEL: vreduce_umax_v1i8:
5269 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
5270 ; CHECK-NEXT: vle8.v v8, (a0)
5271 ; CHECK-NEXT: vmv.x.s a0, v8
5273 %v = load <1 x i8>, ptr %x
5274 %red = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %v)
5278 declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
5280 define i8 @vreduce_umax_v2i8(ptr %x) {
5281 ; CHECK-LABEL: vreduce_umax_v2i8:
5283 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
5284 ; CHECK-NEXT: vle8.v v8, (a0)
5285 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5286 ; CHECK-NEXT: vmv.x.s a0, v8
5288 %v = load <2 x i8>, ptr %x
5289 %red = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v)
5293 declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
5295 define i8 @vreduce_umax_v4i8(ptr %x) {
5296 ; CHECK-LABEL: vreduce_umax_v4i8:
5298 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
5299 ; CHECK-NEXT: vle8.v v8, (a0)
5300 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5301 ; CHECK-NEXT: vmv.x.s a0, v8
5303 %v = load <4 x i8>, ptr %x
5304 %red = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v)
5308 declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
5310 define i8 @vreduce_umax_v8i8(ptr %x) {
5311 ; CHECK-LABEL: vreduce_umax_v8i8:
5313 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
5314 ; CHECK-NEXT: vle8.v v8, (a0)
5315 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5316 ; CHECK-NEXT: vmv.x.s a0, v8
5318 %v = load <8 x i8>, ptr %x
5319 %red = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
5323 declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
5325 define i8 @vreduce_umax_v16i8(ptr %x) {
5326 ; CHECK-LABEL: vreduce_umax_v16i8:
5328 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
5329 ; CHECK-NEXT: vle8.v v8, (a0)
5330 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5331 ; CHECK-NEXT: vmv.x.s a0, v8
5333 %v = load <16 x i8>, ptr %x
5334 %red = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
5338 declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
5340 define i8 @vreduce_umax_v32i8(ptr %x) {
5341 ; CHECK-LABEL: vreduce_umax_v32i8:
5343 ; CHECK-NEXT: li a1, 32
5344 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
5345 ; CHECK-NEXT: vle8.v v8, (a0)
5346 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5347 ; CHECK-NEXT: vmv.x.s a0, v8
5349 %v = load <32 x i8>, ptr %x
5350 %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v)
5354 declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
5356 define i8 @vreduce_umax_v64i8(ptr %x) {
5357 ; CHECK-LABEL: vreduce_umax_v64i8:
5359 ; CHECK-NEXT: li a1, 64
5360 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
5361 ; CHECK-NEXT: vle8.v v8, (a0)
5362 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5363 ; CHECK-NEXT: vmv.x.s a0, v8
5365 %v = load <64 x i8>, ptr %x
5366 %red = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %v)
5370 declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
5372 define i8 @vreduce_umax_v128i8(ptr %x) {
5373 ; CHECK-LABEL: vreduce_umax_v128i8:
5375 ; CHECK-NEXT: li a1, 128
5376 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
5377 ; CHECK-NEXT: vle8.v v8, (a0)
5378 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5379 ; CHECK-NEXT: vmv.x.s a0, v8
5381 %v = load <128 x i8>, ptr %x
5382 %red = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %v)
5386 declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
5388 define i8 @vreduce_umax_v256i8(ptr %x) {
5389 ; CHECK-LABEL: vreduce_umax_v256i8:
5391 ; CHECK-NEXT: li a1, 128
5392 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
5393 ; CHECK-NEXT: vle8.v v8, (a0)
5394 ; CHECK-NEXT: addi a0, a0, 128
5395 ; CHECK-NEXT: vle8.v v16, (a0)
5396 ; CHECK-NEXT: vmaxu.vv v8, v8, v16
5397 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5398 ; CHECK-NEXT: vmv.x.s a0, v8
5400 %v = load <256 x i8>, ptr %x
5401 %red = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %v)
5405 declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16>)
5407 define i16 @vreduce_umax_v1i16(ptr %x) {
5408 ; CHECK-LABEL: vreduce_umax_v1i16:
5410 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
5411 ; CHECK-NEXT: vle16.v v8, (a0)
5412 ; CHECK-NEXT: vmv.x.s a0, v8
5414 %v = load <1 x i16>, ptr %x
5415 %red = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %v)
5419 declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
5421 define i16 @vreduce_umax_v2i16(ptr %x) {
5422 ; CHECK-LABEL: vreduce_umax_v2i16:
5424 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
5425 ; CHECK-NEXT: vle16.v v8, (a0)
5426 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5427 ; CHECK-NEXT: vmv.x.s a0, v8
5429 %v = load <2 x i16>, ptr %x
5430 %red = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v)
5434 declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
5436 define i16 @vreduce_umax_v4i16(ptr %x) {
5437 ; CHECK-LABEL: vreduce_umax_v4i16:
5439 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
5440 ; CHECK-NEXT: vle16.v v8, (a0)
5441 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5442 ; CHECK-NEXT: vmv.x.s a0, v8
5444 %v = load <4 x i16>, ptr %x
5445 %red = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
5449 declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
5451 define i16 @vreduce_umax_v8i16(ptr %x) {
5452 ; CHECK-LABEL: vreduce_umax_v8i16:
5454 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
5455 ; CHECK-NEXT: vle16.v v8, (a0)
5456 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5457 ; CHECK-NEXT: vmv.x.s a0, v8
5459 %v = load <8 x i16>, ptr %x
5460 %red = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
5464 declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
5466 define i16 @vreduce_umax_v16i16(ptr %x) {
5467 ; CHECK-LABEL: vreduce_umax_v16i16:
5469 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
5470 ; CHECK-NEXT: vle16.v v8, (a0)
5471 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5472 ; CHECK-NEXT: vmv.x.s a0, v8
5474 %v = load <16 x i16>, ptr %x
5475 %red = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v)
5479 declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
5481 define i16 @vreduce_umax_v32i16(ptr %x) {
5482 ; CHECK-LABEL: vreduce_umax_v32i16:
5484 ; CHECK-NEXT: li a1, 32
5485 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
5486 ; CHECK-NEXT: vle16.v v8, (a0)
5487 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5488 ; CHECK-NEXT: vmv.x.s a0, v8
5490 %v = load <32 x i16>, ptr %x
5491 %red = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %v)
5495 declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
5497 define i16 @vreduce_umax_v64i16(ptr %x) {
5498 ; CHECK-LABEL: vreduce_umax_v64i16:
5500 ; CHECK-NEXT: li a1, 64
5501 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
5502 ; CHECK-NEXT: vle16.v v8, (a0)
5503 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5504 ; CHECK-NEXT: vmv.x.s a0, v8
5506 %v = load <64 x i16>, ptr %x
5507 %red = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %v)
5511 declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
5513 define i16 @vreduce_umax_v128i16(ptr %x) {
5514 ; CHECK-LABEL: vreduce_umax_v128i16:
5516 ; CHECK-NEXT: li a1, 64
5517 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
5518 ; CHECK-NEXT: vle16.v v8, (a0)
5519 ; CHECK-NEXT: addi a0, a0, 128
5520 ; CHECK-NEXT: vle16.v v16, (a0)
5521 ; CHECK-NEXT: vmaxu.vv v8, v8, v16
5522 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5523 ; CHECK-NEXT: vmv.x.s a0, v8
5525 %v = load <128 x i16>, ptr %x
5526 %red = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %v)
5530 declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32>)
5532 define i32 @vreduce_umax_v1i32(ptr %x) {
5533 ; CHECK-LABEL: vreduce_umax_v1i32:
5535 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
5536 ; CHECK-NEXT: vle32.v v8, (a0)
5537 ; CHECK-NEXT: vmv.x.s a0, v8
5539 %v = load <1 x i32>, ptr %x
5540 %red = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %v)
5544 declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
5546 define i32 @vreduce_umax_v2i32(ptr %x) {
5547 ; CHECK-LABEL: vreduce_umax_v2i32:
5549 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
5550 ; CHECK-NEXT: vle32.v v8, (a0)
5551 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5552 ; CHECK-NEXT: vmv.x.s a0, v8
5554 %v = load <2 x i32>, ptr %x
5555 %red = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v)
5559 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
5561 define i32 @vreduce_umax_v4i32(ptr %x) {
5562 ; CHECK-LABEL: vreduce_umax_v4i32:
5564 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
5565 ; CHECK-NEXT: vle32.v v8, (a0)
5566 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5567 ; CHECK-NEXT: vmv.x.s a0, v8
5569 %v = load <4 x i32>, ptr %x
5570 %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
5574 declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
5576 define i32 @vreduce_umax_v8i32(ptr %x) {
5577 ; CHECK-LABEL: vreduce_umax_v8i32:
5579 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
5580 ; CHECK-NEXT: vle32.v v8, (a0)
5581 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5582 ; CHECK-NEXT: vmv.x.s a0, v8
5584 %v = load <8 x i32>, ptr %x
5585 %red = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v)
5589 declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
5591 define i32 @vreduce_umax_v16i32(ptr %x) {
5592 ; CHECK-LABEL: vreduce_umax_v16i32:
5594 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
5595 ; CHECK-NEXT: vle32.v v8, (a0)
5596 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5597 ; CHECK-NEXT: vmv.x.s a0, v8
5599 %v = load <16 x i32>, ptr %x
5600 %red = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %v)
5604 declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
5606 define i32 @vreduce_umax_v32i32(ptr %x) {
5607 ; CHECK-LABEL: vreduce_umax_v32i32:
5609 ; CHECK-NEXT: li a1, 32
5610 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5611 ; CHECK-NEXT: vle32.v v8, (a0)
5612 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5613 ; CHECK-NEXT: vmv.x.s a0, v8
5615 %v = load <32 x i32>, ptr %x
5616 %red = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %v)
5620 declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
5622 define i32 @vreduce_umax_v64i32(ptr %x) {
5623 ; CHECK-LABEL: vreduce_umax_v64i32:
5625 ; CHECK-NEXT: li a1, 32
5626 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5627 ; CHECK-NEXT: vle32.v v8, (a0)
5628 ; CHECK-NEXT: addi a0, a0, 128
5629 ; CHECK-NEXT: vle32.v v16, (a0)
5630 ; CHECK-NEXT: vmaxu.vv v8, v8, v16
5631 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5632 ; CHECK-NEXT: vmv.x.s a0, v8
5634 %v = load <64 x i32>, ptr %x
5635 %red = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %v)
5639 declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
5641 define i64 @vreduce_umax_v1i64(ptr %x) {
5642 ; RV32-LABEL: vreduce_umax_v1i64:
5644 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5645 ; RV32-NEXT: vle64.v v8, (a0)
5646 ; RV32-NEXT: li a0, 32
5647 ; RV32-NEXT: vsrl.vx v9, v8, a0
5648 ; RV32-NEXT: vmv.x.s a1, v9
5649 ; RV32-NEXT: vmv.x.s a0, v8
5652 ; RV64-LABEL: vreduce_umax_v1i64:
5654 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5655 ; RV64-NEXT: vle64.v v8, (a0)
5656 ; RV64-NEXT: vmv.x.s a0, v8
5658 %v = load <1 x i64>, ptr %x
5659 %red = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %v)
5663 declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
5665 define i64 @vreduce_umax_v2i64(ptr %x) {
5666 ; RV32-LABEL: vreduce_umax_v2i64:
5668 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5669 ; RV32-NEXT: vle64.v v8, (a0)
5670 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5671 ; RV32-NEXT: li a0, 32
5672 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5673 ; RV32-NEXT: vsrl.vx v9, v8, a0
5674 ; RV32-NEXT: vmv.x.s a1, v9
5675 ; RV32-NEXT: vmv.x.s a0, v8
5678 ; RV64-LABEL: vreduce_umax_v2i64:
5680 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5681 ; RV64-NEXT: vle64.v v8, (a0)
5682 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5683 ; RV64-NEXT: vmv.x.s a0, v8
5685 %v = load <2 x i64>, ptr %x
5686 %red = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v)
5690 declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
5692 define i64 @vreduce_umax_v4i64(ptr %x) {
5693 ; RV32-LABEL: vreduce_umax_v4i64:
5695 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5696 ; RV32-NEXT: vle64.v v8, (a0)
5697 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5698 ; RV32-NEXT: vmv.x.s a0, v8
5699 ; RV32-NEXT: li a1, 32
5700 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5701 ; RV32-NEXT: vsrl.vx v8, v8, a1
5702 ; RV32-NEXT: vmv.x.s a1, v8
5705 ; RV64-LABEL: vreduce_umax_v4i64:
5707 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5708 ; RV64-NEXT: vle64.v v8, (a0)
5709 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5710 ; RV64-NEXT: vmv.x.s a0, v8
5712 %v = load <4 x i64>, ptr %x
5713 %red = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
5717 declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
5719 define i64 @vreduce_umax_v8i64(ptr %x) {
5720 ; RV32-LABEL: vreduce_umax_v8i64:
5722 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5723 ; RV32-NEXT: vle64.v v8, (a0)
5724 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5725 ; RV32-NEXT: vmv.x.s a0, v8
5726 ; RV32-NEXT: li a1, 32
5727 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5728 ; RV32-NEXT: vsrl.vx v8, v8, a1
5729 ; RV32-NEXT: vmv.x.s a1, v8
5732 ; RV64-LABEL: vreduce_umax_v8i64:
5734 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5735 ; RV64-NEXT: vle64.v v8, (a0)
5736 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5737 ; RV64-NEXT: vmv.x.s a0, v8
5739 %v = load <8 x i64>, ptr %x
5740 %red = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %v)
5744 declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
5746 define i64 @vreduce_umax_v16i64(ptr %x) {
5747 ; RV32-LABEL: vreduce_umax_v16i64:
5749 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5750 ; RV32-NEXT: vle64.v v8, (a0)
5751 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5752 ; RV32-NEXT: vmv.x.s a0, v8
5753 ; RV32-NEXT: li a1, 32
5754 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5755 ; RV32-NEXT: vsrl.vx v8, v8, a1
5756 ; RV32-NEXT: vmv.x.s a1, v8
5759 ; RV64-LABEL: vreduce_umax_v16i64:
5761 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5762 ; RV64-NEXT: vle64.v v8, (a0)
5763 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5764 ; RV64-NEXT: vmv.x.s a0, v8
5766 %v = load <16 x i64>, ptr %x
5767 %red = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %v)
5771 declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
5773 define i64 @vreduce_umax_v32i64(ptr %x) {
5774 ; RV32-LABEL: vreduce_umax_v32i64:
5776 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5777 ; RV32-NEXT: vle64.v v8, (a0)
5778 ; RV32-NEXT: addi a0, a0, 128
5779 ; RV32-NEXT: vle64.v v16, (a0)
5780 ; RV32-NEXT: vmaxu.vv v8, v8, v16
5781 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5782 ; RV32-NEXT: vmv.x.s a0, v8
5783 ; RV32-NEXT: li a1, 32
5784 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5785 ; RV32-NEXT: vsrl.vx v8, v8, a1
5786 ; RV32-NEXT: vmv.x.s a1, v8
5789 ; RV64-LABEL: vreduce_umax_v32i64:
5791 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5792 ; RV64-NEXT: vle64.v v8, (a0)
5793 ; RV64-NEXT: addi a0, a0, 128
5794 ; RV64-NEXT: vle64.v v16, (a0)
5795 ; RV64-NEXT: vmaxu.vv v8, v8, v16
5796 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5797 ; RV64-NEXT: vmv.x.s a0, v8
5799 %v = load <32 x i64>, ptr %x
5800 %red = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %v)
5804 declare i64 @llvm.vector.reduce.umax.v64i64(<64 x i64>)
5806 define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
5807 ; RV32-LABEL: vreduce_umax_v64i64:
5809 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5810 ; RV32-NEXT: vle64.v v8, (a0)
5811 ; RV32-NEXT: addi a1, a0, 384
5812 ; RV32-NEXT: vle64.v v16, (a1)
5813 ; RV32-NEXT: addi a1, a0, 256
5814 ; RV32-NEXT: addi a0, a0, 128
5815 ; RV32-NEXT: vle64.v v24, (a0)
5816 ; RV32-NEXT: vle64.v v0, (a1)
5817 ; RV32-NEXT: vmaxu.vv v16, v24, v16
5818 ; RV32-NEXT: vmaxu.vv v8, v8, v0
5819 ; RV32-NEXT: vmaxu.vv v8, v8, v16
5820 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5821 ; RV32-NEXT: vmv.x.s a0, v8
5822 ; RV32-NEXT: li a1, 32
5823 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5824 ; RV32-NEXT: vsrl.vx v8, v8, a1
5825 ; RV32-NEXT: vmv.x.s a1, v8
5828 ; RV64-LABEL: vreduce_umax_v64i64:
5830 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5831 ; RV64-NEXT: vle64.v v8, (a0)
5832 ; RV64-NEXT: addi a1, a0, 384
5833 ; RV64-NEXT: vle64.v v16, (a1)
5834 ; RV64-NEXT: addi a1, a0, 256
5835 ; RV64-NEXT: addi a0, a0, 128
5836 ; RV64-NEXT: vle64.v v24, (a0)
5837 ; RV64-NEXT: vle64.v v0, (a1)
5838 ; RV64-NEXT: vmaxu.vv v16, v24, v16
5839 ; RV64-NEXT: vmaxu.vv v8, v8, v0
5840 ; RV64-NEXT: vmaxu.vv v8, v8, v16
5841 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5842 ; RV64-NEXT: vmv.x.s a0, v8
5844 %v = load <64 x i64>, ptr %x
5845 %red = call i64 @llvm.vector.reduce.umax.v64i64(<64 x i64> %v)
5849 declare i8 @llvm.vector.reduce.mul.v1i8(<1 x i8>)
5851 define i8 @vreduce_mul_v1i8(ptr %x) {
5852 ; CHECK-LABEL: vreduce_mul_v1i8:
5854 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
5855 ; CHECK-NEXT: vle8.v v8, (a0)
5856 ; CHECK-NEXT: vmv.x.s a0, v8
5858 %v = load <1 x i8>, ptr %x
5859 %red = call i8 @llvm.vector.reduce.mul.v1i8(<1 x i8> %v)
5863 declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
5865 define i8 @vreduce_mul_v2i8(ptr %x) {
5866 ; CHECK-LABEL: vreduce_mul_v2i8:
5868 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
5869 ; CHECK-NEXT: vle8.v v8, (a0)
5870 ; CHECK-NEXT: lb a0, 1(a0)
5871 ; CHECK-NEXT: vmul.vx v8, v8, a0
5872 ; CHECK-NEXT: vmv.x.s a0, v8
5874 %v = load <2 x i8>, ptr %x
5875 %red = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %v)
5879 declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
5881 define i8 @vreduce_mul_v4i8(ptr %x) {
5882 ; CHECK-LABEL: vreduce_mul_v4i8:
5884 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
5885 ; CHECK-NEXT: vle8.v v8, (a0)
5886 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
5887 ; CHECK-NEXT: vmul.vv v8, v8, v9
5888 ; CHECK-NEXT: vrgather.vi v9, v8, 1
5889 ; CHECK-NEXT: vmul.vv v8, v8, v9
5890 ; CHECK-NEXT: vmv.x.s a0, v8
5892 %v = load <4 x i8>, ptr %x
5893 %red = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %v)
5897 declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
5899 define i8 @vreduce_mul_v8i8(ptr %x) {
5900 ; CHECK-LABEL: vreduce_mul_v8i8:
5902 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
5903 ; CHECK-NEXT: vle8.v v8, (a0)
5904 ; CHECK-NEXT: vslidedown.vi v9, v8, 4
5905 ; CHECK-NEXT: vmul.vv v8, v8, v9
5906 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
5907 ; CHECK-NEXT: vmul.vv v8, v8, v9
5908 ; CHECK-NEXT: vrgather.vi v9, v8, 1
5909 ; CHECK-NEXT: vmul.vv v8, v8, v9
5910 ; CHECK-NEXT: vmv.x.s a0, v8
5912 %v = load <8 x i8>, ptr %x
5913 %red = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %v)
5917 declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
5919 define i8 @vreduce_mul_v16i8(ptr %x) {
5920 ; CHECK-LABEL: vreduce_mul_v16i8:
5922 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
5923 ; CHECK-NEXT: vle8.v v8, (a0)
5924 ; CHECK-NEXT: vslidedown.vi v9, v8, 8
5925 ; CHECK-NEXT: vmul.vv v8, v8, v9
5926 ; CHECK-NEXT: vslidedown.vi v9, v8, 4
5927 ; CHECK-NEXT: vmul.vv v8, v8, v9
5928 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
5929 ; CHECK-NEXT: vmul.vv v8, v8, v9
5930 ; CHECK-NEXT: vrgather.vi v9, v8, 1
5931 ; CHECK-NEXT: vmul.vv v8, v8, v9
5932 ; CHECK-NEXT: vmv.x.s a0, v8
5934 %v = load <16 x i8>, ptr %x
5935 %red = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %v)
5939 declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
5941 define i8 @vreduce_mul_v32i8(ptr %x) {
5942 ; CHECK-LABEL: vreduce_mul_v32i8:
5944 ; CHECK-NEXT: li a1, 32
5945 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
5946 ; CHECK-NEXT: vle8.v v8, (a0)
5947 ; CHECK-NEXT: vslidedown.vi v10, v8, 16
5948 ; CHECK-NEXT: vmul.vv v8, v8, v10
5949 ; CHECK-NEXT: vslidedown.vi v10, v8, 8
5950 ; CHECK-NEXT: vmul.vv v8, v8, v10
5951 ; CHECK-NEXT: vslidedown.vi v10, v8, 4
5952 ; CHECK-NEXT: vmul.vv v8, v8, v10
5953 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
5954 ; CHECK-NEXT: vmul.vv v8, v8, v10
5955 ; CHECK-NEXT: vrgather.vi v10, v8, 1
5956 ; CHECK-NEXT: vmul.vv v8, v8, v10
5957 ; CHECK-NEXT: vmv.x.s a0, v8
5959 %v = load <32 x i8>, ptr %x
5960 %red = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %v)
5964 declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
5966 define i8 @vreduce_mul_v64i8(ptr %x) {
5967 ; CHECK-LABEL: vreduce_mul_v64i8:
5969 ; CHECK-NEXT: li a1, 64
5970 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
5971 ; CHECK-NEXT: vle8.v v8, (a0)
5972 ; CHECK-NEXT: li a0, 32
5973 ; CHECK-NEXT: vslidedown.vx v12, v8, a0
5974 ; CHECK-NEXT: vmul.vv v8, v8, v12
5975 ; CHECK-NEXT: vslidedown.vi v12, v8, 16
5976 ; CHECK-NEXT: vmul.vv v8, v8, v12
5977 ; CHECK-NEXT: vslidedown.vi v12, v8, 8
5978 ; CHECK-NEXT: vmul.vv v8, v8, v12
5979 ; CHECK-NEXT: vslidedown.vi v12, v8, 4
5980 ; CHECK-NEXT: vmul.vv v8, v8, v12
5981 ; CHECK-NEXT: vslidedown.vi v12, v8, 2
5982 ; CHECK-NEXT: vmul.vv v8, v8, v12
5983 ; CHECK-NEXT: vrgather.vi v12, v8, 1
5984 ; CHECK-NEXT: vmul.vv v8, v8, v12
5985 ; CHECK-NEXT: vmv.x.s a0, v8
5987 %v = load <64 x i8>, ptr %x
5988 %red = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %v)
5992 declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
5994 define i8 @vreduce_mul_v128i8(ptr %x) {
5995 ; CHECK-LABEL: vreduce_mul_v128i8:
5997 ; CHECK-NEXT: li a1, 128
5998 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
5999 ; CHECK-NEXT: vle8.v v8, (a0)
6000 ; CHECK-NEXT: li a0, 64
6001 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6002 ; CHECK-NEXT: vmul.vv v8, v8, v16
6003 ; CHECK-NEXT: li a0, 32
6004 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6005 ; CHECK-NEXT: vmul.vv v8, v8, v16
6006 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6007 ; CHECK-NEXT: vmul.vv v8, v8, v16
6008 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6009 ; CHECK-NEXT: vmul.vv v8, v8, v16
6010 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6011 ; CHECK-NEXT: vmul.vv v8, v8, v16
6012 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6013 ; CHECK-NEXT: vmul.vv v8, v8, v16
6014 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6015 ; CHECK-NEXT: vmul.vv v8, v8, v16
6016 ; CHECK-NEXT: vmv.x.s a0, v8
6018 %v = load <128 x i8>, ptr %x
6019 %red = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %v)
6023 declare i8 @llvm.vector.reduce.mul.v256i8(<256 x i8>)
6025 define i8 @vreduce_mul_v256i8(ptr %x) {
6026 ; CHECK-LABEL: vreduce_mul_v256i8:
6028 ; CHECK-NEXT: li a1, 128
6029 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
6030 ; CHECK-NEXT: vle8.v v8, (a0)
6031 ; CHECK-NEXT: addi a0, a0, 128
6032 ; CHECK-NEXT: vle8.v v16, (a0)
6033 ; CHECK-NEXT: vmul.vv v8, v8, v16
6034 ; CHECK-NEXT: li a0, 64
6035 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6036 ; CHECK-NEXT: vmul.vv v8, v8, v16
6037 ; CHECK-NEXT: li a0, 32
6038 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6039 ; CHECK-NEXT: vmul.vv v8, v8, v16
6040 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6041 ; CHECK-NEXT: vmul.vv v8, v8, v16
6042 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6043 ; CHECK-NEXT: vmul.vv v8, v8, v16
6044 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6045 ; CHECK-NEXT: vmul.vv v8, v8, v16
6046 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6047 ; CHECK-NEXT: vmul.vv v8, v8, v16
6048 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6049 ; CHECK-NEXT: vmul.vv v8, v8, v16
6050 ; CHECK-NEXT: vmv.x.s a0, v8
6052 %v = load <256 x i8>, ptr %x
6053 %red = call i8 @llvm.vector.reduce.mul.v256i8(<256 x i8> %v)
6057 declare i16 @llvm.vector.reduce.mul.v1i16(<1 x i16>)
6059 define i16 @vreduce_mul_v1i16(ptr %x) {
6060 ; CHECK-LABEL: vreduce_mul_v1i16:
6062 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
6063 ; CHECK-NEXT: vle16.v v8, (a0)
6064 ; CHECK-NEXT: vmv.x.s a0, v8
6066 %v = load <1 x i16>, ptr %x
6067 %red = call i16 @llvm.vector.reduce.mul.v1i16(<1 x i16> %v)
6071 declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
6073 define i16 @vreduce_mul_v2i16(ptr %x) {
6074 ; CHECK-LABEL: vreduce_mul_v2i16:
6076 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
6077 ; CHECK-NEXT: vle16.v v8, (a0)
6078 ; CHECK-NEXT: lh a0, 2(a0)
6079 ; CHECK-NEXT: vmul.vx v8, v8, a0
6080 ; CHECK-NEXT: vmv.x.s a0, v8
6082 %v = load <2 x i16>, ptr %x
6083 %red = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %v)
6087 declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
6089 define i16 @vreduce_mul_v4i16(ptr %x) {
6090 ; CHECK-LABEL: vreduce_mul_v4i16:
6092 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
6093 ; CHECK-NEXT: vle16.v v8, (a0)
6094 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
6095 ; CHECK-NEXT: vmul.vv v8, v8, v9
6096 ; CHECK-NEXT: vrgather.vi v9, v8, 1
6097 ; CHECK-NEXT: vmul.vv v8, v8, v9
6098 ; CHECK-NEXT: vmv.x.s a0, v8
6100 %v = load <4 x i16>, ptr %x
6101 %red = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %v)
6105 declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
6107 define i16 @vreduce_mul_v8i16(ptr %x) {
6108 ; CHECK-LABEL: vreduce_mul_v8i16:
6110 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
6111 ; CHECK-NEXT: vle16.v v8, (a0)
6112 ; CHECK-NEXT: vslidedown.vi v9, v8, 4
6113 ; CHECK-NEXT: vmul.vv v8, v8, v9
6114 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
6115 ; CHECK-NEXT: vmul.vv v8, v8, v9
6116 ; CHECK-NEXT: vrgather.vi v9, v8, 1
6117 ; CHECK-NEXT: vmul.vv v8, v8, v9
6118 ; CHECK-NEXT: vmv.x.s a0, v8
6120 %v = load <8 x i16>, ptr %x
6121 %red = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %v)
6125 declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
6127 define i16 @vreduce_mul_v16i16(ptr %x) {
6128 ; CHECK-LABEL: vreduce_mul_v16i16:
6130 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
6131 ; CHECK-NEXT: vle16.v v8, (a0)
6132 ; CHECK-NEXT: vslidedown.vi v10, v8, 8
6133 ; CHECK-NEXT: vmul.vv v8, v8, v10
6134 ; CHECK-NEXT: vslidedown.vi v10, v8, 4
6135 ; CHECK-NEXT: vmul.vv v8, v8, v10
6136 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
6137 ; CHECK-NEXT: vmul.vv v8, v8, v10
6138 ; CHECK-NEXT: vrgather.vi v10, v8, 1
6139 ; CHECK-NEXT: vmul.vv v8, v8, v10
6140 ; CHECK-NEXT: vmv.x.s a0, v8
6142 %v = load <16 x i16>, ptr %x
6143 %red = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %v)
6147 declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
6149 define i16 @vreduce_mul_v32i16(ptr %x) {
6150 ; CHECK-LABEL: vreduce_mul_v32i16:
6152 ; CHECK-NEXT: li a1, 32
6153 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
6154 ; CHECK-NEXT: vle16.v v8, (a0)
6155 ; CHECK-NEXT: vslidedown.vi v12, v8, 16
6156 ; CHECK-NEXT: vmul.vv v8, v8, v12
6157 ; CHECK-NEXT: vslidedown.vi v12, v8, 8
6158 ; CHECK-NEXT: vmul.vv v8, v8, v12
6159 ; CHECK-NEXT: vslidedown.vi v12, v8, 4
6160 ; CHECK-NEXT: vmul.vv v8, v8, v12
6161 ; CHECK-NEXT: vslidedown.vi v12, v8, 2
6162 ; CHECK-NEXT: vmul.vv v8, v8, v12
6163 ; CHECK-NEXT: vrgather.vi v12, v8, 1
6164 ; CHECK-NEXT: vmul.vv v8, v8, v12
6165 ; CHECK-NEXT: vmv.x.s a0, v8
6167 %v = load <32 x i16>, ptr %x
6168 %red = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %v)
6172 declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
6174 define i16 @vreduce_mul_v64i16(ptr %x) {
6175 ; CHECK-LABEL: vreduce_mul_v64i16:
6177 ; CHECK-NEXT: li a1, 64
6178 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
6179 ; CHECK-NEXT: vle16.v v8, (a0)
6180 ; CHECK-NEXT: li a0, 32
6181 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6182 ; CHECK-NEXT: vmul.vv v8, v8, v16
6183 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6184 ; CHECK-NEXT: vmul.vv v8, v8, v16
6185 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6186 ; CHECK-NEXT: vmul.vv v8, v8, v16
6187 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6188 ; CHECK-NEXT: vmul.vv v8, v8, v16
6189 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6190 ; CHECK-NEXT: vmul.vv v8, v8, v16
6191 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6192 ; CHECK-NEXT: vmul.vv v8, v8, v16
6193 ; CHECK-NEXT: vmv.x.s a0, v8
6195 %v = load <64 x i16>, ptr %x
6196 %red = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %v)
6200 declare i16 @llvm.vector.reduce.mul.v128i16(<128 x i16>)
6202 define i16 @vreduce_mul_v128i16(ptr %x) {
6203 ; CHECK-LABEL: vreduce_mul_v128i16:
6205 ; CHECK-NEXT: li a1, 64
6206 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
6207 ; CHECK-NEXT: vle16.v v8, (a0)
6208 ; CHECK-NEXT: addi a0, a0, 128
6209 ; CHECK-NEXT: vle16.v v16, (a0)
6210 ; CHECK-NEXT: vmul.vv v8, v8, v16
6211 ; CHECK-NEXT: li a0, 32
6212 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6213 ; CHECK-NEXT: vmul.vv v8, v8, v16
6214 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6215 ; CHECK-NEXT: vmul.vv v8, v8, v16
6216 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6217 ; CHECK-NEXT: vmul.vv v8, v8, v16
6218 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6219 ; CHECK-NEXT: vmul.vv v8, v8, v16
6220 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6221 ; CHECK-NEXT: vmul.vv v8, v8, v16
6222 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6223 ; CHECK-NEXT: vmul.vv v8, v8, v16
6224 ; CHECK-NEXT: vmv.x.s a0, v8
6226 %v = load <128 x i16>, ptr %x
6227 %red = call i16 @llvm.vector.reduce.mul.v128i16(<128 x i16> %v)
6231 declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32>)
6233 define i32 @vreduce_mul_v1i32(ptr %x) {
6234 ; CHECK-LABEL: vreduce_mul_v1i32:
6236 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
6237 ; CHECK-NEXT: vle32.v v8, (a0)
6238 ; CHECK-NEXT: vmv.x.s a0, v8
6240 %v = load <1 x i32>, ptr %x
6241 %red = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %v)
6245 declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
6247 define i32 @vreduce_mul_v2i32(ptr %x) {
6248 ; CHECK-LABEL: vreduce_mul_v2i32:
6250 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
6251 ; CHECK-NEXT: vle32.v v8, (a0)
6252 ; CHECK-NEXT: lw a0, 4(a0)
6253 ; CHECK-NEXT: vmul.vx v8, v8, a0
6254 ; CHECK-NEXT: vmv.x.s a0, v8
6256 %v = load <2 x i32>, ptr %x
6257 %red = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %v)
6261 declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
6263 define i32 @vreduce_mul_v4i32(ptr %x) {
6264 ; CHECK-LABEL: vreduce_mul_v4i32:
6266 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
6267 ; CHECK-NEXT: vle32.v v8, (a0)
6268 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
6269 ; CHECK-NEXT: vmul.vv v8, v8, v9
6270 ; CHECK-NEXT: vrgather.vi v9, v8, 1
6271 ; CHECK-NEXT: vmul.vv v8, v8, v9
6272 ; CHECK-NEXT: vmv.x.s a0, v8
6274 %v = load <4 x i32>, ptr %x
6275 %red = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %v)
6279 declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
6281 define i32 @vreduce_mul_v8i32(ptr %x) {
6282 ; CHECK-LABEL: vreduce_mul_v8i32:
6284 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
6285 ; CHECK-NEXT: vle32.v v8, (a0)
6286 ; CHECK-NEXT: vslidedown.vi v10, v8, 4
6287 ; CHECK-NEXT: vmul.vv v8, v8, v10
6288 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
6289 ; CHECK-NEXT: vmul.vv v8, v8, v10
6290 ; CHECK-NEXT: vrgather.vi v10, v8, 1
6291 ; CHECK-NEXT: vmul.vv v8, v8, v10
6292 ; CHECK-NEXT: vmv.x.s a0, v8
6294 %v = load <8 x i32>, ptr %x
6295 %red = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %v)
6299 declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
6301 define i32 @vreduce_mul_v16i32(ptr %x) {
6302 ; CHECK-LABEL: vreduce_mul_v16i32:
6304 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
6305 ; CHECK-NEXT: vle32.v v8, (a0)
6306 ; CHECK-NEXT: vslidedown.vi v12, v8, 8
6307 ; CHECK-NEXT: vmul.vv v8, v8, v12
6308 ; CHECK-NEXT: vslidedown.vi v12, v8, 4
6309 ; CHECK-NEXT: vmul.vv v8, v8, v12
6310 ; CHECK-NEXT: vslidedown.vi v12, v8, 2
6311 ; CHECK-NEXT: vmul.vv v8, v8, v12
6312 ; CHECK-NEXT: vrgather.vi v12, v8, 1
6313 ; CHECK-NEXT: vmul.vv v8, v8, v12
6314 ; CHECK-NEXT: vmv.x.s a0, v8
6316 %v = load <16 x i32>, ptr %x
6317 %red = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v)
6321 declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
6323 define i32 @vreduce_mul_v32i32(ptr %x) {
6324 ; CHECK-LABEL: vreduce_mul_v32i32:
6326 ; CHECK-NEXT: li a1, 32
6327 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
6328 ; CHECK-NEXT: vle32.v v8, (a0)
6329 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6330 ; CHECK-NEXT: vmul.vv v8, v8, v16
6331 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6332 ; CHECK-NEXT: vmul.vv v8, v8, v16
6333 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6334 ; CHECK-NEXT: vmul.vv v8, v8, v16
6335 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6336 ; CHECK-NEXT: vmul.vv v8, v8, v16
6337 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6338 ; CHECK-NEXT: vmul.vv v8, v8, v16
6339 ; CHECK-NEXT: vmv.x.s a0, v8
6341 %v = load <32 x i32>, ptr %x
6342 %red = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %v)
6346 declare i32 @llvm.vector.reduce.mul.v64i32(<64 x i32>)
6348 define i32 @vreduce_mul_v64i32(ptr %x) {
6349 ; CHECK-LABEL: vreduce_mul_v64i32:
6351 ; CHECK-NEXT: li a1, 32
6352 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
6353 ; CHECK-NEXT: vle32.v v8, (a0)
6354 ; CHECK-NEXT: addi a0, a0, 128
6355 ; CHECK-NEXT: vle32.v v16, (a0)
6356 ; CHECK-NEXT: vmul.vv v8, v8, v16
6357 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6358 ; CHECK-NEXT: vmul.vv v8, v8, v16
6359 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6360 ; CHECK-NEXT: vmul.vv v8, v8, v16
6361 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6362 ; CHECK-NEXT: vmul.vv v8, v8, v16
6363 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6364 ; CHECK-NEXT: vmul.vv v8, v8, v16
6365 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6366 ; CHECK-NEXT: vmul.vv v8, v8, v16
6367 ; CHECK-NEXT: vmv.x.s a0, v8
6369 %v = load <64 x i32>, ptr %x
6370 %red = call i32 @llvm.vector.reduce.mul.v64i32(<64 x i32> %v)
6374 declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
6376 define i64 @vreduce_mul_v1i64(ptr %x) {
6377 ; RV32-LABEL: vreduce_mul_v1i64:
6379 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
6380 ; RV32-NEXT: vle64.v v8, (a0)
6381 ; RV32-NEXT: li a0, 32
6382 ; RV32-NEXT: vsrl.vx v9, v8, a0
6383 ; RV32-NEXT: vmv.x.s a1, v9
6384 ; RV32-NEXT: vmv.x.s a0, v8
6387 ; RV64-LABEL: vreduce_mul_v1i64:
6389 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
6390 ; RV64-NEXT: vle64.v v8, (a0)
6391 ; RV64-NEXT: vmv.x.s a0, v8
6393 %v = load <1 x i64>, ptr %x
6394 %red = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %v)
6398 declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
6400 define i64 @vreduce_mul_v2i64(ptr %x) {
6401 ; RV32-LABEL: vreduce_mul_v2i64:
6403 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
6404 ; RV32-NEXT: vle64.v v8, (a0)
6405 ; RV32-NEXT: addi a0, a0, 8
6406 ; RV32-NEXT: vlse64.v v9, (a0), zero
6407 ; RV32-NEXT: vmul.vv v8, v8, v9
6408 ; RV32-NEXT: vmv.x.s a0, v8
6409 ; RV32-NEXT: li a1, 32
6410 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
6411 ; RV32-NEXT: vsrl.vx v8, v8, a1
6412 ; RV32-NEXT: vmv.x.s a1, v8
6415 ; RV64-LABEL: vreduce_mul_v2i64:
6417 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
6418 ; RV64-NEXT: vle64.v v8, (a0)
6419 ; RV64-NEXT: ld a0, 8(a0)
6420 ; RV64-NEXT: vmul.vx v8, v8, a0
6421 ; RV64-NEXT: vmv.x.s a0, v8
6423 %v = load <2 x i64>, ptr %x
6424 %red = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %v)
6428 declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
6430 define i64 @vreduce_mul_v4i64(ptr %x) {
6431 ; RV32-LABEL: vreduce_mul_v4i64:
6433 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
6434 ; RV32-NEXT: vle64.v v8, (a0)
6435 ; RV32-NEXT: vslidedown.vi v10, v8, 2
6436 ; RV32-NEXT: vmul.vv v8, v8, v10
6437 ; RV32-NEXT: vrgather.vi v10, v8, 1
6438 ; RV32-NEXT: vmul.vv v8, v8, v10
6439 ; RV32-NEXT: vmv.x.s a0, v8
6440 ; RV32-NEXT: li a1, 32
6441 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
6442 ; RV32-NEXT: vsrl.vx v8, v8, a1
6443 ; RV32-NEXT: vmv.x.s a1, v8
6446 ; RV64-LABEL: vreduce_mul_v4i64:
6448 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
6449 ; RV64-NEXT: vle64.v v8, (a0)
6450 ; RV64-NEXT: vslidedown.vi v10, v8, 2
6451 ; RV64-NEXT: vmul.vv v8, v8, v10
6452 ; RV64-NEXT: vrgather.vi v10, v8, 1
6453 ; RV64-NEXT: vmul.vv v8, v8, v10
6454 ; RV64-NEXT: vmv.x.s a0, v8
6456 %v = load <4 x i64>, ptr %x
6457 %red = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %v)
6461 declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
6463 define i64 @vreduce_mul_v8i64(ptr %x) {
6464 ; RV32-LABEL: vreduce_mul_v8i64:
6466 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
6467 ; RV32-NEXT: vle64.v v8, (a0)
6468 ; RV32-NEXT: vslidedown.vi v12, v8, 4
6469 ; RV32-NEXT: vmul.vv v8, v8, v12
6470 ; RV32-NEXT: vslidedown.vi v12, v8, 2
6471 ; RV32-NEXT: vmul.vv v8, v8, v12
6472 ; RV32-NEXT: vrgather.vi v12, v8, 1
6473 ; RV32-NEXT: vmul.vv v8, v8, v12
6474 ; RV32-NEXT: vmv.x.s a0, v8
6475 ; RV32-NEXT: li a1, 32
6476 ; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
6477 ; RV32-NEXT: vsrl.vx v8, v8, a1
6478 ; RV32-NEXT: vmv.x.s a1, v8
6481 ; RV64-LABEL: vreduce_mul_v8i64:
6483 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
6484 ; RV64-NEXT: vle64.v v8, (a0)
6485 ; RV64-NEXT: vslidedown.vi v12, v8, 4
6486 ; RV64-NEXT: vmul.vv v8, v8, v12
6487 ; RV64-NEXT: vslidedown.vi v12, v8, 2
6488 ; RV64-NEXT: vmul.vv v8, v8, v12
6489 ; RV64-NEXT: vrgather.vi v12, v8, 1
6490 ; RV64-NEXT: vmul.vv v8, v8, v12
6491 ; RV64-NEXT: vmv.x.s a0, v8
6493 %v = load <8 x i64>, ptr %x
6494 %red = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %v)
6498 declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
6500 define i64 @vreduce_mul_v16i64(ptr %x) {
6501 ; RV32-LABEL: vreduce_mul_v16i64:
6503 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6504 ; RV32-NEXT: vle64.v v8, (a0)
6505 ; RV32-NEXT: vslidedown.vi v16, v8, 8
6506 ; RV32-NEXT: vmul.vv v8, v8, v16
6507 ; RV32-NEXT: vslidedown.vi v16, v8, 4
6508 ; RV32-NEXT: vmul.vv v8, v8, v16
6509 ; RV32-NEXT: vslidedown.vi v16, v8, 2
6510 ; RV32-NEXT: vmul.vv v8, v8, v16
6511 ; RV32-NEXT: vrgather.vi v16, v8, 1
6512 ; RV32-NEXT: vmul.vv v8, v8, v16
6513 ; RV32-NEXT: vmv.x.s a0, v8
6514 ; RV32-NEXT: li a1, 32
6515 ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
6516 ; RV32-NEXT: vsrl.vx v8, v8, a1
6517 ; RV32-NEXT: vmv.x.s a1, v8
6520 ; RV64-LABEL: vreduce_mul_v16i64:
6522 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6523 ; RV64-NEXT: vle64.v v8, (a0)
6524 ; RV64-NEXT: vslidedown.vi v16, v8, 8
6525 ; RV64-NEXT: vmul.vv v8, v8, v16
6526 ; RV64-NEXT: vslidedown.vi v16, v8, 4
6527 ; RV64-NEXT: vmul.vv v8, v8, v16
6528 ; RV64-NEXT: vslidedown.vi v16, v8, 2
6529 ; RV64-NEXT: vmul.vv v8, v8, v16
6530 ; RV64-NEXT: vrgather.vi v16, v8, 1
6531 ; RV64-NEXT: vmul.vv v8, v8, v16
6532 ; RV64-NEXT: vmv.x.s a0, v8
6534 %v = load <16 x i64>, ptr %x
6535 %red = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %v)
6539 declare i64 @llvm.vector.reduce.mul.v32i64(<32 x i64>)
6541 define i64 @vreduce_mul_v32i64(ptr %x) {
6542 ; RV32-LABEL: vreduce_mul_v32i64:
6544 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6545 ; RV32-NEXT: vle64.v v8, (a0)
6546 ; RV32-NEXT: addi a0, a0, 128
6547 ; RV32-NEXT: vle64.v v16, (a0)
6548 ; RV32-NEXT: vmul.vv v8, v8, v16
6549 ; RV32-NEXT: vslidedown.vi v16, v8, 8
6550 ; RV32-NEXT: vmul.vv v8, v8, v16
6551 ; RV32-NEXT: vslidedown.vi v16, v8, 4
6552 ; RV32-NEXT: vmul.vv v8, v8, v16
6553 ; RV32-NEXT: vslidedown.vi v16, v8, 2
6554 ; RV32-NEXT: vmul.vv v8, v8, v16
6555 ; RV32-NEXT: vrgather.vi v16, v8, 1
6556 ; RV32-NEXT: vmul.vv v8, v8, v16
6557 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
6558 ; RV32-NEXT: vmv.x.s a0, v8
6559 ; RV32-NEXT: vslidedown.vi v8, v8, 1
6560 ; RV32-NEXT: vmv.x.s a1, v8
6563 ; RV64-LABEL: vreduce_mul_v32i64:
6565 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6566 ; RV64-NEXT: vle64.v v8, (a0)
6567 ; RV64-NEXT: addi a0, a0, 128
6568 ; RV64-NEXT: vle64.v v16, (a0)
6569 ; RV64-NEXT: vmul.vv v8, v8, v16
6570 ; RV64-NEXT: vslidedown.vi v16, v8, 8
6571 ; RV64-NEXT: vmul.vv v8, v8, v16
6572 ; RV64-NEXT: vslidedown.vi v16, v8, 4
6573 ; RV64-NEXT: vmul.vv v8, v8, v16
6574 ; RV64-NEXT: vslidedown.vi v16, v8, 2
6575 ; RV64-NEXT: vmul.vv v8, v8, v16
6576 ; RV64-NEXT: vrgather.vi v16, v8, 1
6577 ; RV64-NEXT: vmul.vv v8, v8, v16
6578 ; RV64-NEXT: vmv.x.s a0, v8
6580 %v = load <32 x i64>, ptr %x
6581 %red = call i64 @llvm.vector.reduce.mul.v32i64(<32 x i64> %v)
6585 declare i64 @llvm.vector.reduce.mul.v64i64(<64 x i64>)
6587 define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
6588 ; RV32-LABEL: vreduce_mul_v64i64:
6590 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6591 ; RV32-NEXT: vle64.v v8, (a0)
6592 ; RV32-NEXT: addi a1, a0, 384
6593 ; RV32-NEXT: vle64.v v16, (a1)
6594 ; RV32-NEXT: addi a1, a0, 256
6595 ; RV32-NEXT: addi a0, a0, 128
6596 ; RV32-NEXT: vle64.v v24, (a0)
6597 ; RV32-NEXT: vle64.v v0, (a1)
6598 ; RV32-NEXT: vmul.vv v16, v24, v16
6599 ; RV32-NEXT: vmul.vv v8, v8, v0
6600 ; RV32-NEXT: vmul.vv v8, v8, v16
6601 ; RV32-NEXT: vslidedown.vi v16, v8, 8
6602 ; RV32-NEXT: vmul.vv v8, v8, v16
6603 ; RV32-NEXT: vslidedown.vi v16, v8, 4
6604 ; RV32-NEXT: vmul.vv v8, v8, v16
6605 ; RV32-NEXT: vslidedown.vi v16, v8, 2
6606 ; RV32-NEXT: vmul.vv v8, v8, v16
6607 ; RV32-NEXT: vrgather.vi v16, v8, 1
6608 ; RV32-NEXT: vmul.vv v8, v8, v16
6609 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
6610 ; RV32-NEXT: vmv.x.s a0, v8
6611 ; RV32-NEXT: vslidedown.vi v8, v8, 1
6612 ; RV32-NEXT: vmv.x.s a1, v8
6615 ; RV64-LABEL: vreduce_mul_v64i64:
6617 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6618 ; RV64-NEXT: vle64.v v8, (a0)
6619 ; RV64-NEXT: addi a1, a0, 384
6620 ; RV64-NEXT: vle64.v v16, (a1)
6621 ; RV64-NEXT: addi a1, a0, 256
6622 ; RV64-NEXT: addi a0, a0, 128
6623 ; RV64-NEXT: vle64.v v24, (a0)
6624 ; RV64-NEXT: vle64.v v0, (a1)
6625 ; RV64-NEXT: vmul.vv v16, v24, v16
6626 ; RV64-NEXT: vmul.vv v8, v8, v0
6627 ; RV64-NEXT: vmul.vv v8, v8, v16
6628 ; RV64-NEXT: vslidedown.vi v16, v8, 8
6629 ; RV64-NEXT: vmul.vv v8, v8, v16
6630 ; RV64-NEXT: vslidedown.vi v16, v8, 4
6631 ; RV64-NEXT: vmul.vv v8, v8, v16
6632 ; RV64-NEXT: vslidedown.vi v16, v8, 2
6633 ; RV64-NEXT: vmul.vv v8, v8, v16
6634 ; RV64-NEXT: vrgather.vi v16, v8, 1
6635 ; RV64-NEXT: vmul.vv v8, v8, v16
6636 ; RV64-NEXT: vmv.x.s a0, v8
6638 %v = load <64 x i64>, ptr %x
6639 %red = call i64 @llvm.vector.reduce.mul.v64i64(<64 x i64> %v)