1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
5 declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>)
7 define i8 @vreduce_add_v1i8(ptr %x) {
8 ; CHECK-LABEL: vreduce_add_v1i8:
10 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
11 ; CHECK-NEXT: vle8.v v8, (a0)
12 ; CHECK-NEXT: vmv.x.s a0, v8
14 %v = load <1 x i8>, ptr %x
15 %red = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %v)
19 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
21 define i8 @vreduce_add_v2i8(ptr %x) {
22 ; CHECK-LABEL: vreduce_add_v2i8:
24 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
25 ; CHECK-NEXT: vle8.v v8, (a0)
26 ; CHECK-NEXT: vmv.s.x v9, zero
27 ; CHECK-NEXT: vredsum.vs v8, v8, v9
28 ; CHECK-NEXT: vmv.x.s a0, v8
30 %v = load <2 x i8>, ptr %x
31 %red = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v)
35 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
37 define i8 @vreduce_add_v4i8(ptr %x) {
38 ; CHECK-LABEL: vreduce_add_v4i8:
40 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
41 ; CHECK-NEXT: vle8.v v8, (a0)
42 ; CHECK-NEXT: vmv.s.x v9, zero
43 ; CHECK-NEXT: vredsum.vs v8, v8, v9
44 ; CHECK-NEXT: vmv.x.s a0, v8
46 %v = load <4 x i8>, ptr %x
47 %red = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v)
51 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
53 define i8 @vreduce_add_v8i8(ptr %x) {
54 ; CHECK-LABEL: vreduce_add_v8i8:
56 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
57 ; CHECK-NEXT: vle8.v v8, (a0)
58 ; CHECK-NEXT: vmv.s.x v9, zero
59 ; CHECK-NEXT: vredsum.vs v8, v8, v9
60 ; CHECK-NEXT: vmv.x.s a0, v8
62 %v = load <8 x i8>, ptr %x
63 %red = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v)
67 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
69 define i8 @vreduce_add_v16i8(ptr %x) {
70 ; CHECK-LABEL: vreduce_add_v16i8:
72 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
73 ; CHECK-NEXT: vle8.v v8, (a0)
74 ; CHECK-NEXT: vmv.s.x v9, zero
75 ; CHECK-NEXT: vredsum.vs v8, v8, v9
76 ; CHECK-NEXT: vmv.x.s a0, v8
78 %v = load <16 x i8>, ptr %x
79 %red = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v)
83 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
85 define i8 @vreduce_add_v32i8(ptr %x) {
86 ; CHECK-LABEL: vreduce_add_v32i8:
88 ; CHECK-NEXT: li a1, 32
89 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
90 ; CHECK-NEXT: vle8.v v8, (a0)
91 ; CHECK-NEXT: vmv.s.x v10, zero
92 ; CHECK-NEXT: vredsum.vs v8, v8, v10
93 ; CHECK-NEXT: vmv.x.s a0, v8
95 %v = load <32 x i8>, ptr %x
96 %red = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v)
100 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
102 define i8 @vreduce_add_v64i8(ptr %x) {
103 ; CHECK-LABEL: vreduce_add_v64i8:
105 ; CHECK-NEXT: li a1, 64
106 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
107 ; CHECK-NEXT: vle8.v v8, (a0)
108 ; CHECK-NEXT: vmv.s.x v12, zero
109 ; CHECK-NEXT: vredsum.vs v8, v8, v12
110 ; CHECK-NEXT: vmv.x.s a0, v8
112 %v = load <64 x i8>, ptr %x
113 %red = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %v)
117 declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
119 define i8 @vreduce_add_v128i8(ptr %x) {
120 ; CHECK-LABEL: vreduce_add_v128i8:
122 ; CHECK-NEXT: li a1, 128
123 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
124 ; CHECK-NEXT: vle8.v v8, (a0)
125 ; CHECK-NEXT: vmv.s.x v16, zero
126 ; CHECK-NEXT: vredsum.vs v8, v8, v16
127 ; CHECK-NEXT: vmv.x.s a0, v8
129 %v = load <128 x i8>, ptr %x
130 %red = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %v)
134 declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
136 define i8 @vreduce_add_v256i8(ptr %x) {
137 ; CHECK-LABEL: vreduce_add_v256i8:
139 ; CHECK-NEXT: li a1, 128
140 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
141 ; CHECK-NEXT: vle8.v v8, (a0)
142 ; CHECK-NEXT: addi a0, a0, 128
143 ; CHECK-NEXT: vle8.v v16, (a0)
144 ; CHECK-NEXT: vadd.vv v8, v8, v16
145 ; CHECK-NEXT: vmv.s.x v16, zero
146 ; CHECK-NEXT: vredsum.vs v8, v8, v16
147 ; CHECK-NEXT: vmv.x.s a0, v8
149 %v = load <256 x i8>, ptr %x
150 %red = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %v)
154 declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>)
156 define i16 @vreduce_add_v1i16(ptr %x) {
157 ; CHECK-LABEL: vreduce_add_v1i16:
159 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
160 ; CHECK-NEXT: vle16.v v8, (a0)
161 ; CHECK-NEXT: vmv.x.s a0, v8
163 %v = load <1 x i16>, ptr %x
164 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %v)
168 define i16 @vwreduce_add_v1i16(ptr %x) {
169 ; CHECK-LABEL: vwreduce_add_v1i16:
171 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
172 ; CHECK-NEXT: vle8.v v8, (a0)
173 ; CHECK-NEXT: vsext.vf2 v9, v8
174 ; CHECK-NEXT: vmv.x.s a0, v9
176 %v = load <1 x i8>, ptr %x
177 %e = sext <1 x i8> %v to <1 x i16>
178 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
182 define i16 @vwreduce_uadd_v1i16(ptr %x) {
183 ; CHECK-LABEL: vwreduce_uadd_v1i16:
185 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
186 ; CHECK-NEXT: vle8.v v8, (a0)
187 ; CHECK-NEXT: vzext.vf2 v9, v8
188 ; CHECK-NEXT: vmv.x.s a0, v9
190 %v = load <1 x i8>, ptr %x
191 %e = zext <1 x i8> %v to <1 x i16>
192 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
196 declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
198 define i16 @vreduce_add_v2i16(ptr %x) {
199 ; CHECK-LABEL: vreduce_add_v2i16:
201 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
202 ; CHECK-NEXT: vle16.v v8, (a0)
203 ; CHECK-NEXT: vmv.s.x v9, zero
204 ; CHECK-NEXT: vredsum.vs v8, v8, v9
205 ; CHECK-NEXT: vmv.x.s a0, v8
207 %v = load <2 x i16>, ptr %x
208 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v)
212 define i16 @vwreduce_add_v2i16(ptr %x) {
213 ; CHECK-LABEL: vwreduce_add_v2i16:
215 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
216 ; CHECK-NEXT: vle8.v v8, (a0)
217 ; CHECK-NEXT: vmv.s.x v9, zero
218 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
219 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
220 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
221 ; CHECK-NEXT: vmv.x.s a0, v8
223 %v = load <2 x i8>, ptr %x
224 %e = sext <2 x i8> %v to <2 x i16>
225 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
229 define i16 @vwreduce_uadd_v2i16(ptr %x) {
230 ; CHECK-LABEL: vwreduce_uadd_v2i16:
232 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
233 ; CHECK-NEXT: vle8.v v8, (a0)
234 ; CHECK-NEXT: vmv.s.x v9, zero
235 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
236 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
237 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
238 ; CHECK-NEXT: vmv.x.s a0, v8
240 %v = load <2 x i8>, ptr %x
241 %e = zext <2 x i8> %v to <2 x i16>
242 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
246 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
248 define i16 @vreduce_add_v4i16(ptr %x) {
249 ; CHECK-LABEL: vreduce_add_v4i16:
251 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
252 ; CHECK-NEXT: vle16.v v8, (a0)
253 ; CHECK-NEXT: vmv.s.x v9, zero
254 ; CHECK-NEXT: vredsum.vs v8, v8, v9
255 ; CHECK-NEXT: vmv.x.s a0, v8
257 %v = load <4 x i16>, ptr %x
258 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v)
262 define i16 @vwreduce_add_v4i16(ptr %x) {
263 ; CHECK-LABEL: vwreduce_add_v4i16:
265 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
266 ; CHECK-NEXT: vle8.v v8, (a0)
267 ; CHECK-NEXT: vmv.s.x v9, zero
268 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
269 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
270 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
271 ; CHECK-NEXT: vmv.x.s a0, v8
273 %v = load <4 x i8>, ptr %x
274 %e = sext <4 x i8> %v to <4 x i16>
275 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
279 define i16 @vwreduce_uadd_v4i16(ptr %x) {
280 ; CHECK-LABEL: vwreduce_uadd_v4i16:
282 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
283 ; CHECK-NEXT: vle8.v v8, (a0)
284 ; CHECK-NEXT: vmv.s.x v9, zero
285 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
286 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
287 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
288 ; CHECK-NEXT: vmv.x.s a0, v8
290 %v = load <4 x i8>, ptr %x
291 %e = zext <4 x i8> %v to <4 x i16>
292 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
296 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
298 define i16 @vreduce_add_v8i16(ptr %x) {
299 ; CHECK-LABEL: vreduce_add_v8i16:
301 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
302 ; CHECK-NEXT: vle16.v v8, (a0)
303 ; CHECK-NEXT: vmv.s.x v9, zero
304 ; CHECK-NEXT: vredsum.vs v8, v8, v9
305 ; CHECK-NEXT: vmv.x.s a0, v8
307 %v = load <8 x i16>, ptr %x
308 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v)
312 define i16 @vwreduce_add_v8i16(ptr %x) {
313 ; CHECK-LABEL: vwreduce_add_v8i16:
315 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
316 ; CHECK-NEXT: vle8.v v8, (a0)
317 ; CHECK-NEXT: vmv.s.x v9, zero
318 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
319 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
320 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
321 ; CHECK-NEXT: vmv.x.s a0, v8
323 %v = load <8 x i8>, ptr %x
324 %e = sext <8 x i8> %v to <8 x i16>
325 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
329 define i16 @vwreduce_uadd_v8i16(ptr %x) {
330 ; CHECK-LABEL: vwreduce_uadd_v8i16:
332 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
333 ; CHECK-NEXT: vle8.v v8, (a0)
334 ; CHECK-NEXT: vmv.s.x v9, zero
335 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
336 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
337 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
338 ; CHECK-NEXT: vmv.x.s a0, v8
340 %v = load <8 x i8>, ptr %x
341 %e = zext <8 x i8> %v to <8 x i16>
342 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
346 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
348 define i16 @vreduce_add_v16i16(ptr %x) {
349 ; CHECK-LABEL: vreduce_add_v16i16:
351 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
352 ; CHECK-NEXT: vle16.v v8, (a0)
353 ; CHECK-NEXT: vmv.s.x v10, zero
354 ; CHECK-NEXT: vredsum.vs v8, v8, v10
355 ; CHECK-NEXT: vmv.x.s a0, v8
357 %v = load <16 x i16>, ptr %x
358 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v)
362 define i16 @vwreduce_add_v16i16(ptr %x) {
363 ; CHECK-LABEL: vwreduce_add_v16i16:
365 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
366 ; CHECK-NEXT: vle8.v v8, (a0)
367 ; CHECK-NEXT: vmv.s.x v9, zero
368 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
369 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
370 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
371 ; CHECK-NEXT: vmv.x.s a0, v8
373 %v = load <16 x i8>, ptr %x
374 %e = sext <16 x i8> %v to <16 x i16>
375 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
379 define i16 @vwreduce_uadd_v16i16(ptr %x) {
380 ; CHECK-LABEL: vwreduce_uadd_v16i16:
382 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
383 ; CHECK-NEXT: vle8.v v8, (a0)
384 ; CHECK-NEXT: vmv.s.x v9, zero
385 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
386 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
387 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
388 ; CHECK-NEXT: vmv.x.s a0, v8
390 %v = load <16 x i8>, ptr %x
391 %e = zext <16 x i8> %v to <16 x i16>
392 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
396 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
398 define i16 @vreduce_add_v32i16(ptr %x) {
399 ; CHECK-LABEL: vreduce_add_v32i16:
401 ; CHECK-NEXT: li a1, 32
402 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
403 ; CHECK-NEXT: vle16.v v8, (a0)
404 ; CHECK-NEXT: vmv.s.x v12, zero
405 ; CHECK-NEXT: vredsum.vs v8, v8, v12
406 ; CHECK-NEXT: vmv.x.s a0, v8
408 %v = load <32 x i16>, ptr %x
409 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %v)
413 define i16 @vwreduce_add_v32i16(ptr %x) {
414 ; CHECK-LABEL: vwreduce_add_v32i16:
416 ; CHECK-NEXT: li a1, 32
417 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
418 ; CHECK-NEXT: vle8.v v8, (a0)
419 ; CHECK-NEXT: vmv.s.x v10, zero
420 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
421 ; CHECK-NEXT: vwredsum.vs v8, v8, v10
422 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
423 ; CHECK-NEXT: vmv.x.s a0, v8
425 %v = load <32 x i8>, ptr %x
426 %e = sext <32 x i8> %v to <32 x i16>
427 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
431 define i16 @vwreduce_uadd_v32i16(ptr %x) {
432 ; CHECK-LABEL: vwreduce_uadd_v32i16:
434 ; CHECK-NEXT: li a1, 32
435 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
436 ; CHECK-NEXT: vle8.v v8, (a0)
437 ; CHECK-NEXT: vmv.s.x v10, zero
438 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
439 ; CHECK-NEXT: vwredsumu.vs v8, v8, v10
440 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
441 ; CHECK-NEXT: vmv.x.s a0, v8
443 %v = load <32 x i8>, ptr %x
444 %e = zext <32 x i8> %v to <32 x i16>
445 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
449 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
451 define i16 @vreduce_add_v64i16(ptr %x) {
452 ; CHECK-LABEL: vreduce_add_v64i16:
454 ; CHECK-NEXT: li a1, 64
455 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
456 ; CHECK-NEXT: vle16.v v8, (a0)
457 ; CHECK-NEXT: vmv.s.x v16, zero
458 ; CHECK-NEXT: vredsum.vs v8, v8, v16
459 ; CHECK-NEXT: vmv.x.s a0, v8
461 %v = load <64 x i16>, ptr %x
462 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %v)
466 define i16 @vwreduce_add_v64i16(ptr %x) {
467 ; CHECK-LABEL: vwreduce_add_v64i16:
469 ; CHECK-NEXT: li a1, 64
470 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
471 ; CHECK-NEXT: vle8.v v8, (a0)
472 ; CHECK-NEXT: vmv.s.x v12, zero
473 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma
474 ; CHECK-NEXT: vwredsum.vs v8, v8, v12
475 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
476 ; CHECK-NEXT: vmv.x.s a0, v8
478 %v = load <64 x i8>, ptr %x
479 %e = sext <64 x i8> %v to <64 x i16>
480 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
484 define i16 @vwreduce_uadd_v64i16(ptr %x) {
485 ; CHECK-LABEL: vwreduce_uadd_v64i16:
487 ; CHECK-NEXT: li a1, 64
488 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
489 ; CHECK-NEXT: vle8.v v8, (a0)
490 ; CHECK-NEXT: vmv.s.x v12, zero
491 ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma
492 ; CHECK-NEXT: vwredsumu.vs v8, v8, v12
493 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
494 ; CHECK-NEXT: vmv.x.s a0, v8
496 %v = load <64 x i8>, ptr %x
497 %e = zext <64 x i8> %v to <64 x i16>
498 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
502 declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
504 define i16 @vreduce_add_v128i16(ptr %x) {
505 ; CHECK-LABEL: vreduce_add_v128i16:
507 ; CHECK-NEXT: li a1, 64
508 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
509 ; CHECK-NEXT: vle16.v v8, (a0)
510 ; CHECK-NEXT: addi a0, a0, 128
511 ; CHECK-NEXT: vle16.v v16, (a0)
512 ; CHECK-NEXT: vadd.vv v8, v8, v16
513 ; CHECK-NEXT: vmv.s.x v16, zero
514 ; CHECK-NEXT: vredsum.vs v8, v8, v16
515 ; CHECK-NEXT: vmv.x.s a0, v8
517 %v = load <128 x i16>, ptr %x
518 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %v)
522 define i16 @vwreduce_add_v128i16(ptr %x) {
523 ; CHECK-LABEL: vwreduce_add_v128i16:
525 ; CHECK-NEXT: li a1, 128
526 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
527 ; CHECK-NEXT: vle8.v v8, (a0)
528 ; CHECK-NEXT: li a0, 64
529 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
530 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
531 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
532 ; CHECK-NEXT: vwadd.vv v24, v8, v16
533 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
534 ; CHECK-NEXT: vmv.s.x v8, zero
535 ; CHECK-NEXT: vredsum.vs v8, v24, v8
536 ; CHECK-NEXT: vmv.x.s a0, v8
538 %v = load <128 x i8>, ptr %x
539 %e = sext <128 x i8> %v to <128 x i16>
540 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
544 define i16 @vwreduce_uadd_v128i16(ptr %x) {
545 ; CHECK-LABEL: vwreduce_uadd_v128i16:
547 ; CHECK-NEXT: li a1, 128
548 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
549 ; CHECK-NEXT: vle8.v v8, (a0)
550 ; CHECK-NEXT: li a0, 64
551 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
552 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
553 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
554 ; CHECK-NEXT: vwaddu.vv v24, v8, v16
555 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
556 ; CHECK-NEXT: vmv.s.x v8, zero
557 ; CHECK-NEXT: vredsum.vs v8, v24, v8
558 ; CHECK-NEXT: vmv.x.s a0, v8
560 %v = load <128 x i8>, ptr %x
561 %e = zext <128 x i8> %v to <128 x i16>
562 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
566 declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
568 define i32 @vreduce_add_v1i32(ptr %x) {
569 ; CHECK-LABEL: vreduce_add_v1i32:
571 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
572 ; CHECK-NEXT: vle32.v v8, (a0)
573 ; CHECK-NEXT: vmv.x.s a0, v8
575 %v = load <1 x i32>, ptr %x
576 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %v)
580 define i32 @vwreduce_add_v1i32(ptr %x) {
581 ; CHECK-LABEL: vwreduce_add_v1i32:
583 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
584 ; CHECK-NEXT: vle16.v v8, (a0)
585 ; CHECK-NEXT: vsext.vf2 v9, v8
586 ; CHECK-NEXT: vmv.x.s a0, v9
588 %v = load <1 x i16>, ptr %x
589 %e = sext <1 x i16> %v to <1 x i32>
590 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
594 define i32 @vwreduce_uadd_v1i32(ptr %x) {
595 ; CHECK-LABEL: vwreduce_uadd_v1i32:
597 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
598 ; CHECK-NEXT: vle16.v v8, (a0)
599 ; CHECK-NEXT: vzext.vf2 v9, v8
600 ; CHECK-NEXT: vmv.x.s a0, v9
602 %v = load <1 x i16>, ptr %x
603 %e = zext <1 x i16> %v to <1 x i32>
604 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
608 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
610 define i32 @vreduce_add_v2i32(ptr %x) {
611 ; CHECK-LABEL: vreduce_add_v2i32:
613 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
614 ; CHECK-NEXT: vle32.v v8, (a0)
615 ; CHECK-NEXT: vmv.s.x v9, zero
616 ; CHECK-NEXT: vredsum.vs v8, v8, v9
617 ; CHECK-NEXT: vmv.x.s a0, v8
619 %v = load <2 x i32>, ptr %x
620 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v)
624 define i32 @vwreduce_add_v2i32(ptr %x) {
625 ; CHECK-LABEL: vwreduce_add_v2i32:
627 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
628 ; CHECK-NEXT: vle16.v v8, (a0)
629 ; CHECK-NEXT: vmv.s.x v9, zero
630 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
631 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
632 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
633 ; CHECK-NEXT: vmv.x.s a0, v8
635 %v = load <2 x i16>, ptr %x
636 %e = sext <2 x i16> %v to <2 x i32>
637 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
641 define i32 @vwreduce_uadd_v2i32(ptr %x) {
642 ; CHECK-LABEL: vwreduce_uadd_v2i32:
644 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
645 ; CHECK-NEXT: vle16.v v8, (a0)
646 ; CHECK-NEXT: vmv.s.x v9, zero
647 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
648 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
649 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
650 ; CHECK-NEXT: vmv.x.s a0, v8
652 %v = load <2 x i16>, ptr %x
653 %e = zext <2 x i16> %v to <2 x i32>
654 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
658 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
660 define i32 @vreduce_add_v4i32(ptr %x) {
661 ; CHECK-LABEL: vreduce_add_v4i32:
663 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
664 ; CHECK-NEXT: vle32.v v8, (a0)
665 ; CHECK-NEXT: vmv.s.x v9, zero
666 ; CHECK-NEXT: vredsum.vs v8, v8, v9
667 ; CHECK-NEXT: vmv.x.s a0, v8
669 %v = load <4 x i32>, ptr %x
670 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v)
674 define i32 @vwreduce_add_v4i32(ptr %x) {
675 ; CHECK-LABEL: vwreduce_add_v4i32:
677 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
678 ; CHECK-NEXT: vle16.v v8, (a0)
679 ; CHECK-NEXT: vmv.s.x v9, zero
680 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
681 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
682 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
683 ; CHECK-NEXT: vmv.x.s a0, v8
685 %v = load <4 x i16>, ptr %x
686 %e = sext <4 x i16> %v to <4 x i32>
687 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
691 define i32 @vwreduce_uadd_v4i32(ptr %x) {
692 ; CHECK-LABEL: vwreduce_uadd_v4i32:
694 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
695 ; CHECK-NEXT: vle16.v v8, (a0)
696 ; CHECK-NEXT: vmv.s.x v9, zero
697 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
698 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
699 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
700 ; CHECK-NEXT: vmv.x.s a0, v8
702 %v = load <4 x i16>, ptr %x
703 %e = zext <4 x i16> %v to <4 x i32>
704 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
708 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
710 define i32 @vreduce_add_v8i32(ptr %x) {
711 ; CHECK-LABEL: vreduce_add_v8i32:
713 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
714 ; CHECK-NEXT: vle32.v v8, (a0)
715 ; CHECK-NEXT: vmv.s.x v10, zero
716 ; CHECK-NEXT: vredsum.vs v8, v8, v10
717 ; CHECK-NEXT: vmv.x.s a0, v8
719 %v = load <8 x i32>, ptr %x
720 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v)
724 define i32 @vwreduce_add_v8i32(ptr %x) {
725 ; CHECK-LABEL: vwreduce_add_v8i32:
727 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
728 ; CHECK-NEXT: vle16.v v8, (a0)
729 ; CHECK-NEXT: vmv.s.x v9, zero
730 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
731 ; CHECK-NEXT: vwredsum.vs v8, v8, v9
732 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
733 ; CHECK-NEXT: vmv.x.s a0, v8
735 %v = load <8 x i16>, ptr %x
736 %e = sext <8 x i16> %v to <8 x i32>
737 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
741 define i32 @vwreduce_uadd_v8i32(ptr %x) {
742 ; CHECK-LABEL: vwreduce_uadd_v8i32:
744 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
745 ; CHECK-NEXT: vle16.v v8, (a0)
746 ; CHECK-NEXT: vmv.s.x v9, zero
747 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
748 ; CHECK-NEXT: vwredsumu.vs v8, v8, v9
749 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
750 ; CHECK-NEXT: vmv.x.s a0, v8
752 %v = load <8 x i16>, ptr %x
753 %e = zext <8 x i16> %v to <8 x i32>
754 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
758 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
760 define i32 @vreduce_add_v16i32(ptr %x) {
761 ; CHECK-LABEL: vreduce_add_v16i32:
763 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
764 ; CHECK-NEXT: vle32.v v8, (a0)
765 ; CHECK-NEXT: vmv.s.x v12, zero
766 ; CHECK-NEXT: vredsum.vs v8, v8, v12
767 ; CHECK-NEXT: vmv.x.s a0, v8
769 %v = load <16 x i32>, ptr %x
770 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v)
774 define i32 @vwreduce_add_v16i32(ptr %x) {
775 ; CHECK-LABEL: vwreduce_add_v16i32:
777 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
778 ; CHECK-NEXT: vle16.v v8, (a0)
779 ; CHECK-NEXT: vmv.s.x v10, zero
780 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
781 ; CHECK-NEXT: vwredsum.vs v8, v8, v10
782 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
783 ; CHECK-NEXT: vmv.x.s a0, v8
785 %v = load <16 x i16>, ptr %x
786 %e = sext <16 x i16> %v to <16 x i32>
787 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
791 define i32 @vwreduce_uadd_v16i32(ptr %x) {
792 ; CHECK-LABEL: vwreduce_uadd_v16i32:
794 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
795 ; CHECK-NEXT: vle16.v v8, (a0)
796 ; CHECK-NEXT: vmv.s.x v10, zero
797 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
798 ; CHECK-NEXT: vwredsumu.vs v8, v8, v10
799 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
800 ; CHECK-NEXT: vmv.x.s a0, v8
802 %v = load <16 x i16>, ptr %x
803 %e = zext <16 x i16> %v to <16 x i32>
804 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
808 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
810 define i32 @vreduce_add_v32i32(ptr %x) {
811 ; CHECK-LABEL: vreduce_add_v32i32:
813 ; CHECK-NEXT: li a1, 32
814 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
815 ; CHECK-NEXT: vle32.v v8, (a0)
816 ; CHECK-NEXT: vmv.s.x v16, zero
817 ; CHECK-NEXT: vredsum.vs v8, v8, v16
818 ; CHECK-NEXT: vmv.x.s a0, v8
820 %v = load <32 x i32>, ptr %x
821 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %v)
825 define i32 @vwreduce_add_v32i32(ptr %x) {
826 ; CHECK-LABEL: vwreduce_add_v32i32:
828 ; CHECK-NEXT: li a1, 32
829 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
830 ; CHECK-NEXT: vle16.v v8, (a0)
831 ; CHECK-NEXT: vmv.s.x v12, zero
832 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
833 ; CHECK-NEXT: vwredsum.vs v8, v8, v12
834 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
835 ; CHECK-NEXT: vmv.x.s a0, v8
837 %v = load <32 x i16>, ptr %x
838 %e = sext <32 x i16> %v to <32 x i32>
839 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
843 define i32 @vwreduce_uadd_v32i32(ptr %x) {
844 ; CHECK-LABEL: vwreduce_uadd_v32i32:
846 ; CHECK-NEXT: li a1, 32
847 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
848 ; CHECK-NEXT: vle16.v v8, (a0)
849 ; CHECK-NEXT: vmv.s.x v12, zero
850 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
851 ; CHECK-NEXT: vwredsumu.vs v8, v8, v12
852 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
853 ; CHECK-NEXT: vmv.x.s a0, v8
855 %v = load <32 x i16>, ptr %x
856 %e = zext <32 x i16> %v to <32 x i32>
857 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
861 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
863 define i32 @vreduce_add_v64i32(ptr %x) {
864 ; CHECK-LABEL: vreduce_add_v64i32:
866 ; CHECK-NEXT: li a1, 32
867 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
868 ; CHECK-NEXT: vle32.v v8, (a0)
869 ; CHECK-NEXT: addi a0, a0, 128
870 ; CHECK-NEXT: vle32.v v16, (a0)
871 ; CHECK-NEXT: vadd.vv v8, v8, v16
872 ; CHECK-NEXT: vmv.s.x v16, zero
873 ; CHECK-NEXT: vredsum.vs v8, v8, v16
874 ; CHECK-NEXT: vmv.x.s a0, v8
876 %v = load <64 x i32>, ptr %x
877 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %v)
881 define i32 @vwreduce_add_v64i32(ptr %x) {
882 ; CHECK-LABEL: vwreduce_add_v64i32:
884 ; CHECK-NEXT: li a1, 64
885 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
886 ; CHECK-NEXT: vle16.v v8, (a0)
887 ; CHECK-NEXT: li a0, 32
888 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
889 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
890 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
891 ; CHECK-NEXT: vwadd.vv v24, v8, v16
892 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
893 ; CHECK-NEXT: vmv.s.x v8, zero
894 ; CHECK-NEXT: vredsum.vs v8, v24, v8
895 ; CHECK-NEXT: vmv.x.s a0, v8
897 %v = load <64 x i16>, ptr %x
898 %e = sext <64 x i16> %v to <64 x i32>
899 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
903 define i32 @vwreduce_uadd_v64i32(ptr %x) {
904 ; CHECK-LABEL: vwreduce_uadd_v64i32:
906 ; CHECK-NEXT: li a1, 64
907 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
908 ; CHECK-NEXT: vle16.v v8, (a0)
909 ; CHECK-NEXT: li a0, 32
910 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
911 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
912 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
913 ; CHECK-NEXT: vwaddu.vv v24, v8, v16
914 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
915 ; CHECK-NEXT: vmv.s.x v8, zero
916 ; CHECK-NEXT: vredsum.vs v8, v24, v8
917 ; CHECK-NEXT: vmv.x.s a0, v8
919 %v = load <64 x i16>, ptr %x
920 %e = zext <64 x i16> %v to <64 x i32>
921 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
925 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
927 define i64 @vreduce_add_v1i64(ptr %x) {
928 ; RV32-LABEL: vreduce_add_v1i64:
930 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
931 ; RV32-NEXT: vle64.v v8, (a0)
932 ; RV32-NEXT: li a0, 32
933 ; RV32-NEXT: vsrl.vx v9, v8, a0
934 ; RV32-NEXT: vmv.x.s a1, v9
935 ; RV32-NEXT: vmv.x.s a0, v8
938 ; RV64-LABEL: vreduce_add_v1i64:
940 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
941 ; RV64-NEXT: vle64.v v8, (a0)
942 ; RV64-NEXT: vmv.x.s a0, v8
944 %v = load <1 x i64>, ptr %x
945 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %v)
949 define i64 @vwreduce_add_v1i64(ptr %x) {
950 ; RV32-LABEL: vwreduce_add_v1i64:
952 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
953 ; RV32-NEXT: vle32.v v8, (a0)
954 ; RV32-NEXT: vsext.vf2 v9, v8
955 ; RV32-NEXT: li a0, 32
956 ; RV32-NEXT: vsrl.vx v8, v9, a0
957 ; RV32-NEXT: vmv.x.s a1, v8
958 ; RV32-NEXT: vmv.x.s a0, v9
961 ; RV64-LABEL: vwreduce_add_v1i64:
963 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
964 ; RV64-NEXT: vle32.v v8, (a0)
965 ; RV64-NEXT: vsext.vf2 v9, v8
966 ; RV64-NEXT: vmv.x.s a0, v9
968 %v = load <1 x i32>, ptr %x
969 %e = sext <1 x i32> %v to <1 x i64>
970 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
974 define i64 @vwreduce_uadd_v1i64(ptr %x) {
975 ; RV32-LABEL: vwreduce_uadd_v1i64:
977 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
978 ; RV32-NEXT: vle32.v v8, (a0)
979 ; RV32-NEXT: vzext.vf2 v9, v8
980 ; RV32-NEXT: li a0, 32
981 ; RV32-NEXT: vsrl.vx v8, v9, a0
982 ; RV32-NEXT: vmv.x.s a1, v8
983 ; RV32-NEXT: vmv.x.s a0, v9
986 ; RV64-LABEL: vwreduce_uadd_v1i64:
988 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
989 ; RV64-NEXT: vle32.v v8, (a0)
990 ; RV64-NEXT: vzext.vf2 v9, v8
991 ; RV64-NEXT: vmv.x.s a0, v9
993 %v = load <1 x i32>, ptr %x
994 %e = zext <1 x i32> %v to <1 x i64>
995 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
999 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1001 define i64 @vreduce_add_v2i64(ptr %x) {
1002 ; RV32-LABEL: vreduce_add_v2i64:
1004 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1005 ; RV32-NEXT: vle64.v v8, (a0)
1006 ; RV32-NEXT: vmv.s.x v9, zero
1007 ; RV32-NEXT: vredsum.vs v8, v8, v9
1008 ; RV32-NEXT: vmv.x.s a0, v8
1009 ; RV32-NEXT: li a1, 32
1010 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1011 ; RV32-NEXT: vsrl.vx v8, v8, a1
1012 ; RV32-NEXT: vmv.x.s a1, v8
1015 ; RV64-LABEL: vreduce_add_v2i64:
1017 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1018 ; RV64-NEXT: vle64.v v8, (a0)
1019 ; RV64-NEXT: vmv.s.x v9, zero
1020 ; RV64-NEXT: vredsum.vs v8, v8, v9
1021 ; RV64-NEXT: vmv.x.s a0, v8
1023 %v = load <2 x i64>, ptr %x
1024 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v)
1028 define i64 @vwreduce_add_v2i64(ptr %x) {
1029 ; RV32-LABEL: vwreduce_add_v2i64:
1031 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1032 ; RV32-NEXT: vle32.v v8, (a0)
1033 ; RV32-NEXT: vmv.s.x v9, zero
1034 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1035 ; RV32-NEXT: vwredsum.vs v8, v8, v9
1036 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1037 ; RV32-NEXT: vmv.x.s a0, v8
1038 ; RV32-NEXT: li a1, 32
1039 ; RV32-NEXT: vsrl.vx v8, v8, a1
1040 ; RV32-NEXT: vmv.x.s a1, v8
1043 ; RV64-LABEL: vwreduce_add_v2i64:
1045 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1046 ; RV64-NEXT: vle32.v v8, (a0)
1047 ; RV64-NEXT: vmv.s.x v9, zero
1048 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1049 ; RV64-NEXT: vwredsum.vs v8, v8, v9
1050 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma
1051 ; RV64-NEXT: vmv.x.s a0, v8
1053 %v = load <2 x i32>, ptr %x
1054 %e = sext <2 x i32> %v to <2 x i64>
1055 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
1059 define i64 @vwreduce_uadd_v2i64(ptr %x) {
1060 ; RV32-LABEL: vwreduce_uadd_v2i64:
1062 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1063 ; RV32-NEXT: vle32.v v8, (a0)
1064 ; RV32-NEXT: vmv.s.x v9, zero
1065 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1066 ; RV32-NEXT: vwredsumu.vs v8, v8, v9
1067 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1068 ; RV32-NEXT: vmv.x.s a0, v8
1069 ; RV32-NEXT: li a1, 32
1070 ; RV32-NEXT: vsrl.vx v8, v8, a1
1071 ; RV32-NEXT: vmv.x.s a1, v8
1074 ; RV64-LABEL: vwreduce_uadd_v2i64:
1076 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1077 ; RV64-NEXT: vle32.v v8, (a0)
1078 ; RV64-NEXT: vmv.s.x v9, zero
1079 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
1080 ; RV64-NEXT: vwredsumu.vs v8, v8, v9
1081 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma
1082 ; RV64-NEXT: vmv.x.s a0, v8
1084 %v = load <2 x i32>, ptr %x
1085 %e = zext <2 x i32> %v to <2 x i64>
1086 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
1090 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1092 define i64 @vreduce_add_v4i64(ptr %x) {
1093 ; RV32-LABEL: vreduce_add_v4i64:
1095 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1096 ; RV32-NEXT: vle64.v v8, (a0)
1097 ; RV32-NEXT: vmv.s.x v10, zero
1098 ; RV32-NEXT: vredsum.vs v8, v8, v10
1099 ; RV32-NEXT: vmv.x.s a0, v8
1100 ; RV32-NEXT: li a1, 32
1101 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1102 ; RV32-NEXT: vsrl.vx v8, v8, a1
1103 ; RV32-NEXT: vmv.x.s a1, v8
1106 ; RV64-LABEL: vreduce_add_v4i64:
1108 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1109 ; RV64-NEXT: vle64.v v8, (a0)
1110 ; RV64-NEXT: vmv.s.x v10, zero
1111 ; RV64-NEXT: vredsum.vs v8, v8, v10
1112 ; RV64-NEXT: vmv.x.s a0, v8
1114 %v = load <4 x i64>, ptr %x
1115 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
1119 define i64 @vwreduce_add_v4i64(ptr %x) {
1120 ; RV32-LABEL: vwreduce_add_v4i64:
1122 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1123 ; RV32-NEXT: vle32.v v8, (a0)
1124 ; RV32-NEXT: vmv.s.x v9, zero
1125 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1126 ; RV32-NEXT: vwredsum.vs v8, v8, v9
1127 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1128 ; RV32-NEXT: vmv.x.s a0, v8
1129 ; RV32-NEXT: li a1, 32
1130 ; RV32-NEXT: vsrl.vx v8, v8, a1
1131 ; RV32-NEXT: vmv.x.s a1, v8
1134 ; RV64-LABEL: vwreduce_add_v4i64:
1136 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1137 ; RV64-NEXT: vle32.v v8, (a0)
1138 ; RV64-NEXT: vmv.s.x v9, zero
1139 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1140 ; RV64-NEXT: vwredsum.vs v8, v8, v9
1141 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
1142 ; RV64-NEXT: vmv.x.s a0, v8
1144 %v = load <4 x i32>, ptr %x
1145 %e = sext <4 x i32> %v to <4 x i64>
1146 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
1150 define i64 @vwreduce_uadd_v4i64(ptr %x) {
1151 ; RV32-LABEL: vwreduce_uadd_v4i64:
1153 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1154 ; RV32-NEXT: vle32.v v8, (a0)
1155 ; RV32-NEXT: vmv.s.x v9, zero
1156 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1157 ; RV32-NEXT: vwredsumu.vs v8, v8, v9
1158 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1159 ; RV32-NEXT: vmv.x.s a0, v8
1160 ; RV32-NEXT: li a1, 32
1161 ; RV32-NEXT: vsrl.vx v8, v8, a1
1162 ; RV32-NEXT: vmv.x.s a1, v8
1165 ; RV64-LABEL: vwreduce_uadd_v4i64:
1167 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
1168 ; RV64-NEXT: vle32.v v8, (a0)
1169 ; RV64-NEXT: vmv.s.x v9, zero
1170 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1171 ; RV64-NEXT: vwredsumu.vs v8, v8, v9
1172 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
1173 ; RV64-NEXT: vmv.x.s a0, v8
1175 %v = load <4 x i32>, ptr %x
1176 %e = zext <4 x i32> %v to <4 x i64>
1177 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
1181 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1183 define i64 @vreduce_add_v8i64(ptr %x) {
1184 ; RV32-LABEL: vreduce_add_v8i64:
1186 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1187 ; RV32-NEXT: vle64.v v8, (a0)
1188 ; RV32-NEXT: vmv.s.x v12, zero
1189 ; RV32-NEXT: vredsum.vs v8, v8, v12
1190 ; RV32-NEXT: vmv.x.s a0, v8
1191 ; RV32-NEXT: li a1, 32
1192 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1193 ; RV32-NEXT: vsrl.vx v8, v8, a1
1194 ; RV32-NEXT: vmv.x.s a1, v8
1197 ; RV64-LABEL: vreduce_add_v8i64:
1199 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1200 ; RV64-NEXT: vle64.v v8, (a0)
1201 ; RV64-NEXT: vmv.s.x v12, zero
1202 ; RV64-NEXT: vredsum.vs v8, v8, v12
1203 ; RV64-NEXT: vmv.x.s a0, v8
1205 %v = load <8 x i64>, ptr %x
1206 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %v)
1210 define i64 @vwreduce_add_v8i64(ptr %x) {
1211 ; RV32-LABEL: vwreduce_add_v8i64:
1213 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1214 ; RV32-NEXT: vle32.v v8, (a0)
1215 ; RV32-NEXT: vmv.s.x v10, zero
1216 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1217 ; RV32-NEXT: vwredsum.vs v8, v8, v10
1218 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1219 ; RV32-NEXT: vmv.x.s a0, v8
1220 ; RV32-NEXT: li a1, 32
1221 ; RV32-NEXT: vsrl.vx v8, v8, a1
1222 ; RV32-NEXT: vmv.x.s a1, v8
1225 ; RV64-LABEL: vwreduce_add_v8i64:
1227 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1228 ; RV64-NEXT: vle32.v v8, (a0)
1229 ; RV64-NEXT: vmv.s.x v10, zero
1230 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1231 ; RV64-NEXT: vwredsum.vs v8, v8, v10
1232 ; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
1233 ; RV64-NEXT: vmv.x.s a0, v8
1235 %v = load <8 x i32>, ptr %x
1236 %e = sext <8 x i32> %v to <8 x i64>
1237 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
1241 define i64 @vwreduce_uadd_v8i64(ptr %x) {
1242 ; RV32-LABEL: vwreduce_uadd_v8i64:
1244 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1245 ; RV32-NEXT: vle32.v v8, (a0)
1246 ; RV32-NEXT: vmv.s.x v10, zero
1247 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1248 ; RV32-NEXT: vwredsumu.vs v8, v8, v10
1249 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1250 ; RV32-NEXT: vmv.x.s a0, v8
1251 ; RV32-NEXT: li a1, 32
1252 ; RV32-NEXT: vsrl.vx v8, v8, a1
1253 ; RV32-NEXT: vmv.x.s a1, v8
1256 ; RV64-LABEL: vwreduce_uadd_v8i64:
1258 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
1259 ; RV64-NEXT: vle32.v v8, (a0)
1260 ; RV64-NEXT: vmv.s.x v10, zero
1261 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1262 ; RV64-NEXT: vwredsumu.vs v8, v8, v10
1263 ; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
1264 ; RV64-NEXT: vmv.x.s a0, v8
1266 %v = load <8 x i32>, ptr %x
1267 %e = zext <8 x i32> %v to <8 x i64>
1268 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
1272 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1274 define i64 @vreduce_add_v16i64(ptr %x) {
1275 ; RV32-LABEL: vreduce_add_v16i64:
1277 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1278 ; RV32-NEXT: vle64.v v8, (a0)
1279 ; RV32-NEXT: vmv.s.x v16, zero
1280 ; RV32-NEXT: vredsum.vs v8, v8, v16
1281 ; RV32-NEXT: vmv.x.s a0, v8
1282 ; RV32-NEXT: li a1, 32
1283 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1284 ; RV32-NEXT: vsrl.vx v8, v8, a1
1285 ; RV32-NEXT: vmv.x.s a1, v8
1288 ; RV64-LABEL: vreduce_add_v16i64:
1290 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1291 ; RV64-NEXT: vle64.v v8, (a0)
1292 ; RV64-NEXT: vmv.s.x v16, zero
1293 ; RV64-NEXT: vredsum.vs v8, v8, v16
1294 ; RV64-NEXT: vmv.x.s a0, v8
1296 %v = load <16 x i64>, ptr %x
1297 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %v)
1301 define i64 @vwreduce_add_v16i64(ptr %x) {
1302 ; RV32-LABEL: vwreduce_add_v16i64:
1304 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1305 ; RV32-NEXT: vle32.v v8, (a0)
1306 ; RV32-NEXT: vmv.s.x v12, zero
1307 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1308 ; RV32-NEXT: vwredsum.vs v8, v8, v12
1309 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1310 ; RV32-NEXT: vmv.x.s a0, v8
1311 ; RV32-NEXT: li a1, 32
1312 ; RV32-NEXT: vsrl.vx v8, v8, a1
1313 ; RV32-NEXT: vmv.x.s a1, v8
1316 ; RV64-LABEL: vwreduce_add_v16i64:
1318 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1319 ; RV64-NEXT: vle32.v v8, (a0)
1320 ; RV64-NEXT: vmv.s.x v12, zero
1321 ; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1322 ; RV64-NEXT: vwredsum.vs v8, v8, v12
1323 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1324 ; RV64-NEXT: vmv.x.s a0, v8
1326 %v = load <16 x i32>, ptr %x
1327 %e = sext <16 x i32> %v to <16 x i64>
1328 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
1332 define i64 @vwreduce_uadd_v16i64(ptr %x) {
1333 ; RV32-LABEL: vwreduce_uadd_v16i64:
1335 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1336 ; RV32-NEXT: vle32.v v8, (a0)
1337 ; RV32-NEXT: vmv.s.x v12, zero
1338 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1339 ; RV32-NEXT: vwredsumu.vs v8, v8, v12
1340 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1341 ; RV32-NEXT: vmv.x.s a0, v8
1342 ; RV32-NEXT: li a1, 32
1343 ; RV32-NEXT: vsrl.vx v8, v8, a1
1344 ; RV32-NEXT: vmv.x.s a1, v8
1347 ; RV64-LABEL: vwreduce_uadd_v16i64:
1349 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1350 ; RV64-NEXT: vle32.v v8, (a0)
1351 ; RV64-NEXT: vmv.s.x v12, zero
1352 ; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1353 ; RV64-NEXT: vwredsumu.vs v8, v8, v12
1354 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1355 ; RV64-NEXT: vmv.x.s a0, v8
1357 %v = load <16 x i32>, ptr %x
1358 %e = zext <16 x i32> %v to <16 x i64>
1359 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
1363 declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
1365 define i64 @vreduce_add_v32i64(ptr %x) {
1366 ; RV32-LABEL: vreduce_add_v32i64:
1368 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1369 ; RV32-NEXT: vle64.v v8, (a0)
1370 ; RV32-NEXT: addi a0, a0, 128
1371 ; RV32-NEXT: vle64.v v16, (a0)
1372 ; RV32-NEXT: vadd.vv v8, v8, v16
1373 ; RV32-NEXT: vmv.s.x v16, zero
1374 ; RV32-NEXT: vredsum.vs v8, v8, v16
1375 ; RV32-NEXT: vmv.x.s a0, v8
1376 ; RV32-NEXT: li a1, 32
1377 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1378 ; RV32-NEXT: vsrl.vx v8, v8, a1
1379 ; RV32-NEXT: vmv.x.s a1, v8
1382 ; RV64-LABEL: vreduce_add_v32i64:
1384 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1385 ; RV64-NEXT: vle64.v v8, (a0)
1386 ; RV64-NEXT: addi a0, a0, 128
1387 ; RV64-NEXT: vle64.v v16, (a0)
1388 ; RV64-NEXT: vadd.vv v8, v8, v16
1389 ; RV64-NEXT: vmv.s.x v16, zero
1390 ; RV64-NEXT: vredsum.vs v8, v8, v16
1391 ; RV64-NEXT: vmv.x.s a0, v8
1393 %v = load <32 x i64>, ptr %x
1394 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %v)
1398 define i64 @vwreduce_add_v32i64(ptr %x) {
1399 ; RV32-LABEL: vwreduce_add_v32i64:
1401 ; RV32-NEXT: li a1, 32
1402 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1403 ; RV32-NEXT: vle32.v v8, (a0)
1404 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1405 ; RV32-NEXT: vslidedown.vi v16, v8, 16
1406 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1407 ; RV32-NEXT: vwadd.vv v24, v8, v16
1408 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1409 ; RV32-NEXT: vmv.s.x v8, zero
1410 ; RV32-NEXT: vredsum.vs v8, v24, v8
1411 ; RV32-NEXT: vmv.x.s a0, v8
1412 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1413 ; RV32-NEXT: vsrl.vx v8, v8, a1
1414 ; RV32-NEXT: vmv.x.s a1, v8
1417 ; RV64-LABEL: vwreduce_add_v32i64:
1419 ; RV64-NEXT: li a1, 32
1420 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1421 ; RV64-NEXT: vle32.v v8, (a0)
1422 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1423 ; RV64-NEXT: vslidedown.vi v16, v8, 16
1424 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1425 ; RV64-NEXT: vwadd.vv v24, v8, v16
1426 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1427 ; RV64-NEXT: vmv.s.x v8, zero
1428 ; RV64-NEXT: vredsum.vs v8, v24, v8
1429 ; RV64-NEXT: vmv.x.s a0, v8
1431 %v = load <32 x i32>, ptr %x
1432 %e = sext <32 x i32> %v to <32 x i64>
1433 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
1437 define i64 @vwreduce_uadd_v32i64(ptr %x) {
1438 ; RV32-LABEL: vwreduce_uadd_v32i64:
1440 ; RV32-NEXT: li a1, 32
1441 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1442 ; RV32-NEXT: vle32.v v8, (a0)
1443 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1444 ; RV32-NEXT: vslidedown.vi v16, v8, 16
1445 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1446 ; RV32-NEXT: vwaddu.vv v24, v8, v16
1447 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1448 ; RV32-NEXT: vmv.s.x v8, zero
1449 ; RV32-NEXT: vredsum.vs v8, v24, v8
1450 ; RV32-NEXT: vmv.x.s a0, v8
1451 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1452 ; RV32-NEXT: vsrl.vx v8, v8, a1
1453 ; RV32-NEXT: vmv.x.s a1, v8
1456 ; RV64-LABEL: vwreduce_uadd_v32i64:
1458 ; RV64-NEXT: li a1, 32
1459 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma
1460 ; RV64-NEXT: vle32.v v8, (a0)
1461 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1462 ; RV64-NEXT: vslidedown.vi v16, v8, 16
1463 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1464 ; RV64-NEXT: vwaddu.vv v24, v8, v16
1465 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1466 ; RV64-NEXT: vmv.s.x v8, zero
1467 ; RV64-NEXT: vredsum.vs v8, v24, v8
1468 ; RV64-NEXT: vmv.x.s a0, v8
1470 %v = load <32 x i32>, ptr %x
1471 %e = zext <32 x i32> %v to <32 x i64>
1472 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
1476 declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
1478 define i64 @vreduce_add_v64i64(ptr %x) nounwind {
1479 ; RV32-LABEL: vreduce_add_v64i64:
1481 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1482 ; RV32-NEXT: vle64.v v8, (a0)
1483 ; RV32-NEXT: addi a1, a0, 384
1484 ; RV32-NEXT: vle64.v v16, (a1)
1485 ; RV32-NEXT: addi a1, a0, 256
1486 ; RV32-NEXT: addi a0, a0, 128
1487 ; RV32-NEXT: vle64.v v24, (a0)
1488 ; RV32-NEXT: vle64.v v0, (a1)
1489 ; RV32-NEXT: vadd.vv v16, v24, v16
1490 ; RV32-NEXT: vadd.vv v8, v8, v0
1491 ; RV32-NEXT: vadd.vv v8, v8, v16
1492 ; RV32-NEXT: vmv.s.x v16, zero
1493 ; RV32-NEXT: vredsum.vs v8, v8, v16
1494 ; RV32-NEXT: vmv.x.s a0, v8
1495 ; RV32-NEXT: li a1, 32
1496 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1497 ; RV32-NEXT: vsrl.vx v8, v8, a1
1498 ; RV32-NEXT: vmv.x.s a1, v8
1501 ; RV64-LABEL: vreduce_add_v64i64:
1503 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
1504 ; RV64-NEXT: vle64.v v8, (a0)
1505 ; RV64-NEXT: addi a1, a0, 384
1506 ; RV64-NEXT: vle64.v v16, (a1)
1507 ; RV64-NEXT: addi a1, a0, 256
1508 ; RV64-NEXT: addi a0, a0, 128
1509 ; RV64-NEXT: vle64.v v24, (a0)
1510 ; RV64-NEXT: vle64.v v0, (a1)
1511 ; RV64-NEXT: vadd.vv v16, v24, v16
1512 ; RV64-NEXT: vadd.vv v8, v8, v0
1513 ; RV64-NEXT: vadd.vv v8, v8, v16
1514 ; RV64-NEXT: vmv.s.x v16, zero
1515 ; RV64-NEXT: vredsum.vs v8, v8, v16
1516 ; RV64-NEXT: vmv.x.s a0, v8
1518 %v = load <64 x i64>, ptr %x
1519 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %v)
1523 define i64 @vwreduce_add_v64i64(ptr %x) {
1524 ; RV32-LABEL: vwreduce_add_v64i64:
1526 ; RV32-NEXT: addi sp, sp, -16
1527 ; RV32-NEXT: .cfi_def_cfa_offset 16
1528 ; RV32-NEXT: csrr a1, vlenb
1529 ; RV32-NEXT: slli a1, a1, 4
1530 ; RV32-NEXT: sub sp, sp, a1
1531 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1532 ; RV32-NEXT: addi a1, a0, 128
1533 ; RV32-NEXT: li a2, 32
1534 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1535 ; RV32-NEXT: vle32.v v8, (a0)
1536 ; RV32-NEXT: addi a0, sp, 16
1537 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1538 ; RV32-NEXT: vle32.v v16, (a1)
1539 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1540 ; RV32-NEXT: vslidedown.vi v24, v8, 16
1541 ; RV32-NEXT: vslidedown.vi v0, v16, 16
1542 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1543 ; RV32-NEXT: vwadd.vv v8, v24, v0
1544 ; RV32-NEXT: csrr a0, vlenb
1545 ; RV32-NEXT: slli a0, a0, 3
1546 ; RV32-NEXT: add a0, sp, a0
1547 ; RV32-NEXT: addi a0, a0, 16
1548 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1549 ; RV32-NEXT: addi a0, sp, 16
1550 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1551 ; RV32-NEXT: vwadd.vv v0, v8, v16
1552 ; RV32-NEXT: csrr a0, vlenb
1553 ; RV32-NEXT: slli a0, a0, 3
1554 ; RV32-NEXT: add a0, sp, a0
1555 ; RV32-NEXT: addi a0, a0, 16
1556 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1557 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1558 ; RV32-NEXT: vadd.vv v8, v0, v8
1559 ; RV32-NEXT: vmv.s.x v16, zero
1560 ; RV32-NEXT: vredsum.vs v8, v8, v16
1561 ; RV32-NEXT: vmv.x.s a0, v8
1562 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1563 ; RV32-NEXT: vsrl.vx v8, v8, a2
1564 ; RV32-NEXT: vmv.x.s a1, v8
1565 ; RV32-NEXT: csrr a2, vlenb
1566 ; RV32-NEXT: slli a2, a2, 4
1567 ; RV32-NEXT: add sp, sp, a2
1568 ; RV32-NEXT: addi sp, sp, 16
1571 ; RV64-LABEL: vwreduce_add_v64i64:
1573 ; RV64-NEXT: addi sp, sp, -16
1574 ; RV64-NEXT: .cfi_def_cfa_offset 16
1575 ; RV64-NEXT: csrr a1, vlenb
1576 ; RV64-NEXT: slli a1, a1, 4
1577 ; RV64-NEXT: sub sp, sp, a1
1578 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1579 ; RV64-NEXT: addi a1, a0, 128
1580 ; RV64-NEXT: li a2, 32
1581 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1582 ; RV64-NEXT: vle32.v v8, (a0)
1583 ; RV64-NEXT: addi a0, sp, 16
1584 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1585 ; RV64-NEXT: vle32.v v16, (a1)
1586 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1587 ; RV64-NEXT: vslidedown.vi v24, v8, 16
1588 ; RV64-NEXT: vslidedown.vi v0, v16, 16
1589 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1590 ; RV64-NEXT: vwadd.vv v8, v24, v0
1591 ; RV64-NEXT: csrr a0, vlenb
1592 ; RV64-NEXT: slli a0, a0, 3
1593 ; RV64-NEXT: add a0, sp, a0
1594 ; RV64-NEXT: addi a0, a0, 16
1595 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1596 ; RV64-NEXT: addi a0, sp, 16
1597 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1598 ; RV64-NEXT: vwadd.vv v0, v8, v16
1599 ; RV64-NEXT: csrr a0, vlenb
1600 ; RV64-NEXT: slli a0, a0, 3
1601 ; RV64-NEXT: add a0, sp, a0
1602 ; RV64-NEXT: addi a0, a0, 16
1603 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1604 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1605 ; RV64-NEXT: vadd.vv v8, v0, v8
1606 ; RV64-NEXT: vmv.s.x v16, zero
1607 ; RV64-NEXT: vredsum.vs v8, v8, v16
1608 ; RV64-NEXT: vmv.x.s a0, v8
1609 ; RV64-NEXT: csrr a1, vlenb
1610 ; RV64-NEXT: slli a1, a1, 4
1611 ; RV64-NEXT: add sp, sp, a1
1612 ; RV64-NEXT: addi sp, sp, 16
1614 %v = load <64 x i32>, ptr %x
1615 %e = sext <64 x i32> %v to <64 x i64>
1616 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
1620 define i64 @vwreduce_uadd_v64i64(ptr %x) {
1621 ; RV32-LABEL: vwreduce_uadd_v64i64:
1623 ; RV32-NEXT: addi sp, sp, -16
1624 ; RV32-NEXT: .cfi_def_cfa_offset 16
1625 ; RV32-NEXT: csrr a1, vlenb
1626 ; RV32-NEXT: slli a1, a1, 4
1627 ; RV32-NEXT: sub sp, sp, a1
1628 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1629 ; RV32-NEXT: addi a1, a0, 128
1630 ; RV32-NEXT: li a2, 32
1631 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1632 ; RV32-NEXT: vle32.v v8, (a0)
1633 ; RV32-NEXT: addi a0, sp, 16
1634 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1635 ; RV32-NEXT: vle32.v v16, (a1)
1636 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1637 ; RV32-NEXT: vslidedown.vi v24, v8, 16
1638 ; RV32-NEXT: vslidedown.vi v0, v16, 16
1639 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1640 ; RV32-NEXT: vwaddu.vv v8, v24, v0
1641 ; RV32-NEXT: csrr a0, vlenb
1642 ; RV32-NEXT: slli a0, a0, 3
1643 ; RV32-NEXT: add a0, sp, a0
1644 ; RV32-NEXT: addi a0, a0, 16
1645 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1646 ; RV32-NEXT: addi a0, sp, 16
1647 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1648 ; RV32-NEXT: vwaddu.vv v0, v8, v16
1649 ; RV32-NEXT: csrr a0, vlenb
1650 ; RV32-NEXT: slli a0, a0, 3
1651 ; RV32-NEXT: add a0, sp, a0
1652 ; RV32-NEXT: addi a0, a0, 16
1653 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1654 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1655 ; RV32-NEXT: vadd.vv v8, v0, v8
1656 ; RV32-NEXT: vmv.s.x v16, zero
1657 ; RV32-NEXT: vredsum.vs v8, v8, v16
1658 ; RV32-NEXT: vmv.x.s a0, v8
1659 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
1660 ; RV32-NEXT: vsrl.vx v8, v8, a2
1661 ; RV32-NEXT: vmv.x.s a1, v8
1662 ; RV32-NEXT: csrr a2, vlenb
1663 ; RV32-NEXT: slli a2, a2, 4
1664 ; RV32-NEXT: add sp, sp, a2
1665 ; RV32-NEXT: addi sp, sp, 16
1668 ; RV64-LABEL: vwreduce_uadd_v64i64:
1670 ; RV64-NEXT: addi sp, sp, -16
1671 ; RV64-NEXT: .cfi_def_cfa_offset 16
1672 ; RV64-NEXT: csrr a1, vlenb
1673 ; RV64-NEXT: slli a1, a1, 4
1674 ; RV64-NEXT: sub sp, sp, a1
1675 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1676 ; RV64-NEXT: addi a1, a0, 128
1677 ; RV64-NEXT: li a2, 32
1678 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma
1679 ; RV64-NEXT: vle32.v v8, (a0)
1680 ; RV64-NEXT: addi a0, sp, 16
1681 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1682 ; RV64-NEXT: vle32.v v16, (a1)
1683 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma
1684 ; RV64-NEXT: vslidedown.vi v24, v8, 16
1685 ; RV64-NEXT: vslidedown.vi v0, v16, 16
1686 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1687 ; RV64-NEXT: vwaddu.vv v8, v24, v0
1688 ; RV64-NEXT: csrr a0, vlenb
1689 ; RV64-NEXT: slli a0, a0, 3
1690 ; RV64-NEXT: add a0, sp, a0
1691 ; RV64-NEXT: addi a0, a0, 16
1692 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
1693 ; RV64-NEXT: addi a0, sp, 16
1694 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1695 ; RV64-NEXT: vwaddu.vv v0, v8, v16
1696 ; RV64-NEXT: csrr a0, vlenb
1697 ; RV64-NEXT: slli a0, a0, 3
1698 ; RV64-NEXT: add a0, sp, a0
1699 ; RV64-NEXT: addi a0, a0, 16
1700 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
1701 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
1702 ; RV64-NEXT: vadd.vv v8, v0, v8
1703 ; RV64-NEXT: vmv.s.x v16, zero
1704 ; RV64-NEXT: vredsum.vs v8, v8, v16
1705 ; RV64-NEXT: vmv.x.s a0, v8
1706 ; RV64-NEXT: csrr a1, vlenb
1707 ; RV64-NEXT: slli a1, a1, 4
1708 ; RV64-NEXT: add sp, sp, a1
1709 ; RV64-NEXT: addi sp, sp, 16
1711 %v = load <64 x i32>, ptr %x
1712 %e = zext <64 x i32> %v to <64 x i64>
1713 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
1717 declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
1719 define i8 @vreduce_and_v1i8(ptr %x) {
1720 ; CHECK-LABEL: vreduce_and_v1i8:
1722 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
1723 ; CHECK-NEXT: vle8.v v8, (a0)
1724 ; CHECK-NEXT: vmv.x.s a0, v8
1726 %v = load <1 x i8>, ptr %x
1727 %red = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %v)
1731 declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>)
1733 define i8 @vreduce_and_v2i8(ptr %x) {
1734 ; CHECK-LABEL: vreduce_and_v2i8:
1736 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
1737 ; CHECK-NEXT: vle8.v v8, (a0)
1738 ; CHECK-NEXT: vredand.vs v8, v8, v8
1739 ; CHECK-NEXT: vmv.x.s a0, v8
1741 %v = load <2 x i8>, ptr %x
1742 %red = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v)
1746 declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
1748 define i8 @vreduce_and_v4i8(ptr %x) {
1749 ; CHECK-LABEL: vreduce_and_v4i8:
1751 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
1752 ; CHECK-NEXT: vle8.v v8, (a0)
1753 ; CHECK-NEXT: vredand.vs v8, v8, v8
1754 ; CHECK-NEXT: vmv.x.s a0, v8
1756 %v = load <4 x i8>, ptr %x
1757 %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
1761 declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
1763 define i8 @vreduce_and_v8i8(ptr %x) {
1764 ; CHECK-LABEL: vreduce_and_v8i8:
1766 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
1767 ; CHECK-NEXT: vle8.v v8, (a0)
1768 ; CHECK-NEXT: vredand.vs v8, v8, v8
1769 ; CHECK-NEXT: vmv.x.s a0, v8
1771 %v = load <8 x i8>, ptr %x
1772 %red = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v)
1776 declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
1778 define i8 @vreduce_and_v16i8(ptr %x) {
1779 ; CHECK-LABEL: vreduce_and_v16i8:
1781 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
1782 ; CHECK-NEXT: vle8.v v8, (a0)
1783 ; CHECK-NEXT: vredand.vs v8, v8, v8
1784 ; CHECK-NEXT: vmv.x.s a0, v8
1786 %v = load <16 x i8>, ptr %x
1787 %red = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v)
1791 declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
1793 define i8 @vreduce_and_v32i8(ptr %x) {
1794 ; CHECK-LABEL: vreduce_and_v32i8:
1796 ; CHECK-NEXT: li a1, 32
1797 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
1798 ; CHECK-NEXT: vle8.v v8, (a0)
1799 ; CHECK-NEXT: vredand.vs v8, v8, v8
1800 ; CHECK-NEXT: vmv.x.s a0, v8
1802 %v = load <32 x i8>, ptr %x
1803 %red = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v)
1807 declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
1809 define i8 @vreduce_and_v64i8(ptr %x) {
1810 ; CHECK-LABEL: vreduce_and_v64i8:
1812 ; CHECK-NEXT: li a1, 64
1813 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
1814 ; CHECK-NEXT: vle8.v v8, (a0)
1815 ; CHECK-NEXT: vredand.vs v8, v8, v8
1816 ; CHECK-NEXT: vmv.x.s a0, v8
1818 %v = load <64 x i8>, ptr %x
1819 %red = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %v)
1823 declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
1825 define i8 @vreduce_and_v128i8(ptr %x) {
1826 ; CHECK-LABEL: vreduce_and_v128i8:
1828 ; CHECK-NEXT: li a1, 128
1829 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
1830 ; CHECK-NEXT: vle8.v v8, (a0)
1831 ; CHECK-NEXT: vredand.vs v8, v8, v8
1832 ; CHECK-NEXT: vmv.x.s a0, v8
1834 %v = load <128 x i8>, ptr %x
1835 %red = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %v)
1839 declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1841 define i8 @vreduce_and_v256i8(ptr %x) {
1842 ; CHECK-LABEL: vreduce_and_v256i8:
1844 ; CHECK-NEXT: li a1, 128
1845 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
1846 ; CHECK-NEXT: vle8.v v8, (a0)
1847 ; CHECK-NEXT: addi a0, a0, 128
1848 ; CHECK-NEXT: vle8.v v16, (a0)
1849 ; CHECK-NEXT: vand.vv v8, v8, v16
1850 ; CHECK-NEXT: vredand.vs v8, v8, v8
1851 ; CHECK-NEXT: vmv.x.s a0, v8
1853 %v = load <256 x i8>, ptr %x
1854 %red = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %v)
1858 declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16>)
1860 define i16 @vreduce_and_v1i16(ptr %x) {
1861 ; CHECK-LABEL: vreduce_and_v1i16:
1863 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
1864 ; CHECK-NEXT: vle16.v v8, (a0)
1865 ; CHECK-NEXT: vmv.x.s a0, v8
1867 %v = load <1 x i16>, ptr %x
1868 %red = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %v)
1872 declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>)
1874 define i16 @vreduce_and_v2i16(ptr %x) {
1875 ; CHECK-LABEL: vreduce_and_v2i16:
1877 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
1878 ; CHECK-NEXT: vle16.v v8, (a0)
1879 ; CHECK-NEXT: vredand.vs v8, v8, v8
1880 ; CHECK-NEXT: vmv.x.s a0, v8
1882 %v = load <2 x i16>, ptr %x
1883 %red = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v)
1887 declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1889 define i16 @vreduce_and_v4i16(ptr %x) {
1890 ; CHECK-LABEL: vreduce_and_v4i16:
1892 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
1893 ; CHECK-NEXT: vle16.v v8, (a0)
1894 ; CHECK-NEXT: vredand.vs v8, v8, v8
1895 ; CHECK-NEXT: vmv.x.s a0, v8
1897 %v = load <4 x i16>, ptr %x
1898 %red = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
1902 declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1904 define i16 @vreduce_and_v8i16(ptr %x) {
1905 ; CHECK-LABEL: vreduce_and_v8i16:
1907 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
1908 ; CHECK-NEXT: vle16.v v8, (a0)
1909 ; CHECK-NEXT: vredand.vs v8, v8, v8
1910 ; CHECK-NEXT: vmv.x.s a0, v8
1912 %v = load <8 x i16>, ptr %x
1913 %red = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v)
1917 declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1919 define i16 @vreduce_and_v16i16(ptr %x) {
1920 ; CHECK-LABEL: vreduce_and_v16i16:
1922 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
1923 ; CHECK-NEXT: vle16.v v8, (a0)
1924 ; CHECK-NEXT: vredand.vs v8, v8, v8
1925 ; CHECK-NEXT: vmv.x.s a0, v8
1927 %v = load <16 x i16>, ptr %x
1928 %red = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
1932 declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1934 define i16 @vreduce_and_v32i16(ptr %x) {
1935 ; CHECK-LABEL: vreduce_and_v32i16:
1937 ; CHECK-NEXT: li a1, 32
1938 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
1939 ; CHECK-NEXT: vle16.v v8, (a0)
1940 ; CHECK-NEXT: vredand.vs v8, v8, v8
1941 ; CHECK-NEXT: vmv.x.s a0, v8
1943 %v = load <32 x i16>, ptr %x
1944 %red = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %v)
1948 declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1950 define i16 @vreduce_and_v64i16(ptr %x) {
1951 ; CHECK-LABEL: vreduce_and_v64i16:
1953 ; CHECK-NEXT: li a1, 64
1954 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
1955 ; CHECK-NEXT: vle16.v v8, (a0)
1956 ; CHECK-NEXT: vredand.vs v8, v8, v8
1957 ; CHECK-NEXT: vmv.x.s a0, v8
1959 %v = load <64 x i16>, ptr %x
1960 %red = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %v)
1964 declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1966 define i16 @vreduce_and_v128i16(ptr %x) {
1967 ; CHECK-LABEL: vreduce_and_v128i16:
1969 ; CHECK-NEXT: li a1, 64
1970 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
1971 ; CHECK-NEXT: vle16.v v8, (a0)
1972 ; CHECK-NEXT: addi a0, a0, 128
1973 ; CHECK-NEXT: vle16.v v16, (a0)
1974 ; CHECK-NEXT: vand.vv v8, v8, v16
1975 ; CHECK-NEXT: vredand.vs v8, v8, v8
1976 ; CHECK-NEXT: vmv.x.s a0, v8
1978 %v = load <128 x i16>, ptr %x
1979 %red = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %v)
1983 declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32>)
1985 define i32 @vreduce_and_v1i32(ptr %x) {
1986 ; CHECK-LABEL: vreduce_and_v1i32:
1988 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
1989 ; CHECK-NEXT: vle32.v v8, (a0)
1990 ; CHECK-NEXT: vmv.x.s a0, v8
1992 %v = load <1 x i32>, ptr %x
1993 %red = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %v)
1997 declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
1999 define i32 @vreduce_and_v2i32(ptr %x) {
2000 ; CHECK-LABEL: vreduce_and_v2i32:
2002 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
2003 ; CHECK-NEXT: vle32.v v8, (a0)
2004 ; CHECK-NEXT: vredand.vs v8, v8, v8
2005 ; CHECK-NEXT: vmv.x.s a0, v8
2007 %v = load <2 x i32>, ptr %x
2008 %red = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v)
2012 declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
2014 define i32 @vreduce_and_v4i32(ptr %x) {
2015 ; CHECK-LABEL: vreduce_and_v4i32:
2017 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
2018 ; CHECK-NEXT: vle32.v v8, (a0)
2019 ; CHECK-NEXT: vredand.vs v8, v8, v8
2020 ; CHECK-NEXT: vmv.x.s a0, v8
2022 %v = load <4 x i32>, ptr %x
2023 %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v)
2027 declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
2029 define i32 @vreduce_and_v8i32(ptr %x) {
2030 ; CHECK-LABEL: vreduce_and_v8i32:
2032 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
2033 ; CHECK-NEXT: vle32.v v8, (a0)
2034 ; CHECK-NEXT: vredand.vs v8, v8, v8
2035 ; CHECK-NEXT: vmv.x.s a0, v8
2037 %v = load <8 x i32>, ptr %x
2038 %red = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v)
2042 declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
2044 define i32 @vreduce_and_v16i32(ptr %x) {
2045 ; CHECK-LABEL: vreduce_and_v16i32:
2047 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2048 ; CHECK-NEXT: vle32.v v8, (a0)
2049 ; CHECK-NEXT: vredand.vs v8, v8, v8
2050 ; CHECK-NEXT: vmv.x.s a0, v8
2052 %v = load <16 x i32>, ptr %x
2053 %red = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v)
2057 declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
2059 define i32 @vreduce_and_v32i32(ptr %x) {
2060 ; CHECK-LABEL: vreduce_and_v32i32:
2062 ; CHECK-NEXT: li a1, 32
2063 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2064 ; CHECK-NEXT: vle32.v v8, (a0)
2065 ; CHECK-NEXT: vredand.vs v8, v8, v8
2066 ; CHECK-NEXT: vmv.x.s a0, v8
2068 %v = load <32 x i32>, ptr %x
2069 %red = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %v)
2073 declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
2075 define i32 @vreduce_and_v64i32(ptr %x) {
2076 ; CHECK-LABEL: vreduce_and_v64i32:
2078 ; CHECK-NEXT: li a1, 32
2079 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2080 ; CHECK-NEXT: vle32.v v8, (a0)
2081 ; CHECK-NEXT: addi a0, a0, 128
2082 ; CHECK-NEXT: vle32.v v16, (a0)
2083 ; CHECK-NEXT: vand.vv v8, v8, v16
2084 ; CHECK-NEXT: vredand.vs v8, v8, v8
2085 ; CHECK-NEXT: vmv.x.s a0, v8
2087 %v = load <64 x i32>, ptr %x
2088 %red = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %v)
2092 declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
2094 define i64 @vreduce_and_v1i64(ptr %x) {
2095 ; RV32-LABEL: vreduce_and_v1i64:
2097 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2098 ; RV32-NEXT: vle64.v v8, (a0)
2099 ; RV32-NEXT: li a0, 32
2100 ; RV32-NEXT: vsrl.vx v9, v8, a0
2101 ; RV32-NEXT: vmv.x.s a1, v9
2102 ; RV32-NEXT: vmv.x.s a0, v8
2105 ; RV64-LABEL: vreduce_and_v1i64:
2107 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2108 ; RV64-NEXT: vle64.v v8, (a0)
2109 ; RV64-NEXT: vmv.x.s a0, v8
2111 %v = load <1 x i64>, ptr %x
2112 %red = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %v)
2116 declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
2118 define i64 @vreduce_and_v2i64(ptr %x) {
2119 ; RV32-LABEL: vreduce_and_v2i64:
2121 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2122 ; RV32-NEXT: vle64.v v8, (a0)
2123 ; RV32-NEXT: vredand.vs v8, v8, v8
2124 ; RV32-NEXT: li a0, 32
2125 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2126 ; RV32-NEXT: vsrl.vx v9, v8, a0
2127 ; RV32-NEXT: vmv.x.s a1, v9
2128 ; RV32-NEXT: vmv.x.s a0, v8
2131 ; RV64-LABEL: vreduce_and_v2i64:
2133 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2134 ; RV64-NEXT: vle64.v v8, (a0)
2135 ; RV64-NEXT: vredand.vs v8, v8, v8
2136 ; RV64-NEXT: vmv.x.s a0, v8
2138 %v = load <2 x i64>, ptr %x
2139 %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v)
2143 declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
2145 define i64 @vreduce_and_v4i64(ptr %x) {
2146 ; RV32-LABEL: vreduce_and_v4i64:
2148 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2149 ; RV32-NEXT: vle64.v v8, (a0)
2150 ; RV32-NEXT: vredand.vs v8, v8, v8
2151 ; RV32-NEXT: vmv.x.s a0, v8
2152 ; RV32-NEXT: li a1, 32
2153 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2154 ; RV32-NEXT: vsrl.vx v8, v8, a1
2155 ; RV32-NEXT: vmv.x.s a1, v8
2158 ; RV64-LABEL: vreduce_and_v4i64:
2160 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2161 ; RV64-NEXT: vle64.v v8, (a0)
2162 ; RV64-NEXT: vredand.vs v8, v8, v8
2163 ; RV64-NEXT: vmv.x.s a0, v8
2165 %v = load <4 x i64>, ptr %x
2166 %red = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
2170 declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
2172 define i64 @vreduce_and_v8i64(ptr %x) {
2173 ; RV32-LABEL: vreduce_and_v8i64:
2175 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2176 ; RV32-NEXT: vle64.v v8, (a0)
2177 ; RV32-NEXT: vredand.vs v8, v8, v8
2178 ; RV32-NEXT: vmv.x.s a0, v8
2179 ; RV32-NEXT: li a1, 32
2180 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2181 ; RV32-NEXT: vsrl.vx v8, v8, a1
2182 ; RV32-NEXT: vmv.x.s a1, v8
2185 ; RV64-LABEL: vreduce_and_v8i64:
2187 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2188 ; RV64-NEXT: vle64.v v8, (a0)
2189 ; RV64-NEXT: vredand.vs v8, v8, v8
2190 ; RV64-NEXT: vmv.x.s a0, v8
2192 %v = load <8 x i64>, ptr %x
2193 %red = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %v)
2197 declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
2199 define i64 @vreduce_and_v16i64(ptr %x) {
2200 ; RV32-LABEL: vreduce_and_v16i64:
2202 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2203 ; RV32-NEXT: vle64.v v8, (a0)
2204 ; RV32-NEXT: vredand.vs v8, v8, v8
2205 ; RV32-NEXT: vmv.x.s a0, v8
2206 ; RV32-NEXT: li a1, 32
2207 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2208 ; RV32-NEXT: vsrl.vx v8, v8, a1
2209 ; RV32-NEXT: vmv.x.s a1, v8
2212 ; RV64-LABEL: vreduce_and_v16i64:
2214 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2215 ; RV64-NEXT: vle64.v v8, (a0)
2216 ; RV64-NEXT: vredand.vs v8, v8, v8
2217 ; RV64-NEXT: vmv.x.s a0, v8
2219 %v = load <16 x i64>, ptr %x
2220 %red = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %v)
2224 declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
2226 define i64 @vreduce_and_v32i64(ptr %x) {
2227 ; RV32-LABEL: vreduce_and_v32i64:
2229 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2230 ; RV32-NEXT: vle64.v v8, (a0)
2231 ; RV32-NEXT: addi a0, a0, 128
2232 ; RV32-NEXT: vle64.v v16, (a0)
2233 ; RV32-NEXT: vand.vv v8, v8, v16
2234 ; RV32-NEXT: vredand.vs v8, v8, v8
2235 ; RV32-NEXT: vmv.x.s a0, v8
2236 ; RV32-NEXT: li a1, 32
2237 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2238 ; RV32-NEXT: vsrl.vx v8, v8, a1
2239 ; RV32-NEXT: vmv.x.s a1, v8
2242 ; RV64-LABEL: vreduce_and_v32i64:
2244 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2245 ; RV64-NEXT: vle64.v v8, (a0)
2246 ; RV64-NEXT: addi a0, a0, 128
2247 ; RV64-NEXT: vle64.v v16, (a0)
2248 ; RV64-NEXT: vand.vv v8, v8, v16
2249 ; RV64-NEXT: vredand.vs v8, v8, v8
2250 ; RV64-NEXT: vmv.x.s a0, v8
2252 %v = load <32 x i64>, ptr %x
2253 %red = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %v)
2257 declare i64 @llvm.vector.reduce.and.v64i64(<64 x i64>)
2259 define i64 @vreduce_and_v64i64(ptr %x) nounwind {
2260 ; RV32-LABEL: vreduce_and_v64i64:
2262 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2263 ; RV32-NEXT: vle64.v v8, (a0)
2264 ; RV32-NEXT: addi a1, a0, 384
2265 ; RV32-NEXT: vle64.v v16, (a1)
2266 ; RV32-NEXT: addi a1, a0, 256
2267 ; RV32-NEXT: addi a0, a0, 128
2268 ; RV32-NEXT: vle64.v v24, (a0)
2269 ; RV32-NEXT: vle64.v v0, (a1)
2270 ; RV32-NEXT: vand.vv v16, v24, v16
2271 ; RV32-NEXT: vand.vv v8, v8, v0
2272 ; RV32-NEXT: vand.vv v8, v8, v16
2273 ; RV32-NEXT: vredand.vs v8, v8, v8
2274 ; RV32-NEXT: vmv.x.s a0, v8
2275 ; RV32-NEXT: li a1, 32
2276 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2277 ; RV32-NEXT: vsrl.vx v8, v8, a1
2278 ; RV32-NEXT: vmv.x.s a1, v8
2281 ; RV64-LABEL: vreduce_and_v64i64:
2283 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2284 ; RV64-NEXT: vle64.v v8, (a0)
2285 ; RV64-NEXT: addi a1, a0, 256
2286 ; RV64-NEXT: addi a2, a0, 384
2287 ; RV64-NEXT: vle64.v v16, (a2)
2288 ; RV64-NEXT: addi a0, a0, 128
2289 ; RV64-NEXT: vle64.v v24, (a0)
2290 ; RV64-NEXT: vle64.v v0, (a1)
2291 ; RV64-NEXT: vand.vv v16, v24, v16
2292 ; RV64-NEXT: vand.vv v8, v8, v0
2293 ; RV64-NEXT: vand.vv v8, v8, v16
2294 ; RV64-NEXT: vredand.vs v8, v8, v8
2295 ; RV64-NEXT: vmv.x.s a0, v8
2297 %v = load <64 x i64>, ptr %x
2298 %red = call i64 @llvm.vector.reduce.and.v64i64(<64 x i64> %v)
2302 declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
2304 define i8 @vreduce_or_v1i8(ptr %x) {
2305 ; CHECK-LABEL: vreduce_or_v1i8:
2307 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
2308 ; CHECK-NEXT: vle8.v v8, (a0)
2309 ; CHECK-NEXT: vmv.x.s a0, v8
2311 %v = load <1 x i8>, ptr %x
2312 %red = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %v)
2316 declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>)
2318 define i8 @vreduce_or_v2i8(ptr %x) {
2319 ; CHECK-LABEL: vreduce_or_v2i8:
2321 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
2322 ; CHECK-NEXT: vle8.v v8, (a0)
2323 ; CHECK-NEXT: vredor.vs v8, v8, v8
2324 ; CHECK-NEXT: vmv.x.s a0, v8
2326 %v = load <2 x i8>, ptr %x
2327 %red = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v)
2331 declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
2333 define i8 @vreduce_or_v4i8(ptr %x) {
2334 ; CHECK-LABEL: vreduce_or_v4i8:
2336 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2337 ; CHECK-NEXT: vle8.v v8, (a0)
2338 ; CHECK-NEXT: vredor.vs v8, v8, v8
2339 ; CHECK-NEXT: vmv.x.s a0, v8
2341 %v = load <4 x i8>, ptr %x
2342 %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
2346 declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
2348 define i8 @vreduce_or_v8i8(ptr %x) {
2349 ; CHECK-LABEL: vreduce_or_v8i8:
2351 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2352 ; CHECK-NEXT: vle8.v v8, (a0)
2353 ; CHECK-NEXT: vredor.vs v8, v8, v8
2354 ; CHECK-NEXT: vmv.x.s a0, v8
2356 %v = load <8 x i8>, ptr %x
2357 %red = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
2361 declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
2363 define i8 @vreduce_or_v16i8(ptr %x) {
2364 ; CHECK-LABEL: vreduce_or_v16i8:
2366 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
2367 ; CHECK-NEXT: vle8.v v8, (a0)
2368 ; CHECK-NEXT: vredor.vs v8, v8, v8
2369 ; CHECK-NEXT: vmv.x.s a0, v8
2371 %v = load <16 x i8>, ptr %x
2372 %red = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v)
2376 declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
2378 define i8 @vreduce_or_v32i8(ptr %x) {
2379 ; CHECK-LABEL: vreduce_or_v32i8:
2381 ; CHECK-NEXT: li a1, 32
2382 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
2383 ; CHECK-NEXT: vle8.v v8, (a0)
2384 ; CHECK-NEXT: vredor.vs v8, v8, v8
2385 ; CHECK-NEXT: vmv.x.s a0, v8
2387 %v = load <32 x i8>, ptr %x
2388 %red = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v)
2392 declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
2394 define i8 @vreduce_or_v64i8(ptr %x) {
2395 ; CHECK-LABEL: vreduce_or_v64i8:
2397 ; CHECK-NEXT: li a1, 64
2398 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
2399 ; CHECK-NEXT: vle8.v v8, (a0)
2400 ; CHECK-NEXT: vredor.vs v8, v8, v8
2401 ; CHECK-NEXT: vmv.x.s a0, v8
2403 %v = load <64 x i8>, ptr %x
2404 %red = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %v)
2408 declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
2410 define i8 @vreduce_or_v128i8(ptr %x) {
2411 ; CHECK-LABEL: vreduce_or_v128i8:
2413 ; CHECK-NEXT: li a1, 128
2414 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
2415 ; CHECK-NEXT: vle8.v v8, (a0)
2416 ; CHECK-NEXT: vredor.vs v8, v8, v8
2417 ; CHECK-NEXT: vmv.x.s a0, v8
2419 %v = load <128 x i8>, ptr %x
2420 %red = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %v)
2424 declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
2426 define i8 @vreduce_or_v256i8(ptr %x) {
2427 ; CHECK-LABEL: vreduce_or_v256i8:
2429 ; CHECK-NEXT: li a1, 128
2430 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
2431 ; CHECK-NEXT: vle8.v v8, (a0)
2432 ; CHECK-NEXT: addi a0, a0, 128
2433 ; CHECK-NEXT: vle8.v v16, (a0)
2434 ; CHECK-NEXT: vor.vv v8, v8, v16
2435 ; CHECK-NEXT: vredor.vs v8, v8, v8
2436 ; CHECK-NEXT: vmv.x.s a0, v8
2438 %v = load <256 x i8>, ptr %x
2439 %red = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %v)
2443 declare i16 @llvm.vector.reduce.or.v1i16(<1 x i16>)
2445 define i16 @vreduce_or_v1i16(ptr %x) {
2446 ; CHECK-LABEL: vreduce_or_v1i16:
2448 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
2449 ; CHECK-NEXT: vle16.v v8, (a0)
2450 ; CHECK-NEXT: vmv.x.s a0, v8
2452 %v = load <1 x i16>, ptr %x
2453 %red = call i16 @llvm.vector.reduce.or.v1i16(<1 x i16> %v)
2457 declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>)
2459 define i16 @vreduce_or_v2i16(ptr %x) {
2460 ; CHECK-LABEL: vreduce_or_v2i16:
2462 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
2463 ; CHECK-NEXT: vle16.v v8, (a0)
2464 ; CHECK-NEXT: vredor.vs v8, v8, v8
2465 ; CHECK-NEXT: vmv.x.s a0, v8
2467 %v = load <2 x i16>, ptr %x
2468 %red = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v)
2472 declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
2474 define i16 @vreduce_or_v4i16(ptr %x) {
2475 ; CHECK-LABEL: vreduce_or_v4i16:
2477 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
2478 ; CHECK-NEXT: vle16.v v8, (a0)
2479 ; CHECK-NEXT: vredor.vs v8, v8, v8
2480 ; CHECK-NEXT: vmv.x.s a0, v8
2482 %v = load <4 x i16>, ptr %x
2483 %red = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
2487 declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
2489 define i16 @vreduce_or_v8i16(ptr %x) {
2490 ; CHECK-LABEL: vreduce_or_v8i16:
2492 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
2493 ; CHECK-NEXT: vle16.v v8, (a0)
2494 ; CHECK-NEXT: vredor.vs v8, v8, v8
2495 ; CHECK-NEXT: vmv.x.s a0, v8
2497 %v = load <8 x i16>, ptr %x
2498 %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v)
2502 declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
2504 define i16 @vreduce_or_v16i16(ptr %x) {
2505 ; CHECK-LABEL: vreduce_or_v16i16:
2507 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
2508 ; CHECK-NEXT: vle16.v v8, (a0)
2509 ; CHECK-NEXT: vredor.vs v8, v8, v8
2510 ; CHECK-NEXT: vmv.x.s a0, v8
2512 %v = load <16 x i16>, ptr %x
2513 %red = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
2517 declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
2519 define i16 @vreduce_or_v32i16(ptr %x) {
2520 ; CHECK-LABEL: vreduce_or_v32i16:
2522 ; CHECK-NEXT: li a1, 32
2523 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
2524 ; CHECK-NEXT: vle16.v v8, (a0)
2525 ; CHECK-NEXT: vredor.vs v8, v8, v8
2526 ; CHECK-NEXT: vmv.x.s a0, v8
2528 %v = load <32 x i16>, ptr %x
2529 %red = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %v)
2533 declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
2535 define i16 @vreduce_or_v64i16(ptr %x) {
2536 ; CHECK-LABEL: vreduce_or_v64i16:
2538 ; CHECK-NEXT: li a1, 64
2539 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
2540 ; CHECK-NEXT: vle16.v v8, (a0)
2541 ; CHECK-NEXT: vredor.vs v8, v8, v8
2542 ; CHECK-NEXT: vmv.x.s a0, v8
2544 %v = load <64 x i16>, ptr %x
2545 %red = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %v)
2549 declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
2551 define i16 @vreduce_or_v128i16(ptr %x) {
2552 ; CHECK-LABEL: vreduce_or_v128i16:
2554 ; CHECK-NEXT: li a1, 64
2555 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
2556 ; CHECK-NEXT: vle16.v v8, (a0)
2557 ; CHECK-NEXT: addi a0, a0, 128
2558 ; CHECK-NEXT: vle16.v v16, (a0)
2559 ; CHECK-NEXT: vor.vv v8, v8, v16
2560 ; CHECK-NEXT: vredor.vs v8, v8, v8
2561 ; CHECK-NEXT: vmv.x.s a0, v8
2563 %v = load <128 x i16>, ptr %x
2564 %red = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %v)
2568 declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32>)
2570 define i32 @vreduce_or_v1i32(ptr %x) {
2571 ; CHECK-LABEL: vreduce_or_v1i32:
2573 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
2574 ; CHECK-NEXT: vle32.v v8, (a0)
2575 ; CHECK-NEXT: vmv.x.s a0, v8
2577 %v = load <1 x i32>, ptr %x
2578 %red = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %v)
2582 declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
2584 define i32 @vreduce_or_v2i32(ptr %x) {
2585 ; CHECK-LABEL: vreduce_or_v2i32:
2587 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
2588 ; CHECK-NEXT: vle32.v v8, (a0)
2589 ; CHECK-NEXT: vredor.vs v8, v8, v8
2590 ; CHECK-NEXT: vmv.x.s a0, v8
2592 %v = load <2 x i32>, ptr %x
2593 %red = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v)
2597 declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2599 define i32 @vreduce_or_v4i32(ptr %x) {
2600 ; CHECK-LABEL: vreduce_or_v4i32:
2602 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
2603 ; CHECK-NEXT: vle32.v v8, (a0)
2604 ; CHECK-NEXT: vredor.vs v8, v8, v8
2605 ; CHECK-NEXT: vmv.x.s a0, v8
2607 %v = load <4 x i32>, ptr %x
2608 %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
2612 declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
2614 define i32 @vreduce_or_v8i32(ptr %x) {
2615 ; CHECK-LABEL: vreduce_or_v8i32:
2617 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
2618 ; CHECK-NEXT: vle32.v v8, (a0)
2619 ; CHECK-NEXT: vredor.vs v8, v8, v8
2620 ; CHECK-NEXT: vmv.x.s a0, v8
2622 %v = load <8 x i32>, ptr %x
2623 %red = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v)
2627 declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
2629 define i32 @vreduce_or_v16i32(ptr %x) {
2630 ; CHECK-LABEL: vreduce_or_v16i32:
2632 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2633 ; CHECK-NEXT: vle32.v v8, (a0)
2634 ; CHECK-NEXT: vredor.vs v8, v8, v8
2635 ; CHECK-NEXT: vmv.x.s a0, v8
2637 %v = load <16 x i32>, ptr %x
2638 %red = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v)
2642 declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
2644 define i32 @vreduce_or_v32i32(ptr %x) {
2645 ; CHECK-LABEL: vreduce_or_v32i32:
2647 ; CHECK-NEXT: li a1, 32
2648 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2649 ; CHECK-NEXT: vle32.v v8, (a0)
2650 ; CHECK-NEXT: vredor.vs v8, v8, v8
2651 ; CHECK-NEXT: vmv.x.s a0, v8
2653 %v = load <32 x i32>, ptr %x
2654 %red = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %v)
2658 declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
2660 define i32 @vreduce_or_v64i32(ptr %x) {
2661 ; CHECK-LABEL: vreduce_or_v64i32:
2663 ; CHECK-NEXT: li a1, 32
2664 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
2665 ; CHECK-NEXT: vle32.v v8, (a0)
2666 ; CHECK-NEXT: addi a0, a0, 128
2667 ; CHECK-NEXT: vle32.v v16, (a0)
2668 ; CHECK-NEXT: vor.vv v8, v8, v16
2669 ; CHECK-NEXT: vredor.vs v8, v8, v8
2670 ; CHECK-NEXT: vmv.x.s a0, v8
2672 %v = load <64 x i32>, ptr %x
2673 %red = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %v)
2677 declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
2679 define i64 @vreduce_or_v1i64(ptr %x) {
2680 ; RV32-LABEL: vreduce_or_v1i64:
2682 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2683 ; RV32-NEXT: vle64.v v8, (a0)
2684 ; RV32-NEXT: li a0, 32
2685 ; RV32-NEXT: vsrl.vx v9, v8, a0
2686 ; RV32-NEXT: vmv.x.s a1, v9
2687 ; RV32-NEXT: vmv.x.s a0, v8
2690 ; RV64-LABEL: vreduce_or_v1i64:
2692 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2693 ; RV64-NEXT: vle64.v v8, (a0)
2694 ; RV64-NEXT: vmv.x.s a0, v8
2696 %v = load <1 x i64>, ptr %x
2697 %red = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %v)
2701 declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
2703 define i64 @vreduce_or_v2i64(ptr %x) {
2704 ; RV32-LABEL: vreduce_or_v2i64:
2706 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2707 ; RV32-NEXT: vle64.v v8, (a0)
2708 ; RV32-NEXT: vredor.vs v8, v8, v8
2709 ; RV32-NEXT: li a0, 32
2710 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2711 ; RV32-NEXT: vsrl.vx v9, v8, a0
2712 ; RV32-NEXT: vmv.x.s a1, v9
2713 ; RV32-NEXT: vmv.x.s a0, v8
2716 ; RV64-LABEL: vreduce_or_v2i64:
2718 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2719 ; RV64-NEXT: vle64.v v8, (a0)
2720 ; RV64-NEXT: vredor.vs v8, v8, v8
2721 ; RV64-NEXT: vmv.x.s a0, v8
2723 %v = load <2 x i64>, ptr %x
2724 %red = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v)
2728 declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
2730 define i64 @vreduce_or_v4i64(ptr %x) {
2731 ; RV32-LABEL: vreduce_or_v4i64:
2733 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2734 ; RV32-NEXT: vle64.v v8, (a0)
2735 ; RV32-NEXT: vredor.vs v8, v8, v8
2736 ; RV32-NEXT: vmv.x.s a0, v8
2737 ; RV32-NEXT: li a1, 32
2738 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2739 ; RV32-NEXT: vsrl.vx v8, v8, a1
2740 ; RV32-NEXT: vmv.x.s a1, v8
2743 ; RV64-LABEL: vreduce_or_v4i64:
2745 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
2746 ; RV64-NEXT: vle64.v v8, (a0)
2747 ; RV64-NEXT: vredor.vs v8, v8, v8
2748 ; RV64-NEXT: vmv.x.s a0, v8
2750 %v = load <4 x i64>, ptr %x
2751 %red = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
2755 declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
2757 define i64 @vreduce_or_v8i64(ptr %x) {
2758 ; RV32-LABEL: vreduce_or_v8i64:
2760 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2761 ; RV32-NEXT: vle64.v v8, (a0)
2762 ; RV32-NEXT: vredor.vs v8, v8, v8
2763 ; RV32-NEXT: vmv.x.s a0, v8
2764 ; RV32-NEXT: li a1, 32
2765 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2766 ; RV32-NEXT: vsrl.vx v8, v8, a1
2767 ; RV32-NEXT: vmv.x.s a1, v8
2770 ; RV64-LABEL: vreduce_or_v8i64:
2772 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
2773 ; RV64-NEXT: vle64.v v8, (a0)
2774 ; RV64-NEXT: vredor.vs v8, v8, v8
2775 ; RV64-NEXT: vmv.x.s a0, v8
2777 %v = load <8 x i64>, ptr %x
2778 %red = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %v)
2782 declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
2784 define i64 @vreduce_or_v16i64(ptr %x) {
2785 ; RV32-LABEL: vreduce_or_v16i64:
2787 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2788 ; RV32-NEXT: vle64.v v8, (a0)
2789 ; RV32-NEXT: vredor.vs v8, v8, v8
2790 ; RV32-NEXT: vmv.x.s a0, v8
2791 ; RV32-NEXT: li a1, 32
2792 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2793 ; RV32-NEXT: vsrl.vx v8, v8, a1
2794 ; RV32-NEXT: vmv.x.s a1, v8
2797 ; RV64-LABEL: vreduce_or_v16i64:
2799 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2800 ; RV64-NEXT: vle64.v v8, (a0)
2801 ; RV64-NEXT: vredor.vs v8, v8, v8
2802 ; RV64-NEXT: vmv.x.s a0, v8
2804 %v = load <16 x i64>, ptr %x
2805 %red = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %v)
2809 declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
2811 define i64 @vreduce_or_v32i64(ptr %x) {
2812 ; RV32-LABEL: vreduce_or_v32i64:
2814 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2815 ; RV32-NEXT: vle64.v v8, (a0)
2816 ; RV32-NEXT: addi a0, a0, 128
2817 ; RV32-NEXT: vle64.v v16, (a0)
2818 ; RV32-NEXT: vor.vv v8, v8, v16
2819 ; RV32-NEXT: vredor.vs v8, v8, v8
2820 ; RV32-NEXT: vmv.x.s a0, v8
2821 ; RV32-NEXT: li a1, 32
2822 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2823 ; RV32-NEXT: vsrl.vx v8, v8, a1
2824 ; RV32-NEXT: vmv.x.s a1, v8
2827 ; RV64-LABEL: vreduce_or_v32i64:
2829 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2830 ; RV64-NEXT: vle64.v v8, (a0)
2831 ; RV64-NEXT: addi a0, a0, 128
2832 ; RV64-NEXT: vle64.v v16, (a0)
2833 ; RV64-NEXT: vor.vv v8, v8, v16
2834 ; RV64-NEXT: vredor.vs v8, v8, v8
2835 ; RV64-NEXT: vmv.x.s a0, v8
2837 %v = load <32 x i64>, ptr %x
2838 %red = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %v)
2842 declare i64 @llvm.vector.reduce.or.v64i64(<64 x i64>)
2844 define i64 @vreduce_or_v64i64(ptr %x) nounwind {
2845 ; RV32-LABEL: vreduce_or_v64i64:
2847 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2848 ; RV32-NEXT: vle64.v v8, (a0)
2849 ; RV32-NEXT: addi a1, a0, 384
2850 ; RV32-NEXT: vle64.v v16, (a1)
2851 ; RV32-NEXT: addi a1, a0, 256
2852 ; RV32-NEXT: addi a0, a0, 128
2853 ; RV32-NEXT: vle64.v v24, (a0)
2854 ; RV32-NEXT: vle64.v v0, (a1)
2855 ; RV32-NEXT: vor.vv v16, v24, v16
2856 ; RV32-NEXT: vor.vv v8, v8, v0
2857 ; RV32-NEXT: vor.vv v8, v8, v16
2858 ; RV32-NEXT: vredor.vs v8, v8, v8
2859 ; RV32-NEXT: vmv.x.s a0, v8
2860 ; RV32-NEXT: li a1, 32
2861 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
2862 ; RV32-NEXT: vsrl.vx v8, v8, a1
2863 ; RV32-NEXT: vmv.x.s a1, v8
2866 ; RV64-LABEL: vreduce_or_v64i64:
2868 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
2869 ; RV64-NEXT: vle64.v v8, (a0)
2870 ; RV64-NEXT: addi a1, a0, 256
2871 ; RV64-NEXT: addi a2, a0, 384
2872 ; RV64-NEXT: vle64.v v16, (a2)
2873 ; RV64-NEXT: addi a0, a0, 128
2874 ; RV64-NEXT: vle64.v v24, (a0)
2875 ; RV64-NEXT: vle64.v v0, (a1)
2876 ; RV64-NEXT: vor.vv v16, v24, v16
2877 ; RV64-NEXT: vor.vv v8, v8, v0
2878 ; RV64-NEXT: vor.vv v8, v8, v16
2879 ; RV64-NEXT: vredor.vs v8, v8, v8
2880 ; RV64-NEXT: vmv.x.s a0, v8
2882 %v = load <64 x i64>, ptr %x
2883 %red = call i64 @llvm.vector.reduce.or.v64i64(<64 x i64> %v)
2887 declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
2889 define i8 @vreduce_xor_v1i8(ptr %x) {
2890 ; CHECK-LABEL: vreduce_xor_v1i8:
2892 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
2893 ; CHECK-NEXT: vle8.v v8, (a0)
2894 ; CHECK-NEXT: vmv.x.s a0, v8
2896 %v = load <1 x i8>, ptr %x
2897 %red = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> %v)
2901 declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>)
2903 define i8 @vreduce_xor_v2i8(ptr %x) {
2904 ; CHECK-LABEL: vreduce_xor_v2i8:
2906 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
2907 ; CHECK-NEXT: vle8.v v8, (a0)
2908 ; CHECK-NEXT: vmv.s.x v9, zero
2909 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2910 ; CHECK-NEXT: vmv.x.s a0, v8
2912 %v = load <2 x i8>, ptr %x
2913 %red = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v)
2917 declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
2919 define i8 @vreduce_xor_v4i8(ptr %x) {
2920 ; CHECK-LABEL: vreduce_xor_v4i8:
2922 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2923 ; CHECK-NEXT: vle8.v v8, (a0)
2924 ; CHECK-NEXT: vmv.s.x v9, zero
2925 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2926 ; CHECK-NEXT: vmv.x.s a0, v8
2928 %v = load <4 x i8>, ptr %x
2929 %red = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v)
2933 declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
2935 define i8 @vreduce_xor_v8i8(ptr %x) {
2936 ; CHECK-LABEL: vreduce_xor_v8i8:
2938 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2939 ; CHECK-NEXT: vle8.v v8, (a0)
2940 ; CHECK-NEXT: vmv.s.x v9, zero
2941 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2942 ; CHECK-NEXT: vmv.x.s a0, v8
2944 %v = load <8 x i8>, ptr %x
2945 %red = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v)
2949 declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
2951 define i8 @vreduce_xor_v16i8(ptr %x) {
2952 ; CHECK-LABEL: vreduce_xor_v16i8:
2954 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
2955 ; CHECK-NEXT: vle8.v v8, (a0)
2956 ; CHECK-NEXT: vmv.s.x v9, zero
2957 ; CHECK-NEXT: vredxor.vs v8, v8, v9
2958 ; CHECK-NEXT: vmv.x.s a0, v8
2960 %v = load <16 x i8>, ptr %x
2961 %red = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %v)
2965 declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
2967 define i8 @vreduce_xor_v32i8(ptr %x) {
2968 ; CHECK-LABEL: vreduce_xor_v32i8:
2970 ; CHECK-NEXT: li a1, 32
2971 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
2972 ; CHECK-NEXT: vle8.v v8, (a0)
2973 ; CHECK-NEXT: vmv.s.x v10, zero
2974 ; CHECK-NEXT: vredxor.vs v8, v8, v10
2975 ; CHECK-NEXT: vmv.x.s a0, v8
2977 %v = load <32 x i8>, ptr %x
2978 %red = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v)
2982 declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
2984 define i8 @vreduce_xor_v64i8(ptr %x) {
2985 ; CHECK-LABEL: vreduce_xor_v64i8:
2987 ; CHECK-NEXT: li a1, 64
2988 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
2989 ; CHECK-NEXT: vle8.v v8, (a0)
2990 ; CHECK-NEXT: vmv.s.x v12, zero
2991 ; CHECK-NEXT: vredxor.vs v8, v8, v12
2992 ; CHECK-NEXT: vmv.x.s a0, v8
2994 %v = load <64 x i8>, ptr %x
2995 %red = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %v)
2999 declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
3001 define i8 @vreduce_xor_v128i8(ptr %x) {
3002 ; CHECK-LABEL: vreduce_xor_v128i8:
3004 ; CHECK-NEXT: li a1, 128
3005 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3006 ; CHECK-NEXT: vle8.v v8, (a0)
3007 ; CHECK-NEXT: vmv.s.x v16, zero
3008 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3009 ; CHECK-NEXT: vmv.x.s a0, v8
3011 %v = load <128 x i8>, ptr %x
3012 %red = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %v)
3016 declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
3018 define i8 @vreduce_xor_v256i8(ptr %x) {
3019 ; CHECK-LABEL: vreduce_xor_v256i8:
3021 ; CHECK-NEXT: li a1, 128
3022 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3023 ; CHECK-NEXT: vle8.v v8, (a0)
3024 ; CHECK-NEXT: addi a0, a0, 128
3025 ; CHECK-NEXT: vle8.v v16, (a0)
3026 ; CHECK-NEXT: vxor.vv v8, v8, v16
3027 ; CHECK-NEXT: vmv.s.x v16, zero
3028 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3029 ; CHECK-NEXT: vmv.x.s a0, v8
3031 %v = load <256 x i8>, ptr %x
3032 %red = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %v)
3036 declare i16 @llvm.vector.reduce.xor.v1i16(<1 x i16>)
3038 define i16 @vreduce_xor_v1i16(ptr %x) {
3039 ; CHECK-LABEL: vreduce_xor_v1i16:
3041 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
3042 ; CHECK-NEXT: vle16.v v8, (a0)
3043 ; CHECK-NEXT: vmv.x.s a0, v8
3045 %v = load <1 x i16>, ptr %x
3046 %red = call i16 @llvm.vector.reduce.xor.v1i16(<1 x i16> %v)
3050 declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>)
3052 define i16 @vreduce_xor_v2i16(ptr %x) {
3053 ; CHECK-LABEL: vreduce_xor_v2i16:
3055 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
3056 ; CHECK-NEXT: vle16.v v8, (a0)
3057 ; CHECK-NEXT: vmv.s.x v9, zero
3058 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3059 ; CHECK-NEXT: vmv.x.s a0, v8
3061 %v = load <2 x i16>, ptr %x
3062 %red = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v)
3066 declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
3068 define i16 @vreduce_xor_v4i16(ptr %x) {
3069 ; CHECK-LABEL: vreduce_xor_v4i16:
3071 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
3072 ; CHECK-NEXT: vle16.v v8, (a0)
3073 ; CHECK-NEXT: vmv.s.x v9, zero
3074 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3075 ; CHECK-NEXT: vmv.x.s a0, v8
3077 %v = load <4 x i16>, ptr %x
3078 %red = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v)
3082 declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
3084 define i16 @vreduce_xor_v8i16(ptr %x) {
3085 ; CHECK-LABEL: vreduce_xor_v8i16:
3087 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
3088 ; CHECK-NEXT: vle16.v v8, (a0)
3089 ; CHECK-NEXT: vmv.s.x v9, zero
3090 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3091 ; CHECK-NEXT: vmv.x.s a0, v8
3093 %v = load <8 x i16>, ptr %x
3094 %red = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v)
3098 declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
3100 define i16 @vreduce_xor_v16i16(ptr %x) {
3101 ; CHECK-LABEL: vreduce_xor_v16i16:
3103 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
3104 ; CHECK-NEXT: vle16.v v8, (a0)
3105 ; CHECK-NEXT: vmv.s.x v10, zero
3106 ; CHECK-NEXT: vredxor.vs v8, v8, v10
3107 ; CHECK-NEXT: vmv.x.s a0, v8
3109 %v = load <16 x i16>, ptr %x
3110 %red = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v)
3114 declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
3116 define i16 @vreduce_xor_v32i16(ptr %x) {
3117 ; CHECK-LABEL: vreduce_xor_v32i16:
3119 ; CHECK-NEXT: li a1, 32
3120 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
3121 ; CHECK-NEXT: vle16.v v8, (a0)
3122 ; CHECK-NEXT: vmv.s.x v12, zero
3123 ; CHECK-NEXT: vredxor.vs v8, v8, v12
3124 ; CHECK-NEXT: vmv.x.s a0, v8
3126 %v = load <32 x i16>, ptr %x
3127 %red = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %v)
3131 declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
3133 define i16 @vreduce_xor_v64i16(ptr %x) {
3134 ; CHECK-LABEL: vreduce_xor_v64i16:
3136 ; CHECK-NEXT: li a1, 64
3137 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3138 ; CHECK-NEXT: vle16.v v8, (a0)
3139 ; CHECK-NEXT: vmv.s.x v16, zero
3140 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3141 ; CHECK-NEXT: vmv.x.s a0, v8
3143 %v = load <64 x i16>, ptr %x
3144 %red = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %v)
3148 declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
3150 define i16 @vreduce_xor_v128i16(ptr %x) {
3151 ; CHECK-LABEL: vreduce_xor_v128i16:
3153 ; CHECK-NEXT: li a1, 64
3154 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3155 ; CHECK-NEXT: vle16.v v8, (a0)
3156 ; CHECK-NEXT: addi a0, a0, 128
3157 ; CHECK-NEXT: vle16.v v16, (a0)
3158 ; CHECK-NEXT: vxor.vv v8, v8, v16
3159 ; CHECK-NEXT: vmv.s.x v16, zero
3160 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3161 ; CHECK-NEXT: vmv.x.s a0, v8
3163 %v = load <128 x i16>, ptr %x
3164 %red = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %v)
3168 declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32>)
3170 define i32 @vreduce_xor_v1i32(ptr %x) {
3171 ; CHECK-LABEL: vreduce_xor_v1i32:
3173 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
3174 ; CHECK-NEXT: vle32.v v8, (a0)
3175 ; CHECK-NEXT: vmv.x.s a0, v8
3177 %v = load <1 x i32>, ptr %x
3178 %red = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %v)
3182 declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
3184 define i32 @vreduce_xor_v2i32(ptr %x) {
3185 ; CHECK-LABEL: vreduce_xor_v2i32:
3187 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
3188 ; CHECK-NEXT: vle32.v v8, (a0)
3189 ; CHECK-NEXT: vmv.s.x v9, zero
3190 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3191 ; CHECK-NEXT: vmv.x.s a0, v8
3193 %v = load <2 x i32>, ptr %x
3194 %red = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v)
3198 declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
3200 define i32 @vreduce_xor_v4i32(ptr %x) {
3201 ; CHECK-LABEL: vreduce_xor_v4i32:
3203 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3204 ; CHECK-NEXT: vle32.v v8, (a0)
3205 ; CHECK-NEXT: vmv.s.x v9, zero
3206 ; CHECK-NEXT: vredxor.vs v8, v8, v9
3207 ; CHECK-NEXT: vmv.x.s a0, v8
3209 %v = load <4 x i32>, ptr %x
3210 %red = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v)
3214 declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
3216 define i32 @vreduce_xor_v8i32(ptr %x) {
3217 ; CHECK-LABEL: vreduce_xor_v8i32:
3219 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
3220 ; CHECK-NEXT: vle32.v v8, (a0)
3221 ; CHECK-NEXT: vmv.s.x v10, zero
3222 ; CHECK-NEXT: vredxor.vs v8, v8, v10
3223 ; CHECK-NEXT: vmv.x.s a0, v8
3225 %v = load <8 x i32>, ptr %x
3226 %red = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v)
3230 declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
3232 define i32 @vreduce_xor_v16i32(ptr %x) {
3233 ; CHECK-LABEL: vreduce_xor_v16i32:
3235 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
3236 ; CHECK-NEXT: vle32.v v8, (a0)
3237 ; CHECK-NEXT: vmv.s.x v12, zero
3238 ; CHECK-NEXT: vredxor.vs v8, v8, v12
3239 ; CHECK-NEXT: vmv.x.s a0, v8
3241 %v = load <16 x i32>, ptr %x
3242 %red = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v)
3246 declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
3248 define i32 @vreduce_xor_v32i32(ptr %x) {
3249 ; CHECK-LABEL: vreduce_xor_v32i32:
3251 ; CHECK-NEXT: li a1, 32
3252 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3253 ; CHECK-NEXT: vle32.v v8, (a0)
3254 ; CHECK-NEXT: vmv.s.x v16, zero
3255 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3256 ; CHECK-NEXT: vmv.x.s a0, v8
3258 %v = load <32 x i32>, ptr %x
3259 %red = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %v)
3263 declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
3265 define i32 @vreduce_xor_v64i32(ptr %x) {
3266 ; CHECK-LABEL: vreduce_xor_v64i32:
3268 ; CHECK-NEXT: li a1, 32
3269 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3270 ; CHECK-NEXT: vle32.v v8, (a0)
3271 ; CHECK-NEXT: addi a0, a0, 128
3272 ; CHECK-NEXT: vle32.v v16, (a0)
3273 ; CHECK-NEXT: vxor.vv v8, v8, v16
3274 ; CHECK-NEXT: vmv.s.x v16, zero
3275 ; CHECK-NEXT: vredxor.vs v8, v8, v16
3276 ; CHECK-NEXT: vmv.x.s a0, v8
3278 %v = load <64 x i32>, ptr %x
3279 %red = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %v)
3283 declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
3285 define i64 @vreduce_xor_v1i64(ptr %x) {
3286 ; RV32-LABEL: vreduce_xor_v1i64:
3288 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3289 ; RV32-NEXT: vle64.v v8, (a0)
3290 ; RV32-NEXT: li a0, 32
3291 ; RV32-NEXT: vsrl.vx v9, v8, a0
3292 ; RV32-NEXT: vmv.x.s a1, v9
3293 ; RV32-NEXT: vmv.x.s a0, v8
3296 ; RV64-LABEL: vreduce_xor_v1i64:
3298 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3299 ; RV64-NEXT: vle64.v v8, (a0)
3300 ; RV64-NEXT: vmv.x.s a0, v8
3302 %v = load <1 x i64>, ptr %x
3303 %red = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %v)
3307 declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
3309 define i64 @vreduce_xor_v2i64(ptr %x) {
3310 ; RV32-LABEL: vreduce_xor_v2i64:
3312 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3313 ; RV32-NEXT: vle64.v v8, (a0)
3314 ; RV32-NEXT: vmv.s.x v9, zero
3315 ; RV32-NEXT: vredxor.vs v8, v8, v9
3316 ; RV32-NEXT: vmv.x.s a0, v8
3317 ; RV32-NEXT: li a1, 32
3318 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3319 ; RV32-NEXT: vsrl.vx v8, v8, a1
3320 ; RV32-NEXT: vmv.x.s a1, v8
3323 ; RV64-LABEL: vreduce_xor_v2i64:
3325 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3326 ; RV64-NEXT: vle64.v v8, (a0)
3327 ; RV64-NEXT: vmv.s.x v9, zero
3328 ; RV64-NEXT: vredxor.vs v8, v8, v9
3329 ; RV64-NEXT: vmv.x.s a0, v8
3331 %v = load <2 x i64>, ptr %x
3332 %red = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v)
3336 declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
3338 define i64 @vreduce_xor_v4i64(ptr %x) {
3339 ; RV32-LABEL: vreduce_xor_v4i64:
3341 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3342 ; RV32-NEXT: vle64.v v8, (a0)
3343 ; RV32-NEXT: vmv.s.x v10, zero
3344 ; RV32-NEXT: vredxor.vs v8, v8, v10
3345 ; RV32-NEXT: vmv.x.s a0, v8
3346 ; RV32-NEXT: li a1, 32
3347 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3348 ; RV32-NEXT: vsrl.vx v8, v8, a1
3349 ; RV32-NEXT: vmv.x.s a1, v8
3352 ; RV64-LABEL: vreduce_xor_v4i64:
3354 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3355 ; RV64-NEXT: vle64.v v8, (a0)
3356 ; RV64-NEXT: vmv.s.x v10, zero
3357 ; RV64-NEXT: vredxor.vs v8, v8, v10
3358 ; RV64-NEXT: vmv.x.s a0, v8
3360 %v = load <4 x i64>, ptr %x
3361 %red = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
3365 declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
3367 define i64 @vreduce_xor_v8i64(ptr %x) {
3368 ; RV32-LABEL: vreduce_xor_v8i64:
3370 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3371 ; RV32-NEXT: vle64.v v8, (a0)
3372 ; RV32-NEXT: vmv.s.x v12, zero
3373 ; RV32-NEXT: vredxor.vs v8, v8, v12
3374 ; RV32-NEXT: vmv.x.s a0, v8
3375 ; RV32-NEXT: li a1, 32
3376 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3377 ; RV32-NEXT: vsrl.vx v8, v8, a1
3378 ; RV32-NEXT: vmv.x.s a1, v8
3381 ; RV64-LABEL: vreduce_xor_v8i64:
3383 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3384 ; RV64-NEXT: vle64.v v8, (a0)
3385 ; RV64-NEXT: vmv.s.x v12, zero
3386 ; RV64-NEXT: vredxor.vs v8, v8, v12
3387 ; RV64-NEXT: vmv.x.s a0, v8
3389 %v = load <8 x i64>, ptr %x
3390 %red = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %v)
3394 declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
3396 define i64 @vreduce_xor_v16i64(ptr %x) {
3397 ; RV32-LABEL: vreduce_xor_v16i64:
3399 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3400 ; RV32-NEXT: vle64.v v8, (a0)
3401 ; RV32-NEXT: vmv.s.x v16, zero
3402 ; RV32-NEXT: vredxor.vs v8, v8, v16
3403 ; RV32-NEXT: vmv.x.s a0, v8
3404 ; RV32-NEXT: li a1, 32
3405 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3406 ; RV32-NEXT: vsrl.vx v8, v8, a1
3407 ; RV32-NEXT: vmv.x.s a1, v8
3410 ; RV64-LABEL: vreduce_xor_v16i64:
3412 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3413 ; RV64-NEXT: vle64.v v8, (a0)
3414 ; RV64-NEXT: vmv.s.x v16, zero
3415 ; RV64-NEXT: vredxor.vs v8, v8, v16
3416 ; RV64-NEXT: vmv.x.s a0, v8
3418 %v = load <16 x i64>, ptr %x
3419 %red = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %v)
3423 declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)
3425 define i64 @vreduce_xor_v32i64(ptr %x) {
3426 ; RV32-LABEL: vreduce_xor_v32i64:
3428 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3429 ; RV32-NEXT: vle64.v v8, (a0)
3430 ; RV32-NEXT: addi a0, a0, 128
3431 ; RV32-NEXT: vle64.v v16, (a0)
3432 ; RV32-NEXT: vxor.vv v8, v8, v16
3433 ; RV32-NEXT: vmv.s.x v16, zero
3434 ; RV32-NEXT: vredxor.vs v8, v8, v16
3435 ; RV32-NEXT: vmv.x.s a0, v8
3436 ; RV32-NEXT: li a1, 32
3437 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3438 ; RV32-NEXT: vsrl.vx v8, v8, a1
3439 ; RV32-NEXT: vmv.x.s a1, v8
3442 ; RV64-LABEL: vreduce_xor_v32i64:
3444 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3445 ; RV64-NEXT: vle64.v v8, (a0)
3446 ; RV64-NEXT: addi a0, a0, 128
3447 ; RV64-NEXT: vle64.v v16, (a0)
3448 ; RV64-NEXT: vxor.vv v8, v8, v16
3449 ; RV64-NEXT: vmv.s.x v16, zero
3450 ; RV64-NEXT: vredxor.vs v8, v8, v16
3451 ; RV64-NEXT: vmv.x.s a0, v8
3453 %v = load <32 x i64>, ptr %x
3454 %red = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %v)
3458 declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
3460 define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
3461 ; RV32-LABEL: vreduce_xor_v64i64:
3463 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3464 ; RV32-NEXT: vle64.v v8, (a0)
3465 ; RV32-NEXT: addi a1, a0, 384
3466 ; RV32-NEXT: vle64.v v16, (a1)
3467 ; RV32-NEXT: addi a1, a0, 256
3468 ; RV32-NEXT: addi a0, a0, 128
3469 ; RV32-NEXT: vle64.v v24, (a0)
3470 ; RV32-NEXT: vle64.v v0, (a1)
3471 ; RV32-NEXT: vxor.vv v16, v24, v16
3472 ; RV32-NEXT: vxor.vv v8, v8, v0
3473 ; RV32-NEXT: vxor.vv v8, v8, v16
3474 ; RV32-NEXT: vmv.s.x v16, zero
3475 ; RV32-NEXT: vredxor.vs v8, v8, v16
3476 ; RV32-NEXT: vmv.x.s a0, v8
3477 ; RV32-NEXT: li a1, 32
3478 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3479 ; RV32-NEXT: vsrl.vx v8, v8, a1
3480 ; RV32-NEXT: vmv.x.s a1, v8
3483 ; RV64-LABEL: vreduce_xor_v64i64:
3485 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3486 ; RV64-NEXT: vle64.v v8, (a0)
3487 ; RV64-NEXT: addi a1, a0, 384
3488 ; RV64-NEXT: vle64.v v16, (a1)
3489 ; RV64-NEXT: addi a1, a0, 256
3490 ; RV64-NEXT: addi a0, a0, 128
3491 ; RV64-NEXT: vle64.v v24, (a0)
3492 ; RV64-NEXT: vle64.v v0, (a1)
3493 ; RV64-NEXT: vxor.vv v16, v24, v16
3494 ; RV64-NEXT: vxor.vv v8, v8, v0
3495 ; RV64-NEXT: vxor.vv v8, v8, v16
3496 ; RV64-NEXT: vmv.s.x v16, zero
3497 ; RV64-NEXT: vredxor.vs v8, v8, v16
3498 ; RV64-NEXT: vmv.x.s a0, v8
3500 %v = load <64 x i64>, ptr %x
3501 %red = call i64 @llvm.vector.reduce.xor.v64i64(<64 x i64> %v)
3505 declare i8 @llvm.vector.reduce.smin.v1i8(<1 x i8>)
3507 define i8 @vreduce_smin_v1i8(ptr %x) {
3508 ; CHECK-LABEL: vreduce_smin_v1i8:
3510 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
3511 ; CHECK-NEXT: vle8.v v8, (a0)
3512 ; CHECK-NEXT: vmv.x.s a0, v8
3514 %v = load <1 x i8>, ptr %x
3515 %red = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> %v)
3519 declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>)
3521 define i8 @vreduce_smin_v2i8(ptr %x) {
3522 ; CHECK-LABEL: vreduce_smin_v2i8:
3524 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
3525 ; CHECK-NEXT: vle8.v v8, (a0)
3526 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3527 ; CHECK-NEXT: vmv.x.s a0, v8
3529 %v = load <2 x i8>, ptr %x
3530 %red = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v)
3534 declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>)
3536 define i8 @vreduce_smin_v4i8(ptr %x) {
3537 ; CHECK-LABEL: vreduce_smin_v4i8:
3539 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
3540 ; CHECK-NEXT: vle8.v v8, (a0)
3541 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3542 ; CHECK-NEXT: vmv.x.s a0, v8
3544 %v = load <4 x i8>, ptr %x
3545 %red = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v)
3549 declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
3551 define i8 @vreduce_smin_v8i8(ptr %x) {
3552 ; CHECK-LABEL: vreduce_smin_v8i8:
3554 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
3555 ; CHECK-NEXT: vle8.v v8, (a0)
3556 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3557 ; CHECK-NEXT: vmv.x.s a0, v8
3559 %v = load <8 x i8>, ptr %x
3560 %red = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
3564 declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
3566 define i8 @vreduce_smin_v16i8(ptr %x) {
3567 ; CHECK-LABEL: vreduce_smin_v16i8:
3569 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
3570 ; CHECK-NEXT: vle8.v v8, (a0)
3571 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3572 ; CHECK-NEXT: vmv.x.s a0, v8
3574 %v = load <16 x i8>, ptr %x
3575 %red = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
3579 declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
3581 define i8 @vreduce_smin_v32i8(ptr %x) {
3582 ; CHECK-LABEL: vreduce_smin_v32i8:
3584 ; CHECK-NEXT: li a1, 32
3585 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
3586 ; CHECK-NEXT: vle8.v v8, (a0)
3587 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3588 ; CHECK-NEXT: vmv.x.s a0, v8
3590 %v = load <32 x i8>, ptr %x
3591 %red = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v)
3595 declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
3597 define i8 @vreduce_smin_v64i8(ptr %x) {
3598 ; CHECK-LABEL: vreduce_smin_v64i8:
3600 ; CHECK-NEXT: li a1, 64
3601 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
3602 ; CHECK-NEXT: vle8.v v8, (a0)
3603 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3604 ; CHECK-NEXT: vmv.x.s a0, v8
3606 %v = load <64 x i8>, ptr %x
3607 %red = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %v)
3611 declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
3613 define i8 @vreduce_smin_v128i8(ptr %x) {
3614 ; CHECK-LABEL: vreduce_smin_v128i8:
3616 ; CHECK-NEXT: li a1, 128
3617 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3618 ; CHECK-NEXT: vle8.v v8, (a0)
3619 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3620 ; CHECK-NEXT: vmv.x.s a0, v8
3622 %v = load <128 x i8>, ptr %x
3623 %red = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %v)
3627 declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
3629 define i8 @vreduce_smin_v256i8(ptr %x) {
3630 ; CHECK-LABEL: vreduce_smin_v256i8:
3632 ; CHECK-NEXT: li a1, 128
3633 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
3634 ; CHECK-NEXT: vle8.v v8, (a0)
3635 ; CHECK-NEXT: addi a0, a0, 128
3636 ; CHECK-NEXT: vle8.v v16, (a0)
3637 ; CHECK-NEXT: vmin.vv v8, v8, v16
3638 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3639 ; CHECK-NEXT: vmv.x.s a0, v8
3641 %v = load <256 x i8>, ptr %x
3642 %red = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %v)
3646 declare i16 @llvm.vector.reduce.smin.v1i16(<1 x i16>)
3648 define i16 @vreduce_smin_v1i16(ptr %x) {
3649 ; CHECK-LABEL: vreduce_smin_v1i16:
3651 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
3652 ; CHECK-NEXT: vle16.v v8, (a0)
3653 ; CHECK-NEXT: vmv.x.s a0, v8
3655 %v = load <1 x i16>, ptr %x
3656 %red = call i16 @llvm.vector.reduce.smin.v1i16(<1 x i16> %v)
3660 declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>)
3662 define i16 @vreduce_smin_v2i16(ptr %x) {
3663 ; CHECK-LABEL: vreduce_smin_v2i16:
3665 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
3666 ; CHECK-NEXT: vle16.v v8, (a0)
3667 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3668 ; CHECK-NEXT: vmv.x.s a0, v8
3670 %v = load <2 x i16>, ptr %x
3671 %red = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v)
3675 declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
3677 define i16 @vreduce_smin_v4i16(ptr %x) {
3678 ; CHECK-LABEL: vreduce_smin_v4i16:
3680 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
3681 ; CHECK-NEXT: vle16.v v8, (a0)
3682 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3683 ; CHECK-NEXT: vmv.x.s a0, v8
3685 %v = load <4 x i16>, ptr %x
3686 %red = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
3690 declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
3692 define i16 @vreduce_smin_v8i16(ptr %x) {
3693 ; CHECK-LABEL: vreduce_smin_v8i16:
3695 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
3696 ; CHECK-NEXT: vle16.v v8, (a0)
3697 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3698 ; CHECK-NEXT: vmv.x.s a0, v8
3700 %v = load <8 x i16>, ptr %x
3701 %red = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
3705 declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
3707 define i16 @vreduce_smin_v16i16(ptr %x) {
3708 ; CHECK-LABEL: vreduce_smin_v16i16:
3710 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
3711 ; CHECK-NEXT: vle16.v v8, (a0)
3712 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3713 ; CHECK-NEXT: vmv.x.s a0, v8
3715 %v = load <16 x i16>, ptr %x
3716 %red = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v)
3720 declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
3722 define i16 @vreduce_smin_v32i16(ptr %x) {
3723 ; CHECK-LABEL: vreduce_smin_v32i16:
3725 ; CHECK-NEXT: li a1, 32
3726 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
3727 ; CHECK-NEXT: vle16.v v8, (a0)
3728 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3729 ; CHECK-NEXT: vmv.x.s a0, v8
3731 %v = load <32 x i16>, ptr %x
3732 %red = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %v)
3736 declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
3738 define i16 @vreduce_smin_v64i16(ptr %x) {
3739 ; CHECK-LABEL: vreduce_smin_v64i16:
3741 ; CHECK-NEXT: li a1, 64
3742 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3743 ; CHECK-NEXT: vle16.v v8, (a0)
3744 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3745 ; CHECK-NEXT: vmv.x.s a0, v8
3747 %v = load <64 x i16>, ptr %x
3748 %red = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %v)
3752 declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
3754 define i16 @vreduce_smin_v128i16(ptr %x) {
3755 ; CHECK-LABEL: vreduce_smin_v128i16:
3757 ; CHECK-NEXT: li a1, 64
3758 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
3759 ; CHECK-NEXT: vle16.v v8, (a0)
3760 ; CHECK-NEXT: addi a0, a0, 128
3761 ; CHECK-NEXT: vle16.v v16, (a0)
3762 ; CHECK-NEXT: vmin.vv v8, v8, v16
3763 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3764 ; CHECK-NEXT: vmv.x.s a0, v8
3766 %v = load <128 x i16>, ptr %x
3767 %red = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %v)
3771 declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32>)
3773 define i32 @vreduce_smin_v1i32(ptr %x) {
3774 ; CHECK-LABEL: vreduce_smin_v1i32:
3776 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
3777 ; CHECK-NEXT: vle32.v v8, (a0)
3778 ; CHECK-NEXT: vmv.x.s a0, v8
3780 %v = load <1 x i32>, ptr %x
3781 %red = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %v)
3785 declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
3787 define i32 @vreduce_smin_v2i32(ptr %x) {
3788 ; CHECK-LABEL: vreduce_smin_v2i32:
3790 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
3791 ; CHECK-NEXT: vle32.v v8, (a0)
3792 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3793 ; CHECK-NEXT: vmv.x.s a0, v8
3795 %v = load <2 x i32>, ptr %x
3796 %red = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v)
3800 declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
3802 define i32 @vreduce_smin_v4i32(ptr %x) {
3803 ; CHECK-LABEL: vreduce_smin_v4i32:
3805 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3806 ; CHECK-NEXT: vle32.v v8, (a0)
3807 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3808 ; CHECK-NEXT: vmv.x.s a0, v8
3810 %v = load <4 x i32>, ptr %x
3811 %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
3815 declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
3817 define i32 @vreduce_smin_v8i32(ptr %x) {
3818 ; CHECK-LABEL: vreduce_smin_v8i32:
3820 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
3821 ; CHECK-NEXT: vle32.v v8, (a0)
3822 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3823 ; CHECK-NEXT: vmv.x.s a0, v8
3825 %v = load <8 x i32>, ptr %x
3826 %red = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v)
3830 declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
3832 define i32 @vreduce_smin_v16i32(ptr %x) {
3833 ; CHECK-LABEL: vreduce_smin_v16i32:
3835 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
3836 ; CHECK-NEXT: vle32.v v8, (a0)
3837 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3838 ; CHECK-NEXT: vmv.x.s a0, v8
3840 %v = load <16 x i32>, ptr %x
3841 %red = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %v)
3845 declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
3847 define i32 @vreduce_smin_v32i32(ptr %x) {
3848 ; CHECK-LABEL: vreduce_smin_v32i32:
3850 ; CHECK-NEXT: li a1, 32
3851 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3852 ; CHECK-NEXT: vle32.v v8, (a0)
3853 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3854 ; CHECK-NEXT: vmv.x.s a0, v8
3856 %v = load <32 x i32>, ptr %x
3857 %red = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %v)
3861 declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
3863 define i32 @vreduce_smin_v64i32(ptr %x) {
3864 ; CHECK-LABEL: vreduce_smin_v64i32:
3866 ; CHECK-NEXT: li a1, 32
3867 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
3868 ; CHECK-NEXT: vle32.v v8, (a0)
3869 ; CHECK-NEXT: addi a0, a0, 128
3870 ; CHECK-NEXT: vle32.v v16, (a0)
3871 ; CHECK-NEXT: vmin.vv v8, v8, v16
3872 ; CHECK-NEXT: vredmin.vs v8, v8, v8
3873 ; CHECK-NEXT: vmv.x.s a0, v8
3875 %v = load <64 x i32>, ptr %x
3876 %red = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %v)
3880 declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
3882 define i64 @vreduce_smin_v1i64(ptr %x) {
3883 ; RV32-LABEL: vreduce_smin_v1i64:
3885 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3886 ; RV32-NEXT: vle64.v v8, (a0)
3887 ; RV32-NEXT: li a0, 32
3888 ; RV32-NEXT: vsrl.vx v9, v8, a0
3889 ; RV32-NEXT: vmv.x.s a1, v9
3890 ; RV32-NEXT: vmv.x.s a0, v8
3893 ; RV64-LABEL: vreduce_smin_v1i64:
3895 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3896 ; RV64-NEXT: vle64.v v8, (a0)
3897 ; RV64-NEXT: vmv.x.s a0, v8
3899 %v = load <1 x i64>, ptr %x
3900 %red = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %v)
3904 declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
3906 define i64 @vreduce_smin_v2i64(ptr %x) {
3907 ; RV32-LABEL: vreduce_smin_v2i64:
3909 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3910 ; RV32-NEXT: vle64.v v8, (a0)
3911 ; RV32-NEXT: vredmin.vs v8, v8, v8
3912 ; RV32-NEXT: li a0, 32
3913 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3914 ; RV32-NEXT: vsrl.vx v9, v8, a0
3915 ; RV32-NEXT: vmv.x.s a1, v9
3916 ; RV32-NEXT: vmv.x.s a0, v8
3919 ; RV64-LABEL: vreduce_smin_v2i64:
3921 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
3922 ; RV64-NEXT: vle64.v v8, (a0)
3923 ; RV64-NEXT: vredmin.vs v8, v8, v8
3924 ; RV64-NEXT: vmv.x.s a0, v8
3926 %v = load <2 x i64>, ptr %x
3927 %red = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v)
3931 declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
3933 define i64 @vreduce_smin_v4i64(ptr %x) {
3934 ; RV32-LABEL: vreduce_smin_v4i64:
3936 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3937 ; RV32-NEXT: vle64.v v8, (a0)
3938 ; RV32-NEXT: vredmin.vs v8, v8, v8
3939 ; RV32-NEXT: vmv.x.s a0, v8
3940 ; RV32-NEXT: li a1, 32
3941 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3942 ; RV32-NEXT: vsrl.vx v8, v8, a1
3943 ; RV32-NEXT: vmv.x.s a1, v8
3946 ; RV64-LABEL: vreduce_smin_v4i64:
3948 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
3949 ; RV64-NEXT: vle64.v v8, (a0)
3950 ; RV64-NEXT: vredmin.vs v8, v8, v8
3951 ; RV64-NEXT: vmv.x.s a0, v8
3953 %v = load <4 x i64>, ptr %x
3954 %red = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
3958 declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
3960 define i64 @vreduce_smin_v8i64(ptr %x) {
3961 ; RV32-LABEL: vreduce_smin_v8i64:
3963 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3964 ; RV32-NEXT: vle64.v v8, (a0)
3965 ; RV32-NEXT: vredmin.vs v8, v8, v8
3966 ; RV32-NEXT: vmv.x.s a0, v8
3967 ; RV32-NEXT: li a1, 32
3968 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3969 ; RV32-NEXT: vsrl.vx v8, v8, a1
3970 ; RV32-NEXT: vmv.x.s a1, v8
3973 ; RV64-LABEL: vreduce_smin_v8i64:
3975 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
3976 ; RV64-NEXT: vle64.v v8, (a0)
3977 ; RV64-NEXT: vredmin.vs v8, v8, v8
3978 ; RV64-NEXT: vmv.x.s a0, v8
3980 %v = load <8 x i64>, ptr %x
3981 %red = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %v)
3985 declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
3987 define i64 @vreduce_smin_v16i64(ptr %x) {
3988 ; RV32-LABEL: vreduce_smin_v16i64:
3990 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
3991 ; RV32-NEXT: vle64.v v8, (a0)
3992 ; RV32-NEXT: vredmin.vs v8, v8, v8
3993 ; RV32-NEXT: vmv.x.s a0, v8
3994 ; RV32-NEXT: li a1, 32
3995 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
3996 ; RV32-NEXT: vsrl.vx v8, v8, a1
3997 ; RV32-NEXT: vmv.x.s a1, v8
4000 ; RV64-LABEL: vreduce_smin_v16i64:
4002 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4003 ; RV64-NEXT: vle64.v v8, (a0)
4004 ; RV64-NEXT: vredmin.vs v8, v8, v8
4005 ; RV64-NEXT: vmv.x.s a0, v8
4007 %v = load <16 x i64>, ptr %x
4008 %red = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %v)
4012 declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
4014 define i64 @vreduce_smin_v32i64(ptr %x) {
4015 ; RV32-LABEL: vreduce_smin_v32i64:
4017 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4018 ; RV32-NEXT: vle64.v v8, (a0)
4019 ; RV32-NEXT: addi a0, a0, 128
4020 ; RV32-NEXT: vle64.v v16, (a0)
4021 ; RV32-NEXT: vmin.vv v8, v8, v16
4022 ; RV32-NEXT: vredmin.vs v8, v8, v8
4023 ; RV32-NEXT: vmv.x.s a0, v8
4024 ; RV32-NEXT: li a1, 32
4025 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4026 ; RV32-NEXT: vsrl.vx v8, v8, a1
4027 ; RV32-NEXT: vmv.x.s a1, v8
4030 ; RV64-LABEL: vreduce_smin_v32i64:
4032 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4033 ; RV64-NEXT: vle64.v v8, (a0)
4034 ; RV64-NEXT: addi a0, a0, 128
4035 ; RV64-NEXT: vle64.v v16, (a0)
4036 ; RV64-NEXT: vmin.vv v8, v8, v16
4037 ; RV64-NEXT: vredmin.vs v8, v8, v8
4038 ; RV64-NEXT: vmv.x.s a0, v8
4040 %v = load <32 x i64>, ptr %x
4041 %red = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %v)
4045 declare i64 @llvm.vector.reduce.smin.v64i64(<64 x i64>)
4047 define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
4048 ; RV32-LABEL: vreduce_smin_v64i64:
4050 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4051 ; RV32-NEXT: vle64.v v8, (a0)
4052 ; RV32-NEXT: addi a1, a0, 384
4053 ; RV32-NEXT: vle64.v v16, (a1)
4054 ; RV32-NEXT: addi a1, a0, 256
4055 ; RV32-NEXT: addi a0, a0, 128
4056 ; RV32-NEXT: vle64.v v24, (a0)
4057 ; RV32-NEXT: vle64.v v0, (a1)
4058 ; RV32-NEXT: vmin.vv v16, v24, v16
4059 ; RV32-NEXT: vmin.vv v8, v8, v0
4060 ; RV32-NEXT: vmin.vv v8, v8, v16
4061 ; RV32-NEXT: vredmin.vs v8, v8, v8
4062 ; RV32-NEXT: vmv.x.s a0, v8
4063 ; RV32-NEXT: li a1, 32
4064 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4065 ; RV32-NEXT: vsrl.vx v8, v8, a1
4066 ; RV32-NEXT: vmv.x.s a1, v8
4069 ; RV64-LABEL: vreduce_smin_v64i64:
4071 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4072 ; RV64-NEXT: vle64.v v8, (a0)
4073 ; RV64-NEXT: addi a1, a0, 256
4074 ; RV64-NEXT: addi a2, a0, 384
4075 ; RV64-NEXT: vle64.v v16, (a2)
4076 ; RV64-NEXT: addi a0, a0, 128
4077 ; RV64-NEXT: vle64.v v24, (a0)
4078 ; RV64-NEXT: vle64.v v0, (a1)
4079 ; RV64-NEXT: vmin.vv v16, v24, v16
4080 ; RV64-NEXT: vmin.vv v8, v8, v0
4081 ; RV64-NEXT: vmin.vv v8, v8, v16
4082 ; RV64-NEXT: vredmin.vs v8, v8, v8
4083 ; RV64-NEXT: vmv.x.s a0, v8
4085 %v = load <64 x i64>, ptr %x
4086 %red = call i64 @llvm.vector.reduce.smin.v64i64(<64 x i64> %v)
4090 declare i8 @llvm.vector.reduce.smax.v1i8(<1 x i8>)
4092 define i8 @vreduce_smax_v1i8(ptr %x) {
4093 ; CHECK-LABEL: vreduce_smax_v1i8:
4095 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
4096 ; CHECK-NEXT: vle8.v v8, (a0)
4097 ; CHECK-NEXT: vmv.x.s a0, v8
4099 %v = load <1 x i8>, ptr %x
4100 %red = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> %v)
4104 declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>)
4106 define i8 @vreduce_smax_v2i8(ptr %x) {
4107 ; CHECK-LABEL: vreduce_smax_v2i8:
4109 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
4110 ; CHECK-NEXT: vle8.v v8, (a0)
4111 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4112 ; CHECK-NEXT: vmv.x.s a0, v8
4114 %v = load <2 x i8>, ptr %x
4115 %red = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v)
4119 declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>)
4121 define i8 @vreduce_smax_v4i8(ptr %x) {
4122 ; CHECK-LABEL: vreduce_smax_v4i8:
4124 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4125 ; CHECK-NEXT: vle8.v v8, (a0)
4126 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4127 ; CHECK-NEXT: vmv.x.s a0, v8
4129 %v = load <4 x i8>, ptr %x
4130 %red = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v)
4134 declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
4136 define i8 @vreduce_smax_v8i8(ptr %x) {
4137 ; CHECK-LABEL: vreduce_smax_v8i8:
4139 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4140 ; CHECK-NEXT: vle8.v v8, (a0)
4141 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4142 ; CHECK-NEXT: vmv.x.s a0, v8
4144 %v = load <8 x i8>, ptr %x
4145 %red = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
4149 declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
4151 define i8 @vreduce_smax_v16i8(ptr %x) {
4152 ; CHECK-LABEL: vreduce_smax_v16i8:
4154 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
4155 ; CHECK-NEXT: vle8.v v8, (a0)
4156 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4157 ; CHECK-NEXT: vmv.x.s a0, v8
4159 %v = load <16 x i8>, ptr %x
4160 %red = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
4164 declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
4166 define i8 @vreduce_smax_v32i8(ptr %x) {
4167 ; CHECK-LABEL: vreduce_smax_v32i8:
4169 ; CHECK-NEXT: li a1, 32
4170 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
4171 ; CHECK-NEXT: vle8.v v8, (a0)
4172 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4173 ; CHECK-NEXT: vmv.x.s a0, v8
4175 %v = load <32 x i8>, ptr %x
4176 %red = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v)
4180 declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
4182 define i8 @vreduce_smax_v64i8(ptr %x) {
4183 ; CHECK-LABEL: vreduce_smax_v64i8:
4185 ; CHECK-NEXT: li a1, 64
4186 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
4187 ; CHECK-NEXT: vle8.v v8, (a0)
4188 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4189 ; CHECK-NEXT: vmv.x.s a0, v8
4191 %v = load <64 x i8>, ptr %x
4192 %red = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %v)
4196 declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
4198 define i8 @vreduce_smax_v128i8(ptr %x) {
4199 ; CHECK-LABEL: vreduce_smax_v128i8:
4201 ; CHECK-NEXT: li a1, 128
4202 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4203 ; CHECK-NEXT: vle8.v v8, (a0)
4204 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4205 ; CHECK-NEXT: vmv.x.s a0, v8
4207 %v = load <128 x i8>, ptr %x
4208 %red = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %v)
4212 declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
4214 define i8 @vreduce_smax_v256i8(ptr %x) {
4215 ; CHECK-LABEL: vreduce_smax_v256i8:
4217 ; CHECK-NEXT: li a1, 128
4218 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4219 ; CHECK-NEXT: vle8.v v8, (a0)
4220 ; CHECK-NEXT: addi a0, a0, 128
4221 ; CHECK-NEXT: vle8.v v16, (a0)
4222 ; CHECK-NEXT: vmax.vv v8, v8, v16
4223 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4224 ; CHECK-NEXT: vmv.x.s a0, v8
4226 %v = load <256 x i8>, ptr %x
4227 %red = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %v)
4231 declare i16 @llvm.vector.reduce.smax.v1i16(<1 x i16>)
4233 define i16 @vreduce_smax_v1i16(ptr %x) {
4234 ; CHECK-LABEL: vreduce_smax_v1i16:
4236 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
4237 ; CHECK-NEXT: vle16.v v8, (a0)
4238 ; CHECK-NEXT: vmv.x.s a0, v8
4240 %v = load <1 x i16>, ptr %x
4241 %red = call i16 @llvm.vector.reduce.smax.v1i16(<1 x i16> %v)
4245 declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>)
4247 define i16 @vreduce_smax_v2i16(ptr %x) {
4248 ; CHECK-LABEL: vreduce_smax_v2i16:
4250 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
4251 ; CHECK-NEXT: vle16.v v8, (a0)
4252 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4253 ; CHECK-NEXT: vmv.x.s a0, v8
4255 %v = load <2 x i16>, ptr %x
4256 %red = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v)
4260 declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
4262 define i16 @vreduce_smax_v4i16(ptr %x) {
4263 ; CHECK-LABEL: vreduce_smax_v4i16:
4265 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
4266 ; CHECK-NEXT: vle16.v v8, (a0)
4267 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4268 ; CHECK-NEXT: vmv.x.s a0, v8
4270 %v = load <4 x i16>, ptr %x
4271 %red = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
4275 declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
4277 define i16 @vreduce_smax_v8i16(ptr %x) {
4278 ; CHECK-LABEL: vreduce_smax_v8i16:
4280 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
4281 ; CHECK-NEXT: vle16.v v8, (a0)
4282 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4283 ; CHECK-NEXT: vmv.x.s a0, v8
4285 %v = load <8 x i16>, ptr %x
4286 %red = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
4290 declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
4292 define i16 @vreduce_smax_v16i16(ptr %x) {
4293 ; CHECK-LABEL: vreduce_smax_v16i16:
4295 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
4296 ; CHECK-NEXT: vle16.v v8, (a0)
4297 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4298 ; CHECK-NEXT: vmv.x.s a0, v8
4300 %v = load <16 x i16>, ptr %x
4301 %red = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v)
4305 declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
4307 define i16 @vreduce_smax_v32i16(ptr %x) {
4308 ; CHECK-LABEL: vreduce_smax_v32i16:
4310 ; CHECK-NEXT: li a1, 32
4311 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
4312 ; CHECK-NEXT: vle16.v v8, (a0)
4313 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4314 ; CHECK-NEXT: vmv.x.s a0, v8
4316 %v = load <32 x i16>, ptr %x
4317 %red = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %v)
4321 declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
4323 define i16 @vreduce_smax_v64i16(ptr %x) {
4324 ; CHECK-LABEL: vreduce_smax_v64i16:
4326 ; CHECK-NEXT: li a1, 64
4327 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4328 ; CHECK-NEXT: vle16.v v8, (a0)
4329 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4330 ; CHECK-NEXT: vmv.x.s a0, v8
4332 %v = load <64 x i16>, ptr %x
4333 %red = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %v)
4337 declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
4339 define i16 @vreduce_smax_v128i16(ptr %x) {
4340 ; CHECK-LABEL: vreduce_smax_v128i16:
4342 ; CHECK-NEXT: li a1, 64
4343 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4344 ; CHECK-NEXT: vle16.v v8, (a0)
4345 ; CHECK-NEXT: addi a0, a0, 128
4346 ; CHECK-NEXT: vle16.v v16, (a0)
4347 ; CHECK-NEXT: vmax.vv v8, v8, v16
4348 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4349 ; CHECK-NEXT: vmv.x.s a0, v8
4351 %v = load <128 x i16>, ptr %x
4352 %red = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %v)
4356 declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32>)
4358 define i32 @vreduce_smax_v1i32(ptr %x) {
4359 ; CHECK-LABEL: vreduce_smax_v1i32:
4361 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
4362 ; CHECK-NEXT: vle32.v v8, (a0)
4363 ; CHECK-NEXT: vmv.x.s a0, v8
4365 %v = load <1 x i32>, ptr %x
4366 %red = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %v)
4370 declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
4372 define i32 @vreduce_smax_v2i32(ptr %x) {
4373 ; CHECK-LABEL: vreduce_smax_v2i32:
4375 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
4376 ; CHECK-NEXT: vle32.v v8, (a0)
4377 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4378 ; CHECK-NEXT: vmv.x.s a0, v8
4380 %v = load <2 x i32>, ptr %x
4381 %red = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v)
4385 declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
4387 define i32 @vreduce_smax_v4i32(ptr %x) {
4388 ; CHECK-LABEL: vreduce_smax_v4i32:
4390 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
4391 ; CHECK-NEXT: vle32.v v8, (a0)
4392 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4393 ; CHECK-NEXT: vmv.x.s a0, v8
4395 %v = load <4 x i32>, ptr %x
4396 %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
4400 declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
4402 define i32 @vreduce_smax_v8i32(ptr %x) {
4403 ; CHECK-LABEL: vreduce_smax_v8i32:
4405 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
4406 ; CHECK-NEXT: vle32.v v8, (a0)
4407 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4408 ; CHECK-NEXT: vmv.x.s a0, v8
4410 %v = load <8 x i32>, ptr %x
4411 %red = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v)
4415 declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
4417 define i32 @vreduce_smax_v16i32(ptr %x) {
4418 ; CHECK-LABEL: vreduce_smax_v16i32:
4420 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
4421 ; CHECK-NEXT: vle32.v v8, (a0)
4422 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4423 ; CHECK-NEXT: vmv.x.s a0, v8
4425 %v = load <16 x i32>, ptr %x
4426 %red = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %v)
4430 declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
4432 define i32 @vreduce_smax_v32i32(ptr %x) {
4433 ; CHECK-LABEL: vreduce_smax_v32i32:
4435 ; CHECK-NEXT: li a1, 32
4436 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
4437 ; CHECK-NEXT: vle32.v v8, (a0)
4438 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4439 ; CHECK-NEXT: vmv.x.s a0, v8
4441 %v = load <32 x i32>, ptr %x
4442 %red = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %v)
4446 declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
4448 define i32 @vreduce_smax_v64i32(ptr %x) {
4449 ; CHECK-LABEL: vreduce_smax_v64i32:
4451 ; CHECK-NEXT: li a1, 32
4452 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
4453 ; CHECK-NEXT: vle32.v v8, (a0)
4454 ; CHECK-NEXT: addi a0, a0, 128
4455 ; CHECK-NEXT: vle32.v v16, (a0)
4456 ; CHECK-NEXT: vmax.vv v8, v8, v16
4457 ; CHECK-NEXT: vredmax.vs v8, v8, v8
4458 ; CHECK-NEXT: vmv.x.s a0, v8
4460 %v = load <64 x i32>, ptr %x
4461 %red = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %v)
4465 declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
4467 define i64 @vreduce_smax_v1i64(ptr %x) {
4468 ; RV32-LABEL: vreduce_smax_v1i64:
4470 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4471 ; RV32-NEXT: vle64.v v8, (a0)
4472 ; RV32-NEXT: li a0, 32
4473 ; RV32-NEXT: vsrl.vx v9, v8, a0
4474 ; RV32-NEXT: vmv.x.s a1, v9
4475 ; RV32-NEXT: vmv.x.s a0, v8
4478 ; RV64-LABEL: vreduce_smax_v1i64:
4480 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4481 ; RV64-NEXT: vle64.v v8, (a0)
4482 ; RV64-NEXT: vmv.x.s a0, v8
4484 %v = load <1 x i64>, ptr %x
4485 %red = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %v)
4489 declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
4491 define i64 @vreduce_smax_v2i64(ptr %x) {
4492 ; RV32-LABEL: vreduce_smax_v2i64:
4494 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
4495 ; RV32-NEXT: vle64.v v8, (a0)
4496 ; RV32-NEXT: vredmax.vs v8, v8, v8
4497 ; RV32-NEXT: li a0, 32
4498 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4499 ; RV32-NEXT: vsrl.vx v9, v8, a0
4500 ; RV32-NEXT: vmv.x.s a1, v9
4501 ; RV32-NEXT: vmv.x.s a0, v8
4504 ; RV64-LABEL: vreduce_smax_v2i64:
4506 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
4507 ; RV64-NEXT: vle64.v v8, (a0)
4508 ; RV64-NEXT: vredmax.vs v8, v8, v8
4509 ; RV64-NEXT: vmv.x.s a0, v8
4511 %v = load <2 x i64>, ptr %x
4512 %red = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v)
4516 declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
4518 define i64 @vreduce_smax_v4i64(ptr %x) {
4519 ; RV32-LABEL: vreduce_smax_v4i64:
4521 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
4522 ; RV32-NEXT: vle64.v v8, (a0)
4523 ; RV32-NEXT: vredmax.vs v8, v8, v8
4524 ; RV32-NEXT: vmv.x.s a0, v8
4525 ; RV32-NEXT: li a1, 32
4526 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4527 ; RV32-NEXT: vsrl.vx v8, v8, a1
4528 ; RV32-NEXT: vmv.x.s a1, v8
4531 ; RV64-LABEL: vreduce_smax_v4i64:
4533 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
4534 ; RV64-NEXT: vle64.v v8, (a0)
4535 ; RV64-NEXT: vredmax.vs v8, v8, v8
4536 ; RV64-NEXT: vmv.x.s a0, v8
4538 %v = load <4 x i64>, ptr %x
4539 %red = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
4543 declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
4545 define i64 @vreduce_smax_v8i64(ptr %x) {
4546 ; RV32-LABEL: vreduce_smax_v8i64:
4548 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
4549 ; RV32-NEXT: vle64.v v8, (a0)
4550 ; RV32-NEXT: vredmax.vs v8, v8, v8
4551 ; RV32-NEXT: vmv.x.s a0, v8
4552 ; RV32-NEXT: li a1, 32
4553 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4554 ; RV32-NEXT: vsrl.vx v8, v8, a1
4555 ; RV32-NEXT: vmv.x.s a1, v8
4558 ; RV64-LABEL: vreduce_smax_v8i64:
4560 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
4561 ; RV64-NEXT: vle64.v v8, (a0)
4562 ; RV64-NEXT: vredmax.vs v8, v8, v8
4563 ; RV64-NEXT: vmv.x.s a0, v8
4565 %v = load <8 x i64>, ptr %x
4566 %red = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %v)
4570 declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
4572 define i64 @vreduce_smax_v16i64(ptr %x) {
4573 ; RV32-LABEL: vreduce_smax_v16i64:
4575 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4576 ; RV32-NEXT: vle64.v v8, (a0)
4577 ; RV32-NEXT: vredmax.vs v8, v8, v8
4578 ; RV32-NEXT: vmv.x.s a0, v8
4579 ; RV32-NEXT: li a1, 32
4580 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4581 ; RV32-NEXT: vsrl.vx v8, v8, a1
4582 ; RV32-NEXT: vmv.x.s a1, v8
4585 ; RV64-LABEL: vreduce_smax_v16i64:
4587 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4588 ; RV64-NEXT: vle64.v v8, (a0)
4589 ; RV64-NEXT: vredmax.vs v8, v8, v8
4590 ; RV64-NEXT: vmv.x.s a0, v8
4592 %v = load <16 x i64>, ptr %x
4593 %red = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %v)
4597 declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
4599 define i64 @vreduce_smax_v32i64(ptr %x) {
4600 ; RV32-LABEL: vreduce_smax_v32i64:
4602 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4603 ; RV32-NEXT: vle64.v v8, (a0)
4604 ; RV32-NEXT: addi a0, a0, 128
4605 ; RV32-NEXT: vle64.v v16, (a0)
4606 ; RV32-NEXT: vmax.vv v8, v8, v16
4607 ; RV32-NEXT: vredmax.vs v8, v8, v8
4608 ; RV32-NEXT: vmv.x.s a0, v8
4609 ; RV32-NEXT: li a1, 32
4610 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4611 ; RV32-NEXT: vsrl.vx v8, v8, a1
4612 ; RV32-NEXT: vmv.x.s a1, v8
4615 ; RV64-LABEL: vreduce_smax_v32i64:
4617 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4618 ; RV64-NEXT: vle64.v v8, (a0)
4619 ; RV64-NEXT: addi a0, a0, 128
4620 ; RV64-NEXT: vle64.v v16, (a0)
4621 ; RV64-NEXT: vmax.vv v8, v8, v16
4622 ; RV64-NEXT: vredmax.vs v8, v8, v8
4623 ; RV64-NEXT: vmv.x.s a0, v8
4625 %v = load <32 x i64>, ptr %x
4626 %red = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %v)
4630 declare i64 @llvm.vector.reduce.smax.v64i64(<64 x i64>)
4632 define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
4633 ; RV32-LABEL: vreduce_smax_v64i64:
4635 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4636 ; RV32-NEXT: vle64.v v8, (a0)
4637 ; RV32-NEXT: addi a1, a0, 384
4638 ; RV32-NEXT: vle64.v v16, (a1)
4639 ; RV32-NEXT: addi a1, a0, 256
4640 ; RV32-NEXT: addi a0, a0, 128
4641 ; RV32-NEXT: vle64.v v24, (a0)
4642 ; RV32-NEXT: vle64.v v0, (a1)
4643 ; RV32-NEXT: vmax.vv v16, v24, v16
4644 ; RV32-NEXT: vmax.vv v8, v8, v0
4645 ; RV32-NEXT: vmax.vv v8, v8, v16
4646 ; RV32-NEXT: vredmax.vs v8, v8, v8
4647 ; RV32-NEXT: vmv.x.s a0, v8
4648 ; RV32-NEXT: li a1, 32
4649 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
4650 ; RV32-NEXT: vsrl.vx v8, v8, a1
4651 ; RV32-NEXT: vmv.x.s a1, v8
4654 ; RV64-LABEL: vreduce_smax_v64i64:
4656 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
4657 ; RV64-NEXT: vle64.v v8, (a0)
4658 ; RV64-NEXT: addi a1, a0, 256
4659 ; RV64-NEXT: addi a2, a0, 384
4660 ; RV64-NEXT: vle64.v v16, (a2)
4661 ; RV64-NEXT: addi a0, a0, 128
4662 ; RV64-NEXT: vle64.v v24, (a0)
4663 ; RV64-NEXT: vle64.v v0, (a1)
4664 ; RV64-NEXT: vmax.vv v16, v24, v16
4665 ; RV64-NEXT: vmax.vv v8, v8, v0
4666 ; RV64-NEXT: vmax.vv v8, v8, v16
4667 ; RV64-NEXT: vredmax.vs v8, v8, v8
4668 ; RV64-NEXT: vmv.x.s a0, v8
4670 %v = load <64 x i64>, ptr %x
4671 %red = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> %v)
4675 declare i8 @llvm.vector.reduce.umin.v1i8(<1 x i8>)
4677 define i8 @vreduce_umin_v1i8(ptr %x) {
4678 ; CHECK-LABEL: vreduce_umin_v1i8:
4680 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
4681 ; CHECK-NEXT: vle8.v v8, (a0)
4682 ; CHECK-NEXT: vmv.x.s a0, v8
4684 %v = load <1 x i8>, ptr %x
4685 %red = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> %v)
4689 declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>)
4691 define i8 @vreduce_umin_v2i8(ptr %x) {
4692 ; CHECK-LABEL: vreduce_umin_v2i8:
4694 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
4695 ; CHECK-NEXT: vle8.v v8, (a0)
4696 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4697 ; CHECK-NEXT: vmv.x.s a0, v8
4699 %v = load <2 x i8>, ptr %x
4700 %red = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %v)
4704 declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>)
4706 define i8 @vreduce_umin_v4i8(ptr %x) {
4707 ; CHECK-LABEL: vreduce_umin_v4i8:
4709 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4710 ; CHECK-NEXT: vle8.v v8, (a0)
4711 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4712 ; CHECK-NEXT: vmv.x.s a0, v8
4714 %v = load <4 x i8>, ptr %x
4715 %red = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %v)
4719 declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
4721 define i8 @vreduce_umin_v8i8(ptr %x) {
4722 ; CHECK-LABEL: vreduce_umin_v8i8:
4724 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4725 ; CHECK-NEXT: vle8.v v8, (a0)
4726 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4727 ; CHECK-NEXT: vmv.x.s a0, v8
4729 %v = load <8 x i8>, ptr %x
4730 %red = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
4734 declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
4736 define i8 @vreduce_umin_v16i8(ptr %x) {
4737 ; CHECK-LABEL: vreduce_umin_v16i8:
4739 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
4740 ; CHECK-NEXT: vle8.v v8, (a0)
4741 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4742 ; CHECK-NEXT: vmv.x.s a0, v8
4744 %v = load <16 x i8>, ptr %x
4745 %red = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
4749 declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
4751 define i8 @vreduce_umin_v32i8(ptr %x) {
4752 ; CHECK-LABEL: vreduce_umin_v32i8:
4754 ; CHECK-NEXT: li a1, 32
4755 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
4756 ; CHECK-NEXT: vle8.v v8, (a0)
4757 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4758 ; CHECK-NEXT: vmv.x.s a0, v8
4760 %v = load <32 x i8>, ptr %x
4761 %red = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v)
4765 declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
4767 define i8 @vreduce_umin_v64i8(ptr %x) {
4768 ; CHECK-LABEL: vreduce_umin_v64i8:
4770 ; CHECK-NEXT: li a1, 64
4771 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
4772 ; CHECK-NEXT: vle8.v v8, (a0)
4773 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4774 ; CHECK-NEXT: vmv.x.s a0, v8
4776 %v = load <64 x i8>, ptr %x
4777 %red = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %v)
4781 declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
4783 define i8 @vreduce_umin_v128i8(ptr %x) {
4784 ; CHECK-LABEL: vreduce_umin_v128i8:
4786 ; CHECK-NEXT: li a1, 128
4787 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4788 ; CHECK-NEXT: vle8.v v8, (a0)
4789 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4790 ; CHECK-NEXT: vmv.x.s a0, v8
4792 %v = load <128 x i8>, ptr %x
4793 %red = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %v)
4797 declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
4799 define i8 @vreduce_umin_v256i8(ptr %x) {
4800 ; CHECK-LABEL: vreduce_umin_v256i8:
4802 ; CHECK-NEXT: li a1, 128
4803 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
4804 ; CHECK-NEXT: vle8.v v8, (a0)
4805 ; CHECK-NEXT: addi a0, a0, 128
4806 ; CHECK-NEXT: vle8.v v16, (a0)
4807 ; CHECK-NEXT: vminu.vv v8, v8, v16
4808 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4809 ; CHECK-NEXT: vmv.x.s a0, v8
4811 %v = load <256 x i8>, ptr %x
4812 %red = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %v)
4816 declare i16 @llvm.vector.reduce.umin.v1i16(<1 x i16>)
4818 define i16 @vreduce_umin_v1i16(ptr %x) {
4819 ; CHECK-LABEL: vreduce_umin_v1i16:
4821 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
4822 ; CHECK-NEXT: vle16.v v8, (a0)
4823 ; CHECK-NEXT: vmv.x.s a0, v8
4825 %v = load <1 x i16>, ptr %x
4826 %red = call i16 @llvm.vector.reduce.umin.v1i16(<1 x i16> %v)
4830 declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>)
4832 define i16 @vreduce_umin_v2i16(ptr %x) {
4833 ; CHECK-LABEL: vreduce_umin_v2i16:
4835 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
4836 ; CHECK-NEXT: vle16.v v8, (a0)
4837 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4838 ; CHECK-NEXT: vmv.x.s a0, v8
4840 %v = load <2 x i16>, ptr %x
4841 %red = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %v)
4845 declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
4847 define i16 @vreduce_umin_v4i16(ptr %x) {
4848 ; CHECK-LABEL: vreduce_umin_v4i16:
4850 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
4851 ; CHECK-NEXT: vle16.v v8, (a0)
4852 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4853 ; CHECK-NEXT: vmv.x.s a0, v8
4855 %v = load <4 x i16>, ptr %x
4856 %red = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
4860 declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
4862 define i16 @vreduce_umin_v8i16(ptr %x) {
4863 ; CHECK-LABEL: vreduce_umin_v8i16:
4865 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
4866 ; CHECK-NEXT: vle16.v v8, (a0)
4867 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4868 ; CHECK-NEXT: vmv.x.s a0, v8
4870 %v = load <8 x i16>, ptr %x
4871 %red = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
4875 declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
4877 define i16 @vreduce_umin_v16i16(ptr %x) {
4878 ; CHECK-LABEL: vreduce_umin_v16i16:
4880 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
4881 ; CHECK-NEXT: vle16.v v8, (a0)
4882 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4883 ; CHECK-NEXT: vmv.x.s a0, v8
4885 %v = load <16 x i16>, ptr %x
4886 %red = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v)
4890 declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
4892 define i16 @vreduce_umin_v32i16(ptr %x) {
4893 ; CHECK-LABEL: vreduce_umin_v32i16:
4895 ; CHECK-NEXT: li a1, 32
4896 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
4897 ; CHECK-NEXT: vle16.v v8, (a0)
4898 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4899 ; CHECK-NEXT: vmv.x.s a0, v8
4901 %v = load <32 x i16>, ptr %x
4902 %red = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %v)
4906 declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
4908 define i16 @vreduce_umin_v64i16(ptr %x) {
4909 ; CHECK-LABEL: vreduce_umin_v64i16:
4911 ; CHECK-NEXT: li a1, 64
4912 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4913 ; CHECK-NEXT: vle16.v v8, (a0)
4914 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4915 ; CHECK-NEXT: vmv.x.s a0, v8
4917 %v = load <64 x i16>, ptr %x
4918 %red = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %v)
4922 declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
4924 define i16 @vreduce_umin_v128i16(ptr %x) {
4925 ; CHECK-LABEL: vreduce_umin_v128i16:
4927 ; CHECK-NEXT: li a1, 64
4928 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
4929 ; CHECK-NEXT: vle16.v v8, (a0)
4930 ; CHECK-NEXT: addi a0, a0, 128
4931 ; CHECK-NEXT: vle16.v v16, (a0)
4932 ; CHECK-NEXT: vminu.vv v8, v8, v16
4933 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4934 ; CHECK-NEXT: vmv.x.s a0, v8
4936 %v = load <128 x i16>, ptr %x
4937 %red = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %v)
4941 declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32>)
4943 define i32 @vreduce_umin_v1i32(ptr %x) {
4944 ; CHECK-LABEL: vreduce_umin_v1i32:
4946 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
4947 ; CHECK-NEXT: vle32.v v8, (a0)
4948 ; CHECK-NEXT: vmv.x.s a0, v8
4950 %v = load <1 x i32>, ptr %x
4951 %red = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %v)
4955 declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
4957 define i32 @vreduce_umin_v2i32(ptr %x) {
4958 ; CHECK-LABEL: vreduce_umin_v2i32:
4960 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
4961 ; CHECK-NEXT: vle32.v v8, (a0)
4962 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4963 ; CHECK-NEXT: vmv.x.s a0, v8
4965 %v = load <2 x i32>, ptr %x
4966 %red = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v)
4970 declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
4972 define i32 @vreduce_umin_v4i32(ptr %x) {
4973 ; CHECK-LABEL: vreduce_umin_v4i32:
4975 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
4976 ; CHECK-NEXT: vle32.v v8, (a0)
4977 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4978 ; CHECK-NEXT: vmv.x.s a0, v8
4980 %v = load <4 x i32>, ptr %x
4981 %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
4985 declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
4987 define i32 @vreduce_umin_v8i32(ptr %x) {
4988 ; CHECK-LABEL: vreduce_umin_v8i32:
4990 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
4991 ; CHECK-NEXT: vle32.v v8, (a0)
4992 ; CHECK-NEXT: vredminu.vs v8, v8, v8
4993 ; CHECK-NEXT: vmv.x.s a0, v8
4995 %v = load <8 x i32>, ptr %x
4996 %red = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v)
5000 declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
5002 define i32 @vreduce_umin_v16i32(ptr %x) {
5003 ; CHECK-LABEL: vreduce_umin_v16i32:
5005 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
5006 ; CHECK-NEXT: vle32.v v8, (a0)
5007 ; CHECK-NEXT: vredminu.vs v8, v8, v8
5008 ; CHECK-NEXT: vmv.x.s a0, v8
5010 %v = load <16 x i32>, ptr %x
5011 %red = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %v)
5015 declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
5017 define i32 @vreduce_umin_v32i32(ptr %x) {
5018 ; CHECK-LABEL: vreduce_umin_v32i32:
5020 ; CHECK-NEXT: li a1, 32
5021 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5022 ; CHECK-NEXT: vle32.v v8, (a0)
5023 ; CHECK-NEXT: vredminu.vs v8, v8, v8
5024 ; CHECK-NEXT: vmv.x.s a0, v8
5026 %v = load <32 x i32>, ptr %x
5027 %red = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %v)
5031 declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
5033 define i32 @vreduce_umin_v64i32(ptr %x) {
5034 ; CHECK-LABEL: vreduce_umin_v64i32:
5036 ; CHECK-NEXT: li a1, 32
5037 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5038 ; CHECK-NEXT: vle32.v v8, (a0)
5039 ; CHECK-NEXT: addi a0, a0, 128
5040 ; CHECK-NEXT: vle32.v v16, (a0)
5041 ; CHECK-NEXT: vminu.vv v8, v8, v16
5042 ; CHECK-NEXT: vredminu.vs v8, v8, v8
5043 ; CHECK-NEXT: vmv.x.s a0, v8
5045 %v = load <64 x i32>, ptr %x
5046 %red = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %v)
5050 declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
5052 define i64 @vreduce_umin_v1i64(ptr %x) {
5053 ; RV32-LABEL: vreduce_umin_v1i64:
5055 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5056 ; RV32-NEXT: vle64.v v8, (a0)
5057 ; RV32-NEXT: li a0, 32
5058 ; RV32-NEXT: vsrl.vx v9, v8, a0
5059 ; RV32-NEXT: vmv.x.s a1, v9
5060 ; RV32-NEXT: vmv.x.s a0, v8
5063 ; RV64-LABEL: vreduce_umin_v1i64:
5065 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5066 ; RV64-NEXT: vle64.v v8, (a0)
5067 ; RV64-NEXT: vmv.x.s a0, v8
5069 %v = load <1 x i64>, ptr %x
5070 %red = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %v)
5074 declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
5076 define i64 @vreduce_umin_v2i64(ptr %x) {
5077 ; RV32-LABEL: vreduce_umin_v2i64:
5079 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5080 ; RV32-NEXT: vle64.v v8, (a0)
5081 ; RV32-NEXT: vredminu.vs v8, v8, v8
5082 ; RV32-NEXT: li a0, 32
5083 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5084 ; RV32-NEXT: vsrl.vx v9, v8, a0
5085 ; RV32-NEXT: vmv.x.s a1, v9
5086 ; RV32-NEXT: vmv.x.s a0, v8
5089 ; RV64-LABEL: vreduce_umin_v2i64:
5091 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5092 ; RV64-NEXT: vle64.v v8, (a0)
5093 ; RV64-NEXT: vredminu.vs v8, v8, v8
5094 ; RV64-NEXT: vmv.x.s a0, v8
5096 %v = load <2 x i64>, ptr %x
5097 %red = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v)
5101 declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
5103 define i64 @vreduce_umin_v4i64(ptr %x) {
5104 ; RV32-LABEL: vreduce_umin_v4i64:
5106 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5107 ; RV32-NEXT: vle64.v v8, (a0)
5108 ; RV32-NEXT: vredminu.vs v8, v8, v8
5109 ; RV32-NEXT: vmv.x.s a0, v8
5110 ; RV32-NEXT: li a1, 32
5111 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5112 ; RV32-NEXT: vsrl.vx v8, v8, a1
5113 ; RV32-NEXT: vmv.x.s a1, v8
5116 ; RV64-LABEL: vreduce_umin_v4i64:
5118 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5119 ; RV64-NEXT: vle64.v v8, (a0)
5120 ; RV64-NEXT: vredminu.vs v8, v8, v8
5121 ; RV64-NEXT: vmv.x.s a0, v8
5123 %v = load <4 x i64>, ptr %x
5124 %red = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
5128 declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
5130 define i64 @vreduce_umin_v8i64(ptr %x) {
5131 ; RV32-LABEL: vreduce_umin_v8i64:
5133 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5134 ; RV32-NEXT: vle64.v v8, (a0)
5135 ; RV32-NEXT: vredminu.vs v8, v8, v8
5136 ; RV32-NEXT: vmv.x.s a0, v8
5137 ; RV32-NEXT: li a1, 32
5138 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5139 ; RV32-NEXT: vsrl.vx v8, v8, a1
5140 ; RV32-NEXT: vmv.x.s a1, v8
5143 ; RV64-LABEL: vreduce_umin_v8i64:
5145 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5146 ; RV64-NEXT: vle64.v v8, (a0)
5147 ; RV64-NEXT: vredminu.vs v8, v8, v8
5148 ; RV64-NEXT: vmv.x.s a0, v8
5150 %v = load <8 x i64>, ptr %x
5151 %red = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %v)
5155 declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
5157 define i64 @vreduce_umin_v16i64(ptr %x) {
5158 ; RV32-LABEL: vreduce_umin_v16i64:
5160 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5161 ; RV32-NEXT: vle64.v v8, (a0)
5162 ; RV32-NEXT: vredminu.vs v8, v8, v8
5163 ; RV32-NEXT: vmv.x.s a0, v8
5164 ; RV32-NEXT: li a1, 32
5165 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5166 ; RV32-NEXT: vsrl.vx v8, v8, a1
5167 ; RV32-NEXT: vmv.x.s a1, v8
5170 ; RV64-LABEL: vreduce_umin_v16i64:
5172 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5173 ; RV64-NEXT: vle64.v v8, (a0)
5174 ; RV64-NEXT: vredminu.vs v8, v8, v8
5175 ; RV64-NEXT: vmv.x.s a0, v8
5177 %v = load <16 x i64>, ptr %x
5178 %red = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %v)
5182 declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)
5184 define i64 @vreduce_umin_v32i64(ptr %x) {
5185 ; RV32-LABEL: vreduce_umin_v32i64:
5187 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5188 ; RV32-NEXT: vle64.v v8, (a0)
5189 ; RV32-NEXT: addi a0, a0, 128
5190 ; RV32-NEXT: vle64.v v16, (a0)
5191 ; RV32-NEXT: vminu.vv v8, v8, v16
5192 ; RV32-NEXT: vredminu.vs v8, v8, v8
5193 ; RV32-NEXT: vmv.x.s a0, v8
5194 ; RV32-NEXT: li a1, 32
5195 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5196 ; RV32-NEXT: vsrl.vx v8, v8, a1
5197 ; RV32-NEXT: vmv.x.s a1, v8
5200 ; RV64-LABEL: vreduce_umin_v32i64:
5202 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5203 ; RV64-NEXT: vle64.v v8, (a0)
5204 ; RV64-NEXT: addi a0, a0, 128
5205 ; RV64-NEXT: vle64.v v16, (a0)
5206 ; RV64-NEXT: vminu.vv v8, v8, v16
5207 ; RV64-NEXT: vredminu.vs v8, v8, v8
5208 ; RV64-NEXT: vmv.x.s a0, v8
5210 %v = load <32 x i64>, ptr %x
5211 %red = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %v)
5215 declare i64 @llvm.vector.reduce.umin.v64i64(<64 x i64>)
5217 define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
5218 ; RV32-LABEL: vreduce_umin_v64i64:
5220 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5221 ; RV32-NEXT: vle64.v v8, (a0)
5222 ; RV32-NEXT: addi a1, a0, 384
5223 ; RV32-NEXT: vle64.v v16, (a1)
5224 ; RV32-NEXT: addi a1, a0, 256
5225 ; RV32-NEXT: addi a0, a0, 128
5226 ; RV32-NEXT: vle64.v v24, (a0)
5227 ; RV32-NEXT: vle64.v v0, (a1)
5228 ; RV32-NEXT: vminu.vv v16, v24, v16
5229 ; RV32-NEXT: vminu.vv v8, v8, v0
5230 ; RV32-NEXT: vminu.vv v8, v8, v16
5231 ; RV32-NEXT: vredminu.vs v8, v8, v8
5232 ; RV32-NEXT: vmv.x.s a0, v8
5233 ; RV32-NEXT: li a1, 32
5234 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5235 ; RV32-NEXT: vsrl.vx v8, v8, a1
5236 ; RV32-NEXT: vmv.x.s a1, v8
5239 ; RV64-LABEL: vreduce_umin_v64i64:
5241 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5242 ; RV64-NEXT: vle64.v v8, (a0)
5243 ; RV64-NEXT: addi a1, a0, 256
5244 ; RV64-NEXT: addi a2, a0, 384
5245 ; RV64-NEXT: vle64.v v16, (a2)
5246 ; RV64-NEXT: addi a0, a0, 128
5247 ; RV64-NEXT: vle64.v v24, (a0)
5248 ; RV64-NEXT: vle64.v v0, (a1)
5249 ; RV64-NEXT: vminu.vv v16, v24, v16
5250 ; RV64-NEXT: vminu.vv v8, v8, v0
5251 ; RV64-NEXT: vminu.vv v8, v8, v16
5252 ; RV64-NEXT: vredminu.vs v8, v8, v8
5253 ; RV64-NEXT: vmv.x.s a0, v8
5255 %v = load <64 x i64>, ptr %x
5256 %red = call i64 @llvm.vector.reduce.umin.v64i64(<64 x i64> %v)
5260 declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8>)
5262 define i8 @vreduce_umax_v1i8(ptr %x) {
5263 ; CHECK-LABEL: vreduce_umax_v1i8:
5265 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
5266 ; CHECK-NEXT: vle8.v v8, (a0)
5267 ; CHECK-NEXT: vmv.x.s a0, v8
5269 %v = load <1 x i8>, ptr %x
5270 %red = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %v)
5274 declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>)
5276 define i8 @vreduce_umax_v2i8(ptr %x) {
5277 ; CHECK-LABEL: vreduce_umax_v2i8:
5279 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
5280 ; CHECK-NEXT: vle8.v v8, (a0)
5281 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5282 ; CHECK-NEXT: vmv.x.s a0, v8
5284 %v = load <2 x i8>, ptr %x
5285 %red = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v)
5289 declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>)
5291 define i8 @vreduce_umax_v4i8(ptr %x) {
5292 ; CHECK-LABEL: vreduce_umax_v4i8:
5294 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
5295 ; CHECK-NEXT: vle8.v v8, (a0)
5296 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5297 ; CHECK-NEXT: vmv.x.s a0, v8
5299 %v = load <4 x i8>, ptr %x
5300 %red = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v)
5304 declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
5306 define i8 @vreduce_umax_v8i8(ptr %x) {
5307 ; CHECK-LABEL: vreduce_umax_v8i8:
5309 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
5310 ; CHECK-NEXT: vle8.v v8, (a0)
5311 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5312 ; CHECK-NEXT: vmv.x.s a0, v8
5314 %v = load <8 x i8>, ptr %x
5315 %red = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
5319 declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
5321 define i8 @vreduce_umax_v16i8(ptr %x) {
5322 ; CHECK-LABEL: vreduce_umax_v16i8:
5324 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
5325 ; CHECK-NEXT: vle8.v v8, (a0)
5326 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5327 ; CHECK-NEXT: vmv.x.s a0, v8
5329 %v = load <16 x i8>, ptr %x
5330 %red = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
5334 declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
5336 define i8 @vreduce_umax_v32i8(ptr %x) {
5337 ; CHECK-LABEL: vreduce_umax_v32i8:
5339 ; CHECK-NEXT: li a1, 32
5340 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
5341 ; CHECK-NEXT: vle8.v v8, (a0)
5342 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5343 ; CHECK-NEXT: vmv.x.s a0, v8
5345 %v = load <32 x i8>, ptr %x
5346 %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v)
5350 declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
5352 define i8 @vreduce_umax_v64i8(ptr %x) {
5353 ; CHECK-LABEL: vreduce_umax_v64i8:
5355 ; CHECK-NEXT: li a1, 64
5356 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
5357 ; CHECK-NEXT: vle8.v v8, (a0)
5358 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5359 ; CHECK-NEXT: vmv.x.s a0, v8
5361 %v = load <64 x i8>, ptr %x
5362 %red = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %v)
5366 declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
5368 define i8 @vreduce_umax_v128i8(ptr %x) {
5369 ; CHECK-LABEL: vreduce_umax_v128i8:
5371 ; CHECK-NEXT: li a1, 128
5372 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
5373 ; CHECK-NEXT: vle8.v v8, (a0)
5374 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5375 ; CHECK-NEXT: vmv.x.s a0, v8
5377 %v = load <128 x i8>, ptr %x
5378 %red = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %v)
5382 declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
5384 define i8 @vreduce_umax_v256i8(ptr %x) {
5385 ; CHECK-LABEL: vreduce_umax_v256i8:
5387 ; CHECK-NEXT: li a1, 128
5388 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
5389 ; CHECK-NEXT: vle8.v v8, (a0)
5390 ; CHECK-NEXT: addi a0, a0, 128
5391 ; CHECK-NEXT: vle8.v v16, (a0)
5392 ; CHECK-NEXT: vmaxu.vv v8, v8, v16
5393 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5394 ; CHECK-NEXT: vmv.x.s a0, v8
5396 %v = load <256 x i8>, ptr %x
5397 %red = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %v)
5401 declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16>)
5403 define i16 @vreduce_umax_v1i16(ptr %x) {
5404 ; CHECK-LABEL: vreduce_umax_v1i16:
5406 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
5407 ; CHECK-NEXT: vle16.v v8, (a0)
5408 ; CHECK-NEXT: vmv.x.s a0, v8
5410 %v = load <1 x i16>, ptr %x
5411 %red = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %v)
5415 declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>)
5417 define i16 @vreduce_umax_v2i16(ptr %x) {
5418 ; CHECK-LABEL: vreduce_umax_v2i16:
5420 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
5421 ; CHECK-NEXT: vle16.v v8, (a0)
5422 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5423 ; CHECK-NEXT: vmv.x.s a0, v8
5425 %v = load <2 x i16>, ptr %x
5426 %red = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v)
5430 declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
5432 define i16 @vreduce_umax_v4i16(ptr %x) {
5433 ; CHECK-LABEL: vreduce_umax_v4i16:
5435 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
5436 ; CHECK-NEXT: vle16.v v8, (a0)
5437 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5438 ; CHECK-NEXT: vmv.x.s a0, v8
5440 %v = load <4 x i16>, ptr %x
5441 %red = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
5445 declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
5447 define i16 @vreduce_umax_v8i16(ptr %x) {
5448 ; CHECK-LABEL: vreduce_umax_v8i16:
5450 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
5451 ; CHECK-NEXT: vle16.v v8, (a0)
5452 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5453 ; CHECK-NEXT: vmv.x.s a0, v8
5455 %v = load <8 x i16>, ptr %x
5456 %red = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
5460 declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
5462 define i16 @vreduce_umax_v16i16(ptr %x) {
5463 ; CHECK-LABEL: vreduce_umax_v16i16:
5465 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
5466 ; CHECK-NEXT: vle16.v v8, (a0)
5467 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5468 ; CHECK-NEXT: vmv.x.s a0, v8
5470 %v = load <16 x i16>, ptr %x
5471 %red = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v)
5475 declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
5477 define i16 @vreduce_umax_v32i16(ptr %x) {
5478 ; CHECK-LABEL: vreduce_umax_v32i16:
5480 ; CHECK-NEXT: li a1, 32
5481 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
5482 ; CHECK-NEXT: vle16.v v8, (a0)
5483 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5484 ; CHECK-NEXT: vmv.x.s a0, v8
5486 %v = load <32 x i16>, ptr %x
5487 %red = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %v)
5491 declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
5493 define i16 @vreduce_umax_v64i16(ptr %x) {
5494 ; CHECK-LABEL: vreduce_umax_v64i16:
5496 ; CHECK-NEXT: li a1, 64
5497 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
5498 ; CHECK-NEXT: vle16.v v8, (a0)
5499 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5500 ; CHECK-NEXT: vmv.x.s a0, v8
5502 %v = load <64 x i16>, ptr %x
5503 %red = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %v)
5507 declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
5509 define i16 @vreduce_umax_v128i16(ptr %x) {
5510 ; CHECK-LABEL: vreduce_umax_v128i16:
5512 ; CHECK-NEXT: li a1, 64
5513 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
5514 ; CHECK-NEXT: vle16.v v8, (a0)
5515 ; CHECK-NEXT: addi a0, a0, 128
5516 ; CHECK-NEXT: vle16.v v16, (a0)
5517 ; CHECK-NEXT: vmaxu.vv v8, v8, v16
5518 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5519 ; CHECK-NEXT: vmv.x.s a0, v8
5521 %v = load <128 x i16>, ptr %x
5522 %red = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %v)
5526 declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32>)
5528 define i32 @vreduce_umax_v1i32(ptr %x) {
5529 ; CHECK-LABEL: vreduce_umax_v1i32:
5531 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
5532 ; CHECK-NEXT: vle32.v v8, (a0)
5533 ; CHECK-NEXT: vmv.x.s a0, v8
5535 %v = load <1 x i32>, ptr %x
5536 %red = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %v)
5540 declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
5542 define i32 @vreduce_umax_v2i32(ptr %x) {
5543 ; CHECK-LABEL: vreduce_umax_v2i32:
5545 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
5546 ; CHECK-NEXT: vle32.v v8, (a0)
5547 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5548 ; CHECK-NEXT: vmv.x.s a0, v8
5550 %v = load <2 x i32>, ptr %x
5551 %red = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v)
5555 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
5557 define i32 @vreduce_umax_v4i32(ptr %x) {
5558 ; CHECK-LABEL: vreduce_umax_v4i32:
5560 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
5561 ; CHECK-NEXT: vle32.v v8, (a0)
5562 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5563 ; CHECK-NEXT: vmv.x.s a0, v8
5565 %v = load <4 x i32>, ptr %x
5566 %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
5570 declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
5572 define i32 @vreduce_umax_v8i32(ptr %x) {
5573 ; CHECK-LABEL: vreduce_umax_v8i32:
5575 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
5576 ; CHECK-NEXT: vle32.v v8, (a0)
5577 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5578 ; CHECK-NEXT: vmv.x.s a0, v8
5580 %v = load <8 x i32>, ptr %x
5581 %red = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v)
5585 declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
5587 define i32 @vreduce_umax_v16i32(ptr %x) {
5588 ; CHECK-LABEL: vreduce_umax_v16i32:
5590 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
5591 ; CHECK-NEXT: vle32.v v8, (a0)
5592 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5593 ; CHECK-NEXT: vmv.x.s a0, v8
5595 %v = load <16 x i32>, ptr %x
5596 %red = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %v)
5600 declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
5602 define i32 @vreduce_umax_v32i32(ptr %x) {
5603 ; CHECK-LABEL: vreduce_umax_v32i32:
5605 ; CHECK-NEXT: li a1, 32
5606 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5607 ; CHECK-NEXT: vle32.v v8, (a0)
5608 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5609 ; CHECK-NEXT: vmv.x.s a0, v8
5611 %v = load <32 x i32>, ptr %x
5612 %red = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %v)
5616 declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
5618 define i32 @vreduce_umax_v64i32(ptr %x) {
5619 ; CHECK-LABEL: vreduce_umax_v64i32:
5621 ; CHECK-NEXT: li a1, 32
5622 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
5623 ; CHECK-NEXT: vle32.v v8, (a0)
5624 ; CHECK-NEXT: addi a0, a0, 128
5625 ; CHECK-NEXT: vle32.v v16, (a0)
5626 ; CHECK-NEXT: vmaxu.vv v8, v8, v16
5627 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
5628 ; CHECK-NEXT: vmv.x.s a0, v8
5630 %v = load <64 x i32>, ptr %x
5631 %red = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %v)
5635 declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
5637 define i64 @vreduce_umax_v1i64(ptr %x) {
5638 ; RV32-LABEL: vreduce_umax_v1i64:
5640 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5641 ; RV32-NEXT: vle64.v v8, (a0)
5642 ; RV32-NEXT: li a0, 32
5643 ; RV32-NEXT: vsrl.vx v9, v8, a0
5644 ; RV32-NEXT: vmv.x.s a1, v9
5645 ; RV32-NEXT: vmv.x.s a0, v8
5648 ; RV64-LABEL: vreduce_umax_v1i64:
5650 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5651 ; RV64-NEXT: vle64.v v8, (a0)
5652 ; RV64-NEXT: vmv.x.s a0, v8
5654 %v = load <1 x i64>, ptr %x
5655 %red = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %v)
5659 declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
5661 define i64 @vreduce_umax_v2i64(ptr %x) {
5662 ; RV32-LABEL: vreduce_umax_v2i64:
5664 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5665 ; RV32-NEXT: vle64.v v8, (a0)
5666 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5667 ; RV32-NEXT: li a0, 32
5668 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5669 ; RV32-NEXT: vsrl.vx v9, v8, a0
5670 ; RV32-NEXT: vmv.x.s a1, v9
5671 ; RV32-NEXT: vmv.x.s a0, v8
5674 ; RV64-LABEL: vreduce_umax_v2i64:
5676 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
5677 ; RV64-NEXT: vle64.v v8, (a0)
5678 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5679 ; RV64-NEXT: vmv.x.s a0, v8
5681 %v = load <2 x i64>, ptr %x
5682 %red = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v)
5686 declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
5688 define i64 @vreduce_umax_v4i64(ptr %x) {
5689 ; RV32-LABEL: vreduce_umax_v4i64:
5691 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5692 ; RV32-NEXT: vle64.v v8, (a0)
5693 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5694 ; RV32-NEXT: vmv.x.s a0, v8
5695 ; RV32-NEXT: li a1, 32
5696 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5697 ; RV32-NEXT: vsrl.vx v8, v8, a1
5698 ; RV32-NEXT: vmv.x.s a1, v8
5701 ; RV64-LABEL: vreduce_umax_v4i64:
5703 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
5704 ; RV64-NEXT: vle64.v v8, (a0)
5705 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5706 ; RV64-NEXT: vmv.x.s a0, v8
5708 %v = load <4 x i64>, ptr %x
5709 %red = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
5713 declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
5715 define i64 @vreduce_umax_v8i64(ptr %x) {
5716 ; RV32-LABEL: vreduce_umax_v8i64:
5718 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5719 ; RV32-NEXT: vle64.v v8, (a0)
5720 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5721 ; RV32-NEXT: vmv.x.s a0, v8
5722 ; RV32-NEXT: li a1, 32
5723 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5724 ; RV32-NEXT: vsrl.vx v8, v8, a1
5725 ; RV32-NEXT: vmv.x.s a1, v8
5728 ; RV64-LABEL: vreduce_umax_v8i64:
5730 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
5731 ; RV64-NEXT: vle64.v v8, (a0)
5732 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5733 ; RV64-NEXT: vmv.x.s a0, v8
5735 %v = load <8 x i64>, ptr %x
5736 %red = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %v)
5740 declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
5742 define i64 @vreduce_umax_v16i64(ptr %x) {
5743 ; RV32-LABEL: vreduce_umax_v16i64:
5745 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5746 ; RV32-NEXT: vle64.v v8, (a0)
5747 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5748 ; RV32-NEXT: vmv.x.s a0, v8
5749 ; RV32-NEXT: li a1, 32
5750 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5751 ; RV32-NEXT: vsrl.vx v8, v8, a1
5752 ; RV32-NEXT: vmv.x.s a1, v8
5755 ; RV64-LABEL: vreduce_umax_v16i64:
5757 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5758 ; RV64-NEXT: vle64.v v8, (a0)
5759 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5760 ; RV64-NEXT: vmv.x.s a0, v8
5762 %v = load <16 x i64>, ptr %x
5763 %red = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %v)
5767 declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
5769 define i64 @vreduce_umax_v32i64(ptr %x) {
5770 ; RV32-LABEL: vreduce_umax_v32i64:
5772 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5773 ; RV32-NEXT: vle64.v v8, (a0)
5774 ; RV32-NEXT: addi a0, a0, 128
5775 ; RV32-NEXT: vle64.v v16, (a0)
5776 ; RV32-NEXT: vmaxu.vv v8, v8, v16
5777 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5778 ; RV32-NEXT: vmv.x.s a0, v8
5779 ; RV32-NEXT: li a1, 32
5780 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5781 ; RV32-NEXT: vsrl.vx v8, v8, a1
5782 ; RV32-NEXT: vmv.x.s a1, v8
5785 ; RV64-LABEL: vreduce_umax_v32i64:
5787 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5788 ; RV64-NEXT: vle64.v v8, (a0)
5789 ; RV64-NEXT: addi a0, a0, 128
5790 ; RV64-NEXT: vle64.v v16, (a0)
5791 ; RV64-NEXT: vmaxu.vv v8, v8, v16
5792 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5793 ; RV64-NEXT: vmv.x.s a0, v8
5795 %v = load <32 x i64>, ptr %x
5796 %red = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %v)
5800 declare i64 @llvm.vector.reduce.umax.v64i64(<64 x i64>)
5802 define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
5803 ; RV32-LABEL: vreduce_umax_v64i64:
5805 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5806 ; RV32-NEXT: vle64.v v8, (a0)
5807 ; RV32-NEXT: addi a1, a0, 384
5808 ; RV32-NEXT: vle64.v v16, (a1)
5809 ; RV32-NEXT: addi a1, a0, 256
5810 ; RV32-NEXT: addi a0, a0, 128
5811 ; RV32-NEXT: vle64.v v24, (a0)
5812 ; RV32-NEXT: vle64.v v0, (a1)
5813 ; RV32-NEXT: vmaxu.vv v16, v24, v16
5814 ; RV32-NEXT: vmaxu.vv v8, v8, v0
5815 ; RV32-NEXT: vmaxu.vv v8, v8, v16
5816 ; RV32-NEXT: vredmaxu.vs v8, v8, v8
5817 ; RV32-NEXT: vmv.x.s a0, v8
5818 ; RV32-NEXT: li a1, 32
5819 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
5820 ; RV32-NEXT: vsrl.vx v8, v8, a1
5821 ; RV32-NEXT: vmv.x.s a1, v8
5824 ; RV64-LABEL: vreduce_umax_v64i64:
5826 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
5827 ; RV64-NEXT: vle64.v v8, (a0)
5828 ; RV64-NEXT: addi a1, a0, 256
5829 ; RV64-NEXT: addi a2, a0, 384
5830 ; RV64-NEXT: vle64.v v16, (a2)
5831 ; RV64-NEXT: addi a0, a0, 128
5832 ; RV64-NEXT: vle64.v v24, (a0)
5833 ; RV64-NEXT: vle64.v v0, (a1)
5834 ; RV64-NEXT: vmaxu.vv v16, v24, v16
5835 ; RV64-NEXT: vmaxu.vv v8, v8, v0
5836 ; RV64-NEXT: vmaxu.vv v8, v8, v16
5837 ; RV64-NEXT: vredmaxu.vs v8, v8, v8
5838 ; RV64-NEXT: vmv.x.s a0, v8
5840 %v = load <64 x i64>, ptr %x
5841 %red = call i64 @llvm.vector.reduce.umax.v64i64(<64 x i64> %v)
5845 declare i8 @llvm.vector.reduce.mul.v1i8(<1 x i8>)
5847 define i8 @vreduce_mul_v1i8(ptr %x) {
5848 ; CHECK-LABEL: vreduce_mul_v1i8:
5850 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
5851 ; CHECK-NEXT: vle8.v v8, (a0)
5852 ; CHECK-NEXT: vmv.x.s a0, v8
5854 %v = load <1 x i8>, ptr %x
5855 %red = call i8 @llvm.vector.reduce.mul.v1i8(<1 x i8> %v)
5859 declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
5861 define i8 @vreduce_mul_v2i8(ptr %x) {
5862 ; CHECK-LABEL: vreduce_mul_v2i8:
5864 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
5865 ; CHECK-NEXT: vle8.v v8, (a0)
5866 ; CHECK-NEXT: lbu a0, 1(a0)
5867 ; CHECK-NEXT: vmul.vx v8, v8, a0
5868 ; CHECK-NEXT: vmv.x.s a0, v8
5870 %v = load <2 x i8>, ptr %x
5871 %red = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %v)
5875 declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
5877 define i8 @vreduce_mul_v4i8(ptr %x) {
5878 ; CHECK-LABEL: vreduce_mul_v4i8:
5880 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
5881 ; CHECK-NEXT: vle8.v v8, (a0)
5882 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
5883 ; CHECK-NEXT: vmul.vv v8, v8, v9
5884 ; CHECK-NEXT: vrgather.vi v9, v8, 1
5885 ; CHECK-NEXT: vmul.vv v8, v8, v9
5886 ; CHECK-NEXT: vmv.x.s a0, v8
5888 %v = load <4 x i8>, ptr %x
5889 %red = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %v)
5893 declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
5895 define i8 @vreduce_mul_v8i8(ptr %x) {
5896 ; CHECK-LABEL: vreduce_mul_v8i8:
5898 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
5899 ; CHECK-NEXT: vle8.v v8, (a0)
5900 ; CHECK-NEXT: vslidedown.vi v9, v8, 4
5901 ; CHECK-NEXT: vmul.vv v8, v8, v9
5902 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
5903 ; CHECK-NEXT: vmul.vv v8, v8, v9
5904 ; CHECK-NEXT: vrgather.vi v9, v8, 1
5905 ; CHECK-NEXT: vmul.vv v8, v8, v9
5906 ; CHECK-NEXT: vmv.x.s a0, v8
5908 %v = load <8 x i8>, ptr %x
5909 %red = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %v)
5913 declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
5915 define i8 @vreduce_mul_v16i8(ptr %x) {
5916 ; CHECK-LABEL: vreduce_mul_v16i8:
5918 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
5919 ; CHECK-NEXT: vle8.v v8, (a0)
5920 ; CHECK-NEXT: vslidedown.vi v9, v8, 8
5921 ; CHECK-NEXT: vmul.vv v8, v8, v9
5922 ; CHECK-NEXT: vslidedown.vi v9, v8, 4
5923 ; CHECK-NEXT: vmul.vv v8, v8, v9
5924 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
5925 ; CHECK-NEXT: vmul.vv v8, v8, v9
5926 ; CHECK-NEXT: vrgather.vi v9, v8, 1
5927 ; CHECK-NEXT: vmul.vv v8, v8, v9
5928 ; CHECK-NEXT: vmv.x.s a0, v8
5930 %v = load <16 x i8>, ptr %x
5931 %red = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %v)
5935 declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>)
5937 define i8 @vreduce_mul_v32i8(ptr %x) {
5938 ; CHECK-LABEL: vreduce_mul_v32i8:
5940 ; CHECK-NEXT: li a1, 32
5941 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
5942 ; CHECK-NEXT: vle8.v v8, (a0)
5943 ; CHECK-NEXT: vslidedown.vi v10, v8, 16
5944 ; CHECK-NEXT: vmul.vv v8, v8, v10
5945 ; CHECK-NEXT: vslidedown.vi v10, v8, 8
5946 ; CHECK-NEXT: vmul.vv v8, v8, v10
5947 ; CHECK-NEXT: vslidedown.vi v10, v8, 4
5948 ; CHECK-NEXT: vmul.vv v8, v8, v10
5949 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
5950 ; CHECK-NEXT: vmul.vv v8, v8, v10
5951 ; CHECK-NEXT: vrgather.vi v10, v8, 1
5952 ; CHECK-NEXT: vmul.vv v8, v8, v10
5953 ; CHECK-NEXT: vmv.x.s a0, v8
5955 %v = load <32 x i8>, ptr %x
5956 %red = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %v)
5960 declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>)
5962 define i8 @vreduce_mul_v64i8(ptr %x) {
5963 ; CHECK-LABEL: vreduce_mul_v64i8:
5965 ; CHECK-NEXT: li a1, 64
5966 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
5967 ; CHECK-NEXT: vle8.v v8, (a0)
5968 ; CHECK-NEXT: li a0, 32
5969 ; CHECK-NEXT: vslidedown.vx v12, v8, a0
5970 ; CHECK-NEXT: vmul.vv v8, v8, v12
5971 ; CHECK-NEXT: vslidedown.vi v12, v8, 16
5972 ; CHECK-NEXT: vmul.vv v8, v8, v12
5973 ; CHECK-NEXT: vslidedown.vi v12, v8, 8
5974 ; CHECK-NEXT: vmul.vv v8, v8, v12
5975 ; CHECK-NEXT: vslidedown.vi v12, v8, 4
5976 ; CHECK-NEXT: vmul.vv v8, v8, v12
5977 ; CHECK-NEXT: vslidedown.vi v12, v8, 2
5978 ; CHECK-NEXT: vmul.vv v8, v8, v12
5979 ; CHECK-NEXT: vrgather.vi v12, v8, 1
5980 ; CHECK-NEXT: vmul.vv v8, v8, v12
5981 ; CHECK-NEXT: vmv.x.s a0, v8
5983 %v = load <64 x i8>, ptr %x
5984 %red = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %v)
5988 declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
5990 define i8 @vreduce_mul_v128i8(ptr %x) {
5991 ; CHECK-LABEL: vreduce_mul_v128i8:
5993 ; CHECK-NEXT: li a1, 128
5994 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
5995 ; CHECK-NEXT: vle8.v v8, (a0)
5996 ; CHECK-NEXT: li a0, 64
5997 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
5998 ; CHECK-NEXT: vmul.vv v8, v8, v16
5999 ; CHECK-NEXT: li a0, 32
6000 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6001 ; CHECK-NEXT: vmul.vv v8, v8, v16
6002 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6003 ; CHECK-NEXT: vmul.vv v8, v8, v16
6004 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6005 ; CHECK-NEXT: vmul.vv v8, v8, v16
6006 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6007 ; CHECK-NEXT: vmul.vv v8, v8, v16
6008 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6009 ; CHECK-NEXT: vmul.vv v8, v8, v16
6010 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6011 ; CHECK-NEXT: vmul.vv v8, v8, v16
6012 ; CHECK-NEXT: vmv.x.s a0, v8
6014 %v = load <128 x i8>, ptr %x
6015 %red = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %v)
6019 declare i8 @llvm.vector.reduce.mul.v256i8(<256 x i8>)
6021 define i8 @vreduce_mul_v256i8(ptr %x) {
6022 ; CHECK-LABEL: vreduce_mul_v256i8:
6024 ; CHECK-NEXT: li a1, 128
6025 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
6026 ; CHECK-NEXT: vle8.v v8, (a0)
6027 ; CHECK-NEXT: addi a0, a0, 128
6028 ; CHECK-NEXT: vle8.v v16, (a0)
6029 ; CHECK-NEXT: vmul.vv v8, v8, v16
6030 ; CHECK-NEXT: li a0, 64
6031 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6032 ; CHECK-NEXT: vmul.vv v8, v8, v16
6033 ; CHECK-NEXT: li a0, 32
6034 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6035 ; CHECK-NEXT: vmul.vv v8, v8, v16
6036 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6037 ; CHECK-NEXT: vmul.vv v8, v8, v16
6038 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6039 ; CHECK-NEXT: vmul.vv v8, v8, v16
6040 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6041 ; CHECK-NEXT: vmul.vv v8, v8, v16
6042 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6043 ; CHECK-NEXT: vmul.vv v8, v8, v16
6044 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6045 ; CHECK-NEXT: vmul.vv v8, v8, v16
6046 ; CHECK-NEXT: vmv.x.s a0, v8
6048 %v = load <256 x i8>, ptr %x
6049 %red = call i8 @llvm.vector.reduce.mul.v256i8(<256 x i8> %v)
6053 declare i16 @llvm.vector.reduce.mul.v1i16(<1 x i16>)
6055 define i16 @vreduce_mul_v1i16(ptr %x) {
6056 ; CHECK-LABEL: vreduce_mul_v1i16:
6058 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
6059 ; CHECK-NEXT: vle16.v v8, (a0)
6060 ; CHECK-NEXT: vmv.x.s a0, v8
6062 %v = load <1 x i16>, ptr %x
6063 %red = call i16 @llvm.vector.reduce.mul.v1i16(<1 x i16> %v)
6067 declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
6069 define i16 @vreduce_mul_v2i16(ptr %x) {
6070 ; CHECK-LABEL: vreduce_mul_v2i16:
6072 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
6073 ; CHECK-NEXT: vle16.v v8, (a0)
6074 ; CHECK-NEXT: lh a0, 2(a0)
6075 ; CHECK-NEXT: vmul.vx v8, v8, a0
6076 ; CHECK-NEXT: vmv.x.s a0, v8
6078 %v = load <2 x i16>, ptr %x
6079 %red = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %v)
6083 declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
6085 define i16 @vreduce_mul_v4i16(ptr %x) {
6086 ; CHECK-LABEL: vreduce_mul_v4i16:
6088 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
6089 ; CHECK-NEXT: vle16.v v8, (a0)
6090 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
6091 ; CHECK-NEXT: vmul.vv v8, v8, v9
6092 ; CHECK-NEXT: vrgather.vi v9, v8, 1
6093 ; CHECK-NEXT: vmul.vv v8, v8, v9
6094 ; CHECK-NEXT: vmv.x.s a0, v8
6096 %v = load <4 x i16>, ptr %x
6097 %red = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %v)
6101 declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
6103 define i16 @vreduce_mul_v8i16(ptr %x) {
6104 ; CHECK-LABEL: vreduce_mul_v8i16:
6106 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
6107 ; CHECK-NEXT: vle16.v v8, (a0)
6108 ; CHECK-NEXT: vslidedown.vi v9, v8, 4
6109 ; CHECK-NEXT: vmul.vv v8, v8, v9
6110 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
6111 ; CHECK-NEXT: vmul.vv v8, v8, v9
6112 ; CHECK-NEXT: vrgather.vi v9, v8, 1
6113 ; CHECK-NEXT: vmul.vv v8, v8, v9
6114 ; CHECK-NEXT: vmv.x.s a0, v8
6116 %v = load <8 x i16>, ptr %x
6117 %red = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %v)
6121 declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
6123 define i16 @vreduce_mul_v16i16(ptr %x) {
6124 ; CHECK-LABEL: vreduce_mul_v16i16:
6126 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
6127 ; CHECK-NEXT: vle16.v v8, (a0)
6128 ; CHECK-NEXT: vslidedown.vi v10, v8, 8
6129 ; CHECK-NEXT: vmul.vv v8, v8, v10
6130 ; CHECK-NEXT: vslidedown.vi v10, v8, 4
6131 ; CHECK-NEXT: vmul.vv v8, v8, v10
6132 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
6133 ; CHECK-NEXT: vmul.vv v8, v8, v10
6134 ; CHECK-NEXT: vrgather.vi v10, v8, 1
6135 ; CHECK-NEXT: vmul.vv v8, v8, v10
6136 ; CHECK-NEXT: vmv.x.s a0, v8
6138 %v = load <16 x i16>, ptr %x
6139 %red = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %v)
6143 declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>)
6145 define i16 @vreduce_mul_v32i16(ptr %x) {
6146 ; CHECK-LABEL: vreduce_mul_v32i16:
6148 ; CHECK-NEXT: li a1, 32
6149 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
6150 ; CHECK-NEXT: vle16.v v8, (a0)
6151 ; CHECK-NEXT: vslidedown.vi v12, v8, 16
6152 ; CHECK-NEXT: vmul.vv v8, v8, v12
6153 ; CHECK-NEXT: vslidedown.vi v12, v8, 8
6154 ; CHECK-NEXT: vmul.vv v8, v8, v12
6155 ; CHECK-NEXT: vslidedown.vi v12, v8, 4
6156 ; CHECK-NEXT: vmul.vv v8, v8, v12
6157 ; CHECK-NEXT: vslidedown.vi v12, v8, 2
6158 ; CHECK-NEXT: vmul.vv v8, v8, v12
6159 ; CHECK-NEXT: vrgather.vi v12, v8, 1
6160 ; CHECK-NEXT: vmul.vv v8, v8, v12
6161 ; CHECK-NEXT: vmv.x.s a0, v8
6163 %v = load <32 x i16>, ptr %x
6164 %red = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %v)
6168 declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>)
6170 define i16 @vreduce_mul_v64i16(ptr %x) {
6171 ; CHECK-LABEL: vreduce_mul_v64i16:
6173 ; CHECK-NEXT: li a1, 64
6174 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
6175 ; CHECK-NEXT: vle16.v v8, (a0)
6176 ; CHECK-NEXT: li a0, 32
6177 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6178 ; CHECK-NEXT: vmul.vv v8, v8, v16
6179 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6180 ; CHECK-NEXT: vmul.vv v8, v8, v16
6181 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6182 ; CHECK-NEXT: vmul.vv v8, v8, v16
6183 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6184 ; CHECK-NEXT: vmul.vv v8, v8, v16
6185 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6186 ; CHECK-NEXT: vmul.vv v8, v8, v16
6187 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6188 ; CHECK-NEXT: vmul.vv v8, v8, v16
6189 ; CHECK-NEXT: vmv.x.s a0, v8
6191 %v = load <64 x i16>, ptr %x
6192 %red = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %v)
6196 declare i16 @llvm.vector.reduce.mul.v128i16(<128 x i16>)
6198 define i16 @vreduce_mul_v128i16(ptr %x) {
6199 ; CHECK-LABEL: vreduce_mul_v128i16:
6201 ; CHECK-NEXT: li a1, 64
6202 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
6203 ; CHECK-NEXT: vle16.v v8, (a0)
6204 ; CHECK-NEXT: addi a0, a0, 128
6205 ; CHECK-NEXT: vle16.v v16, (a0)
6206 ; CHECK-NEXT: vmul.vv v8, v8, v16
6207 ; CHECK-NEXT: li a0, 32
6208 ; CHECK-NEXT: vslidedown.vx v16, v8, a0
6209 ; CHECK-NEXT: vmul.vv v8, v8, v16
6210 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6211 ; CHECK-NEXT: vmul.vv v8, v8, v16
6212 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6213 ; CHECK-NEXT: vmul.vv v8, v8, v16
6214 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6215 ; CHECK-NEXT: vmul.vv v8, v8, v16
6216 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6217 ; CHECK-NEXT: vmul.vv v8, v8, v16
6218 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6219 ; CHECK-NEXT: vmul.vv v8, v8, v16
6220 ; CHECK-NEXT: vmv.x.s a0, v8
6222 %v = load <128 x i16>, ptr %x
6223 %red = call i16 @llvm.vector.reduce.mul.v128i16(<128 x i16> %v)
6227 declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32>)
6229 define i32 @vreduce_mul_v1i32(ptr %x) {
6230 ; CHECK-LABEL: vreduce_mul_v1i32:
6232 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
6233 ; CHECK-NEXT: vle32.v v8, (a0)
6234 ; CHECK-NEXT: vmv.x.s a0, v8
6236 %v = load <1 x i32>, ptr %x
6237 %red = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %v)
6241 declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
6243 define i32 @vreduce_mul_v2i32(ptr %x) {
6244 ; CHECK-LABEL: vreduce_mul_v2i32:
6246 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
6247 ; CHECK-NEXT: vle32.v v8, (a0)
6248 ; CHECK-NEXT: lw a0, 4(a0)
6249 ; CHECK-NEXT: vmul.vx v8, v8, a0
6250 ; CHECK-NEXT: vmv.x.s a0, v8
6252 %v = load <2 x i32>, ptr %x
6253 %red = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %v)
6257 declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
6259 define i32 @vreduce_mul_v4i32(ptr %x) {
6260 ; CHECK-LABEL: vreduce_mul_v4i32:
6262 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
6263 ; CHECK-NEXT: vle32.v v8, (a0)
6264 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
6265 ; CHECK-NEXT: vmul.vv v8, v8, v9
6266 ; CHECK-NEXT: vrgather.vi v9, v8, 1
6267 ; CHECK-NEXT: vmul.vv v8, v8, v9
6268 ; CHECK-NEXT: vmv.x.s a0, v8
6270 %v = load <4 x i32>, ptr %x
6271 %red = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %v)
6275 declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
6277 define i32 @vreduce_mul_v8i32(ptr %x) {
6278 ; CHECK-LABEL: vreduce_mul_v8i32:
6280 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
6281 ; CHECK-NEXT: vle32.v v8, (a0)
6282 ; CHECK-NEXT: vslidedown.vi v10, v8, 4
6283 ; CHECK-NEXT: vmul.vv v8, v8, v10
6284 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
6285 ; CHECK-NEXT: vmul.vv v8, v8, v10
6286 ; CHECK-NEXT: vrgather.vi v10, v8, 1
6287 ; CHECK-NEXT: vmul.vv v8, v8, v10
6288 ; CHECK-NEXT: vmv.x.s a0, v8
6290 %v = load <8 x i32>, ptr %x
6291 %red = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %v)
6295 declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
6297 define i32 @vreduce_mul_v16i32(ptr %x) {
6298 ; CHECK-LABEL: vreduce_mul_v16i32:
6300 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
6301 ; CHECK-NEXT: vle32.v v8, (a0)
6302 ; CHECK-NEXT: vslidedown.vi v12, v8, 8
6303 ; CHECK-NEXT: vmul.vv v8, v8, v12
6304 ; CHECK-NEXT: vslidedown.vi v12, v8, 4
6305 ; CHECK-NEXT: vmul.vv v8, v8, v12
6306 ; CHECK-NEXT: vslidedown.vi v12, v8, 2
6307 ; CHECK-NEXT: vmul.vv v8, v8, v12
6308 ; CHECK-NEXT: vrgather.vi v12, v8, 1
6309 ; CHECK-NEXT: vmul.vv v8, v8, v12
6310 ; CHECK-NEXT: vmv.x.s a0, v8
6312 %v = load <16 x i32>, ptr %x
6313 %red = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v)
6317 declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>)
6319 define i32 @vreduce_mul_v32i32(ptr %x) {
6320 ; CHECK-LABEL: vreduce_mul_v32i32:
6322 ; CHECK-NEXT: li a1, 32
6323 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
6324 ; CHECK-NEXT: vle32.v v8, (a0)
6325 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6326 ; CHECK-NEXT: vmul.vv v8, v8, v16
6327 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6328 ; CHECK-NEXT: vmul.vv v8, v8, v16
6329 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6330 ; CHECK-NEXT: vmul.vv v8, v8, v16
6331 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6332 ; CHECK-NEXT: vmul.vv v8, v8, v16
6333 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6334 ; CHECK-NEXT: vmul.vv v8, v8, v16
6335 ; CHECK-NEXT: vmv.x.s a0, v8
6337 %v = load <32 x i32>, ptr %x
6338 %red = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %v)
6342 declare i32 @llvm.vector.reduce.mul.v64i32(<64 x i32>)
6344 define i32 @vreduce_mul_v64i32(ptr %x) {
6345 ; CHECK-LABEL: vreduce_mul_v64i32:
6347 ; CHECK-NEXT: li a1, 32
6348 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
6349 ; CHECK-NEXT: vle32.v v8, (a0)
6350 ; CHECK-NEXT: addi a0, a0, 128
6351 ; CHECK-NEXT: vle32.v v16, (a0)
6352 ; CHECK-NEXT: vmul.vv v8, v8, v16
6353 ; CHECK-NEXT: vslidedown.vi v16, v8, 16
6354 ; CHECK-NEXT: vmul.vv v8, v8, v16
6355 ; CHECK-NEXT: vslidedown.vi v16, v8, 8
6356 ; CHECK-NEXT: vmul.vv v8, v8, v16
6357 ; CHECK-NEXT: vslidedown.vi v16, v8, 4
6358 ; CHECK-NEXT: vmul.vv v8, v8, v16
6359 ; CHECK-NEXT: vslidedown.vi v16, v8, 2
6360 ; CHECK-NEXT: vmul.vv v8, v8, v16
6361 ; CHECK-NEXT: vrgather.vi v16, v8, 1
6362 ; CHECK-NEXT: vmul.vv v8, v8, v16
6363 ; CHECK-NEXT: vmv.x.s a0, v8
6365 %v = load <64 x i32>, ptr %x
6366 %red = call i32 @llvm.vector.reduce.mul.v64i32(<64 x i32> %v)
6370 declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>)
6372 define i64 @vreduce_mul_v1i64(ptr %x) {
6373 ; RV32-LABEL: vreduce_mul_v1i64:
6375 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
6376 ; RV32-NEXT: vle64.v v8, (a0)
6377 ; RV32-NEXT: li a0, 32
6378 ; RV32-NEXT: vsrl.vx v9, v8, a0
6379 ; RV32-NEXT: vmv.x.s a1, v9
6380 ; RV32-NEXT: vmv.x.s a0, v8
6383 ; RV64-LABEL: vreduce_mul_v1i64:
6385 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
6386 ; RV64-NEXT: vle64.v v8, (a0)
6387 ; RV64-NEXT: vmv.x.s a0, v8
6389 %v = load <1 x i64>, ptr %x
6390 %red = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %v)
6394 declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
6396 define i64 @vreduce_mul_v2i64(ptr %x) {
6397 ; RV32-LABEL: vreduce_mul_v2i64:
6399 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
6400 ; RV32-NEXT: vle64.v v8, (a0)
6401 ; RV32-NEXT: addi a0, a0, 8
6402 ; RV32-NEXT: vlse64.v v9, (a0), zero
6403 ; RV32-NEXT: vmul.vv v8, v8, v9
6404 ; RV32-NEXT: vmv.x.s a0, v8
6405 ; RV32-NEXT: li a1, 32
6406 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
6407 ; RV32-NEXT: vsrl.vx v8, v8, a1
6408 ; RV32-NEXT: vmv.x.s a1, v8
6411 ; RV64-LABEL: vreduce_mul_v2i64:
6413 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
6414 ; RV64-NEXT: vle64.v v8, (a0)
6415 ; RV64-NEXT: ld a0, 8(a0)
6416 ; RV64-NEXT: vmul.vx v8, v8, a0
6417 ; RV64-NEXT: vmv.x.s a0, v8
6419 %v = load <2 x i64>, ptr %x
6420 %red = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %v)
6424 declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
6426 define i64 @vreduce_mul_v4i64(ptr %x) {
6427 ; RV32-LABEL: vreduce_mul_v4i64:
6429 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
6430 ; RV32-NEXT: vle64.v v8, (a0)
6431 ; RV32-NEXT: vslidedown.vi v10, v8, 2
6432 ; RV32-NEXT: vmul.vv v8, v8, v10
6433 ; RV32-NEXT: vrgather.vi v10, v8, 1
6434 ; RV32-NEXT: vmul.vv v8, v8, v10
6435 ; RV32-NEXT: vmv.x.s a0, v8
6436 ; RV32-NEXT: li a1, 32
6437 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
6438 ; RV32-NEXT: vsrl.vx v8, v8, a1
6439 ; RV32-NEXT: vmv.x.s a1, v8
6442 ; RV64-LABEL: vreduce_mul_v4i64:
6444 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
6445 ; RV64-NEXT: vle64.v v8, (a0)
6446 ; RV64-NEXT: vslidedown.vi v10, v8, 2
6447 ; RV64-NEXT: vmul.vv v8, v8, v10
6448 ; RV64-NEXT: vrgather.vi v10, v8, 1
6449 ; RV64-NEXT: vmul.vv v8, v8, v10
6450 ; RV64-NEXT: vmv.x.s a0, v8
6452 %v = load <4 x i64>, ptr %x
6453 %red = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %v)
6457 declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
6459 define i64 @vreduce_mul_v8i64(ptr %x) {
6460 ; RV32-LABEL: vreduce_mul_v8i64:
6462 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
6463 ; RV32-NEXT: vle64.v v8, (a0)
6464 ; RV32-NEXT: vslidedown.vi v12, v8, 4
6465 ; RV32-NEXT: vmul.vv v8, v8, v12
6466 ; RV32-NEXT: vslidedown.vi v12, v8, 2
6467 ; RV32-NEXT: vmul.vv v8, v8, v12
6468 ; RV32-NEXT: vrgather.vi v12, v8, 1
6469 ; RV32-NEXT: vmul.vv v8, v8, v12
6470 ; RV32-NEXT: vmv.x.s a0, v8
6471 ; RV32-NEXT: li a1, 32
6472 ; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma
6473 ; RV32-NEXT: vsrl.vx v8, v8, a1
6474 ; RV32-NEXT: vmv.x.s a1, v8
6477 ; RV64-LABEL: vreduce_mul_v8i64:
6479 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
6480 ; RV64-NEXT: vle64.v v8, (a0)
6481 ; RV64-NEXT: vslidedown.vi v12, v8, 4
6482 ; RV64-NEXT: vmul.vv v8, v8, v12
6483 ; RV64-NEXT: vslidedown.vi v12, v8, 2
6484 ; RV64-NEXT: vmul.vv v8, v8, v12
6485 ; RV64-NEXT: vrgather.vi v12, v8, 1
6486 ; RV64-NEXT: vmul.vv v8, v8, v12
6487 ; RV64-NEXT: vmv.x.s a0, v8
6489 %v = load <8 x i64>, ptr %x
6490 %red = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %v)
6494 declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
6496 define i64 @vreduce_mul_v16i64(ptr %x) {
6497 ; RV32-LABEL: vreduce_mul_v16i64:
6499 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6500 ; RV32-NEXT: vle64.v v8, (a0)
6501 ; RV32-NEXT: vslidedown.vi v16, v8, 8
6502 ; RV32-NEXT: vmul.vv v8, v8, v16
6503 ; RV32-NEXT: vslidedown.vi v16, v8, 4
6504 ; RV32-NEXT: vmul.vv v8, v8, v16
6505 ; RV32-NEXT: vslidedown.vi v16, v8, 2
6506 ; RV32-NEXT: vmul.vv v8, v8, v16
6507 ; RV32-NEXT: vrgather.vi v16, v8, 1
6508 ; RV32-NEXT: vmul.vv v8, v8, v16
6509 ; RV32-NEXT: vmv.x.s a0, v8
6510 ; RV32-NEXT: li a1, 32
6511 ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
6512 ; RV32-NEXT: vsrl.vx v8, v8, a1
6513 ; RV32-NEXT: vmv.x.s a1, v8
6516 ; RV64-LABEL: vreduce_mul_v16i64:
6518 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6519 ; RV64-NEXT: vle64.v v8, (a0)
6520 ; RV64-NEXT: vslidedown.vi v16, v8, 8
6521 ; RV64-NEXT: vmul.vv v8, v8, v16
6522 ; RV64-NEXT: vslidedown.vi v16, v8, 4
6523 ; RV64-NEXT: vmul.vv v8, v8, v16
6524 ; RV64-NEXT: vslidedown.vi v16, v8, 2
6525 ; RV64-NEXT: vmul.vv v8, v8, v16
6526 ; RV64-NEXT: vrgather.vi v16, v8, 1
6527 ; RV64-NEXT: vmul.vv v8, v8, v16
6528 ; RV64-NEXT: vmv.x.s a0, v8
6530 %v = load <16 x i64>, ptr %x
6531 %red = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %v)
6535 declare i64 @llvm.vector.reduce.mul.v32i64(<32 x i64>)
6537 define i64 @vreduce_mul_v32i64(ptr %x) {
6538 ; RV32-LABEL: vreduce_mul_v32i64:
6540 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6541 ; RV32-NEXT: vle64.v v8, (a0)
6542 ; RV32-NEXT: addi a0, a0, 128
6543 ; RV32-NEXT: vle64.v v16, (a0)
6544 ; RV32-NEXT: vmul.vv v8, v8, v16
6545 ; RV32-NEXT: vslidedown.vi v16, v8, 8
6546 ; RV32-NEXT: vmul.vv v8, v8, v16
6547 ; RV32-NEXT: vslidedown.vi v16, v8, 4
6548 ; RV32-NEXT: vmul.vv v8, v8, v16
6549 ; RV32-NEXT: vslidedown.vi v16, v8, 2
6550 ; RV32-NEXT: vmul.vv v8, v8, v16
6551 ; RV32-NEXT: vrgather.vi v16, v8, 1
6552 ; RV32-NEXT: vmul.vv v8, v8, v16
6553 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
6554 ; RV32-NEXT: vmv.x.s a0, v8
6555 ; RV32-NEXT: vslidedown.vi v8, v8, 1
6556 ; RV32-NEXT: vmv.x.s a1, v8
6559 ; RV64-LABEL: vreduce_mul_v32i64:
6561 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6562 ; RV64-NEXT: vle64.v v8, (a0)
6563 ; RV64-NEXT: addi a0, a0, 128
6564 ; RV64-NEXT: vle64.v v16, (a0)
6565 ; RV64-NEXT: vmul.vv v8, v8, v16
6566 ; RV64-NEXT: vslidedown.vi v16, v8, 8
6567 ; RV64-NEXT: vmul.vv v8, v8, v16
6568 ; RV64-NEXT: vslidedown.vi v16, v8, 4
6569 ; RV64-NEXT: vmul.vv v8, v8, v16
6570 ; RV64-NEXT: vslidedown.vi v16, v8, 2
6571 ; RV64-NEXT: vmul.vv v8, v8, v16
6572 ; RV64-NEXT: vrgather.vi v16, v8, 1
6573 ; RV64-NEXT: vmul.vv v8, v8, v16
6574 ; RV64-NEXT: vmv.x.s a0, v8
6576 %v = load <32 x i64>, ptr %x
6577 %red = call i64 @llvm.vector.reduce.mul.v32i64(<32 x i64> %v)
6581 declare i64 @llvm.vector.reduce.mul.v64i64(<64 x i64>)
6583 define i64 @vreduce_mul_v64i64(ptr %x) nounwind {
6584 ; RV32-LABEL: vreduce_mul_v64i64:
6586 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6587 ; RV32-NEXT: vle64.v v8, (a0)
6588 ; RV32-NEXT: addi a1, a0, 384
6589 ; RV32-NEXT: vle64.v v16, (a1)
6590 ; RV32-NEXT: addi a1, a0, 256
6591 ; RV32-NEXT: addi a0, a0, 128
6592 ; RV32-NEXT: vle64.v v24, (a0)
6593 ; RV32-NEXT: vle64.v v0, (a1)
6594 ; RV32-NEXT: vmul.vv v16, v24, v16
6595 ; RV32-NEXT: vmul.vv v8, v8, v0
6596 ; RV32-NEXT: vmul.vv v8, v8, v16
6597 ; RV32-NEXT: vslidedown.vi v16, v8, 8
6598 ; RV32-NEXT: vmul.vv v8, v8, v16
6599 ; RV32-NEXT: vslidedown.vi v16, v8, 4
6600 ; RV32-NEXT: vmul.vv v8, v8, v16
6601 ; RV32-NEXT: vslidedown.vi v16, v8, 2
6602 ; RV32-NEXT: vmul.vv v8, v8, v16
6603 ; RV32-NEXT: vrgather.vi v16, v8, 1
6604 ; RV32-NEXT: vmul.vv v8, v8, v16
6605 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
6606 ; RV32-NEXT: vmv.x.s a0, v8
6607 ; RV32-NEXT: vslidedown.vi v8, v8, 1
6608 ; RV32-NEXT: vmv.x.s a1, v8
6611 ; RV64-LABEL: vreduce_mul_v64i64:
6613 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
6614 ; RV64-NEXT: vle64.v v8, (a0)
6615 ; RV64-NEXT: addi a1, a0, 384
6616 ; RV64-NEXT: vle64.v v16, (a1)
6617 ; RV64-NEXT: addi a1, a0, 256
6618 ; RV64-NEXT: addi a0, a0, 128
6619 ; RV64-NEXT: vle64.v v24, (a0)
6620 ; RV64-NEXT: vle64.v v0, (a1)
6621 ; RV64-NEXT: vmul.vv v16, v24, v16
6622 ; RV64-NEXT: vmul.vv v8, v8, v0
6623 ; RV64-NEXT: vmul.vv v8, v8, v16
6624 ; RV64-NEXT: vslidedown.vi v16, v8, 8
6625 ; RV64-NEXT: vmul.vv v8, v8, v16
6626 ; RV64-NEXT: vslidedown.vi v16, v8, 4
6627 ; RV64-NEXT: vmul.vv v8, v8, v16
6628 ; RV64-NEXT: vslidedown.vi v16, v8, 2
6629 ; RV64-NEXT: vmul.vv v8, v8, v16
6630 ; RV64-NEXT: vrgather.vi v16, v8, 1
6631 ; RV64-NEXT: vmul.vv v8, v8, v16
6632 ; RV64-NEXT: vmv.x.s a0, v8
6634 %v = load <64 x i64>, ptr %x
6635 %red = call i64 @llvm.vector.reduce.mul.v64i64(<64 x i64> %v)