1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
5 define i32 @reduce_sum_2xi32(<2 x i32> %v) {
6 ; CHECK-LABEL: reduce_sum_2xi32:
8 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
9 ; CHECK-NEXT: vmv.s.x v9, zero
10 ; CHECK-NEXT: vredsum.vs v8, v8, v9
11 ; CHECK-NEXT: vmv.x.s a0, v8
13 %e0 = extractelement <2 x i32> %v, i32 0
14 %e1 = extractelement <2 x i32> %v, i32 1
15 %add0 = add i32 %e0, %e1
19 define i32 @reduce_sum_4xi32(<4 x i32> %v) {
20 ; CHECK-LABEL: reduce_sum_4xi32:
22 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
23 ; CHECK-NEXT: vmv.s.x v9, zero
24 ; CHECK-NEXT: vredsum.vs v8, v8, v9
25 ; CHECK-NEXT: vmv.x.s a0, v8
27 %e0 = extractelement <4 x i32> %v, i32 0
28 %e1 = extractelement <4 x i32> %v, i32 1
29 %e2 = extractelement <4 x i32> %v, i32 2
30 %e3 = extractelement <4 x i32> %v, i32 3
31 %add0 = add i32 %e0, %e1
32 %add1 = add i32 %add0, %e2
33 %add2 = add i32 %add1, %e3
37 define i32 @reduce_sum_8xi32(<8 x i32> %v) {
38 ; CHECK-LABEL: reduce_sum_8xi32:
40 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
41 ; CHECK-NEXT: vmv.s.x v10, zero
42 ; CHECK-NEXT: vredsum.vs v8, v8, v10
43 ; CHECK-NEXT: vmv.x.s a0, v8
45 %e0 = extractelement <8 x i32> %v, i32 0
46 %e1 = extractelement <8 x i32> %v, i32 1
47 %e2 = extractelement <8 x i32> %v, i32 2
48 %e3 = extractelement <8 x i32> %v, i32 3
49 %e4 = extractelement <8 x i32> %v, i32 4
50 %e5 = extractelement <8 x i32> %v, i32 5
51 %e6 = extractelement <8 x i32> %v, i32 6
52 %e7 = extractelement <8 x i32> %v, i32 7
53 %add0 = add i32 %e0, %e1
54 %add1 = add i32 %add0, %e2
55 %add2 = add i32 %add1, %e3
56 %add3 = add i32 %add2, %e4
57 %add4 = add i32 %add3, %e5
58 %add5 = add i32 %add4, %e6
59 %add6 = add i32 %add5, %e7
63 define i32 @reduce_sum_16xi32(<16 x i32> %v) {
64 ; CHECK-LABEL: reduce_sum_16xi32:
66 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
67 ; CHECK-NEXT: vmv.s.x v12, zero
68 ; CHECK-NEXT: vredsum.vs v8, v8, v12
69 ; CHECK-NEXT: vmv.x.s a0, v8
71 %e0 = extractelement <16 x i32> %v, i32 0
72 %e1 = extractelement <16 x i32> %v, i32 1
73 %e2 = extractelement <16 x i32> %v, i32 2
74 %e3 = extractelement <16 x i32> %v, i32 3
75 %e4 = extractelement <16 x i32> %v, i32 4
76 %e5 = extractelement <16 x i32> %v, i32 5
77 %e6 = extractelement <16 x i32> %v, i32 6
78 %e7 = extractelement <16 x i32> %v, i32 7
79 %e8 = extractelement <16 x i32> %v, i32 8
80 %e9 = extractelement <16 x i32> %v, i32 9
81 %e10 = extractelement <16 x i32> %v, i32 10
82 %e11 = extractelement <16 x i32> %v, i32 11
83 %e12 = extractelement <16 x i32> %v, i32 12
84 %e13 = extractelement <16 x i32> %v, i32 13
85 %e14 = extractelement <16 x i32> %v, i32 14
86 %e15 = extractelement <16 x i32> %v, i32 15
87 %add0 = add i32 %e0, %e1
88 %add1 = add i32 %add0, %e2
89 %add2 = add i32 %add1, %e3
90 %add3 = add i32 %add2, %e4
91 %add4 = add i32 %add3, %e5
92 %add5 = add i32 %add4, %e6
93 %add6 = add i32 %add5, %e7
94 %add7 = add i32 %add6, %e8
95 %add8 = add i32 %add7, %e9
96 %add9 = add i32 %add8, %e10
97 %add10 = add i32 %add9, %e11
98 %add11 = add i32 %add10, %e12
99 %add12 = add i32 %add11, %e13
100 %add13 = add i32 %add12, %e14
101 %add14 = add i32 %add13, %e15
105 define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
106 ; CHECK-LABEL: reduce_sum_16xi32_prefix2:
108 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
109 ; CHECK-NEXT: vle32.v v8, (a0)
110 ; CHECK-NEXT: vmv.s.x v9, zero
111 ; CHECK-NEXT: vredsum.vs v8, v8, v9
112 ; CHECK-NEXT: vmv.x.s a0, v8
114 %v = load <16 x i32>, ptr %p, align 256
115 %e0 = extractelement <16 x i32> %v, i32 0
116 %e1 = extractelement <16 x i32> %v, i32 1
117 %add0 = add i32 %e0, %e1
121 define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
122 ; CHECK-LABEL: reduce_sum_16xi32_prefix3:
124 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
125 ; CHECK-NEXT: vle32.v v8, (a0)
126 ; CHECK-NEXT: vmv.s.x v9, zero
127 ; CHECK-NEXT: vslideup.vi v8, v9, 3
128 ; CHECK-NEXT: vredsum.vs v8, v8, v9
129 ; CHECK-NEXT: vmv.x.s a0, v8
131 %v = load <16 x i32>, ptr %p, align 256
132 %e0 = extractelement <16 x i32> %v, i32 0
133 %e1 = extractelement <16 x i32> %v, i32 1
134 %e2 = extractelement <16 x i32> %v, i32 2
135 %add0 = add i32 %e0, %e1
136 %add1 = add i32 %add0, %e2
140 define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
141 ; CHECK-LABEL: reduce_sum_16xi32_prefix4:
143 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
144 ; CHECK-NEXT: vle32.v v8, (a0)
145 ; CHECK-NEXT: vmv.s.x v9, zero
146 ; CHECK-NEXT: vredsum.vs v8, v8, v9
147 ; CHECK-NEXT: vmv.x.s a0, v8
149 %v = load <16 x i32>, ptr %p, align 256
150 %e0 = extractelement <16 x i32> %v, i32 0
151 %e1 = extractelement <16 x i32> %v, i32 1
152 %e2 = extractelement <16 x i32> %v, i32 2
153 %e3 = extractelement <16 x i32> %v, i32 3
154 %add0 = add i32 %e0, %e1
155 %add1 = add i32 %add0, %e2
156 %add2 = add i32 %add1, %e3
160 define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
161 ; CHECK-LABEL: reduce_sum_16xi32_prefix5:
163 ; CHECK-NEXT: li a1, -32
164 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
165 ; CHECK-NEXT: vmv.s.x v0, a1
166 ; CHECK-NEXT: vle32.v v8, (a0)
167 ; CHECK-NEXT: vmv.v.i v10, -1
168 ; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
169 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
170 ; CHECK-NEXT: vsext.vf4 v12, v10
171 ; CHECK-NEXT: vand.vv v8, v8, v12
172 ; CHECK-NEXT: vmv.s.x v10, zero
173 ; CHECK-NEXT: vredsum.vs v8, v8, v10
174 ; CHECK-NEXT: vmv.x.s a0, v8
176 %v = load <16 x i32>, ptr %p, align 256
177 %e0 = extractelement <16 x i32> %v, i32 0
178 %e1 = extractelement <16 x i32> %v, i32 1
179 %e2 = extractelement <16 x i32> %v, i32 2
180 %e3 = extractelement <16 x i32> %v, i32 3
181 %e4 = extractelement <16 x i32> %v, i32 4
182 %add0 = add i32 %e0, %e1
183 %add1 = add i32 %add0, %e2
184 %add2 = add i32 %add1, %e3
185 %add3 = add i32 %add2, %e4
189 define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
190 ; CHECK-LABEL: reduce_sum_16xi32_prefix6:
192 ; CHECK-NEXT: li a1, 192
193 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
194 ; CHECK-NEXT: vmv.s.x v0, a1
195 ; CHECK-NEXT: vle32.v v8, (a0)
196 ; CHECK-NEXT: vmv.v.i v10, -1
197 ; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
198 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
199 ; CHECK-NEXT: vsext.vf4 v12, v10
200 ; CHECK-NEXT: vand.vv v8, v8, v12
201 ; CHECK-NEXT: vmv.s.x v10, zero
202 ; CHECK-NEXT: vredsum.vs v8, v8, v10
203 ; CHECK-NEXT: vmv.x.s a0, v8
205 %v = load <16 x i32>, ptr %p, align 256
206 %e0 = extractelement <16 x i32> %v, i32 0
207 %e1 = extractelement <16 x i32> %v, i32 1
208 %e2 = extractelement <16 x i32> %v, i32 2
209 %e3 = extractelement <16 x i32> %v, i32 3
210 %e4 = extractelement <16 x i32> %v, i32 4
211 %e5 = extractelement <16 x i32> %v, i32 5
212 %add0 = add i32 %e0, %e1
213 %add1 = add i32 %add0, %e2
214 %add2 = add i32 %add1, %e3
215 %add3 = add i32 %add2, %e4
216 %add4 = add i32 %add3, %e5
220 define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
221 ; CHECK-LABEL: reduce_sum_16xi32_prefix7:
223 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
224 ; CHECK-NEXT: vle32.v v8, (a0)
225 ; CHECK-NEXT: vmv.s.x v10, zero
226 ; CHECK-NEXT: vslideup.vi v8, v10, 7
227 ; CHECK-NEXT: vredsum.vs v8, v8, v10
228 ; CHECK-NEXT: vmv.x.s a0, v8
230 %v = load <16 x i32>, ptr %p, align 256
231 %e0 = extractelement <16 x i32> %v, i32 0
232 %e1 = extractelement <16 x i32> %v, i32 1
233 %e2 = extractelement <16 x i32> %v, i32 2
234 %e3 = extractelement <16 x i32> %v, i32 3
235 %e4 = extractelement <16 x i32> %v, i32 4
236 %e5 = extractelement <16 x i32> %v, i32 5
237 %e6 = extractelement <16 x i32> %v, i32 6
238 %add0 = add i32 %e0, %e1
239 %add1 = add i32 %add0, %e2
240 %add2 = add i32 %add1, %e3
241 %add3 = add i32 %add2, %e4
242 %add4 = add i32 %add3, %e5
243 %add5 = add i32 %add4, %e6
247 define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
248 ; CHECK-LABEL: reduce_sum_16xi32_prefix8:
250 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
251 ; CHECK-NEXT: vle32.v v8, (a0)
252 ; CHECK-NEXT: vmv.s.x v10, zero
253 ; CHECK-NEXT: vredsum.vs v8, v8, v10
254 ; CHECK-NEXT: vmv.x.s a0, v8
256 %v = load <16 x i32>, ptr %p, align 256
257 %e0 = extractelement <16 x i32> %v, i32 0
258 %e1 = extractelement <16 x i32> %v, i32 1
259 %e2 = extractelement <16 x i32> %v, i32 2
260 %e3 = extractelement <16 x i32> %v, i32 3
261 %e4 = extractelement <16 x i32> %v, i32 4
262 %e5 = extractelement <16 x i32> %v, i32 5
263 %e6 = extractelement <16 x i32> %v, i32 6
264 %e7 = extractelement <16 x i32> %v, i32 7
265 %add0 = add i32 %e0, %e1
266 %add1 = add i32 %add0, %e2
267 %add2 = add i32 %add1, %e3
268 %add3 = add i32 %add2, %e4
269 %add4 = add i32 %add3, %e5
270 %add5 = add i32 %add4, %e6
271 %add6 = add i32 %add5, %e7
275 define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
276 ; CHECK-LABEL: reduce_sum_16xi32_prefix9:
278 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
279 ; CHECK-NEXT: vle32.v v8, (a0)
280 ; CHECK-NEXT: li a0, -512
281 ; CHECK-NEXT: vmv.s.x v0, a0
282 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
283 ; CHECK-NEXT: vmv.v.i v12, -1
284 ; CHECK-NEXT: vmerge.vim v12, v12, 0, v0
285 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
286 ; CHECK-NEXT: vsext.vf4 v16, v12
287 ; CHECK-NEXT: vand.vv v8, v8, v16
288 ; CHECK-NEXT: vmv.s.x v12, zero
289 ; CHECK-NEXT: vredsum.vs v8, v8, v12
290 ; CHECK-NEXT: vmv.x.s a0, v8
292 %v = load <16 x i32>, ptr %p, align 256
293 %e0 = extractelement <16 x i32> %v, i32 0
294 %e1 = extractelement <16 x i32> %v, i32 1
295 %e2 = extractelement <16 x i32> %v, i32 2
296 %e3 = extractelement <16 x i32> %v, i32 3
297 %e4 = extractelement <16 x i32> %v, i32 4
298 %e5 = extractelement <16 x i32> %v, i32 5
299 %e6 = extractelement <16 x i32> %v, i32 6
300 %e7 = extractelement <16 x i32> %v, i32 7
301 %e8 = extractelement <16 x i32> %v, i32 8
302 %add0 = add i32 %e0, %e1
303 %add1 = add i32 %add0, %e2
304 %add2 = add i32 %add1, %e3
305 %add3 = add i32 %add2, %e4
306 %add4 = add i32 %add3, %e5
307 %add5 = add i32 %add4, %e6
308 %add6 = add i32 %add5, %e7
309 %add7 = add i32 %add6, %e8
313 define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
314 ; CHECK-LABEL: reduce_sum_16xi32_prefix13:
316 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
317 ; CHECK-NEXT: vle32.v v8, (a0)
318 ; CHECK-NEXT: lui a0, 14
319 ; CHECK-NEXT: vmv.s.x v0, a0
320 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
321 ; CHECK-NEXT: vmv.v.i v12, -1
322 ; CHECK-NEXT: vmerge.vim v12, v12, 0, v0
323 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
324 ; CHECK-NEXT: vsext.vf4 v16, v12
325 ; CHECK-NEXT: vand.vv v8, v8, v16
326 ; CHECK-NEXT: vmv.s.x v12, zero
327 ; CHECK-NEXT: vredsum.vs v8, v8, v12
328 ; CHECK-NEXT: vmv.x.s a0, v8
330 %v = load <16 x i32>, ptr %p, align 256
331 %e0 = extractelement <16 x i32> %v, i32 0
332 %e1 = extractelement <16 x i32> %v, i32 1
333 %e2 = extractelement <16 x i32> %v, i32 2
334 %e3 = extractelement <16 x i32> %v, i32 3
335 %e4 = extractelement <16 x i32> %v, i32 4
336 %e5 = extractelement <16 x i32> %v, i32 5
337 %e6 = extractelement <16 x i32> %v, i32 6
338 %e7 = extractelement <16 x i32> %v, i32 7
339 %e8 = extractelement <16 x i32> %v, i32 8
340 %e9 = extractelement <16 x i32> %v, i32 9
341 %e10 = extractelement <16 x i32> %v, i32 10
342 %e11 = extractelement <16 x i32> %v, i32 11
343 %e12 = extractelement <16 x i32> %v, i32 12
344 %add0 = add i32 %e0, %e1
345 %add1 = add i32 %add0, %e2
346 %add2 = add i32 %add1, %e3
347 %add3 = add i32 %add2, %e4
348 %add4 = add i32 %add3, %e5
349 %add5 = add i32 %add4, %e6
350 %add6 = add i32 %add5, %e7
351 %add7 = add i32 %add6, %e8
352 %add8 = add i32 %add7, %e9
353 %add9 = add i32 %add8, %e10
354 %add10 = add i32 %add9, %e11
355 %add11 = add i32 %add10, %e12
360 define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
361 ; CHECK-LABEL: reduce_sum_16xi32_prefix14:
363 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
364 ; CHECK-NEXT: vle32.v v8, (a0)
365 ; CHECK-NEXT: lui a0, 12
366 ; CHECK-NEXT: vmv.s.x v0, a0
367 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
368 ; CHECK-NEXT: vmv.v.i v12, -1
369 ; CHECK-NEXT: vmerge.vim v12, v12, 0, v0
370 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
371 ; CHECK-NEXT: vsext.vf4 v16, v12
372 ; CHECK-NEXT: vand.vv v8, v8, v16
373 ; CHECK-NEXT: vmv.s.x v12, zero
374 ; CHECK-NEXT: vredsum.vs v8, v8, v12
375 ; CHECK-NEXT: vmv.x.s a0, v8
377 %v = load <16 x i32>, ptr %p, align 256
378 %e0 = extractelement <16 x i32> %v, i32 0
379 %e1 = extractelement <16 x i32> %v, i32 1
380 %e2 = extractelement <16 x i32> %v, i32 2
381 %e3 = extractelement <16 x i32> %v, i32 3
382 %e4 = extractelement <16 x i32> %v, i32 4
383 %e5 = extractelement <16 x i32> %v, i32 5
384 %e6 = extractelement <16 x i32> %v, i32 6
385 %e7 = extractelement <16 x i32> %v, i32 7
386 %e8 = extractelement <16 x i32> %v, i32 8
387 %e9 = extractelement <16 x i32> %v, i32 9
388 %e10 = extractelement <16 x i32> %v, i32 10
389 %e11 = extractelement <16 x i32> %v, i32 11
390 %e12 = extractelement <16 x i32> %v, i32 12
391 %e13 = extractelement <16 x i32> %v, i32 13
392 %add0 = add i32 %e0, %e1
393 %add1 = add i32 %add0, %e2
394 %add2 = add i32 %add1, %e3
395 %add3 = add i32 %add2, %e4
396 %add4 = add i32 %add3, %e5
397 %add5 = add i32 %add4, %e6
398 %add6 = add i32 %add5, %e7
399 %add7 = add i32 %add6, %e8
400 %add8 = add i32 %add7, %e9
401 %add9 = add i32 %add8, %e10
402 %add10 = add i32 %add9, %e11
403 %add11 = add i32 %add10, %e12
404 %add12 = add i32 %add11, %e13
408 define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
409 ; CHECK-LABEL: reduce_sum_16xi32_prefix15:
411 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
412 ; CHECK-NEXT: vle32.v v8, (a0)
413 ; CHECK-NEXT: vmv.s.x v12, zero
414 ; CHECK-NEXT: vslideup.vi v8, v12, 15
415 ; CHECK-NEXT: vredsum.vs v8, v8, v12
416 ; CHECK-NEXT: vmv.x.s a0, v8
418 %v = load <16 x i32>, ptr %p, align 256
419 %e0 = extractelement <16 x i32> %v, i32 0
420 %e1 = extractelement <16 x i32> %v, i32 1
421 %e2 = extractelement <16 x i32> %v, i32 2
422 %e3 = extractelement <16 x i32> %v, i32 3
423 %e4 = extractelement <16 x i32> %v, i32 4
424 %e5 = extractelement <16 x i32> %v, i32 5
425 %e6 = extractelement <16 x i32> %v, i32 6
426 %e7 = extractelement <16 x i32> %v, i32 7
427 %e8 = extractelement <16 x i32> %v, i32 8
428 %e9 = extractelement <16 x i32> %v, i32 9
429 %e10 = extractelement <16 x i32> %v, i32 10
430 %e11 = extractelement <16 x i32> %v, i32 11
431 %e12 = extractelement <16 x i32> %v, i32 12
432 %e13 = extractelement <16 x i32> %v, i32 13
433 %e14 = extractelement <16 x i32> %v, i32 14
434 %add0 = add i32 %e0, %e1
435 %add1 = add i32 %add0, %e2
436 %add2 = add i32 %add1, %e3
437 %add3 = add i32 %add2, %e4
438 %add4 = add i32 %add3, %e5
439 %add5 = add i32 %add4, %e6
440 %add6 = add i32 %add5, %e7
441 %add7 = add i32 %add6, %e8
442 %add8 = add i32 %add7, %e9
443 %add9 = add i32 %add8, %e10
444 %add10 = add i32 %add9, %e11
445 %add11 = add i32 %add10, %e12
446 %add12 = add i32 %add11, %e13
447 %add13 = add i32 %add12, %e14
451 ; Check that we can match with the operand ordered reversed, but the
452 ; reduction order unchanged.
453 define i32 @reduce_sum_4xi32_op_order(<4 x i32> %v) {
454 ; CHECK-LABEL: reduce_sum_4xi32_op_order:
456 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
457 ; CHECK-NEXT: vmv.s.x v9, zero
458 ; CHECK-NEXT: vredsum.vs v8, v8, v9
459 ; CHECK-NEXT: vmv.x.s a0, v8
461 %e0 = extractelement <4 x i32> %v, i32 0
462 %e1 = extractelement <4 x i32> %v, i32 1
463 %e2 = extractelement <4 x i32> %v, i32 2
464 %e3 = extractelement <4 x i32> %v, i32 3
465 %add0 = add i32 %e1, %e0
466 %add1 = add i32 %e2, %add0
467 %add2 = add i32 %add1, %e3
471 ; Negative test - Reduction order isn't compatibile with current
472 ; incremental matching scheme.
473 define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
474 ; RV32-LABEL: reduce_sum_4xi32_reduce_order:
476 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
477 ; RV32-NEXT: vmv.x.s a0, v8
478 ; RV32-NEXT: vslidedown.vi v9, v8, 1
479 ; RV32-NEXT: vmv.x.s a1, v9
480 ; RV32-NEXT: vslidedown.vi v9, v8, 2
481 ; RV32-NEXT: vmv.x.s a2, v9
482 ; RV32-NEXT: vslidedown.vi v8, v8, 3
483 ; RV32-NEXT: vmv.x.s a3, v8
484 ; RV32-NEXT: add a1, a1, a2
485 ; RV32-NEXT: add a0, a0, a3
486 ; RV32-NEXT: add a0, a0, a1
489 ; RV64-LABEL: reduce_sum_4xi32_reduce_order:
491 ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
492 ; RV64-NEXT: vmv.x.s a0, v8
493 ; RV64-NEXT: vslidedown.vi v9, v8, 1
494 ; RV64-NEXT: vmv.x.s a1, v9
495 ; RV64-NEXT: vslidedown.vi v9, v8, 2
496 ; RV64-NEXT: vmv.x.s a2, v9
497 ; RV64-NEXT: vslidedown.vi v8, v8, 3
498 ; RV64-NEXT: vmv.x.s a3, v8
499 ; RV64-NEXT: add a1, a1, a2
500 ; RV64-NEXT: add a0, a0, a3
501 ; RV64-NEXT: addw a0, a0, a1
503 %e0 = extractelement <4 x i32> %v, i32 0
504 %e1 = extractelement <4 x i32> %v, i32 1
505 %e2 = extractelement <4 x i32> %v, i32 2
506 %e3 = extractelement <4 x i32> %v, i32 3
507 %add0 = add i32 %e1, %e2
508 %add1 = add i32 %e0, %add0
509 %add2 = add i32 %add1, %e3
513 ;; Most of the cornercases are exercised above, the following just
514 ;; makes sure that other opcodes work as expected.
516 define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
517 ; CHECK-LABEL: reduce_xor_16xi32_prefix2:
519 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
520 ; CHECK-NEXT: vle32.v v8, (a0)
521 ; CHECK-NEXT: vmv.s.x v9, zero
522 ; CHECK-NEXT: vredxor.vs v8, v8, v9
523 ; CHECK-NEXT: vmv.x.s a0, v8
525 %v = load <16 x i32>, ptr %p, align 256
526 %e0 = extractelement <16 x i32> %v, i32 0
527 %e1 = extractelement <16 x i32> %v, i32 1
528 %xor0 = xor i32 %e0, %e1
532 define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
533 ; CHECK-LABEL: reduce_xor_16xi32_prefix5:
535 ; CHECK-NEXT: li a1, -32
536 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
537 ; CHECK-NEXT: vmv.s.x v0, a1
538 ; CHECK-NEXT: vle32.v v8, (a0)
539 ; CHECK-NEXT: vmv.v.i v10, -1
540 ; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
541 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
542 ; CHECK-NEXT: vsext.vf4 v12, v10
543 ; CHECK-NEXT: vand.vv v8, v8, v12
544 ; CHECK-NEXT: vmv.s.x v10, zero
545 ; CHECK-NEXT: vredxor.vs v8, v8, v10
546 ; CHECK-NEXT: vmv.x.s a0, v8
548 %v = load <16 x i32>, ptr %p, align 256
549 %e0 = extractelement <16 x i32> %v, i32 0
550 %e1 = extractelement <16 x i32> %v, i32 1
551 %e2 = extractelement <16 x i32> %v, i32 2
552 %e3 = extractelement <16 x i32> %v, i32 3
553 %e4 = extractelement <16 x i32> %v, i32 4
554 %xor0 = xor i32 %e0, %e1
555 %xor1 = xor i32 %xor0, %e2
556 %xor2 = xor i32 %xor1, %e3
557 %xor3 = xor i32 %xor2, %e4
561 define i32 @reduce_and_16xi32_prefix2(ptr %p) {
562 ; CHECK-LABEL: reduce_and_16xi32_prefix2:
564 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
565 ; CHECK-NEXT: vle32.v v8, (a0)
566 ; CHECK-NEXT: vredand.vs v8, v8, v8
567 ; CHECK-NEXT: vmv.x.s a0, v8
569 %v = load <16 x i32>, ptr %p, align 256
570 %e0 = extractelement <16 x i32> %v, i32 0
571 %e1 = extractelement <16 x i32> %v, i32 1
572 %and0 = and i32 %e0, %e1
576 define i32 @reduce_and_16xi32_prefix5(ptr %p) {
577 ; CHECK-LABEL: reduce_and_16xi32_prefix5:
579 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
580 ; CHECK-NEXT: vle32.v v8, (a0)
581 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
582 ; CHECK-NEXT: vmv.v.i v10, -1
583 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
584 ; CHECK-NEXT: vslideup.vi v8, v10, 5
585 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
586 ; CHECK-NEXT: vslideup.vi v8, v10, 6
587 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
588 ; CHECK-NEXT: vslideup.vi v8, v10, 7
589 ; CHECK-NEXT: vredand.vs v8, v8, v8
590 ; CHECK-NEXT: vmv.x.s a0, v8
592 %v = load <16 x i32>, ptr %p, align 256
593 %e0 = extractelement <16 x i32> %v, i32 0
594 %e1 = extractelement <16 x i32> %v, i32 1
595 %e2 = extractelement <16 x i32> %v, i32 2
596 %e3 = extractelement <16 x i32> %v, i32 3
597 %e4 = extractelement <16 x i32> %v, i32 4
598 %and0 = and i32 %e0, %e1
599 %and1 = and i32 %and0, %e2
600 %and2 = and i32 %and1, %e3
601 %and3 = and i32 %and2, %e4
605 define i32 @reduce_or_16xi32_prefix2(ptr %p) {
606 ; CHECK-LABEL: reduce_or_16xi32_prefix2:
608 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
609 ; CHECK-NEXT: vle32.v v8, (a0)
610 ; CHECK-NEXT: vredor.vs v8, v8, v8
611 ; CHECK-NEXT: vmv.x.s a0, v8
613 %v = load <16 x i32>, ptr %p, align 256
614 %e0 = extractelement <16 x i32> %v, i32 0
615 %e1 = extractelement <16 x i32> %v, i32 1
616 %or0 = or i32 %e0, %e1
620 define i32 @reduce_or_16xi32_prefix5(ptr %p) {
621 ; CHECK-LABEL: reduce_or_16xi32_prefix5:
623 ; CHECK-NEXT: li a1, -32
624 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
625 ; CHECK-NEXT: vmv.s.x v0, a1
626 ; CHECK-NEXT: vle32.v v8, (a0)
627 ; CHECK-NEXT: vmv.v.i v10, -1
628 ; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
629 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
630 ; CHECK-NEXT: vsext.vf4 v12, v10
631 ; CHECK-NEXT: vand.vv v8, v8, v12
632 ; CHECK-NEXT: vredor.vs v8, v8, v8
633 ; CHECK-NEXT: vmv.x.s a0, v8
635 %v = load <16 x i32>, ptr %p, align 256
636 %e0 = extractelement <16 x i32> %v, i32 0
637 %e1 = extractelement <16 x i32> %v, i32 1
638 %e2 = extractelement <16 x i32> %v, i32 2
639 %e3 = extractelement <16 x i32> %v, i32 3
640 %e4 = extractelement <16 x i32> %v, i32 4
641 %or0 = or i32 %e0, %e1
642 %or1 = or i32 %or0, %e2
643 %or2 = or i32 %or1, %e3
644 %or3 = or i32 %or2, %e4
648 declare i32 @llvm.smax.i32(i32 %a, i32 %b)
649 declare i32 @llvm.smin.i32(i32 %a, i32 %b)
650 declare i32 @llvm.umax.i32(i32 %a, i32 %b)
651 declare i32 @llvm.umin.i32(i32 %a, i32 %b)
653 define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
654 ; CHECK-LABEL: reduce_smax_16xi32_prefix2:
656 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
657 ; CHECK-NEXT: vle32.v v8, (a0)
658 ; CHECK-NEXT: vredmax.vs v8, v8, v8
659 ; CHECK-NEXT: vmv.x.s a0, v8
661 %v = load <16 x i32>, ptr %p, align 256
662 %e0 = extractelement <16 x i32> %v, i32 0
663 %e1 = extractelement <16 x i32> %v, i32 1
664 %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
668 define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
669 ; CHECK-LABEL: reduce_smax_16xi32_prefix5:
671 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
672 ; CHECK-NEXT: vle32.v v8, (a0)
673 ; CHECK-NEXT: lui a0, 524288
674 ; CHECK-NEXT: vmv.s.x v10, a0
675 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
676 ; CHECK-NEXT: vslideup.vi v8, v10, 5
677 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
678 ; CHECK-NEXT: vslideup.vi v8, v10, 6
679 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
680 ; CHECK-NEXT: vslideup.vi v8, v10, 7
681 ; CHECK-NEXT: vredmax.vs v8, v8, v8
682 ; CHECK-NEXT: vmv.x.s a0, v8
684 %v = load <16 x i32>, ptr %p, align 256
685 %e0 = extractelement <16 x i32> %v, i32 0
686 %e1 = extractelement <16 x i32> %v, i32 1
687 %e2 = extractelement <16 x i32> %v, i32 2
688 %e3 = extractelement <16 x i32> %v, i32 3
689 %e4 = extractelement <16 x i32> %v, i32 4
690 %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1)
691 %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2)
692 %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3)
693 %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4)
697 define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
698 ; CHECK-LABEL: reduce_smin_16xi32_prefix2:
700 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
701 ; CHECK-NEXT: vle32.v v8, (a0)
702 ; CHECK-NEXT: vredmin.vs v8, v8, v8
703 ; CHECK-NEXT: vmv.x.s a0, v8
705 %v = load <16 x i32>, ptr %p, align 256
706 %e0 = extractelement <16 x i32> %v, i32 0
707 %e1 = extractelement <16 x i32> %v, i32 1
708 %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
712 define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
713 ; CHECK-LABEL: reduce_smin_16xi32_prefix5:
715 ; CHECK-NEXT: lui a1, 524288
716 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
717 ; CHECK-NEXT: vle32.v v8, (a0)
718 ; CHECK-NEXT: addi a1, a1, -1
719 ; CHECK-NEXT: vmv.s.x v10, a1
720 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
721 ; CHECK-NEXT: vslideup.vi v8, v10, 5
722 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
723 ; CHECK-NEXT: vslideup.vi v8, v10, 6
724 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
725 ; CHECK-NEXT: vslideup.vi v8, v10, 7
726 ; CHECK-NEXT: vredmin.vs v8, v8, v8
727 ; CHECK-NEXT: vmv.x.s a0, v8
729 %v = load <16 x i32>, ptr %p, align 256
730 %e0 = extractelement <16 x i32> %v, i32 0
731 %e1 = extractelement <16 x i32> %v, i32 1
732 %e2 = extractelement <16 x i32> %v, i32 2
733 %e3 = extractelement <16 x i32> %v, i32 3
734 %e4 = extractelement <16 x i32> %v, i32 4
735 %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1)
736 %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2)
737 %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3)
738 %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4)
742 define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
743 ; CHECK-LABEL: reduce_umax_16xi32_prefix2:
745 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
746 ; CHECK-NEXT: vle32.v v8, (a0)
747 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
748 ; CHECK-NEXT: vmv.x.s a0, v8
750 %v = load <16 x i32>, ptr %p, align 256
751 %e0 = extractelement <16 x i32> %v, i32 0
752 %e1 = extractelement <16 x i32> %v, i32 1
753 %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
757 define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
758 ; CHECK-LABEL: reduce_umax_16xi32_prefix5:
760 ; CHECK-NEXT: li a1, -32
761 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
762 ; CHECK-NEXT: vmv.s.x v0, a1
763 ; CHECK-NEXT: vle32.v v8, (a0)
764 ; CHECK-NEXT: vmv.v.i v10, -1
765 ; CHECK-NEXT: vmerge.vim v10, v10, 0, v0
766 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
767 ; CHECK-NEXT: vsext.vf4 v12, v10
768 ; CHECK-NEXT: vand.vv v8, v8, v12
769 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
770 ; CHECK-NEXT: vmv.x.s a0, v8
772 %v = load <16 x i32>, ptr %p, align 256
773 %e0 = extractelement <16 x i32> %v, i32 0
774 %e1 = extractelement <16 x i32> %v, i32 1
775 %e2 = extractelement <16 x i32> %v, i32 2
776 %e3 = extractelement <16 x i32> %v, i32 3
777 %e4 = extractelement <16 x i32> %v, i32 4
778 %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1)
779 %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2)
780 %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3)
781 %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4)
785 define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
786 ; CHECK-LABEL: reduce_umin_16xi32_prefix2:
788 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
789 ; CHECK-NEXT: vle32.v v8, (a0)
790 ; CHECK-NEXT: vredminu.vs v8, v8, v8
791 ; CHECK-NEXT: vmv.x.s a0, v8
793 %v = load <16 x i32>, ptr %p, align 256
794 %e0 = extractelement <16 x i32> %v, i32 0
795 %e1 = extractelement <16 x i32> %v, i32 1
796 %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
800 define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
801 ; CHECK-LABEL: reduce_umin_16xi32_prefix5:
803 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
804 ; CHECK-NEXT: vle32.v v8, (a0)
805 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
806 ; CHECK-NEXT: vmv.v.i v10, -1
807 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
808 ; CHECK-NEXT: vslideup.vi v8, v10, 5
809 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
810 ; CHECK-NEXT: vslideup.vi v8, v10, 6
811 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
812 ; CHECK-NEXT: vslideup.vi v8, v10, 7
813 ; CHECK-NEXT: vredminu.vs v8, v8, v8
814 ; CHECK-NEXT: vmv.x.s a0, v8
816 %v = load <16 x i32>, ptr %p, align 256
817 %e0 = extractelement <16 x i32> %v, i32 0
818 %e1 = extractelement <16 x i32> %v, i32 1
819 %e2 = extractelement <16 x i32> %v, i32 2
820 %e3 = extractelement <16 x i32> %v, i32 3
821 %e4 = extractelement <16 x i32> %v, i32 4
822 %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1)
823 %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2)
824 %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3)
825 %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4)
829 define float @reduce_fadd_16xf32_prefix2(ptr %p) {
830 ; CHECK-LABEL: reduce_fadd_16xf32_prefix2:
832 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
833 ; CHECK-NEXT: vle32.v v8, (a0)
834 ; CHECK-NEXT: vmv.s.x v9, zero
835 ; CHECK-NEXT: vfredusum.vs v8, v8, v9
836 ; CHECK-NEXT: vfmv.f.s fa0, v8
838 %v = load <16 x float>, ptr %p, align 256
839 %e0 = extractelement <16 x float> %v, i32 0
840 %e1 = extractelement <16 x float> %v, i32 1
841 %fadd0 = fadd fast float %e0, %e1
845 define float @reduce_fadd_16xi32_prefix5(ptr %p) {
846 ; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
848 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
849 ; CHECK-NEXT: vle32.v v8, (a0)
850 ; CHECK-NEXT: lui a0, 524288
851 ; CHECK-NEXT: vmv.s.x v10, a0
852 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
853 ; CHECK-NEXT: vslideup.vi v8, v10, 5
854 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma
855 ; CHECK-NEXT: vslideup.vi v8, v10, 6
856 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
857 ; CHECK-NEXT: vslideup.vi v8, v10, 7
858 ; CHECK-NEXT: vfredusum.vs v8, v8, v10
859 ; CHECK-NEXT: vfmv.f.s fa0, v8
861 %v = load <16 x float>, ptr %p, align 256
862 %e0 = extractelement <16 x float> %v, i32 0
863 %e1 = extractelement <16 x float> %v, i32 1
864 %e2 = extractelement <16 x float> %v, i32 2
865 %e3 = extractelement <16 x float> %v, i32 3
866 %e4 = extractelement <16 x float> %v, i32 4
867 %fadd0 = fadd fast float %e0, %e1
868 %fadd1 = fadd fast float %fadd0, %e2
869 %fadd2 = fadd fast float %fadd1, %e3
870 %fadd3 = fadd fast float %fadd2, %e4
874 ;; Corner case tests for fadd associativity
876 ; Negative test, not associative. Would need strict opcode.
877 define float @reduce_fadd_2xf32_non_associative(ptr %p) {
878 ; CHECK-LABEL: reduce_fadd_2xf32_non_associative:
880 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
881 ; CHECK-NEXT: vle32.v v8, (a0)
882 ; CHECK-NEXT: vfmv.f.s fa5, v8
883 ; CHECK-NEXT: vslidedown.vi v8, v8, 1
884 ; CHECK-NEXT: vfmv.f.s fa4, v8
885 ; CHECK-NEXT: fadd.s fa0, fa5, fa4
887 %v = load <2 x float>, ptr %p, align 256
888 %e0 = extractelement <2 x float> %v, i32 0
889 %e1 = extractelement <2 x float> %v, i32 1
890 %fadd0 = fadd float %e0, %e1
894 ; Positive test - minimal set of fast math flags
895 define float @reduce_fadd_2xf32_reassoc_only(ptr %p) {
896 ; CHECK-LABEL: reduce_fadd_2xf32_reassoc_only:
898 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
899 ; CHECK-NEXT: vle32.v v8, (a0)
900 ; CHECK-NEXT: lui a0, 524288
901 ; CHECK-NEXT: vmv.s.x v9, a0
902 ; CHECK-NEXT: vfredusum.vs v8, v8, v9
903 ; CHECK-NEXT: vfmv.f.s fa0, v8
905 %v = load <2 x float>, ptr %p, align 256
906 %e0 = extractelement <2 x float> %v, i32 0
907 %e1 = extractelement <2 x float> %v, i32 1
908 %fadd0 = fadd reassoc float %e0, %e1
912 ; Negative test - wrong fast math flag.
913 define float @reduce_fadd_2xf32_ninf_only(ptr %p) {
914 ; CHECK-LABEL: reduce_fadd_2xf32_ninf_only:
916 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
917 ; CHECK-NEXT: vle32.v v8, (a0)
918 ; CHECK-NEXT: vfmv.f.s fa5, v8
919 ; CHECK-NEXT: vslidedown.vi v8, v8, 1
920 ; CHECK-NEXT: vfmv.f.s fa4, v8
921 ; CHECK-NEXT: fadd.s fa0, fa5, fa4
923 %v = load <2 x float>, ptr %p, align 256
924 %e0 = extractelement <2 x float> %v, i32 0
925 %e1 = extractelement <2 x float> %v, i32 1
926 %fadd0 = fadd ninf float %e0, %e1
931 ; Negative test - last fadd is not associative
932 define float @reduce_fadd_4xi32_non_associative(ptr %p) {
933 ; CHECK-LABEL: reduce_fadd_4xi32_non_associative:
935 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
936 ; CHECK-NEXT: vle32.v v8, (a0)
937 ; CHECK-NEXT: vslidedown.vi v9, v8, 3
938 ; CHECK-NEXT: vfmv.f.s fa5, v9
939 ; CHECK-NEXT: lui a0, 524288
940 ; CHECK-NEXT: vmv.s.x v9, a0
941 ; CHECK-NEXT: vslideup.vi v8, v9, 3
942 ; CHECK-NEXT: vfredusum.vs v8, v8, v9
943 ; CHECK-NEXT: vfmv.f.s fa4, v8
944 ; CHECK-NEXT: fadd.s fa0, fa4, fa5
946 %v = load <4 x float>, ptr %p, align 256
947 %e0 = extractelement <4 x float> %v, i32 0
948 %e1 = extractelement <4 x float> %v, i32 1
949 %e2 = extractelement <4 x float> %v, i32 2
950 %e3 = extractelement <4 x float> %v, i32 3
951 %fadd0 = fadd fast float %e0, %e1
952 %fadd1 = fadd fast float %fadd0, %e2
953 %fadd2 = fadd float %fadd1, %e3
957 ; Negative test - first fadd is not associative
958 ; We could form a reduce for elements 2 and 3.
959 define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
960 ; CHECK-LABEL: reduce_fadd_4xi32_non_associative2:
962 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
963 ; CHECK-NEXT: vle32.v v8, (a0)
964 ; CHECK-NEXT: vfmv.f.s fa5, v8
965 ; CHECK-NEXT: vslidedown.vi v9, v8, 1
966 ; CHECK-NEXT: vfmv.f.s fa4, v9
967 ; CHECK-NEXT: vslidedown.vi v9, v8, 2
968 ; CHECK-NEXT: vfmv.f.s fa3, v9
969 ; CHECK-NEXT: vslidedown.vi v8, v8, 3
970 ; CHECK-NEXT: vfmv.f.s fa2, v8
971 ; CHECK-NEXT: fadd.s fa5, fa5, fa4
972 ; CHECK-NEXT: fadd.s fa4, fa3, fa2
973 ; CHECK-NEXT: fadd.s fa0, fa5, fa4
975 %v = load <4 x float>, ptr %p, align 256
976 %e0 = extractelement <4 x float> %v, i32 0
977 %e1 = extractelement <4 x float> %v, i32 1
978 %e2 = extractelement <4 x float> %v, i32 2
979 %e3 = extractelement <4 x float> %v, i32 3
980 %fadd0 = fadd float %e0, %e1
981 %fadd1 = fadd fast float %fadd0, %e2
982 %fadd2 = fadd fast float %fadd1, %e3