1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
3 ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
4 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
5 ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
7 define float @add_f32(<4 x float> %a, <4 x float> %b) {
8 ; CHECK-LABEL: add_f32:
10 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
11 ; CHECK-NEXT: vfadd.vv v8, v8, v9
12 ; CHECK-NEXT: vmv.s.x v9, zero
13 ; CHECK-NEXT: vfredusum.vs v8, v8, v9
14 ; CHECK-NEXT: vfmv.f.s fa0, v8
16 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
17 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
18 %r = fadd fast float %r1, %r2
22 define float @fmul_f32(<4 x float> %a, <4 x float> %b) {
23 ; CHECK-LABEL: fmul_f32:
25 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
26 ; CHECK-NEXT: vslidedown.vi v10, v8, 2
27 ; CHECK-NEXT: vfmul.vv v8, v8, v10
28 ; CHECK-NEXT: vrgather.vi v10, v8, 1
29 ; CHECK-NEXT: vfmul.vv v8, v8, v10
30 ; CHECK-NEXT: vfmv.f.s fa5, v8
31 ; CHECK-NEXT: vslidedown.vi v8, v9, 2
32 ; CHECK-NEXT: vfmul.vv v8, v9, v8
33 ; CHECK-NEXT: vrgather.vi v9, v8, 1
34 ; CHECK-NEXT: vfmul.vv v8, v8, v9
35 ; CHECK-NEXT: vfmv.f.s fa4, v8
36 ; CHECK-NEXT: fmul.s fa0, fa5, fa4
38 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
39 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
40 %r = fmul fast float %r1, %r2
44 define float @fmin_f32(<4 x float> %a, <4 x float> %b) {
45 ; CHECK-LABEL: fmin_f32:
47 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
48 ; CHECK-NEXT: vfmin.vv v8, v8, v9
49 ; CHECK-NEXT: vfredmin.vs v8, v8, v8
50 ; CHECK-NEXT: vfmv.f.s fa0, v8
52 %r1 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
53 %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
54 %r = call float @llvm.minnum.f32(float %r1, float %r2)
58 define float @fmax_f32(<4 x float> %a, <4 x float> %b) {
59 ; CHECK-LABEL: fmax_f32:
61 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
62 ; CHECK-NEXT: vfmax.vv v8, v8, v9
63 ; CHECK-NEXT: vfredmax.vs v8, v8, v8
64 ; CHECK-NEXT: vfmv.f.s fa0, v8
66 %r1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
67 %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
68 %r = call float @llvm.maxnum.f32(float %r1, float %r2)
73 define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) {
74 ; CHECK-LABEL: add_i32:
76 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
77 ; CHECK-NEXT: vadd.vv v8, v8, v9
78 ; CHECK-NEXT: vmv.s.x v9, zero
79 ; CHECK-NEXT: vredsum.vs v8, v8, v9
80 ; CHECK-NEXT: vmv.x.s a0, v8
82 %r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a)
83 %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
88 define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
89 ; CHECK-LABEL: add_ext_i16:
91 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
92 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
93 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
94 ; CHECK-NEXT: vmv.s.x v8, zero
95 ; CHECK-NEXT: vredsum.vs v8, v10, v8
96 ; CHECK-NEXT: vmv.x.s a0, v8
98 %ae = zext <16 x i8> %a to <16 x i16>
99 %be = zext <16 x i8> %b to <16 x i16>
100 %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
101 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
102 %r = add i16 %r1, %r2
106 define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
107 ; CHECK-LABEL: add_ext_v32i16:
109 ; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma
110 ; CHECK-NEXT: vmv.s.x v11, zero
111 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
112 ; CHECK-NEXT: vwredsumu.vs v10, v10, v11
113 ; CHECK-NEXT: li a0, 32
114 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
115 ; CHECK-NEXT: vwredsumu.vs v8, v8, v10
116 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
117 ; CHECK-NEXT: vmv.x.s a0, v8
119 %ae = zext <32 x i8> %a to <32 x i16>
120 %be = zext <16 x i8> %b to <16 x i16>
121 %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
122 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
123 %r = add i16 %r1, %r2
127 define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
128 ; RV32-LABEL: mul_i32:
130 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
131 ; RV32-NEXT: vslidedown.vi v10, v8, 2
132 ; RV32-NEXT: vmul.vv v8, v8, v10
133 ; RV32-NEXT: vrgather.vi v10, v8, 1
134 ; RV32-NEXT: vmul.vv v8, v8, v10
135 ; RV32-NEXT: vmv.x.s a0, v8
136 ; RV32-NEXT: vslidedown.vi v8, v9, 2
137 ; RV32-NEXT: vmul.vv v8, v9, v8
138 ; RV32-NEXT: vrgather.vi v9, v8, 1
139 ; RV32-NEXT: vmul.vv v8, v8, v9
140 ; RV32-NEXT: vmv.x.s a1, v8
141 ; RV32-NEXT: mul a0, a0, a1
144 ; RV64-LABEL: mul_i32:
146 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
147 ; RV64-NEXT: vslidedown.vi v10, v8, 2
148 ; RV64-NEXT: vmul.vv v8, v8, v10
149 ; RV64-NEXT: vrgather.vi v10, v8, 1
150 ; RV64-NEXT: vmul.vv v8, v8, v10
151 ; RV64-NEXT: vmv.x.s a0, v8
152 ; RV64-NEXT: vslidedown.vi v8, v9, 2
153 ; RV64-NEXT: vmul.vv v8, v9, v8
154 ; RV64-NEXT: vrgather.vi v9, v8, 1
155 ; RV64-NEXT: vmul.vv v8, v8, v9
156 ; RV64-NEXT: vmv.x.s a1, v8
157 ; RV64-NEXT: mulw a0, a0, a1
159 %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
160 %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
161 %r = mul i32 %r1, %r2
165 define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) {
166 ; CHECK-LABEL: and_i32:
168 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
169 ; CHECK-NEXT: vand.vv v8, v8, v9
170 ; CHECK-NEXT: vredand.vs v8, v8, v8
171 ; CHECK-NEXT: vmv.x.s a0, v8
173 %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
174 %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
175 %r = and i32 %r1, %r2
179 define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) {
180 ; CHECK-LABEL: or_i32:
182 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
183 ; CHECK-NEXT: vor.vv v8, v8, v9
184 ; CHECK-NEXT: vredor.vs v8, v8, v8
185 ; CHECK-NEXT: vmv.x.s a0, v8
187 %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
188 %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
193 define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) {
194 ; CHECK-LABEL: xor_i32:
196 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
197 ; CHECK-NEXT: vxor.vv v8, v8, v9
198 ; CHECK-NEXT: vmv.s.x v9, zero
199 ; CHECK-NEXT: vredxor.vs v8, v8, v9
200 ; CHECK-NEXT: vmv.x.s a0, v8
202 %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
203 %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
204 %r = xor i32 %r1, %r2
208 define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) {
209 ; CHECK-LABEL: umin_i32:
211 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
212 ; CHECK-NEXT: vminu.vv v8, v8, v9
213 ; CHECK-NEXT: vredminu.vs v8, v8, v8
214 ; CHECK-NEXT: vmv.x.s a0, v8
216 %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
217 %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
218 %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
222 define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
223 ; CHECK-LABEL: umax_i32:
225 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
226 ; CHECK-NEXT: vmaxu.vv v8, v8, v9
227 ; CHECK-NEXT: vredmaxu.vs v8, v8, v8
228 ; CHECK-NEXT: vmv.x.s a0, v8
230 %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
231 %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
232 %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
236 define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) {
237 ; CHECK-LABEL: smin_i32:
239 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
240 ; CHECK-NEXT: vmin.vv v8, v8, v9
241 ; CHECK-NEXT: vredmin.vs v8, v8, v8
242 ; CHECK-NEXT: vmv.x.s a0, v8
244 %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
245 %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
246 %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
250 define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) {
251 ; CHECK-LABEL: smax_i32:
253 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
254 ; CHECK-NEXT: vmax.vv v8, v8, v9
255 ; CHECK-NEXT: vredmax.vs v8, v8, v8
256 ; CHECK-NEXT: vmv.x.s a0, v8
258 %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
259 %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
260 %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
264 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
265 declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
266 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
267 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
268 declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
269 declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
270 declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
271 declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
272 declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
273 declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
274 declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
275 declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
276 declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
277 declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
278 declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
279 declare float @llvm.minnum.f32(float, float)
280 declare float @llvm.maxnum.f32(float, float)
281 declare i32 @llvm.umin.i32(i32, i32)
282 declare i32 @llvm.umax.i32(i32, i32)
283 declare i32 @llvm.smin.i32(i32, i32)
284 declare i32 @llvm.smax.i32(i32, i32)