1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
3 ; RUN: -verify-machineinstrs < %s \
4 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
5 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
6 ; RUN: -verify-machineinstrs < %s \
7 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
8 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
9 ; RUN: -verify-machineinstrs < %s \
10 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT
11 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
12 ; RUN: -verify-machineinstrs < %s \
13 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT
15 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32)
17 define <2 x i8> @strided_vpload_v2i8_i8(ptr %ptr, i8 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
18 ; CHECK-LABEL: strided_vpload_v2i8_i8:
20 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
21 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
23 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr %ptr, i8 %stride, <2 x i1> %m, i32 %evl)
27 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i16(ptr, i16, <2 x i1>, i32)
29 define <2 x i8> @strided_vpload_v2i8_i16(ptr %ptr, i16 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
30 ; CHECK-LABEL: strided_vpload_v2i8_i16:
32 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
33 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
35 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i16(ptr %ptr, i16 %stride, <2 x i1> %m, i32 %evl)
39 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr, i64, <2 x i1>, i32)
41 define <2 x i8> @strided_vpload_v2i8_i64(ptr %ptr, i64 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
42 ; CHECK-RV32-LABEL: strided_vpload_v2i8_i64:
43 ; CHECK-RV32: # %bb.0:
44 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, mf8, ta, ma
45 ; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t
46 ; CHECK-RV32-NEXT: ret
48 ; CHECK-RV64-LABEL: strided_vpload_v2i8_i64:
49 ; CHECK-RV64: # %bb.0:
50 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
51 ; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t
52 ; CHECK-RV64-NEXT: ret
53 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %ptr, i64 %stride, <2 x i1> %m, i32 %evl)
57 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i32(ptr, i32, <2 x i1>, i32)
59 define <2 x i8> @strided_vpload_v2i8(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
60 ; CHECK-LABEL: strided_vpload_v2i8:
62 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
63 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
65 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
69 declare <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr, i32, <4 x i1>, i32)
71 define <4 x i8> @strided_vpload_v4i8(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
72 ; CHECK-LABEL: strided_vpload_v4i8:
74 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma
75 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
77 %load = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
81 define <4 x i8> @strided_vpload_v4i8_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
82 ; CHECK-LABEL: strided_vpload_v4i8_allones_mask:
84 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma
85 ; CHECK-NEXT: vlse8.v v8, (a0), a1
87 %load = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr %ptr, i32 %stride, <4 x i1> splat (i1 true), i32 %evl)
91 declare <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr, i32, <8 x i1>, i32)
93 define <8 x i8> @strided_vpload_v8i8(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
94 ; CHECK-LABEL: strided_vpload_v8i8:
96 ; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
97 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
99 %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
103 define <8 x i8> @strided_vpload_v8i8_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
104 ; CHECK-LABEL: strided_vpload_v8i8_unit_stride:
106 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
107 ; CHECK-NEXT: vle8.v v8, (a0), v0.t
109 %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 1, <8 x i1> %m, i32 %evl)
113 declare <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr, i32, <2 x i1>, i32)
115 define <2 x i16> @strided_vpload_v2i16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
116 ; CHECK-LABEL: strided_vpload_v2i16:
118 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
119 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
121 %load = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
125 declare <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i32(ptr, i32, <4 x i1>, i32)
127 define <4 x i16> @strided_vpload_v4i16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
128 ; CHECK-LABEL: strided_vpload_v4i16:
130 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
131 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
133 %load = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
137 declare <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr, i32, <8 x i1>, i32)
139 define <8 x i16> @strided_vpload_v8i16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
140 ; CHECK-LABEL: strided_vpload_v8i16:
142 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
143 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
145 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
149 define <8 x i16> @strided_vpload_v8i16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
150 ; CHECK-LABEL: strided_vpload_v8i16_unit_stride:
152 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
153 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
155 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
159 define <8 x i16> @strided_vpload_v8i16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
160 ; CHECK-LABEL: strided_vpload_v8i16_allones_mask:
162 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
163 ; CHECK-NEXT: vlse16.v v8, (a0), a1
165 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> splat (i1 true), i32 %evl)
169 declare <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i32(ptr, i32, <2 x i1>, i32)
171 define <2 x i32> @strided_vpload_v2i32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
172 ; CHECK-LABEL: strided_vpload_v2i32:
174 ; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma
175 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
177 %load = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
181 declare <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr, i32, <4 x i1>, i32)
183 define <4 x i32> @strided_vpload_v4i32(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
184 ; CHECK-LABEL: strided_vpload_v4i32:
186 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
187 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
189 %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
193 define <4 x i32> @strided_vpload_v4i32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
194 ; CHECK-LABEL: strided_vpload_v4i32_unit_stride:
196 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
197 ; CHECK-NEXT: vle32.v v8, (a0), v0.t
199 %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
203 declare <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr, i32, <8 x i1>, i32)
205 define <8 x i32> @strided_vpload_v8i32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
206 ; CHECK-LABEL: strided_vpload_v8i32:
208 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
209 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
211 %load = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
215 define <8 x i32> @strided_vpload_v8i32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
216 ; CHECK-LABEL: strided_vpload_v8i32_allones_mask:
218 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
219 ; CHECK-NEXT: vlse32.v v8, (a0), a1
221 %load = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> splat (i1 true), i32 %evl)
225 declare <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr, i32, <2 x i1>, i32)
227 define <2 x i64> @strided_vpload_v2i64(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
228 ; CHECK-LABEL: strided_vpload_v2i64:
230 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
231 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
233 %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
237 define <2 x i64> @strided_vpload_v2i64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
238 ; CHECK-LABEL: strided_vpload_v2i64_unit_stride:
240 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
241 ; CHECK-NEXT: vle64.v v8, (a0), v0.t
243 %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
247 declare <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr, i32, <4 x i1>, i32)
249 define <4 x i64> @strided_vpload_v4i64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
250 ; CHECK-LABEL: strided_vpload_v4i64:
252 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
253 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
255 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
259 define <4 x i64> @strided_vpload_v4i64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
260 ; CHECK-LABEL: strided_vpload_v4i64_allones_mask:
262 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
263 ; CHECK-NEXT: vlse64.v v8, (a0), a1
265 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> splat (i1 true), i32 %evl)
269 declare <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i32(ptr, i32, <8 x i1>, i32)
271 define <8 x i64> @strided_vpload_v8i64(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
272 ; CHECK-LABEL: strided_vpload_v8i64:
274 ; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
275 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
277 %load = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
281 declare <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr, i32, <2 x i1>, i32)
283 define <2 x half> @strided_vpload_v2f16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
284 ; CHECK-LABEL: strided_vpload_v2f16:
286 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
287 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
289 %load = call <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
293 define <2 x half> @strided_vpload_v2f16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
294 ; CHECK-LABEL: strided_vpload_v2f16_allones_mask:
296 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
297 ; CHECK-NEXT: vlse16.v v8, (a0), a1
299 %load = call <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> splat (i1 true), i32 %evl)
303 declare <4 x half> @llvm.experimental.vp.strided.load.v4f16.p0.i32(ptr, i32, <4 x i1>, i32)
305 define <4 x half> @strided_vpload_v4f16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
306 ; CHECK-LABEL: strided_vpload_v4f16:
308 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
309 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
311 %load = call <4 x half> @llvm.experimental.vp.strided.load.v4f16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
315 declare <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr, i32, <8 x i1>, i32)
317 define <8 x half> @strided_vpload_v8f16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
318 ; CHECK-LABEL: strided_vpload_v8f16:
320 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
321 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
323 %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
327 define <8 x half> @strided_vpload_v8f16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
328 ; CHECK-LABEL: strided_vpload_v8f16_unit_stride:
330 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
331 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
333 %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
337 declare <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr, i32, <2 x i1>, i32)
339 define <2 x float> @strided_vpload_v2f32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
340 ; CHECK-LABEL: strided_vpload_v2f32:
342 ; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma
343 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
345 %load = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
346 ret <2 x float> %load
349 declare <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr, i32, <4 x i1>, i32)
351 define <4 x float> @strided_vpload_v4f32(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
352 ; CHECK-LABEL: strided_vpload_v4f32:
354 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
355 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
357 %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
358 ret <4 x float> %load
361 define <4 x float> @strided_vpload_v4f32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
362 ; CHECK-LABEL: strided_vpload_v4f32_unit_stride:
364 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
365 ; CHECK-NEXT: vle32.v v8, (a0), v0.t
367 %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
368 ret <4 x float> %load
371 declare <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr, i32, <8 x i1>, i32)
373 define <8 x float> @strided_vpload_v8f32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
374 ; CHECK-LABEL: strided_vpload_v8f32:
376 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
377 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
379 %load = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
380 ret <8 x float> %load
383 define <8 x float> @strided_vpload_v8f32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
384 ; CHECK-LABEL: strided_vpload_v8f32_allones_mask:
386 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
387 ; CHECK-NEXT: vlse32.v v8, (a0), a1
389 %load = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> splat (i1 true), i32 %evl)
390 ret <8 x float> %load
393 declare <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr, i32, <2 x i1>, i32)
395 define <2 x double> @strided_vpload_v2f64(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
396 ; CHECK-LABEL: strided_vpload_v2f64:
398 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
399 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
401 %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
402 ret <2 x double> %load
405 define <2 x double> @strided_vpload_v2f64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
406 ; CHECK-LABEL: strided_vpload_v2f64_unit_stride:
408 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
409 ; CHECK-NEXT: vle64.v v8, (a0), v0.t
411 %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
412 ret <2 x double> %load
416 declare <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr, i32, <4 x i1>, i32)
418 define <4 x double> @strided_vpload_v4f64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
419 ; CHECK-LABEL: strided_vpload_v4f64:
421 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
422 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
424 %load = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
425 ret <4 x double> %load
428 define <4 x double> @strided_vpload_v4f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
429 ; CHECK-LABEL: strided_vpload_v4f64_allones_mask:
431 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
432 ; CHECK-NEXT: vlse64.v v8, (a0), a1
434 %load = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> splat (i1 true), i32 %evl)
435 ret <4 x double> %load
438 declare <8 x double> @llvm.experimental.vp.strided.load.v8f64.p0.i32(ptr, i32, <8 x i1>, i32)
440 define <8 x double> @strided_vpload_v8f64(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
441 ; CHECK-LABEL: strided_vpload_v8f64:
443 ; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
444 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
446 %load = call <8 x double> @llvm.experimental.vp.strided.load.v8f64.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
447 ret <8 x double> %load
451 define <3 x double> @strided_vpload_v3f64(ptr %ptr, i32 signext %stride, <3 x i1> %mask, i32 zeroext %evl) {
452 ; CHECK-LABEL: strided_vpload_v3f64:
454 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
455 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
457 %v = call <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr %ptr, i32 %stride, <3 x i1> %mask, i32 %evl)
461 define <3 x double> @strided_vpload_v3f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
462 ; CHECK-LABEL: strided_vpload_v3f64_allones_mask:
464 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
465 ; CHECK-NEXT: vlse64.v v8, (a0), a1
467 %v = call <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr %ptr, i32 %stride, <3 x i1> splat (i1 true), i32 %evl)
471 declare <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr, i32, <3 x i1>, i32)
474 define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x i1> %m, i32 zeroext %evl) nounwind {
475 ; CHECK-LABEL: strided_vpload_v32f64:
477 ; CHECK-NEXT: li a4, 16
478 ; CHECK-NEXT: vmv1r.v v9, v0
479 ; CHECK-NEXT: mv a3, a2
480 ; CHECK-NEXT: bltu a2, a4, .LBB40_2
481 ; CHECK-NEXT: # %bb.1:
482 ; CHECK-NEXT: li a3, 16
483 ; CHECK-NEXT: .LBB40_2:
484 ; CHECK-NEXT: mul a4, a3, a1
485 ; CHECK-NEXT: add a4, a0, a4
486 ; CHECK-NEXT: addi a5, a2, -16
487 ; CHECK-NEXT: sltu a2, a2, a5
488 ; CHECK-NEXT: addi a2, a2, -1
489 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
490 ; CHECK-NEXT: vslidedown.vi v8, v9, 2
491 ; CHECK-NEXT: and a2, a2, a5
492 ; CHECK-NEXT: vmv1r.v v0, v8
493 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
494 ; CHECK-NEXT: vlse64.v v16, (a4), a1, v0.t
495 ; CHECK-NEXT: vmv1r.v v0, v9
496 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
497 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
499 %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
500 ret <32 x double> %load
503 define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
504 ; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
506 ; CHECK-NEXT: li a4, 16
507 ; CHECK-NEXT: mv a3, a2
508 ; CHECK-NEXT: bltu a2, a4, .LBB41_2
509 ; CHECK-NEXT: # %bb.1:
510 ; CHECK-NEXT: li a3, 16
511 ; CHECK-NEXT: .LBB41_2:
512 ; CHECK-NEXT: mul a4, a3, a1
513 ; CHECK-NEXT: add a4, a0, a4
514 ; CHECK-NEXT: addi a5, a2, -16
515 ; CHECK-NEXT: sltu a2, a2, a5
516 ; CHECK-NEXT: addi a2, a2, -1
517 ; CHECK-NEXT: and a2, a2, a5
518 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
519 ; CHECK-NEXT: vlse64.v v16, (a4), a1
520 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
521 ; CHECK-NEXT: vlse64.v v8, (a0), a1
523 %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> splat (i1 true), i32 %evl)
524 ret <32 x double> %load
527 declare <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr, i32, <32 x i1>, i32)
529 ; Widening + splitting (with HiIsEmpty == true)
530 define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 zeroext %evl) {
531 ; CHECK-RV32-LABEL: strided_load_v33f64:
532 ; CHECK-RV32: # %bb.0:
533 ; CHECK-RV32-NEXT: li a5, 32
534 ; CHECK-RV32-NEXT: vmv1r.v v8, v0
535 ; CHECK-RV32-NEXT: mv a3, a4
536 ; CHECK-RV32-NEXT: bltu a4, a5, .LBB42_2
537 ; CHECK-RV32-NEXT: # %bb.1:
538 ; CHECK-RV32-NEXT: li a3, 32
539 ; CHECK-RV32-NEXT: .LBB42_2:
540 ; CHECK-RV32-NEXT: mul a6, a3, a2
541 ; CHECK-RV32-NEXT: addi a5, a4, -32
542 ; CHECK-RV32-NEXT: sltu a7, a4, a5
543 ; CHECK-RV32-NEXT: addi a7, a7, -1
544 ; CHECK-RV32-NEXT: and a7, a7, a5
545 ; CHECK-RV32-NEXT: li a5, 16
546 ; CHECK-RV32-NEXT: add a6, a1, a6
547 ; CHECK-RV32-NEXT: bltu a7, a5, .LBB42_4
548 ; CHECK-RV32-NEXT: # %bb.3:
549 ; CHECK-RV32-NEXT: li a7, 16
550 ; CHECK-RV32-NEXT: .LBB42_4:
551 ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
552 ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4
553 ; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, ma
554 ; CHECK-RV32-NEXT: vlse64.v v16, (a6), a2, v0.t
555 ; CHECK-RV32-NEXT: addi a6, a3, -16
556 ; CHECK-RV32-NEXT: sltu a3, a3, a6
557 ; CHECK-RV32-NEXT: addi a3, a3, -1
558 ; CHECK-RV32-NEXT: and a3, a3, a6
559 ; CHECK-RV32-NEXT: bltu a4, a5, .LBB42_6
560 ; CHECK-RV32-NEXT: # %bb.5:
561 ; CHECK-RV32-NEXT: li a4, 16
562 ; CHECK-RV32-NEXT: .LBB42_6:
563 ; CHECK-RV32-NEXT: mul a5, a4, a2
564 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
565 ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 2
566 ; CHECK-RV32-NEXT: add a5, a1, a5
567 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
568 ; CHECK-RV32-NEXT: vlse64.v v24, (a5), a2, v0.t
569 ; CHECK-RV32-NEXT: vmv1r.v v0, v8
570 ; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
571 ; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t
572 ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
573 ; CHECK-RV32-NEXT: vse64.v v8, (a0)
574 ; CHECK-RV32-NEXT: addi a1, a0, 128
575 ; CHECK-RV32-NEXT: vse64.v v24, (a1)
576 ; CHECK-RV32-NEXT: addi a0, a0, 256
577 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
578 ; CHECK-RV32-NEXT: vse64.v v16, (a0)
579 ; CHECK-RV32-NEXT: ret
581 ; CHECK-RV64-LABEL: strided_load_v33f64:
582 ; CHECK-RV64: # %bb.0:
583 ; CHECK-RV64-NEXT: li a5, 32
584 ; CHECK-RV64-NEXT: vmv1r.v v8, v0
585 ; CHECK-RV64-NEXT: mv a4, a3
586 ; CHECK-RV64-NEXT: bltu a3, a5, .LBB42_2
587 ; CHECK-RV64-NEXT: # %bb.1:
588 ; CHECK-RV64-NEXT: li a4, 32
589 ; CHECK-RV64-NEXT: .LBB42_2:
590 ; CHECK-RV64-NEXT: mul a6, a4, a2
591 ; CHECK-RV64-NEXT: addi a5, a3, -32
592 ; CHECK-RV64-NEXT: sltu a7, a3, a5
593 ; CHECK-RV64-NEXT: addi a7, a7, -1
594 ; CHECK-RV64-NEXT: and a7, a7, a5
595 ; CHECK-RV64-NEXT: li a5, 16
596 ; CHECK-RV64-NEXT: add a6, a1, a6
597 ; CHECK-RV64-NEXT: bltu a7, a5, .LBB42_4
598 ; CHECK-RV64-NEXT: # %bb.3:
599 ; CHECK-RV64-NEXT: li a7, 16
600 ; CHECK-RV64-NEXT: .LBB42_4:
601 ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
602 ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4
603 ; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
604 ; CHECK-RV64-NEXT: vlse64.v v16, (a6), a2, v0.t
605 ; CHECK-RV64-NEXT: addi a6, a4, -16
606 ; CHECK-RV64-NEXT: sltu a4, a4, a6
607 ; CHECK-RV64-NEXT: addi a4, a4, -1
608 ; CHECK-RV64-NEXT: and a4, a4, a6
609 ; CHECK-RV64-NEXT: bltu a3, a5, .LBB42_6
610 ; CHECK-RV64-NEXT: # %bb.5:
611 ; CHECK-RV64-NEXT: li a3, 16
612 ; CHECK-RV64-NEXT: .LBB42_6:
613 ; CHECK-RV64-NEXT: mul a5, a3, a2
614 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
615 ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 2
616 ; CHECK-RV64-NEXT: add a5, a1, a5
617 ; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
618 ; CHECK-RV64-NEXT: vlse64.v v24, (a5), a2, v0.t
619 ; CHECK-RV64-NEXT: vmv1r.v v0, v8
620 ; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
621 ; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t
622 ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
623 ; CHECK-RV64-NEXT: vse64.v v8, (a0)
624 ; CHECK-RV64-NEXT: addi a1, a0, 128
625 ; CHECK-RV64-NEXT: vse64.v v24, (a1)
626 ; CHECK-RV64-NEXT: addi a0, a0, 256
627 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
628 ; CHECK-RV64-NEXT: vse64.v v16, (a0)
629 ; CHECK-RV64-NEXT: ret
630 %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
634 declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32)
636 ; TODO: Use accurate evl.
637 ; Test unmasked integer zero strided
638 define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
639 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
640 ; CHECK-OPT: # %bb.0:
641 ; CHECK-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
642 ; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
643 ; CHECK-OPT-NEXT: ret
645 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
646 ; CHECK-NO-OPT: # %bb.0:
647 ; CHECK-NO-OPT-NEXT: lbu a0, 0(a0)
648 ; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
649 ; CHECK-NO-OPT-NEXT: vmv.v.x v8, a0
650 ; CHECK-NO-OPT-NEXT: ret
651 %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3)
655 ; TODO: Use accurate evl.
656 ; Test unmasked float zero strided
657 define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
658 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
659 ; CHECK-OPT: # %bb.0:
660 ; CHECK-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
661 ; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
662 ; CHECK-OPT-NEXT: ret
664 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
665 ; CHECK-NO-OPT: # %bb.0:
666 ; CHECK-NO-OPT-NEXT: flh fa5, 0(a0)
667 ; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
668 ; CHECK-NO-OPT-NEXT: vfmv.v.f v8, fa5
669 ; CHECK-NO-OPT-NEXT: ret
670 %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
674 define <4 x i64> @zero_strided_vadd.vx(<4 x i64> %v, ptr %ptr) {
675 ; CHECK-RV32-LABEL: zero_strided_vadd.vx:
676 ; CHECK-RV32: # %bb.0:
677 ; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
678 ; CHECK-RV32-NEXT: vlse64.v v10, (a0), zero
679 ; CHECK-RV32-NEXT: vadd.vv v8, v8, v10
680 ; CHECK-RV32-NEXT: ret
682 ; CHECK-RV64-LABEL: zero_strided_vadd.vx:
683 ; CHECK-RV64: # %bb.0:
684 ; CHECK-RV64-NEXT: ld a0, 0(a0)
685 ; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
686 ; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
687 ; CHECK-RV64-NEXT: ret
688 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4)
689 %w = add <4 x i64> %v, %load