1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
3 ; RUN: -verify-machineinstrs < %s \
4 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
5 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
6 ; RUN: -verify-machineinstrs < %s \
7 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
9 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32)
11 define <2 x i8> @strided_vpload_v2i8_i8(ptr %ptr, i8 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
12 ; CHECK-LABEL: strided_vpload_v2i8_i8:
14 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
15 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
17 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr %ptr, i8 %stride, <2 x i1> %m, i32 %evl)
21 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i16(ptr, i16, <2 x i1>, i32)
23 define <2 x i8> @strided_vpload_v2i8_i16(ptr %ptr, i16 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
24 ; CHECK-LABEL: strided_vpload_v2i8_i16:
26 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
27 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
29 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i16(ptr %ptr, i16 %stride, <2 x i1> %m, i32 %evl)
33 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr, i64, <2 x i1>, i32)
35 define <2 x i8> @strided_vpload_v2i8_i64(ptr %ptr, i64 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
36 ; CHECK-RV32-LABEL: strided_vpload_v2i8_i64:
37 ; CHECK-RV32: # %bb.0:
38 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, mf8, ta, ma
39 ; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t
40 ; CHECK-RV32-NEXT: ret
42 ; CHECK-RV64-LABEL: strided_vpload_v2i8_i64:
43 ; CHECK-RV64: # %bb.0:
44 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
45 ; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t
46 ; CHECK-RV64-NEXT: ret
47 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %ptr, i64 %stride, <2 x i1> %m, i32 %evl)
51 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i32(ptr, i32, <2 x i1>, i32)
53 define <2 x i8> @strided_vpload_v2i8(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
54 ; CHECK-LABEL: strided_vpload_v2i8:
56 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
57 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
59 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
63 declare <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr, i32, <4 x i1>, i32)
65 define <4 x i8> @strided_vpload_v4i8(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
66 ; CHECK-LABEL: strided_vpload_v4i8:
68 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma
69 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
71 %load = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
75 define <4 x i8> @strided_vpload_v4i8_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
76 ; CHECK-LABEL: strided_vpload_v4i8_allones_mask:
78 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma
79 ; CHECK-NEXT: vlse8.v v8, (a0), a1
81 %a = insertelement <4 x i1> poison, i1 true, i32 0
82 %b = shufflevector <4 x i1> %a, <4 x i1> poison, <4 x i32> zeroinitializer
83 %load = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %b, i32 %evl)
87 declare <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr, i32, <8 x i1>, i32)
89 define <8 x i8> @strided_vpload_v8i8(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
90 ; CHECK-LABEL: strided_vpload_v8i8:
92 ; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
93 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
95 %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
99 define <8 x i8> @strided_vpload_v8i8_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
100 ; CHECK-LABEL: strided_vpload_v8i8_unit_stride:
102 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
103 ; CHECK-NEXT: vle8.v v8, (a0), v0.t
105 %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 1, <8 x i1> %m, i32 %evl)
109 declare <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr, i32, <2 x i1>, i32)
111 define <2 x i16> @strided_vpload_v2i16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
112 ; CHECK-LABEL: strided_vpload_v2i16:
114 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
115 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
117 %load = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
121 declare <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i32(ptr, i32, <4 x i1>, i32)
123 define <4 x i16> @strided_vpload_v4i16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
124 ; CHECK-LABEL: strided_vpload_v4i16:
126 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
127 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
129 %load = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
133 declare <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr, i32, <8 x i1>, i32)
135 define <8 x i16> @strided_vpload_v8i16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
136 ; CHECK-LABEL: strided_vpload_v8i16:
138 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
139 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
141 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
145 define <8 x i16> @strided_vpload_v8i16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
146 ; CHECK-LABEL: strided_vpload_v8i16_unit_stride:
148 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
149 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
151 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
155 define <8 x i16> @strided_vpload_v8i16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
156 ; CHECK-LABEL: strided_vpload_v8i16_allones_mask:
158 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
159 ; CHECK-NEXT: vlse16.v v8, (a0), a1
161 %a = insertelement <8 x i1> poison, i1 true, i32 0
162 %b = shufflevector <8 x i1> %a, <8 x i1> poison, <8 x i32> zeroinitializer
163 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %b, i32 %evl)
167 declare <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i32(ptr, i32, <2 x i1>, i32)
169 define <2 x i32> @strided_vpload_v2i32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
170 ; CHECK-LABEL: strided_vpload_v2i32:
172 ; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma
173 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
175 %load = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
179 declare <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr, i32, <4 x i1>, i32)
181 define <4 x i32> @strided_vpload_v4i32(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
182 ; CHECK-LABEL: strided_vpload_v4i32:
184 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
185 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
187 %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
191 define <4 x i32> @strided_vpload_v4i32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
192 ; CHECK-LABEL: strided_vpload_v4i32_unit_stride:
194 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
195 ; CHECK-NEXT: vle32.v v8, (a0), v0.t
197 %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
201 declare <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr, i32, <8 x i1>, i32)
203 define <8 x i32> @strided_vpload_v8i32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
204 ; CHECK-LABEL: strided_vpload_v8i32:
206 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
207 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
209 %load = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
213 define <8 x i32> @strided_vpload_v8i32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
214 ; CHECK-LABEL: strided_vpload_v8i32_allones_mask:
216 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
217 ; CHECK-NEXT: vlse32.v v8, (a0), a1
219 %a = insertelement <8 x i1> poison, i1 true, i32 0
220 %b = shufflevector <8 x i1> %a, <8 x i1> poison, <8 x i32> zeroinitializer
221 %load = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %b, i32 %evl)
225 declare <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr, i32, <2 x i1>, i32)
227 define <2 x i64> @strided_vpload_v2i64(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
228 ; CHECK-LABEL: strided_vpload_v2i64:
230 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
231 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
233 %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
237 define <2 x i64> @strided_vpload_v2i64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
238 ; CHECK-LABEL: strided_vpload_v2i64_unit_stride:
240 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
241 ; CHECK-NEXT: vle64.v v8, (a0), v0.t
243 %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
247 declare <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr, i32, <4 x i1>, i32)
249 define <4 x i64> @strided_vpload_v4i64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
250 ; CHECK-LABEL: strided_vpload_v4i64:
252 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
253 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
255 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
259 define <4 x i64> @strided_vpload_v4i64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
260 ; CHECK-LABEL: strided_vpload_v4i64_allones_mask:
262 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
263 ; CHECK-NEXT: vlse64.v v8, (a0), a1
265 %a = insertelement <4 x i1> poison, i1 true, i32 0
266 %b = shufflevector <4 x i1> %a, <4 x i1> poison, <4 x i32> zeroinitializer
267 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %b, i32 %evl)
271 declare <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i32(ptr, i32, <8 x i1>, i32)
273 define <8 x i64> @strided_vpload_v8i64(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
274 ; CHECK-LABEL: strided_vpload_v8i64:
276 ; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
277 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
279 %load = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
283 declare <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr, i32, <2 x i1>, i32)
285 define <2 x half> @strided_vpload_v2f16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
286 ; CHECK-LABEL: strided_vpload_v2f16:
288 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
289 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
291 %load = call <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
295 define <2 x half> @strided_vpload_v2f16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
296 ; CHECK-LABEL: strided_vpload_v2f16_allones_mask:
298 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
299 ; CHECK-NEXT: vlse16.v v8, (a0), a1
301 %a = insertelement <2 x i1> poison, i1 true, i32 0
302 %b = shufflevector <2 x i1> %a, <2 x i1> poison, <2 x i32> zeroinitializer
303 %load = call <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %b, i32 %evl)
307 declare <4 x half> @llvm.experimental.vp.strided.load.v4f16.p0.i32(ptr, i32, <4 x i1>, i32)
309 define <4 x half> @strided_vpload_v4f16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
310 ; CHECK-LABEL: strided_vpload_v4f16:
312 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
313 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
315 %load = call <4 x half> @llvm.experimental.vp.strided.load.v4f16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
319 declare <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr, i32, <8 x i1>, i32)
321 define <8 x half> @strided_vpload_v8f16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
322 ; CHECK-LABEL: strided_vpload_v8f16:
324 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
325 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
327 %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
331 define <8 x half> @strided_vpload_v8f16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
332 ; CHECK-LABEL: strided_vpload_v8f16_unit_stride:
334 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
335 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
337 %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
341 declare <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr, i32, <2 x i1>, i32)
343 define <2 x float> @strided_vpload_v2f32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
344 ; CHECK-LABEL: strided_vpload_v2f32:
346 ; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma
347 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
349 %load = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
350 ret <2 x float> %load
353 declare <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr, i32, <4 x i1>, i32)
355 define <4 x float> @strided_vpload_v4f32(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
356 ; CHECK-LABEL: strided_vpload_v4f32:
358 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
359 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
361 %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
362 ret <4 x float> %load
365 define <4 x float> @strided_vpload_v4f32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
366 ; CHECK-LABEL: strided_vpload_v4f32_unit_stride:
368 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
369 ; CHECK-NEXT: vle32.v v8, (a0), v0.t
371 %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
372 ret <4 x float> %load
375 declare <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr, i32, <8 x i1>, i32)
377 define <8 x float> @strided_vpload_v8f32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
378 ; CHECK-LABEL: strided_vpload_v8f32:
380 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
381 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
383 %load = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
384 ret <8 x float> %load
387 define <8 x float> @strided_vpload_v8f32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
388 ; CHECK-LABEL: strided_vpload_v8f32_allones_mask:
390 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
391 ; CHECK-NEXT: vlse32.v v8, (a0), a1
393 %a = insertelement <8 x i1> poison, i1 true, i32 0
394 %b = shufflevector <8 x i1> %a, <8 x i1> poison, <8 x i32> zeroinitializer
395 %load = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %b, i32 %evl)
396 ret <8 x float> %load
399 declare <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr, i32, <2 x i1>, i32)
401 define <2 x double> @strided_vpload_v2f64(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
402 ; CHECK-LABEL: strided_vpload_v2f64:
404 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
405 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
407 %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
408 ret <2 x double> %load
411 define <2 x double> @strided_vpload_v2f64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
412 ; CHECK-LABEL: strided_vpload_v2f64_unit_stride:
414 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
415 ; CHECK-NEXT: vle64.v v8, (a0), v0.t
417 %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
418 ret <2 x double> %load
422 declare <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr, i32, <4 x i1>, i32)
424 define <4 x double> @strided_vpload_v4f64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
425 ; CHECK-LABEL: strided_vpload_v4f64:
427 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
428 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
430 %load = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
431 ret <4 x double> %load
434 define <4 x double> @strided_vpload_v4f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
435 ; CHECK-LABEL: strided_vpload_v4f64_allones_mask:
437 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
438 ; CHECK-NEXT: vlse64.v v8, (a0), a1
440 %a = insertelement <4 x i1> poison, i1 true, i32 0
441 %b = shufflevector <4 x i1> %a, <4 x i1> poison, <4 x i32> zeroinitializer
442 %load = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %b, i32 %evl)
443 ret <4 x double> %load
446 declare <8 x double> @llvm.experimental.vp.strided.load.v8f64.p0.i32(ptr, i32, <8 x i1>, i32)
448 define <8 x double> @strided_vpload_v8f64(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
449 ; CHECK-LABEL: strided_vpload_v8f64:
451 ; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
452 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
454 %load = call <8 x double> @llvm.experimental.vp.strided.load.v8f64.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
455 ret <8 x double> %load
459 define <3 x double> @strided_vpload_v3f64(ptr %ptr, i32 signext %stride, <3 x i1> %mask, i32 zeroext %evl) {
460 ; CHECK-LABEL: strided_vpload_v3f64:
462 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
463 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
465 %v = call <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr %ptr, i32 %stride, <3 x i1> %mask, i32 %evl)
469 define <3 x double> @strided_vpload_v3f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
470 ; CHECK-LABEL: strided_vpload_v3f64_allones_mask:
472 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
473 ; CHECK-NEXT: vlse64.v v8, (a0), a1
475 %one = insertelement <3 x i1> poison, i1 true, i32 0
476 %allones = shufflevector <3 x i1> %one, <3 x i1> poison, <3 x i32> zeroinitializer
477 %v = call <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr %ptr, i32 %stride, <3 x i1> %allones, i32 %evl)
481 declare <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr, i32, <3 x i1>, i32)
484 define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x i1> %m, i32 zeroext %evl) nounwind {
485 ; CHECK-LABEL: strided_vpload_v32f64:
487 ; CHECK-NEXT: li a4, 16
488 ; CHECK-NEXT: vmv1r.v v9, v0
489 ; CHECK-NEXT: mv a3, a2
490 ; CHECK-NEXT: bltu a2, a4, .LBB40_2
491 ; CHECK-NEXT: # %bb.1:
492 ; CHECK-NEXT: li a3, 16
493 ; CHECK-NEXT: .LBB40_2:
494 ; CHECK-NEXT: mul a4, a3, a1
495 ; CHECK-NEXT: add a4, a0, a4
496 ; CHECK-NEXT: addi a5, a2, -16
497 ; CHECK-NEXT: sltu a2, a2, a5
498 ; CHECK-NEXT: addi a2, a2, -1
499 ; CHECK-NEXT: and a2, a2, a5
500 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
501 ; CHECK-NEXT: vslidedown.vi v8, v9, 2
502 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
503 ; CHECK-NEXT: vmv1r.v v0, v8
504 ; CHECK-NEXT: vlse64.v v16, (a4), a1, v0.t
505 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
506 ; CHECK-NEXT: vmv1r.v v0, v9
507 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
509 %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
510 ret <32 x double> %load
513 define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
514 ; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
516 ; CHECK-NEXT: li a4, 16
517 ; CHECK-NEXT: mv a3, a2
518 ; CHECK-NEXT: bltu a2, a4, .LBB41_2
519 ; CHECK-NEXT: # %bb.1:
520 ; CHECK-NEXT: li a3, 16
521 ; CHECK-NEXT: .LBB41_2:
522 ; CHECK-NEXT: mul a4, a3, a1
523 ; CHECK-NEXT: add a4, a0, a4
524 ; CHECK-NEXT: addi a5, a2, -16
525 ; CHECK-NEXT: sltu a2, a2, a5
526 ; CHECK-NEXT: addi a2, a2, -1
527 ; CHECK-NEXT: and a2, a2, a5
528 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
529 ; CHECK-NEXT: vlse64.v v16, (a4), a1
530 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
531 ; CHECK-NEXT: vlse64.v v8, (a0), a1
533 %one = insertelement <32 x i1> poison, i1 true, i32 0
534 %allones = shufflevector <32 x i1> %one, <32 x i1> poison, <32 x i32> zeroinitializer
535 %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %allones, i32 %evl)
536 ret <32 x double> %load
539 declare <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr, i32, <32 x i1>, i32)
541 ; Widening + splitting (with HiIsEmpty == true)
542 define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 zeroext %evl) {
543 ; CHECK-RV32-LABEL: strided_load_v33f64:
544 ; CHECK-RV32: # %bb.0:
545 ; CHECK-RV32-NEXT: li a5, 32
546 ; CHECK-RV32-NEXT: vmv1r.v v8, v0
547 ; CHECK-RV32-NEXT: mv a3, a4
548 ; CHECK-RV32-NEXT: bltu a4, a5, .LBB42_2
549 ; CHECK-RV32-NEXT: # %bb.1:
550 ; CHECK-RV32-NEXT: li a3, 32
551 ; CHECK-RV32-NEXT: .LBB42_2:
552 ; CHECK-RV32-NEXT: mul a5, a3, a2
553 ; CHECK-RV32-NEXT: addi a6, a4, -32
554 ; CHECK-RV32-NEXT: sltu a4, a4, a6
555 ; CHECK-RV32-NEXT: addi a4, a4, -1
556 ; CHECK-RV32-NEXT: and a6, a4, a6
557 ; CHECK-RV32-NEXT: li a4, 16
558 ; CHECK-RV32-NEXT: add a5, a1, a5
559 ; CHECK-RV32-NEXT: bltu a6, a4, .LBB42_4
560 ; CHECK-RV32-NEXT: # %bb.3:
561 ; CHECK-RV32-NEXT: li a6, 16
562 ; CHECK-RV32-NEXT: .LBB42_4:
563 ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
564 ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4
565 ; CHECK-RV32-NEXT: vsetvli zero, a6, e64, m8, ta, ma
566 ; CHECK-RV32-NEXT: vlse64.v v16, (a5), a2, v0.t
567 ; CHECK-RV32-NEXT: addi a5, a3, -16
568 ; CHECK-RV32-NEXT: sltu a6, a3, a5
569 ; CHECK-RV32-NEXT: addi a6, a6, -1
570 ; CHECK-RV32-NEXT: and a5, a6, a5
571 ; CHECK-RV32-NEXT: bltu a3, a4, .LBB42_6
572 ; CHECK-RV32-NEXT: # %bb.5:
573 ; CHECK-RV32-NEXT: li a3, 16
574 ; CHECK-RV32-NEXT: .LBB42_6:
575 ; CHECK-RV32-NEXT: mul a4, a3, a2
576 ; CHECK-RV32-NEXT: add a4, a1, a4
577 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
578 ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 2
579 ; CHECK-RV32-NEXT: vsetvli zero, a5, e64, m8, ta, ma
580 ; CHECK-RV32-NEXT: vlse64.v v24, (a4), a2, v0.t
581 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
582 ; CHECK-RV32-NEXT: vmv1r.v v0, v8
583 ; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t
584 ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
585 ; CHECK-RV32-NEXT: vse64.v v8, (a0)
586 ; CHECK-RV32-NEXT: addi a1, a0, 128
587 ; CHECK-RV32-NEXT: vse64.v v24, (a1)
588 ; CHECK-RV32-NEXT: addi a0, a0, 256
589 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
590 ; CHECK-RV32-NEXT: vse64.v v16, (a0)
591 ; CHECK-RV32-NEXT: ret
593 ; CHECK-RV64-LABEL: strided_load_v33f64:
594 ; CHECK-RV64: # %bb.0:
595 ; CHECK-RV64-NEXT: li a5, 32
596 ; CHECK-RV64-NEXT: vmv1r.v v8, v0
597 ; CHECK-RV64-NEXT: mv a4, a3
598 ; CHECK-RV64-NEXT: bltu a3, a5, .LBB42_2
599 ; CHECK-RV64-NEXT: # %bb.1:
600 ; CHECK-RV64-NEXT: li a4, 32
601 ; CHECK-RV64-NEXT: .LBB42_2:
602 ; CHECK-RV64-NEXT: mul a5, a4, a2
603 ; CHECK-RV64-NEXT: addi a6, a3, -32
604 ; CHECK-RV64-NEXT: sltu a3, a3, a6
605 ; CHECK-RV64-NEXT: addi a3, a3, -1
606 ; CHECK-RV64-NEXT: and a6, a3, a6
607 ; CHECK-RV64-NEXT: li a3, 16
608 ; CHECK-RV64-NEXT: add a5, a1, a5
609 ; CHECK-RV64-NEXT: bltu a6, a3, .LBB42_4
610 ; CHECK-RV64-NEXT: # %bb.3:
611 ; CHECK-RV64-NEXT: li a6, 16
612 ; CHECK-RV64-NEXT: .LBB42_4:
613 ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
614 ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4
615 ; CHECK-RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma
616 ; CHECK-RV64-NEXT: vlse64.v v16, (a5), a2, v0.t
617 ; CHECK-RV64-NEXT: addi a5, a4, -16
618 ; CHECK-RV64-NEXT: sltu a6, a4, a5
619 ; CHECK-RV64-NEXT: addi a6, a6, -1
620 ; CHECK-RV64-NEXT: and a5, a6, a5
621 ; CHECK-RV64-NEXT: bltu a4, a3, .LBB42_6
622 ; CHECK-RV64-NEXT: # %bb.5:
623 ; CHECK-RV64-NEXT: li a4, 16
624 ; CHECK-RV64-NEXT: .LBB42_6:
625 ; CHECK-RV64-NEXT: mul a3, a4, a2
626 ; CHECK-RV64-NEXT: add a3, a1, a3
627 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
628 ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 2
629 ; CHECK-RV64-NEXT: vsetvli zero, a5, e64, m8, ta, ma
630 ; CHECK-RV64-NEXT: vlse64.v v24, (a3), a2, v0.t
631 ; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
632 ; CHECK-RV64-NEXT: vmv1r.v v0, v8
633 ; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t
634 ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
635 ; CHECK-RV64-NEXT: vse64.v v8, (a0)
636 ; CHECK-RV64-NEXT: addi a1, a0, 128
637 ; CHECK-RV64-NEXT: vse64.v v24, (a1)
638 ; CHECK-RV64-NEXT: addi a0, a0, 256
639 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
640 ; CHECK-RV64-NEXT: vse64.v v16, (a0)
641 ; CHECK-RV64-NEXT: ret
642 %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
646 declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32)