1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh,+zvfbfmin,+optimized-zero-stride-load \
3 ; RUN: -verify-machineinstrs < %s \
4 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
5 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh,+zvfbfmin,+optimized-zero-stride-load \
6 ; RUN: -verify-machineinstrs < %s \
7 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
8 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfh,+zvfbfmin \
9 ; RUN: -verify-machineinstrs < %s \
10 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-ZVFH
11 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfh,+zvfbfmin \
12 ; RUN: -verify-machineinstrs < %s \
13 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-ZVFH
14 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin,+optimized-zero-stride-load \
15 ; RUN: -verify-machineinstrs < %s \
16 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
17 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin,+optimized-zero-stride-load \
18 ; RUN: -verify-machineinstrs < %s \
19 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
20 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin \
21 ; RUN: -verify-machineinstrs < %s \
22 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-ZVFHMIN
23 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+v,+zvfhmin,+zvfbfmin \
24 ; RUN: -verify-machineinstrs < %s \
25 ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-ZVFHMIN
27 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32)
29 define <2 x i8> @strided_vpload_v2i8_i8(ptr %ptr, i8 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
30 ; CHECK-LABEL: strided_vpload_v2i8_i8:
32 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
33 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
35 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr %ptr, i8 %stride, <2 x i1> %m, i32 %evl)
39 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i16(ptr, i16, <2 x i1>, i32)
41 define <2 x i8> @strided_vpload_v2i8_i16(ptr %ptr, i16 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
42 ; CHECK-LABEL: strided_vpload_v2i8_i16:
44 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
45 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
47 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i16(ptr %ptr, i16 %stride, <2 x i1> %m, i32 %evl)
51 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr, i64, <2 x i1>, i32)
53 define <2 x i8> @strided_vpload_v2i8_i64(ptr %ptr, i64 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
54 ; CHECK-RV32-LABEL: strided_vpload_v2i8_i64:
55 ; CHECK-RV32: # %bb.0:
56 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, mf8, ta, ma
57 ; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t
58 ; CHECK-RV32-NEXT: ret
60 ; CHECK-RV64-LABEL: strided_vpload_v2i8_i64:
61 ; CHECK-RV64: # %bb.0:
62 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
63 ; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t
64 ; CHECK-RV64-NEXT: ret
65 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr %ptr, i64 %stride, <2 x i1> %m, i32 %evl)
69 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i32(ptr, i32, <2 x i1>, i32)
71 define <2 x i8> @strided_vpload_v2i8(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
72 ; CHECK-LABEL: strided_vpload_v2i8:
74 ; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma
75 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
77 %load = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
81 declare <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr, i32, <4 x i1>, i32)
83 define <4 x i8> @strided_vpload_v4i8(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
84 ; CHECK-LABEL: strided_vpload_v4i8:
86 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma
87 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
89 %load = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
93 define <4 x i8> @strided_vpload_v4i8_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
94 ; CHECK-LABEL: strided_vpload_v4i8_allones_mask:
96 ; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma
97 ; CHECK-NEXT: vlse8.v v8, (a0), a1
99 %load = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i32(ptr %ptr, i32 %stride, <4 x i1> splat (i1 true), i32 %evl)
103 declare <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr, i32, <8 x i1>, i32)
105 define <8 x i8> @strided_vpload_v8i8(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
106 ; CHECK-LABEL: strided_vpload_v8i8:
108 ; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
109 ; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t
111 %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
115 define <8 x i8> @strided_vpload_v8i8_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
116 ; CHECK-LABEL: strided_vpload_v8i8_unit_stride:
118 ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
119 ; CHECK-NEXT: vle8.v v8, (a0), v0.t
121 %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 1, <8 x i1> %m, i32 %evl)
125 declare <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr, i32, <2 x i1>, i32)
127 define <2 x i16> @strided_vpload_v2i16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
128 ; CHECK-LABEL: strided_vpload_v2i16:
130 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
131 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
133 %load = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
137 declare <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i32(ptr, i32, <4 x i1>, i32)
139 define <4 x i16> @strided_vpload_v4i16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
140 ; CHECK-LABEL: strided_vpload_v4i16:
142 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
143 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
145 %load = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
149 declare <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr, i32, <8 x i1>, i32)
151 define <8 x i16> @strided_vpload_v8i16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
152 ; CHECK-LABEL: strided_vpload_v8i16:
154 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
155 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
157 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
161 define <8 x i16> @strided_vpload_v8i16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
162 ; CHECK-LABEL: strided_vpload_v8i16_unit_stride:
164 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
165 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
167 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
171 define <8 x i16> @strided_vpload_v8i16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
172 ; CHECK-LABEL: strided_vpload_v8i16_allones_mask:
174 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
175 ; CHECK-NEXT: vlse16.v v8, (a0), a1
177 %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> splat (i1 true), i32 %evl)
181 declare <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i32(ptr, i32, <2 x i1>, i32)
183 define <2 x i32> @strided_vpload_v2i32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
184 ; CHECK-LABEL: strided_vpload_v2i32:
186 ; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma
187 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
189 %load = call <2 x i32> @llvm.experimental.vp.strided.load.v2i32.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
193 declare <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr, i32, <4 x i1>, i32)
195 define <4 x i32> @strided_vpload_v4i32(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
196 ; CHECK-LABEL: strided_vpload_v4i32:
198 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
199 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
201 %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
205 define <4 x i32> @strided_vpload_v4i32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
206 ; CHECK-LABEL: strided_vpload_v4i32_unit_stride:
208 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
209 ; CHECK-NEXT: vle32.v v8, (a0), v0.t
211 %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
215 declare <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr, i32, <8 x i1>, i32)
217 define <8 x i32> @strided_vpload_v8i32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
218 ; CHECK-LABEL: strided_vpload_v8i32:
220 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
221 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
223 %load = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
227 define <8 x i32> @strided_vpload_v8i32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
228 ; CHECK-LABEL: strided_vpload_v8i32_allones_mask:
230 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
231 ; CHECK-NEXT: vlse32.v v8, (a0), a1
233 %load = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> splat (i1 true), i32 %evl)
237 declare <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr, i32, <2 x i1>, i32)
239 define <2 x i64> @strided_vpload_v2i64(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
240 ; CHECK-LABEL: strided_vpload_v2i64:
242 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
243 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
245 %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
249 define <2 x i64> @strided_vpload_v2i64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
250 ; CHECK-LABEL: strided_vpload_v2i64_unit_stride:
252 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
253 ; CHECK-NEXT: vle64.v v8, (a0), v0.t
255 %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
259 declare <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr, i32, <4 x i1>, i32)
261 define <4 x i64> @strided_vpload_v4i64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
262 ; CHECK-LABEL: strided_vpload_v4i64:
264 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
265 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
267 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
271 define <4 x i64> @strided_vpload_v4i64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
272 ; CHECK-LABEL: strided_vpload_v4i64_allones_mask:
274 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
275 ; CHECK-NEXT: vlse64.v v8, (a0), a1
277 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> splat (i1 true), i32 %evl)
281 declare <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i32(ptr, i32, <8 x i1>, i32)
283 define <8 x i64> @strided_vpload_v8i64(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
284 ; CHECK-LABEL: strided_vpload_v8i64:
286 ; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
287 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
289 %load = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
293 declare <2 x bfloat> @llvm.experimental.vp.strided.load.v2bf16.p0.i32(ptr, i32, <2 x i1>, i32)
295 define <2 x bfloat> @strided_vpload_v2bf16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
296 ; CHECK-LABEL: strided_vpload_v2bf16:
298 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
299 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
301 %load = call <2 x bfloat> @llvm.experimental.vp.strided.load.v2bf16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
302 ret <2 x bfloat> %load
305 define <2 x bfloat> @strided_vpload_v2bf16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
306 ; CHECK-LABEL: strided_vpload_v2bf16_allones_mask:
308 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
309 ; CHECK-NEXT: vlse16.v v8, (a0), a1
311 %load = call <2 x bfloat> @llvm.experimental.vp.strided.load.v2bf16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> splat (i1 true), i32 %evl)
312 ret <2 x bfloat> %load
315 declare <4 x bfloat> @llvm.experimental.vp.strided.load.v4bf16.p0.i32(ptr, i32, <4 x i1>, i32)
317 define <4 x bfloat> @strided_vpload_v4bf16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
318 ; CHECK-LABEL: strided_vpload_v4bf16:
320 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
321 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
323 %load = call <4 x bfloat> @llvm.experimental.vp.strided.load.v4bf16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
324 ret <4 x bfloat> %load
327 declare <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i32(ptr, i32, <8 x i1>, i32)
329 define <8 x bfloat> @strided_vpload_v8bf16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
330 ; CHECK-LABEL: strided_vpload_v8bf16:
332 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
333 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
335 %load = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
336 ret <8 x bfloat> %load
339 define <8 x bfloat> @strided_vpload_v8bf16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
340 ; CHECK-LABEL: strided_vpload_v8bf16_unit_stride:
342 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
343 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
345 %load = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
346 ret <8 x bfloat> %load
349 declare <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr, i32, <2 x i1>, i32)
351 define <2 x half> @strided_vpload_v2f16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
352 ; CHECK-LABEL: strided_vpload_v2f16:
354 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
355 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
357 %load = call <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
361 define <2 x half> @strided_vpload_v2f16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
362 ; CHECK-LABEL: strided_vpload_v2f16_allones_mask:
364 ; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma
365 ; CHECK-NEXT: vlse16.v v8, (a0), a1
367 %load = call <2 x half> @llvm.experimental.vp.strided.load.v2f16.p0.i32(ptr %ptr, i32 %stride, <2 x i1> splat (i1 true), i32 %evl)
371 declare <4 x half> @llvm.experimental.vp.strided.load.v4f16.p0.i32(ptr, i32, <4 x i1>, i32)
373 define <4 x half> @strided_vpload_v4f16(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
374 ; CHECK-LABEL: strided_vpload_v4f16:
376 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
377 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
379 %load = call <4 x half> @llvm.experimental.vp.strided.load.v4f16.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
383 declare <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr, i32, <8 x i1>, i32)
385 define <8 x half> @strided_vpload_v8f16(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
386 ; CHECK-LABEL: strided_vpload_v8f16:
388 ; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
389 ; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t
391 %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
395 define <8 x half> @strided_vpload_v8f16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
396 ; CHECK-LABEL: strided_vpload_v8f16_unit_stride:
398 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
399 ; CHECK-NEXT: vle16.v v8, (a0), v0.t
401 %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
405 declare <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr, i32, <2 x i1>, i32)
407 define <2 x float> @strided_vpload_v2f32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
408 ; CHECK-LABEL: strided_vpload_v2f32:
410 ; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma
411 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
413 %load = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
414 ret <2 x float> %load
417 declare <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr, i32, <4 x i1>, i32)
419 define <4 x float> @strided_vpload_v4f32(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
420 ; CHECK-LABEL: strided_vpload_v4f32:
422 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
423 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
425 %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
426 ret <4 x float> %load
429 define <4 x float> @strided_vpload_v4f32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
430 ; CHECK-LABEL: strided_vpload_v4f32_unit_stride:
432 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
433 ; CHECK-NEXT: vle32.v v8, (a0), v0.t
435 %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
436 ret <4 x float> %load
439 declare <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr, i32, <8 x i1>, i32)
441 define <8 x float> @strided_vpload_v8f32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
442 ; CHECK-LABEL: strided_vpload_v8f32:
444 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
445 ; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t
447 %load = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
448 ret <8 x float> %load
451 define <8 x float> @strided_vpload_v8f32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
452 ; CHECK-LABEL: strided_vpload_v8f32_allones_mask:
454 ; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma
455 ; CHECK-NEXT: vlse32.v v8, (a0), a1
457 %load = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr %ptr, i32 %stride, <8 x i1> splat (i1 true), i32 %evl)
458 ret <8 x float> %load
461 declare <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr, i32, <2 x i1>, i32)
463 define <2 x double> @strided_vpload_v2f64(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
464 ; CHECK-LABEL: strided_vpload_v2f64:
466 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma
467 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
469 %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 %stride, <2 x i1> %m, i32 %evl)
470 ret <2 x double> %load
473 define <2 x double> @strided_vpload_v2f64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
474 ; CHECK-LABEL: strided_vpload_v2f64_unit_stride:
476 ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
477 ; CHECK-NEXT: vle64.v v8, (a0), v0.t
479 %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
480 ret <2 x double> %load
484 declare <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr, i32, <4 x i1>, i32)
486 define <4 x double> @strided_vpload_v4f64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
487 ; CHECK-LABEL: strided_vpload_v4f64:
489 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
490 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
492 %load = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> %m, i32 %evl)
493 ret <4 x double> %load
496 define <4 x double> @strided_vpload_v4f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
497 ; CHECK-LABEL: strided_vpload_v4f64_allones_mask:
499 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
500 ; CHECK-NEXT: vlse64.v v8, (a0), a1
502 %load = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr %ptr, i32 %stride, <4 x i1> splat (i1 true), i32 %evl)
503 ret <4 x double> %load
506 declare <8 x double> @llvm.experimental.vp.strided.load.v8f64.p0.i32(ptr, i32, <8 x i1>, i32)
508 define <8 x double> @strided_vpload_v8f64(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
509 ; CHECK-LABEL: strided_vpload_v8f64:
511 ; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma
512 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
514 %load = call <8 x double> @llvm.experimental.vp.strided.load.v8f64.p0.i32(ptr %ptr, i32 %stride, <8 x i1> %m, i32 %evl)
515 ret <8 x double> %load
519 define <3 x double> @strided_vpload_v3f64(ptr %ptr, i32 signext %stride, <3 x i1> %mask, i32 zeroext %evl) {
520 ; CHECK-LABEL: strided_vpload_v3f64:
522 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
523 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
525 %v = call <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr %ptr, i32 %stride, <3 x i1> %mask, i32 %evl)
529 define <3 x double> @strided_vpload_v3f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
530 ; CHECK-LABEL: strided_vpload_v3f64_allones_mask:
532 ; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma
533 ; CHECK-NEXT: vlse64.v v8, (a0), a1
535 %v = call <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr %ptr, i32 %stride, <3 x i1> splat (i1 true), i32 %evl)
539 declare <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr, i32, <3 x i1>, i32)
542 define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x i1> %m, i32 zeroext %evl) nounwind {
543 ; CHECK-LABEL: strided_vpload_v32f64:
545 ; CHECK-NEXT: vmv1r.v v9, v0
546 ; CHECK-NEXT: li a4, 16
547 ; CHECK-NEXT: mv a3, a2
548 ; CHECK-NEXT: bltu a2, a4, .LBB45_2
549 ; CHECK-NEXT: # %bb.1:
550 ; CHECK-NEXT: li a3, 16
551 ; CHECK-NEXT: .LBB45_2:
552 ; CHECK-NEXT: mul a4, a3, a1
553 ; CHECK-NEXT: addi a5, a2, -16
554 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
555 ; CHECK-NEXT: vslidedown.vi v8, v9, 2
556 ; CHECK-NEXT: add a4, a0, a4
557 ; CHECK-NEXT: sltu a2, a2, a5
558 ; CHECK-NEXT: addi a2, a2, -1
559 ; CHECK-NEXT: and a2, a2, a5
560 ; CHECK-NEXT: vmv1r.v v0, v8
561 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
562 ; CHECK-NEXT: vlse64.v v16, (a4), a1, v0.t
563 ; CHECK-NEXT: vmv1r.v v0, v9
564 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
565 ; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t
567 %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> %m, i32 %evl)
568 ret <32 x double> %load
571 define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) nounwind {
572 ; CHECK-LABEL: strided_vpload_v32f64_allones_mask:
574 ; CHECK-NEXT: li a4, 16
575 ; CHECK-NEXT: mv a3, a2
576 ; CHECK-NEXT: bltu a2, a4, .LBB46_2
577 ; CHECK-NEXT: # %bb.1:
578 ; CHECK-NEXT: li a3, 16
579 ; CHECK-NEXT: .LBB46_2:
580 ; CHECK-NEXT: mul a4, a3, a1
581 ; CHECK-NEXT: addi a5, a2, -16
582 ; CHECK-NEXT: add a4, a0, a4
583 ; CHECK-NEXT: sltu a2, a2, a5
584 ; CHECK-NEXT: addi a2, a2, -1
585 ; CHECK-NEXT: and a2, a2, a5
586 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
587 ; CHECK-NEXT: vlse64.v v16, (a4), a1
588 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
589 ; CHECK-NEXT: vlse64.v v8, (a0), a1
591 %load = call <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr %ptr, i32 %stride, <32 x i1> splat (i1 true), i32 %evl)
592 ret <32 x double> %load
595 declare <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr, i32, <32 x i1>, i32)
597 ; Widening + splitting (with HiIsEmpty == true)
598 define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 zeroext %evl) {
599 ; CHECK-RV32-LABEL: strided_load_v33f64:
600 ; CHECK-RV32: # %bb.0:
601 ; CHECK-RV32-NEXT: vmv1r.v v8, v0
602 ; CHECK-RV32-NEXT: li a5, 32
603 ; CHECK-RV32-NEXT: mv a3, a4
604 ; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_2
605 ; CHECK-RV32-NEXT: # %bb.1:
606 ; CHECK-RV32-NEXT: li a3, 32
607 ; CHECK-RV32-NEXT: .LBB47_2:
608 ; CHECK-RV32-NEXT: mul a6, a3, a2
609 ; CHECK-RV32-NEXT: addi a5, a4, -32
610 ; CHECK-RV32-NEXT: sltu a7, a4, a5
611 ; CHECK-RV32-NEXT: addi a7, a7, -1
612 ; CHECK-RV32-NEXT: and a7, a7, a5
613 ; CHECK-RV32-NEXT: li a5, 16
614 ; CHECK-RV32-NEXT: add a6, a1, a6
615 ; CHECK-RV32-NEXT: bltu a7, a5, .LBB47_4
616 ; CHECK-RV32-NEXT: # %bb.3:
617 ; CHECK-RV32-NEXT: li a7, 16
618 ; CHECK-RV32-NEXT: .LBB47_4:
619 ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
620 ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4
621 ; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, ma
622 ; CHECK-RV32-NEXT: vlse64.v v16, (a6), a2, v0.t
623 ; CHECK-RV32-NEXT: addi a6, a3, -16
624 ; CHECK-RV32-NEXT: sltu a3, a3, a6
625 ; CHECK-RV32-NEXT: addi a3, a3, -1
626 ; CHECK-RV32-NEXT: and a3, a3, a6
627 ; CHECK-RV32-NEXT: bltu a4, a5, .LBB47_6
628 ; CHECK-RV32-NEXT: # %bb.5:
629 ; CHECK-RV32-NEXT: li a4, 16
630 ; CHECK-RV32-NEXT: .LBB47_6:
631 ; CHECK-RV32-NEXT: mul a5, a4, a2
632 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
633 ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 2
634 ; CHECK-RV32-NEXT: add a5, a1, a5
635 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma
636 ; CHECK-RV32-NEXT: vlse64.v v24, (a5), a2, v0.t
637 ; CHECK-RV32-NEXT: vmv1r.v v0, v8
638 ; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma
639 ; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t
640 ; CHECK-RV32-NEXT: addi a1, a0, 128
641 ; CHECK-RV32-NEXT: addi a2, a0, 256
642 ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
643 ; CHECK-RV32-NEXT: vse64.v v8, (a0)
644 ; CHECK-RV32-NEXT: vse64.v v24, (a1)
645 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
646 ; CHECK-RV32-NEXT: vse64.v v16, (a2)
647 ; CHECK-RV32-NEXT: ret
649 ; CHECK-RV64-LABEL: strided_load_v33f64:
650 ; CHECK-RV64: # %bb.0:
651 ; CHECK-RV64-NEXT: vmv1r.v v8, v0
652 ; CHECK-RV64-NEXT: li a5, 32
653 ; CHECK-RV64-NEXT: mv a4, a3
654 ; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_2
655 ; CHECK-RV64-NEXT: # %bb.1:
656 ; CHECK-RV64-NEXT: li a4, 32
657 ; CHECK-RV64-NEXT: .LBB47_2:
658 ; CHECK-RV64-NEXT: mul a6, a4, a2
659 ; CHECK-RV64-NEXT: addi a5, a3, -32
660 ; CHECK-RV64-NEXT: sltu a7, a3, a5
661 ; CHECK-RV64-NEXT: addi a7, a7, -1
662 ; CHECK-RV64-NEXT: and a7, a7, a5
663 ; CHECK-RV64-NEXT: li a5, 16
664 ; CHECK-RV64-NEXT: add a6, a1, a6
665 ; CHECK-RV64-NEXT: bltu a7, a5, .LBB47_4
666 ; CHECK-RV64-NEXT: # %bb.3:
667 ; CHECK-RV64-NEXT: li a7, 16
668 ; CHECK-RV64-NEXT: .LBB47_4:
669 ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
670 ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4
671 ; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma
672 ; CHECK-RV64-NEXT: vlse64.v v16, (a6), a2, v0.t
673 ; CHECK-RV64-NEXT: addi a6, a4, -16
674 ; CHECK-RV64-NEXT: sltu a4, a4, a6
675 ; CHECK-RV64-NEXT: addi a4, a4, -1
676 ; CHECK-RV64-NEXT: and a4, a4, a6
677 ; CHECK-RV64-NEXT: bltu a3, a5, .LBB47_6
678 ; CHECK-RV64-NEXT: # %bb.5:
679 ; CHECK-RV64-NEXT: li a3, 16
680 ; CHECK-RV64-NEXT: .LBB47_6:
681 ; CHECK-RV64-NEXT: mul a5, a3, a2
682 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
683 ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 2
684 ; CHECK-RV64-NEXT: add a5, a1, a5
685 ; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma
686 ; CHECK-RV64-NEXT: vlse64.v v24, (a5), a2, v0.t
687 ; CHECK-RV64-NEXT: vmv1r.v v0, v8
688 ; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma
689 ; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t
690 ; CHECK-RV64-NEXT: addi a1, a0, 128
691 ; CHECK-RV64-NEXT: addi a2, a0, 256
692 ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
693 ; CHECK-RV64-NEXT: vse64.v v8, (a0)
694 ; CHECK-RV64-NEXT: vse64.v v24, (a1)
695 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
696 ; CHECK-RV64-NEXT: vse64.v v16, (a2)
697 ; CHECK-RV64-NEXT: ret
698 %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
702 declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32)
704 ; Test unmasked integer zero strided
705 define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
706 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
707 ; CHECK-OPT: # %bb.0:
708 ; CHECK-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
709 ; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
710 ; CHECK-OPT-NEXT: ret
712 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
713 ; CHECK-NO-OPT: # %bb.0:
714 ; CHECK-NO-OPT-NEXT: lbu a0, 0(a0)
715 ; CHECK-NO-OPT-NEXT: vsetivli zero, 3, e8, mf4, ta, ma
716 ; CHECK-NO-OPT-NEXT: vmv.v.x v8, a0
717 ; CHECK-NO-OPT-NEXT: ret
718 %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3)
722 ; Test unmasked float zero strided
723 define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
724 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
725 ; CHECK-OPT: # %bb.0:
726 ; CHECK-OPT-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
727 ; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
728 ; CHECK-OPT-NEXT: ret
730 ; CHECK-NO-OPT-ZVFH-LABEL: zero_strided_unmasked_vpload_4f16:
731 ; CHECK-NO-OPT-ZVFH: # %bb.0:
732 ; CHECK-NO-OPT-ZVFH-NEXT: flh fa5, 0(a0)
733 ; CHECK-NO-OPT-ZVFH-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
734 ; CHECK-NO-OPT-ZVFH-NEXT: vfmv.v.f v8, fa5
735 ; CHECK-NO-OPT-ZVFH-NEXT: ret
737 ; CHECK-NO-OPT-ZVFHMIN-LABEL: zero_strided_unmasked_vpload_4f16:
738 ; CHECK-NO-OPT-ZVFHMIN: # %bb.0:
739 ; CHECK-NO-OPT-ZVFHMIN-NEXT: lh a0, 0(a0)
740 ; CHECK-NO-OPT-ZVFHMIN-NEXT: vsetivli zero, 3, e16, mf2, ta, ma
741 ; CHECK-NO-OPT-ZVFHMIN-NEXT: vmv.v.x v8, a0
742 ; CHECK-NO-OPT-ZVFHMIN-NEXT: ret
743 %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
747 define <4 x i64> @zero_strided_vadd.vx(<4 x i64> %v, ptr %ptr) {
748 ; CHECK-RV32-LABEL: zero_strided_vadd.vx:
749 ; CHECK-RV32: # %bb.0:
750 ; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
751 ; CHECK-RV32-NEXT: vlse64.v v10, (a0), zero
752 ; CHECK-RV32-NEXT: vadd.vv v8, v8, v10
753 ; CHECK-RV32-NEXT: ret
755 ; CHECK-RV64-LABEL: zero_strided_vadd.vx:
756 ; CHECK-RV64: # %bb.0:
757 ; CHECK-RV64-NEXT: ld a0, 0(a0)
758 ; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
759 ; CHECK-RV64-NEXT: vadd.vx v8, v8, a0
760 ; CHECK-RV64-NEXT: ret
761 %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4)
762 %w = add <4 x i64> %v, %load