1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s \
3 ; RUN: | FileCheck %s --check-prefixes=SLOW,RV32-SLOW
4 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \
5 ; RUN: | FileCheck %s --check-prefixes=SLOW,RV64-SLOW
6 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+fast-unaligned-access -verify-machineinstrs < %s \
7 ; RUN: | FileCheck %s --check-prefixes=FAST,RV32-FAST
8 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+fast-unaligned-access -verify-machineinstrs < %s \
9 ; RUN: | FileCheck %s --check-prefixes=FAST,RV64-FAST
11 define <4 x i32> @load_v4i32_align1(ptr %ptr) {
12 ; SLOW-LABEL: load_v4i32_align1:
14 ; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma
15 ; SLOW-NEXT: vle8.v v8, (a0)
18 ; FAST-LABEL: load_v4i32_align1:
20 ; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma
21 ; FAST-NEXT: vle32.v v8, (a0)
23 %z = load <4 x i32>, ptr %ptr, align 1
27 define <4 x i32> @load_v4i32_align2(ptr %ptr) {
28 ; SLOW-LABEL: load_v4i32_align2:
30 ; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma
31 ; SLOW-NEXT: vle8.v v8, (a0)
34 ; FAST-LABEL: load_v4i32_align2:
36 ; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma
37 ; FAST-NEXT: vle32.v v8, (a0)
39 %z = load <4 x i32>, ptr %ptr, align 2
43 define void @store_v4i32_align1(<4 x i32> %x, ptr %ptr) {
44 ; SLOW-LABEL: store_v4i32_align1:
46 ; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma
47 ; SLOW-NEXT: vse8.v v8, (a0)
50 ; FAST-LABEL: store_v4i32_align1:
52 ; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma
53 ; FAST-NEXT: vse32.v v8, (a0)
55 store <4 x i32> %x, ptr %ptr, align 1
59 define void @store_v4i32_align2(<4 x i32> %x, ptr %ptr) {
60 ; SLOW-LABEL: store_v4i32_align2:
62 ; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma
63 ; SLOW-NEXT: vse8.v v8, (a0)
66 ; FAST-LABEL: store_v4i32_align2:
68 ; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma
69 ; FAST-NEXT: vse32.v v8, (a0)
71 store <4 x i32> %x, ptr %ptr, align 2
75 declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>)
77 define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
78 ; RV32-SLOW-LABEL: mgather_v2i16_align1:
80 ; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
81 ; RV32-SLOW-NEXT: vmv.x.s a0, v0
82 ; RV32-SLOW-NEXT: andi a1, a0, 1
83 ; RV32-SLOW-NEXT: bnez a1, .LBB4_3
84 ; RV32-SLOW-NEXT: # %bb.1: # %else
85 ; RV32-SLOW-NEXT: andi a0, a0, 2
86 ; RV32-SLOW-NEXT: bnez a0, .LBB4_4
87 ; RV32-SLOW-NEXT: .LBB4_2: # %else2
88 ; RV32-SLOW-NEXT: vmv1r.v v8, v9
90 ; RV32-SLOW-NEXT: .LBB4_3: # %cond.load
91 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma
92 ; RV32-SLOW-NEXT: vmv.x.s a1, v8
93 ; RV32-SLOW-NEXT: lbu a2, 1(a1)
94 ; RV32-SLOW-NEXT: lbu a1, 0(a1)
95 ; RV32-SLOW-NEXT: slli a2, a2, 8
96 ; RV32-SLOW-NEXT: or a1, a2, a1
97 ; RV32-SLOW-NEXT: vsetivli zero, 2, e16, m2, tu, ma
98 ; RV32-SLOW-NEXT: vmv.s.x v9, a1
99 ; RV32-SLOW-NEXT: andi a0, a0, 2
100 ; RV32-SLOW-NEXT: beqz a0, .LBB4_2
101 ; RV32-SLOW-NEXT: .LBB4_4: # %cond.load1
102 ; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
103 ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1
104 ; RV32-SLOW-NEXT: vmv.x.s a0, v8
105 ; RV32-SLOW-NEXT: lbu a1, 1(a0)
106 ; RV32-SLOW-NEXT: lbu a0, 0(a0)
107 ; RV32-SLOW-NEXT: slli a1, a1, 8
108 ; RV32-SLOW-NEXT: or a0, a1, a0
109 ; RV32-SLOW-NEXT: vmv.s.x v8, a0
110 ; RV32-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
111 ; RV32-SLOW-NEXT: vslideup.vi v9, v8, 1
112 ; RV32-SLOW-NEXT: vmv1r.v v8, v9
113 ; RV32-SLOW-NEXT: ret
115 ; RV64-SLOW-LABEL: mgather_v2i16_align1:
116 ; RV64-SLOW: # %bb.0:
117 ; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
118 ; RV64-SLOW-NEXT: vmv.x.s a0, v0
119 ; RV64-SLOW-NEXT: andi a1, a0, 1
120 ; RV64-SLOW-NEXT: bnez a1, .LBB4_3
121 ; RV64-SLOW-NEXT: # %bb.1: # %else
122 ; RV64-SLOW-NEXT: andi a0, a0, 2
123 ; RV64-SLOW-NEXT: bnez a0, .LBB4_4
124 ; RV64-SLOW-NEXT: .LBB4_2: # %else2
125 ; RV64-SLOW-NEXT: vmv1r.v v8, v9
126 ; RV64-SLOW-NEXT: ret
127 ; RV64-SLOW-NEXT: .LBB4_3: # %cond.load
128 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma
129 ; RV64-SLOW-NEXT: vmv.x.s a1, v8
130 ; RV64-SLOW-NEXT: lbu a2, 1(a1)
131 ; RV64-SLOW-NEXT: lbu a1, 0(a1)
132 ; RV64-SLOW-NEXT: slli a2, a2, 8
133 ; RV64-SLOW-NEXT: or a1, a2, a1
134 ; RV64-SLOW-NEXT: vsetivli zero, 2, e16, m2, tu, ma
135 ; RV64-SLOW-NEXT: vmv.s.x v9, a1
136 ; RV64-SLOW-NEXT: andi a0, a0, 2
137 ; RV64-SLOW-NEXT: beqz a0, .LBB4_2
138 ; RV64-SLOW-NEXT: .LBB4_4: # %cond.load1
139 ; RV64-SLOW-NEXT: vsetivli zero, 1, e64, m1, ta, ma
140 ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1
141 ; RV64-SLOW-NEXT: vmv.x.s a0, v8
142 ; RV64-SLOW-NEXT: lbu a1, 1(a0)
143 ; RV64-SLOW-NEXT: lbu a0, 0(a0)
144 ; RV64-SLOW-NEXT: slli a1, a1, 8
145 ; RV64-SLOW-NEXT: or a0, a1, a0
146 ; RV64-SLOW-NEXT: vmv.s.x v8, a0
147 ; RV64-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
148 ; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1
149 ; RV64-SLOW-NEXT: vmv1r.v v8, v9
150 ; RV64-SLOW-NEXT: ret
152 ; RV32-FAST-LABEL: mgather_v2i16_align1:
153 ; RV32-FAST: # %bb.0:
154 ; RV32-FAST-NEXT: vsetivli zero, 2, e16, mf4, ta, mu
155 ; RV32-FAST-NEXT: vluxei32.v v9, (zero), v8, v0.t
156 ; RV32-FAST-NEXT: vmv1r.v v8, v9
157 ; RV32-FAST-NEXT: ret
159 ; RV64-FAST-LABEL: mgather_v2i16_align1:
160 ; RV64-FAST: # %bb.0:
161 ; RV64-FAST-NEXT: vsetivli zero, 2, e16, mf4, ta, mu
162 ; RV64-FAST-NEXT: vluxei64.v v9, (zero), v8, v0.t
163 ; RV64-FAST-NEXT: vmv1r.v v8, v9
164 ; RV64-FAST-NEXT: ret
165 %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 1, <2 x i1> %m, <2 x i16> %passthru)
169 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
171 define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthru) {
172 ; RV32-SLOW-LABEL: mgather_v2i64_align4:
173 ; RV32-SLOW: # %bb.0:
174 ; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
175 ; RV32-SLOW-NEXT: vmv.x.s a0, v0
176 ; RV32-SLOW-NEXT: andi a1, a0, 1
177 ; RV32-SLOW-NEXT: bnez a1, .LBB5_3
178 ; RV32-SLOW-NEXT: # %bb.1: # %else
179 ; RV32-SLOW-NEXT: andi a0, a0, 2
180 ; RV32-SLOW-NEXT: bnez a0, .LBB5_4
181 ; RV32-SLOW-NEXT: .LBB5_2: # %else2
182 ; RV32-SLOW-NEXT: vmv1r.v v8, v9
183 ; RV32-SLOW-NEXT: ret
184 ; RV32-SLOW-NEXT: .LBB5_3: # %cond.load
185 ; RV32-SLOW-NEXT: vsetivli zero, 2, e32, m1, tu, ma
186 ; RV32-SLOW-NEXT: vmv.x.s a1, v8
187 ; RV32-SLOW-NEXT: lw a2, 0(a1)
188 ; RV32-SLOW-NEXT: lw a1, 4(a1)
189 ; RV32-SLOW-NEXT: vslide1down.vx v9, v9, a2
190 ; RV32-SLOW-NEXT: vslide1down.vx v9, v9, a1
191 ; RV32-SLOW-NEXT: andi a0, a0, 2
192 ; RV32-SLOW-NEXT: beqz a0, .LBB5_2
193 ; RV32-SLOW-NEXT: .LBB5_4: # %cond.load1
194 ; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
195 ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1
196 ; RV32-SLOW-NEXT: vmv.x.s a0, v8
197 ; RV32-SLOW-NEXT: lw a1, 0(a0)
198 ; RV32-SLOW-NEXT: lw a0, 4(a0)
199 ; RV32-SLOW-NEXT: vsetivli zero, 2, e32, m1, ta, ma
200 ; RV32-SLOW-NEXT: vslide1down.vx v8, v8, a1
201 ; RV32-SLOW-NEXT: vslide1down.vx v8, v8, a0
202 ; RV32-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma
203 ; RV32-SLOW-NEXT: vslideup.vi v9, v8, 1
204 ; RV32-SLOW-NEXT: vmv1r.v v8, v9
205 ; RV32-SLOW-NEXT: ret
207 ; RV64-SLOW-LABEL: mgather_v2i64_align4:
208 ; RV64-SLOW: # %bb.0:
209 ; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
210 ; RV64-SLOW-NEXT: vmv.x.s a0, v0
211 ; RV64-SLOW-NEXT: andi a1, a0, 1
212 ; RV64-SLOW-NEXT: bnez a1, .LBB5_3
213 ; RV64-SLOW-NEXT: # %bb.1: # %else
214 ; RV64-SLOW-NEXT: andi a0, a0, 2
215 ; RV64-SLOW-NEXT: bnez a0, .LBB5_4
216 ; RV64-SLOW-NEXT: .LBB5_2: # %else2
217 ; RV64-SLOW-NEXT: vmv1r.v v8, v9
218 ; RV64-SLOW-NEXT: ret
219 ; RV64-SLOW-NEXT: .LBB5_3: # %cond.load
220 ; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m8, tu, ma
221 ; RV64-SLOW-NEXT: vmv.x.s a1, v8
222 ; RV64-SLOW-NEXT: lwu a2, 4(a1)
223 ; RV64-SLOW-NEXT: lwu a1, 0(a1)
224 ; RV64-SLOW-NEXT: slli a2, a2, 32
225 ; RV64-SLOW-NEXT: or a1, a2, a1
226 ; RV64-SLOW-NEXT: vmv.s.x v9, a1
227 ; RV64-SLOW-NEXT: andi a0, a0, 2
228 ; RV64-SLOW-NEXT: beqz a0, .LBB5_2
229 ; RV64-SLOW-NEXT: .LBB5_4: # %cond.load1
230 ; RV64-SLOW-NEXT: vsetivli zero, 1, e64, m1, ta, ma
231 ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1
232 ; RV64-SLOW-NEXT: vmv.x.s a0, v8
233 ; RV64-SLOW-NEXT: lwu a1, 4(a0)
234 ; RV64-SLOW-NEXT: lwu a0, 0(a0)
235 ; RV64-SLOW-NEXT: slli a1, a1, 32
236 ; RV64-SLOW-NEXT: or a0, a1, a0
237 ; RV64-SLOW-NEXT: vmv.s.x v8, a0
238 ; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma
239 ; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1
240 ; RV64-SLOW-NEXT: vmv1r.v v8, v9
241 ; RV64-SLOW-NEXT: ret
243 ; RV32-FAST-LABEL: mgather_v2i64_align4:
244 ; RV32-FAST: # %bb.0:
245 ; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, mu
246 ; RV32-FAST-NEXT: vluxei32.v v9, (zero), v8, v0.t
247 ; RV32-FAST-NEXT: vmv.v.v v8, v9
248 ; RV32-FAST-NEXT: ret
250 ; RV64-FAST-LABEL: mgather_v2i64_align4:
251 ; RV64-FAST: # %bb.0:
252 ; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, mu
253 ; RV64-FAST-NEXT: vluxei64.v v9, (zero), v8, v0.t
254 ; RV64-FAST-NEXT: vmv.v.v v8, v9
255 ; RV64-FAST-NEXT: ret
256 %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x i64> %passthru)
260 declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
262 define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) {
263 ; RV32-SLOW-LABEL: mscatter_v4i16_align1:
264 ; RV32-SLOW: # %bb.0:
265 ; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
266 ; RV32-SLOW-NEXT: vmv.x.s a0, v0
267 ; RV32-SLOW-NEXT: andi a1, a0, 1
268 ; RV32-SLOW-NEXT: bnez a1, .LBB6_5
269 ; RV32-SLOW-NEXT: # %bb.1: # %else
270 ; RV32-SLOW-NEXT: andi a1, a0, 2
271 ; RV32-SLOW-NEXT: bnez a1, .LBB6_6
272 ; RV32-SLOW-NEXT: .LBB6_2: # %else2
273 ; RV32-SLOW-NEXT: andi a1, a0, 4
274 ; RV32-SLOW-NEXT: bnez a1, .LBB6_7
275 ; RV32-SLOW-NEXT: .LBB6_3: # %else4
276 ; RV32-SLOW-NEXT: andi a0, a0, 8
277 ; RV32-SLOW-NEXT: bnez a0, .LBB6_8
278 ; RV32-SLOW-NEXT: .LBB6_4: # %else6
279 ; RV32-SLOW-NEXT: ret
280 ; RV32-SLOW-NEXT: .LBB6_5: # %cond.store
281 ; RV32-SLOW-NEXT: vsetvli zero, zero, e16, m2, ta, ma
282 ; RV32-SLOW-NEXT: vmv.x.s a1, v8
283 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma
284 ; RV32-SLOW-NEXT: vmv.x.s a2, v9
285 ; RV32-SLOW-NEXT: sb a1, 0(a2)
286 ; RV32-SLOW-NEXT: srli a1, a1, 8
287 ; RV32-SLOW-NEXT: sb a1, 1(a2)
288 ; RV32-SLOW-NEXT: andi a1, a0, 2
289 ; RV32-SLOW-NEXT: beqz a1, .LBB6_2
290 ; RV32-SLOW-NEXT: .LBB6_6: # %cond.store1
291 ; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
292 ; RV32-SLOW-NEXT: vslidedown.vi v10, v8, 1
293 ; RV32-SLOW-NEXT: vmv.x.s a1, v10
294 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma
295 ; RV32-SLOW-NEXT: vslidedown.vi v10, v9, 1
296 ; RV32-SLOW-NEXT: vmv.x.s a2, v10
297 ; RV32-SLOW-NEXT: sb a1, 0(a2)
298 ; RV32-SLOW-NEXT: srli a1, a1, 8
299 ; RV32-SLOW-NEXT: sb a1, 1(a2)
300 ; RV32-SLOW-NEXT: andi a1, a0, 4
301 ; RV32-SLOW-NEXT: beqz a1, .LBB6_3
302 ; RV32-SLOW-NEXT: .LBB6_7: # %cond.store3
303 ; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
304 ; RV32-SLOW-NEXT: vslidedown.vi v10, v8, 2
305 ; RV32-SLOW-NEXT: vmv.x.s a1, v10
306 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma
307 ; RV32-SLOW-NEXT: vslidedown.vi v10, v9, 2
308 ; RV32-SLOW-NEXT: vmv.x.s a2, v10
309 ; RV32-SLOW-NEXT: sb a1, 0(a2)
310 ; RV32-SLOW-NEXT: srli a1, a1, 8
311 ; RV32-SLOW-NEXT: sb a1, 1(a2)
312 ; RV32-SLOW-NEXT: andi a0, a0, 8
313 ; RV32-SLOW-NEXT: beqz a0, .LBB6_4
314 ; RV32-SLOW-NEXT: .LBB6_8: # %cond.store5
315 ; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
316 ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 3
317 ; RV32-SLOW-NEXT: vmv.x.s a0, v8
318 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma
319 ; RV32-SLOW-NEXT: vslidedown.vi v8, v9, 3
320 ; RV32-SLOW-NEXT: vmv.x.s a1, v8
321 ; RV32-SLOW-NEXT: sb a0, 0(a1)
322 ; RV32-SLOW-NEXT: srli a0, a0, 8
323 ; RV32-SLOW-NEXT: sb a0, 1(a1)
324 ; RV32-SLOW-NEXT: ret
326 ; RV64-SLOW-LABEL: mscatter_v4i16_align1:
327 ; RV64-SLOW: # %bb.0:
328 ; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
329 ; RV64-SLOW-NEXT: vmv.x.s a0, v0
330 ; RV64-SLOW-NEXT: andi a1, a0, 1
331 ; RV64-SLOW-NEXT: bnez a1, .LBB6_5
332 ; RV64-SLOW-NEXT: # %bb.1: # %else
333 ; RV64-SLOW-NEXT: andi a1, a0, 2
334 ; RV64-SLOW-NEXT: bnez a1, .LBB6_6
335 ; RV64-SLOW-NEXT: .LBB6_2: # %else2
336 ; RV64-SLOW-NEXT: andi a1, a0, 4
337 ; RV64-SLOW-NEXT: bnez a1, .LBB6_7
338 ; RV64-SLOW-NEXT: .LBB6_3: # %else4
339 ; RV64-SLOW-NEXT: andi a0, a0, 8
340 ; RV64-SLOW-NEXT: bnez a0, .LBB6_8
341 ; RV64-SLOW-NEXT: .LBB6_4: # %else6
342 ; RV64-SLOW-NEXT: ret
343 ; RV64-SLOW-NEXT: .LBB6_5: # %cond.store
344 ; RV64-SLOW-NEXT: vsetvli zero, zero, e16, m2, ta, ma
345 ; RV64-SLOW-NEXT: vmv.x.s a1, v8
346 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma
347 ; RV64-SLOW-NEXT: vmv.x.s a2, v10
348 ; RV64-SLOW-NEXT: srli a3, a1, 8
349 ; RV64-SLOW-NEXT: sb a3, 1(a2)
350 ; RV64-SLOW-NEXT: sb a1, 0(a2)
351 ; RV64-SLOW-NEXT: andi a1, a0, 2
352 ; RV64-SLOW-NEXT: beqz a1, .LBB6_2
353 ; RV64-SLOW-NEXT: .LBB6_6: # %cond.store1
354 ; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
355 ; RV64-SLOW-NEXT: vslidedown.vi v9, v8, 1
356 ; RV64-SLOW-NEXT: vmv.x.s a1, v9
357 ; RV64-SLOW-NEXT: vsetivli zero, 1, e64, m1, ta, ma
358 ; RV64-SLOW-NEXT: vslidedown.vi v9, v10, 1
359 ; RV64-SLOW-NEXT: vmv.x.s a2, v9
360 ; RV64-SLOW-NEXT: srli a3, a1, 8
361 ; RV64-SLOW-NEXT: sb a3, 1(a2)
362 ; RV64-SLOW-NEXT: sb a1, 0(a2)
363 ; RV64-SLOW-NEXT: andi a1, a0, 4
364 ; RV64-SLOW-NEXT: beqz a1, .LBB6_3
365 ; RV64-SLOW-NEXT: .LBB6_7: # %cond.store3
366 ; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
367 ; RV64-SLOW-NEXT: vslidedown.vi v9, v8, 2
368 ; RV64-SLOW-NEXT: vmv.x.s a1, v9
369 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma
370 ; RV64-SLOW-NEXT: vslidedown.vi v12, v10, 2
371 ; RV64-SLOW-NEXT: vmv.x.s a2, v12
372 ; RV64-SLOW-NEXT: srli a3, a1, 8
373 ; RV64-SLOW-NEXT: sb a3, 1(a2)
374 ; RV64-SLOW-NEXT: sb a1, 0(a2)
375 ; RV64-SLOW-NEXT: andi a0, a0, 8
376 ; RV64-SLOW-NEXT: beqz a0, .LBB6_4
377 ; RV64-SLOW-NEXT: .LBB6_8: # %cond.store5
378 ; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
379 ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 3
380 ; RV64-SLOW-NEXT: vmv.x.s a0, v8
381 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma
382 ; RV64-SLOW-NEXT: vslidedown.vi v8, v10, 3
383 ; RV64-SLOW-NEXT: vmv.x.s a1, v8
384 ; RV64-SLOW-NEXT: srli a2, a0, 8
385 ; RV64-SLOW-NEXT: sb a2, 1(a1)
386 ; RV64-SLOW-NEXT: sb a0, 0(a1)
387 ; RV64-SLOW-NEXT: ret
389 ; RV32-FAST-LABEL: mscatter_v4i16_align1:
390 ; RV32-FAST: # %bb.0:
391 ; RV32-FAST-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
392 ; RV32-FAST-NEXT: vsoxei32.v v8, (zero), v9, v0.t
393 ; RV32-FAST-NEXT: ret
395 ; RV64-FAST-LABEL: mscatter_v4i16_align1:
396 ; RV64-FAST: # %bb.0:
397 ; RV64-FAST-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
398 ; RV64-FAST-NEXT: vsoxei64.v v8, (zero), v10, v0.t
399 ; RV64-FAST-NEXT: ret
400 call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %m)
404 declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
406 define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m) {
407 ; RV32-SLOW-LABEL: mscatter_v2i32_align2:
408 ; RV32-SLOW: # %bb.0:
409 ; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
410 ; RV32-SLOW-NEXT: vmv.x.s a0, v0
411 ; RV32-SLOW-NEXT: andi a1, a0, 1
412 ; RV32-SLOW-NEXT: bnez a1, .LBB7_3
413 ; RV32-SLOW-NEXT: # %bb.1: # %else
414 ; RV32-SLOW-NEXT: andi a0, a0, 2
415 ; RV32-SLOW-NEXT: bnez a0, .LBB7_4
416 ; RV32-SLOW-NEXT: .LBB7_2: # %else2
417 ; RV32-SLOW-NEXT: ret
418 ; RV32-SLOW-NEXT: .LBB7_3: # %cond.store
419 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma
420 ; RV32-SLOW-NEXT: vmv.x.s a1, v8
421 ; RV32-SLOW-NEXT: vmv.x.s a2, v9
422 ; RV32-SLOW-NEXT: sh a1, 0(a2)
423 ; RV32-SLOW-NEXT: srli a1, a1, 16
424 ; RV32-SLOW-NEXT: sh a1, 2(a2)
425 ; RV32-SLOW-NEXT: andi a0, a0, 2
426 ; RV32-SLOW-NEXT: beqz a0, .LBB7_2
427 ; RV32-SLOW-NEXT: .LBB7_4: # %cond.store1
428 ; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
429 ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1
430 ; RV32-SLOW-NEXT: vmv.x.s a0, v8
431 ; RV32-SLOW-NEXT: vslidedown.vi v8, v9, 1
432 ; RV32-SLOW-NEXT: vmv.x.s a1, v8
433 ; RV32-SLOW-NEXT: sh a0, 0(a1)
434 ; RV32-SLOW-NEXT: srli a0, a0, 16
435 ; RV32-SLOW-NEXT: sh a0, 2(a1)
436 ; RV32-SLOW-NEXT: ret
438 ; RV64-SLOW-LABEL: mscatter_v2i32_align2:
439 ; RV64-SLOW: # %bb.0:
440 ; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma
441 ; RV64-SLOW-NEXT: vmv.x.s a0, v0
442 ; RV64-SLOW-NEXT: andi a1, a0, 1
443 ; RV64-SLOW-NEXT: bnez a1, .LBB7_3
444 ; RV64-SLOW-NEXT: # %bb.1: # %else
445 ; RV64-SLOW-NEXT: andi a0, a0, 2
446 ; RV64-SLOW-NEXT: bnez a0, .LBB7_4
447 ; RV64-SLOW-NEXT: .LBB7_2: # %else2
448 ; RV64-SLOW-NEXT: ret
449 ; RV64-SLOW-NEXT: .LBB7_3: # %cond.store
450 ; RV64-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma
451 ; RV64-SLOW-NEXT: vmv.x.s a1, v8
452 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma
453 ; RV64-SLOW-NEXT: vmv.x.s a2, v9
454 ; RV64-SLOW-NEXT: sh a1, 0(a2)
455 ; RV64-SLOW-NEXT: srli a1, a1, 16
456 ; RV64-SLOW-NEXT: sh a1, 2(a2)
457 ; RV64-SLOW-NEXT: andi a0, a0, 2
458 ; RV64-SLOW-NEXT: beqz a0, .LBB7_2
459 ; RV64-SLOW-NEXT: .LBB7_4: # %cond.store1
460 ; RV64-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
461 ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1
462 ; RV64-SLOW-NEXT: vmv.x.s a0, v8
463 ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m1, ta, ma
464 ; RV64-SLOW-NEXT: vslidedown.vi v8, v9, 1
465 ; RV64-SLOW-NEXT: vmv.x.s a1, v8
466 ; RV64-SLOW-NEXT: sh a0, 0(a1)
467 ; RV64-SLOW-NEXT: srli a0, a0, 16
468 ; RV64-SLOW-NEXT: sh a0, 2(a1)
469 ; RV64-SLOW-NEXT: ret
471 ; RV32-FAST-LABEL: mscatter_v2i32_align2:
472 ; RV32-FAST: # %bb.0:
473 ; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
474 ; RV32-FAST-NEXT: vsoxei32.v v8, (zero), v9, v0.t
475 ; RV32-FAST-NEXT: ret
477 ; RV64-FAST-LABEL: mscatter_v2i32_align2:
478 ; RV64-FAST: # %bb.0:
479 ; RV64-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
480 ; RV64-FAST-NEXT: vsoxei64.v v8, (zero), v9, v0.t
481 ; RV64-FAST-NEXT: ret
482 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %val, <2 x ptr> %ptrs, i32 2, <2 x i1> %m)
486 declare <2 x i32> @llvm.masked.load.v2i32(ptr, i32, <2 x i1>, <2 x i32>)
488 define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwind {
489 ; RV32-SLOW-LABEL: masked_load_v2i32_align1:
490 ; RV32-SLOW: # %bb.0:
491 ; RV32-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
492 ; RV32-SLOW-NEXT: vmseq.vi v8, v8, 0
493 ; RV32-SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
494 ; RV32-SLOW-NEXT: vmv.x.s a2, v8
495 ; RV32-SLOW-NEXT: andi a3, a2, 1
496 ; RV32-SLOW-NEXT: # implicit-def: $v8
497 ; RV32-SLOW-NEXT: beqz a3, .LBB8_2
498 ; RV32-SLOW-NEXT: # %bb.1: # %cond.load
499 ; RV32-SLOW-NEXT: lbu a3, 1(a0)
500 ; RV32-SLOW-NEXT: lbu a4, 0(a0)
501 ; RV32-SLOW-NEXT: lbu a5, 2(a0)
502 ; RV32-SLOW-NEXT: lbu a6, 3(a0)
503 ; RV32-SLOW-NEXT: slli a3, a3, 8
504 ; RV32-SLOW-NEXT: or a3, a3, a4
505 ; RV32-SLOW-NEXT: slli a5, a5, 16
506 ; RV32-SLOW-NEXT: slli a6, a6, 24
507 ; RV32-SLOW-NEXT: or a4, a6, a5
508 ; RV32-SLOW-NEXT: or a3, a4, a3
509 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
510 ; RV32-SLOW-NEXT: vmv.v.x v8, a3
511 ; RV32-SLOW-NEXT: .LBB8_2: # %else
512 ; RV32-SLOW-NEXT: andi a2, a2, 2
513 ; RV32-SLOW-NEXT: beqz a2, .LBB8_4
514 ; RV32-SLOW-NEXT: # %bb.3: # %cond.load1
515 ; RV32-SLOW-NEXT: lbu a2, 5(a0)
516 ; RV32-SLOW-NEXT: lbu a3, 4(a0)
517 ; RV32-SLOW-NEXT: lbu a4, 6(a0)
518 ; RV32-SLOW-NEXT: lbu a0, 7(a0)
519 ; RV32-SLOW-NEXT: slli a2, a2, 8
520 ; RV32-SLOW-NEXT: or a2, a2, a3
521 ; RV32-SLOW-NEXT: slli a4, a4, 16
522 ; RV32-SLOW-NEXT: slli a0, a0, 24
523 ; RV32-SLOW-NEXT: or a0, a0, a4
524 ; RV32-SLOW-NEXT: or a0, a0, a2
525 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
526 ; RV32-SLOW-NEXT: vmv.s.x v9, a0
527 ; RV32-SLOW-NEXT: vslideup.vi v8, v9, 1
528 ; RV32-SLOW-NEXT: .LBB8_4: # %else2
529 ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
530 ; RV32-SLOW-NEXT: vse32.v v8, (a1)
531 ; RV32-SLOW-NEXT: ret
533 ; RV64-SLOW-LABEL: masked_load_v2i32_align1:
534 ; RV64-SLOW: # %bb.0:
535 ; RV64-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
536 ; RV64-SLOW-NEXT: vmseq.vi v8, v8, 0
537 ; RV64-SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
538 ; RV64-SLOW-NEXT: vmv.x.s a2, v8
539 ; RV64-SLOW-NEXT: andi a3, a2, 1
540 ; RV64-SLOW-NEXT: # implicit-def: $v8
541 ; RV64-SLOW-NEXT: beqz a3, .LBB8_2
542 ; RV64-SLOW-NEXT: # %bb.1: # %cond.load
543 ; RV64-SLOW-NEXT: lbu a3, 1(a0)
544 ; RV64-SLOW-NEXT: lbu a4, 0(a0)
545 ; RV64-SLOW-NEXT: lbu a5, 2(a0)
546 ; RV64-SLOW-NEXT: lb a6, 3(a0)
547 ; RV64-SLOW-NEXT: slli a3, a3, 8
548 ; RV64-SLOW-NEXT: or a3, a3, a4
549 ; RV64-SLOW-NEXT: slli a5, a5, 16
550 ; RV64-SLOW-NEXT: slli a6, a6, 24
551 ; RV64-SLOW-NEXT: or a4, a6, a5
552 ; RV64-SLOW-NEXT: or a3, a4, a3
553 ; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
554 ; RV64-SLOW-NEXT: vmv.v.x v8, a3
555 ; RV64-SLOW-NEXT: .LBB8_2: # %else
556 ; RV64-SLOW-NEXT: andi a2, a2, 2
557 ; RV64-SLOW-NEXT: beqz a2, .LBB8_4
558 ; RV64-SLOW-NEXT: # %bb.3: # %cond.load1
559 ; RV64-SLOW-NEXT: lbu a2, 5(a0)
560 ; RV64-SLOW-NEXT: lbu a3, 4(a0)
561 ; RV64-SLOW-NEXT: lbu a4, 6(a0)
562 ; RV64-SLOW-NEXT: lb a0, 7(a0)
563 ; RV64-SLOW-NEXT: slli a2, a2, 8
564 ; RV64-SLOW-NEXT: or a2, a2, a3
565 ; RV64-SLOW-NEXT: slli a4, a4, 16
566 ; RV64-SLOW-NEXT: slli a0, a0, 24
567 ; RV64-SLOW-NEXT: or a0, a0, a4
568 ; RV64-SLOW-NEXT: or a0, a0, a2
569 ; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
570 ; RV64-SLOW-NEXT: vmv.s.x v9, a0
571 ; RV64-SLOW-NEXT: vslideup.vi v8, v9, 1
572 ; RV64-SLOW-NEXT: .LBB8_4: # %else2
573 ; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
574 ; RV64-SLOW-NEXT: vse32.v v8, (a1)
575 ; RV64-SLOW-NEXT: ret
577 ; FAST-LABEL: masked_load_v2i32_align1:
579 ; FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
580 ; FAST-NEXT: vmseq.vi v0, v8, 0
581 ; FAST-NEXT: vle32.v v8, (a0), v0.t
582 ; FAST-NEXT: vse32.v v8, (a1)
584 %mask = icmp eq <2 x i32> %m, zeroinitializer
585 %load = call <2 x i32> @llvm.masked.load.v2i32(ptr %a, i32 1, <2 x i1> %mask, <2 x i32> undef)
586 store <2 x i32> %load, ptr %res_ptr
590 declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
592 define void @masked_store_v2i32_align2(<2 x i32> %val, ptr %a, <2 x i32> %m) nounwind {
593 ; SLOW-LABEL: masked_store_v2i32_align2:
595 ; SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
596 ; SLOW-NEXT: vmseq.vi v9, v9, 0
597 ; SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
598 ; SLOW-NEXT: vmv.x.s a1, v9
599 ; SLOW-NEXT: andi a2, a1, 1
600 ; SLOW-NEXT: bnez a2, .LBB9_3
601 ; SLOW-NEXT: # %bb.1: # %else
602 ; SLOW-NEXT: andi a1, a1, 2
603 ; SLOW-NEXT: bnez a1, .LBB9_4
604 ; SLOW-NEXT: .LBB9_2: # %else2
606 ; SLOW-NEXT: .LBB9_3: # %cond.store
607 ; SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
608 ; SLOW-NEXT: vmv.x.s a2, v8
609 ; SLOW-NEXT: sh a2, 0(a0)
610 ; SLOW-NEXT: srli a2, a2, 16
611 ; SLOW-NEXT: sh a2, 2(a0)
612 ; SLOW-NEXT: andi a1, a1, 2
613 ; SLOW-NEXT: beqz a1, .LBB9_2
614 ; SLOW-NEXT: .LBB9_4: # %cond.store1
615 ; SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
616 ; SLOW-NEXT: vslidedown.vi v8, v8, 1
617 ; SLOW-NEXT: vmv.x.s a1, v8
618 ; SLOW-NEXT: sh a1, 4(a0)
619 ; SLOW-NEXT: srli a1, a1, 16
620 ; SLOW-NEXT: sh a1, 6(a0)
623 ; FAST-LABEL: masked_store_v2i32_align2:
625 ; FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
626 ; FAST-NEXT: vmseq.vi v0, v9, 0
627 ; FAST-NEXT: vse32.v v8, (a0), v0.t
629 %mask = icmp eq <2 x i32> %m, zeroinitializer
630 call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %a, i32 2, <2 x i1> %mask)