1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
5 define void @masked_store_v1f16(<1 x half>* %val_ptr, <1 x half>* %a, <1 x half>* %m_ptr) nounwind {
6 ; CHECK-LABEL: masked_store_v1f16:
8 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
9 ; CHECK-NEXT: vle16.v v8, (a2)
10 ; CHECK-NEXT: vle16.v v9, (a0)
11 ; CHECK-NEXT: fmv.h.x fa5, zero
12 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
13 ; CHECK-NEXT: vse16.v v9, (a1), v0.t
15 %m = load <1 x half>, <1 x half>* %m_ptr
16 %mask = fcmp oeq <1 x half> %m, zeroinitializer
17 %val = load <1 x half>, <1 x half>* %val_ptr
18 call void @llvm.masked.store.v1f16.p0v1f16(<1 x half> %val, <1 x half>* %a, i32 8, <1 x i1> %mask)
21 declare void @llvm.masked.store.v1f16.p0v1f16(<1 x half>, <1 x half>*, i32, <1 x i1>)
23 define void @masked_store_v1f32(<1 x float>* %val_ptr, <1 x float>* %a, <1 x float>* %m_ptr) nounwind {
24 ; CHECK-LABEL: masked_store_v1f32:
26 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
27 ; CHECK-NEXT: vle32.v v8, (a2)
28 ; CHECK-NEXT: vle32.v v9, (a0)
29 ; CHECK-NEXT: fmv.w.x fa5, zero
30 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
31 ; CHECK-NEXT: vse32.v v9, (a1), v0.t
33 %m = load <1 x float>, <1 x float>* %m_ptr
34 %mask = fcmp oeq <1 x float> %m, zeroinitializer
35 %val = load <1 x float>, <1 x float>* %val_ptr
36 call void @llvm.masked.store.v1f32.p0v1f32(<1 x float> %val, <1 x float>* %a, i32 8, <1 x i1> %mask)
39 declare void @llvm.masked.store.v1f32.p0v1f32(<1 x float>, <1 x float>*, i32, <1 x i1>)
41 define void @masked_store_v1f64(<1 x double>* %val_ptr, <1 x double>* %a, <1 x double>* %m_ptr) nounwind {
42 ; RV32-LABEL: masked_store_v1f64:
44 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
45 ; RV32-NEXT: vle64.v v8, (a2)
46 ; RV32-NEXT: vle64.v v9, (a0)
47 ; RV32-NEXT: fcvt.d.w fa5, zero
48 ; RV32-NEXT: vmfeq.vf v0, v8, fa5
49 ; RV32-NEXT: vse64.v v9, (a1), v0.t
52 ; RV64-LABEL: masked_store_v1f64:
54 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
55 ; RV64-NEXT: vle64.v v8, (a2)
56 ; RV64-NEXT: vle64.v v9, (a0)
57 ; RV64-NEXT: fmv.d.x fa5, zero
58 ; RV64-NEXT: vmfeq.vf v0, v8, fa5
59 ; RV64-NEXT: vse64.v v9, (a1), v0.t
61 %m = load <1 x double>, <1 x double>* %m_ptr
62 %mask = fcmp oeq <1 x double> %m, zeroinitializer
63 %val = load <1 x double>, <1 x double>* %val_ptr
64 call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> %val, <1 x double>* %a, i32 8, <1 x i1> %mask)
67 declare void @llvm.masked.store.v1f64.p0v1f64(<1 x double>, <1 x double>*, i32, <1 x i1>)
69 define void @masked_store_v2f16(<2 x half>* %val_ptr, <2 x half>* %a, <2 x half>* %m_ptr) nounwind {
70 ; CHECK-LABEL: masked_store_v2f16:
72 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
73 ; CHECK-NEXT: vle16.v v8, (a2)
74 ; CHECK-NEXT: vle16.v v9, (a0)
75 ; CHECK-NEXT: fmv.h.x fa5, zero
76 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
77 ; CHECK-NEXT: vse16.v v9, (a1), v0.t
79 %m = load <2 x half>, <2 x half>* %m_ptr
80 %mask = fcmp oeq <2 x half> %m, zeroinitializer
81 %val = load <2 x half>, <2 x half>* %val_ptr
82 call void @llvm.masked.store.v2f16.p0v2f16(<2 x half> %val, <2 x half>* %a, i32 8, <2 x i1> %mask)
85 declare void @llvm.masked.store.v2f16.p0v2f16(<2 x half>, <2 x half>*, i32, <2 x i1>)
87 define void @masked_store_v2f32(<2 x float>* %val_ptr, <2 x float>* %a, <2 x float>* %m_ptr) nounwind {
88 ; CHECK-LABEL: masked_store_v2f32:
90 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
91 ; CHECK-NEXT: vle32.v v8, (a2)
92 ; CHECK-NEXT: vle32.v v9, (a0)
93 ; CHECK-NEXT: fmv.w.x fa5, zero
94 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
95 ; CHECK-NEXT: vse32.v v9, (a1), v0.t
97 %m = load <2 x float>, <2 x float>* %m_ptr
98 %mask = fcmp oeq <2 x float> %m, zeroinitializer
99 %val = load <2 x float>, <2 x float>* %val_ptr
100 call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %a, i32 8, <2 x i1> %mask)
103 declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
105 define void @masked_store_v2f64(<2 x double>* %val_ptr, <2 x double>* %a, <2 x double>* %m_ptr) nounwind {
106 ; RV32-LABEL: masked_store_v2f64:
108 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
109 ; RV32-NEXT: vle64.v v8, (a2)
110 ; RV32-NEXT: vle64.v v9, (a0)
111 ; RV32-NEXT: fcvt.d.w fa5, zero
112 ; RV32-NEXT: vmfeq.vf v0, v8, fa5
113 ; RV32-NEXT: vse64.v v9, (a1), v0.t
116 ; RV64-LABEL: masked_store_v2f64:
118 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
119 ; RV64-NEXT: vle64.v v8, (a2)
120 ; RV64-NEXT: vle64.v v9, (a0)
121 ; RV64-NEXT: fmv.d.x fa5, zero
122 ; RV64-NEXT: vmfeq.vf v0, v8, fa5
123 ; RV64-NEXT: vse64.v v9, (a1), v0.t
125 %m = load <2 x double>, <2 x double>* %m_ptr
126 %mask = fcmp oeq <2 x double> %m, zeroinitializer
127 %val = load <2 x double>, <2 x double>* %val_ptr
128 call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %a, i32 8, <2 x i1> %mask)
131 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
133 define void @masked_store_v4f16(<4 x half>* %val_ptr, <4 x half>* %a, <4 x half>* %m_ptr) nounwind {
134 ; CHECK-LABEL: masked_store_v4f16:
136 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
137 ; CHECK-NEXT: vle16.v v8, (a2)
138 ; CHECK-NEXT: vle16.v v9, (a0)
139 ; CHECK-NEXT: fmv.h.x fa5, zero
140 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
141 ; CHECK-NEXT: vse16.v v9, (a1), v0.t
143 %m = load <4 x half>, <4 x half>* %m_ptr
144 %mask = fcmp oeq <4 x half> %m, zeroinitializer
145 %val = load <4 x half>, <4 x half>* %val_ptr
146 call void @llvm.masked.store.v4f16.p0v4f16(<4 x half> %val, <4 x half>* %a, i32 8, <4 x i1> %mask)
149 declare void @llvm.masked.store.v4f16.p0v4f16(<4 x half>, <4 x half>*, i32, <4 x i1>)
151 define void @masked_store_v4f32(<4 x float>* %val_ptr, <4 x float>* %a, <4 x float>* %m_ptr) nounwind {
152 ; CHECK-LABEL: masked_store_v4f32:
154 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
155 ; CHECK-NEXT: vle32.v v8, (a2)
156 ; CHECK-NEXT: vle32.v v9, (a0)
157 ; CHECK-NEXT: fmv.w.x fa5, zero
158 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
159 ; CHECK-NEXT: vse32.v v9, (a1), v0.t
161 %m = load <4 x float>, <4 x float>* %m_ptr
162 %mask = fcmp oeq <4 x float> %m, zeroinitializer
163 %val = load <4 x float>, <4 x float>* %val_ptr
164 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %a, i32 8, <4 x i1> %mask)
167 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
169 define void @masked_store_v4f64(<4 x double>* %val_ptr, <4 x double>* %a, <4 x double>* %m_ptr) nounwind {
170 ; RV32-LABEL: masked_store_v4f64:
172 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
173 ; RV32-NEXT: vle64.v v8, (a2)
174 ; RV32-NEXT: vle64.v v10, (a0)
175 ; RV32-NEXT: fcvt.d.w fa5, zero
176 ; RV32-NEXT: vmfeq.vf v0, v8, fa5
177 ; RV32-NEXT: vse64.v v10, (a1), v0.t
180 ; RV64-LABEL: masked_store_v4f64:
182 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
183 ; RV64-NEXT: vle64.v v8, (a2)
184 ; RV64-NEXT: vle64.v v10, (a0)
185 ; RV64-NEXT: fmv.d.x fa5, zero
186 ; RV64-NEXT: vmfeq.vf v0, v8, fa5
187 ; RV64-NEXT: vse64.v v10, (a1), v0.t
189 %m = load <4 x double>, <4 x double>* %m_ptr
190 %mask = fcmp oeq <4 x double> %m, zeroinitializer
191 %val = load <4 x double>, <4 x double>* %val_ptr
192 call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %a, i32 8, <4 x i1> %mask)
195 declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
197 define void @masked_store_v8f16(<8 x half>* %val_ptr, <8 x half>* %a, <8 x half>* %m_ptr) nounwind {
198 ; CHECK-LABEL: masked_store_v8f16:
200 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
201 ; CHECK-NEXT: vle16.v v8, (a2)
202 ; CHECK-NEXT: vle16.v v9, (a0)
203 ; CHECK-NEXT: fmv.h.x fa5, zero
204 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
205 ; CHECK-NEXT: vse16.v v9, (a1), v0.t
207 %m = load <8 x half>, <8 x half>* %m_ptr
208 %mask = fcmp oeq <8 x half> %m, zeroinitializer
209 %val = load <8 x half>, <8 x half>* %val_ptr
210 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %a, i32 8, <8 x i1> %mask)
213 declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
215 define void @masked_store_v8f32(<8 x float>* %val_ptr, <8 x float>* %a, <8 x float>* %m_ptr) nounwind {
216 ; CHECK-LABEL: masked_store_v8f32:
218 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
219 ; CHECK-NEXT: vle32.v v8, (a2)
220 ; CHECK-NEXT: vle32.v v10, (a0)
221 ; CHECK-NEXT: fmv.w.x fa5, zero
222 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
223 ; CHECK-NEXT: vse32.v v10, (a1), v0.t
225 %m = load <8 x float>, <8 x float>* %m_ptr
226 %mask = fcmp oeq <8 x float> %m, zeroinitializer
227 %val = load <8 x float>, <8 x float>* %val_ptr
228 call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %val, <8 x float>* %a, i32 8, <8 x i1> %mask)
231 declare void @llvm.masked.store.v8f32.p0v8f32(<8 x float>, <8 x float>*, i32, <8 x i1>)
233 define void @masked_store_v8f64(<8 x double>* %val_ptr, <8 x double>* %a, <8 x double>* %m_ptr) nounwind {
234 ; RV32-LABEL: masked_store_v8f64:
236 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
237 ; RV32-NEXT: vle64.v v8, (a2)
238 ; RV32-NEXT: vle64.v v12, (a0)
239 ; RV32-NEXT: fcvt.d.w fa5, zero
240 ; RV32-NEXT: vmfeq.vf v0, v8, fa5
241 ; RV32-NEXT: vse64.v v12, (a1), v0.t
244 ; RV64-LABEL: masked_store_v8f64:
246 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
247 ; RV64-NEXT: vle64.v v8, (a2)
248 ; RV64-NEXT: vle64.v v12, (a0)
249 ; RV64-NEXT: fmv.d.x fa5, zero
250 ; RV64-NEXT: vmfeq.vf v0, v8, fa5
251 ; RV64-NEXT: vse64.v v12, (a1), v0.t
253 %m = load <8 x double>, <8 x double>* %m_ptr
254 %mask = fcmp oeq <8 x double> %m, zeroinitializer
255 %val = load <8 x double>, <8 x double>* %val_ptr
256 call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %a, i32 8, <8 x i1> %mask)
259 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
261 define void @masked_store_v16f16(<16 x half>* %val_ptr, <16 x half>* %a, <16 x half>* %m_ptr) nounwind {
262 ; CHECK-LABEL: masked_store_v16f16:
264 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
265 ; CHECK-NEXT: vle16.v v8, (a2)
266 ; CHECK-NEXT: vle16.v v10, (a0)
267 ; CHECK-NEXT: fmv.h.x fa5, zero
268 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
269 ; CHECK-NEXT: vse16.v v10, (a1), v0.t
271 %m = load <16 x half>, <16 x half>* %m_ptr
272 %mask = fcmp oeq <16 x half> %m, zeroinitializer
273 %val = load <16 x half>, <16 x half>* %val_ptr
274 call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %a, i32 8, <16 x i1> %mask)
277 declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>)
279 define void @masked_store_v16f32(<16 x float>* %val_ptr, <16 x float>* %a, <16 x float>* %m_ptr) nounwind {
280 ; CHECK-LABEL: masked_store_v16f32:
282 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
283 ; CHECK-NEXT: vle32.v v8, (a2)
284 ; CHECK-NEXT: vle32.v v12, (a0)
285 ; CHECK-NEXT: fmv.w.x fa5, zero
286 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
287 ; CHECK-NEXT: vse32.v v12, (a1), v0.t
289 %m = load <16 x float>, <16 x float>* %m_ptr
290 %mask = fcmp oeq <16 x float> %m, zeroinitializer
291 %val = load <16 x float>, <16 x float>* %val_ptr
292 call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %val, <16 x float>* %a, i32 8, <16 x i1> %mask)
295 declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
297 define void @masked_store_v16f64(<16 x double>* %val_ptr, <16 x double>* %a, <16 x double>* %m_ptr) nounwind {
298 ; RV32-LABEL: masked_store_v16f64:
300 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
301 ; RV32-NEXT: vle64.v v8, (a2)
302 ; RV32-NEXT: vle64.v v16, (a0)
303 ; RV32-NEXT: fcvt.d.w fa5, zero
304 ; RV32-NEXT: vmfeq.vf v0, v8, fa5
305 ; RV32-NEXT: vse64.v v16, (a1), v0.t
308 ; RV64-LABEL: masked_store_v16f64:
310 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
311 ; RV64-NEXT: vle64.v v8, (a2)
312 ; RV64-NEXT: vle64.v v16, (a0)
313 ; RV64-NEXT: fmv.d.x fa5, zero
314 ; RV64-NEXT: vmfeq.vf v0, v8, fa5
315 ; RV64-NEXT: vse64.v v16, (a1), v0.t
317 %m = load <16 x double>, <16 x double>* %m_ptr
318 %mask = fcmp oeq <16 x double> %m, zeroinitializer
319 %val = load <16 x double>, <16 x double>* %val_ptr
320 call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %val, <16 x double>* %a, i32 8, <16 x i1> %mask)
323 declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double>, <16 x double>*, i32, <16 x i1>)
325 define void @masked_store_v32f16(<32 x half>* %val_ptr, <32 x half>* %a, <32 x half>* %m_ptr) nounwind {
326 ; CHECK-LABEL: masked_store_v32f16:
328 ; CHECK-NEXT: li a3, 32
329 ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
330 ; CHECK-NEXT: vle16.v v8, (a2)
331 ; CHECK-NEXT: vle16.v v12, (a0)
332 ; CHECK-NEXT: fmv.h.x fa5, zero
333 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
334 ; CHECK-NEXT: vse16.v v12, (a1), v0.t
336 %m = load <32 x half>, <32 x half>* %m_ptr
337 %mask = fcmp oeq <32 x half> %m, zeroinitializer
338 %val = load <32 x half>, <32 x half>* %val_ptr
339 call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %a, i32 8, <32 x i1> %mask)
342 declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>)
344 define void @masked_store_v32f32(<32 x float>* %val_ptr, <32 x float>* %a, <32 x float>* %m_ptr) nounwind {
345 ; CHECK-LABEL: masked_store_v32f32:
347 ; CHECK-NEXT: li a3, 32
348 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
349 ; CHECK-NEXT: vle32.v v8, (a2)
350 ; CHECK-NEXT: vle32.v v16, (a0)
351 ; CHECK-NEXT: fmv.w.x fa5, zero
352 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
353 ; CHECK-NEXT: vse32.v v16, (a1), v0.t
355 %m = load <32 x float>, <32 x float>* %m_ptr
356 %mask = fcmp oeq <32 x float> %m, zeroinitializer
357 %val = load <32 x float>, <32 x float>* %val_ptr
358 call void @llvm.masked.store.v32f32.p0v32f32(<32 x float> %val, <32 x float>* %a, i32 8, <32 x i1> %mask)
361 declare void @llvm.masked.store.v32f32.p0v32f32(<32 x float>, <32 x float>*, i32, <32 x i1>)
363 define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 x double>* %m_ptr) nounwind {
364 ; RV32-LABEL: masked_store_v32f64:
366 ; RV32-NEXT: addi sp, sp, -16
367 ; RV32-NEXT: csrr a3, vlenb
368 ; RV32-NEXT: slli a3, a3, 4
369 ; RV32-NEXT: sub sp, sp, a3
370 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
371 ; RV32-NEXT: vle64.v v8, (a2)
372 ; RV32-NEXT: addi a2, a2, 128
373 ; RV32-NEXT: vle64.v v16, (a2)
374 ; RV32-NEXT: csrr a2, vlenb
375 ; RV32-NEXT: slli a2, a2, 3
376 ; RV32-NEXT: add a2, sp, a2
377 ; RV32-NEXT: addi a2, a2, 16
378 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
379 ; RV32-NEXT: fcvt.d.w fa5, zero
380 ; RV32-NEXT: vmfeq.vf v0, v8, fa5
381 ; RV32-NEXT: vle64.v v24, (a0)
382 ; RV32-NEXT: addi a0, a0, 128
383 ; RV32-NEXT: vle64.v v8, (a0)
384 ; RV32-NEXT: addi a0, sp, 16
385 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
386 ; RV32-NEXT: csrr a0, vlenb
387 ; RV32-NEXT: slli a0, a0, 3
388 ; RV32-NEXT: add a0, sp, a0
389 ; RV32-NEXT: addi a0, a0, 16
390 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
391 ; RV32-NEXT: vmfeq.vf v8, v16, fa5
392 ; RV32-NEXT: vse64.v v24, (a1), v0.t
393 ; RV32-NEXT: addi a0, a1, 128
394 ; RV32-NEXT: vmv1r.v v0, v8
395 ; RV32-NEXT: addi a1, sp, 16
396 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
397 ; RV32-NEXT: vse64.v v8, (a0), v0.t
398 ; RV32-NEXT: csrr a0, vlenb
399 ; RV32-NEXT: slli a0, a0, 4
400 ; RV32-NEXT: add sp, sp, a0
401 ; RV32-NEXT: addi sp, sp, 16
404 ; RV64-LABEL: masked_store_v32f64:
406 ; RV64-NEXT: addi sp, sp, -16
407 ; RV64-NEXT: csrr a3, vlenb
408 ; RV64-NEXT: slli a3, a3, 4
409 ; RV64-NEXT: sub sp, sp, a3
410 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
411 ; RV64-NEXT: vle64.v v8, (a2)
412 ; RV64-NEXT: addi a2, a2, 128
413 ; RV64-NEXT: vle64.v v16, (a2)
414 ; RV64-NEXT: csrr a2, vlenb
415 ; RV64-NEXT: slli a2, a2, 3
416 ; RV64-NEXT: add a2, sp, a2
417 ; RV64-NEXT: addi a2, a2, 16
418 ; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
419 ; RV64-NEXT: fmv.d.x fa5, zero
420 ; RV64-NEXT: vmfeq.vf v0, v8, fa5
421 ; RV64-NEXT: vle64.v v24, (a0)
422 ; RV64-NEXT: addi a0, a0, 128
423 ; RV64-NEXT: vle64.v v8, (a0)
424 ; RV64-NEXT: addi a0, sp, 16
425 ; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
426 ; RV64-NEXT: csrr a0, vlenb
427 ; RV64-NEXT: slli a0, a0, 3
428 ; RV64-NEXT: add a0, sp, a0
429 ; RV64-NEXT: addi a0, a0, 16
430 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
431 ; RV64-NEXT: vmfeq.vf v8, v16, fa5
432 ; RV64-NEXT: vse64.v v24, (a1), v0.t
433 ; RV64-NEXT: addi a0, a1, 128
434 ; RV64-NEXT: vmv1r.v v0, v8
435 ; RV64-NEXT: addi a1, sp, 16
436 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
437 ; RV64-NEXT: vse64.v v8, (a0), v0.t
438 ; RV64-NEXT: csrr a0, vlenb
439 ; RV64-NEXT: slli a0, a0, 4
440 ; RV64-NEXT: add sp, sp, a0
441 ; RV64-NEXT: addi sp, sp, 16
443 %m = load <32 x double>, <32 x double>* %m_ptr
444 %mask = fcmp oeq <32 x double> %m, zeroinitializer
445 %val = load <32 x double>, <32 x double>* %val_ptr
446 call void @llvm.masked.store.v32f32.p0v32f64(<32 x double> %val, <32 x double>* %a, i32 8, <32 x i1> %mask)
449 declare void @llvm.masked.store.v32f32.p0v32f64(<32 x double>, <32 x double>*, i32, <32 x i1>)
451 define void @masked_store_v64f16(<64 x half>* %val_ptr, <64 x half>* %a, <64 x half>* %m_ptr) nounwind {
452 ; CHECK-LABEL: masked_store_v64f16:
454 ; CHECK-NEXT: li a3, 64
455 ; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma
456 ; CHECK-NEXT: vle16.v v8, (a2)
457 ; CHECK-NEXT: vle16.v v16, (a0)
458 ; CHECK-NEXT: fmv.h.x fa5, zero
459 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
460 ; CHECK-NEXT: vse16.v v16, (a1), v0.t
462 %m = load <64 x half>, <64 x half>* %m_ptr
463 %mask = fcmp oeq <64 x half> %m, zeroinitializer
464 %val = load <64 x half>, <64 x half>* %val_ptr
465 call void @llvm.masked.store.v64f16.p0v64f16(<64 x half> %val, <64 x half>* %a, i32 8, <64 x i1> %mask)
468 declare void @llvm.masked.store.v64f16.p0v64f16(<64 x half>, <64 x half>*, i32, <64 x i1>)
470 define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x float>* %m_ptr) nounwind {
471 ; CHECK-LABEL: masked_store_v64f32:
473 ; CHECK-NEXT: addi sp, sp, -16
474 ; CHECK-NEXT: csrr a3, vlenb
475 ; CHECK-NEXT: slli a3, a3, 4
476 ; CHECK-NEXT: sub sp, sp, a3
477 ; CHECK-NEXT: li a3, 32
478 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma
479 ; CHECK-NEXT: vle32.v v8, (a2)
480 ; CHECK-NEXT: addi a2, a2, 128
481 ; CHECK-NEXT: vle32.v v16, (a2)
482 ; CHECK-NEXT: csrr a2, vlenb
483 ; CHECK-NEXT: slli a2, a2, 3
484 ; CHECK-NEXT: add a2, sp, a2
485 ; CHECK-NEXT: addi a2, a2, 16
486 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
487 ; CHECK-NEXT: fmv.w.x fa5, zero
488 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
489 ; CHECK-NEXT: vle32.v v24, (a0)
490 ; CHECK-NEXT: addi a0, a0, 128
491 ; CHECK-NEXT: vle32.v v8, (a0)
492 ; CHECK-NEXT: addi a0, sp, 16
493 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
494 ; CHECK-NEXT: csrr a0, vlenb
495 ; CHECK-NEXT: slli a0, a0, 3
496 ; CHECK-NEXT: add a0, sp, a0
497 ; CHECK-NEXT: addi a0, a0, 16
498 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
499 ; CHECK-NEXT: vmfeq.vf v8, v16, fa5
500 ; CHECK-NEXT: vse32.v v24, (a1), v0.t
501 ; CHECK-NEXT: addi a0, a1, 128
502 ; CHECK-NEXT: vmv1r.v v0, v8
503 ; CHECK-NEXT: addi a1, sp, 16
504 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
505 ; CHECK-NEXT: vse32.v v8, (a0), v0.t
506 ; CHECK-NEXT: csrr a0, vlenb
507 ; CHECK-NEXT: slli a0, a0, 4
508 ; CHECK-NEXT: add sp, sp, a0
509 ; CHECK-NEXT: addi sp, sp, 16
511 %m = load <64 x float>, <64 x float>* %m_ptr
512 %mask = fcmp oeq <64 x float> %m, zeroinitializer
513 %val = load <64 x float>, <64 x float>* %val_ptr
514 call void @llvm.masked.store.v64f16.p0v64f32(<64 x float> %val, <64 x float>* %a, i32 8, <64 x i1> %mask)
517 declare void @llvm.masked.store.v64f16.p0v64f32(<64 x float>, <64 x float>*, i32, <64 x i1>)
519 define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 x half>* %m_ptr) nounwind {
520 ; CHECK-LABEL: masked_store_v128f16:
522 ; CHECK-NEXT: addi sp, sp, -16
523 ; CHECK-NEXT: csrr a3, vlenb
524 ; CHECK-NEXT: slli a3, a3, 4
525 ; CHECK-NEXT: sub sp, sp, a3
526 ; CHECK-NEXT: li a3, 64
527 ; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma
528 ; CHECK-NEXT: vle16.v v8, (a2)
529 ; CHECK-NEXT: addi a2, a2, 128
530 ; CHECK-NEXT: vle16.v v16, (a2)
531 ; CHECK-NEXT: csrr a2, vlenb
532 ; CHECK-NEXT: slli a2, a2, 3
533 ; CHECK-NEXT: add a2, sp, a2
534 ; CHECK-NEXT: addi a2, a2, 16
535 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
536 ; CHECK-NEXT: fmv.h.x fa5, zero
537 ; CHECK-NEXT: vmfeq.vf v0, v8, fa5
538 ; CHECK-NEXT: vle16.v v24, (a0)
539 ; CHECK-NEXT: addi a0, a0, 128
540 ; CHECK-NEXT: vle16.v v8, (a0)
541 ; CHECK-NEXT: addi a0, sp, 16
542 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
543 ; CHECK-NEXT: csrr a0, vlenb
544 ; CHECK-NEXT: slli a0, a0, 3
545 ; CHECK-NEXT: add a0, sp, a0
546 ; CHECK-NEXT: addi a0, a0, 16
547 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
548 ; CHECK-NEXT: vmfeq.vf v8, v16, fa5
549 ; CHECK-NEXT: vse16.v v24, (a1), v0.t
550 ; CHECK-NEXT: addi a0, a1, 128
551 ; CHECK-NEXT: vmv1r.v v0, v8
552 ; CHECK-NEXT: addi a1, sp, 16
553 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
554 ; CHECK-NEXT: vse16.v v8, (a0), v0.t
555 ; CHECK-NEXT: csrr a0, vlenb
556 ; CHECK-NEXT: slli a0, a0, 4
557 ; CHECK-NEXT: add sp, sp, a0
558 ; CHECK-NEXT: addi sp, sp, 16
560 %m = load <128 x half>, <128 x half>* %m_ptr
561 %mask = fcmp oeq <128 x half> %m, zeroinitializer
562 %val = load <128 x half>, <128 x half>* %val_ptr
563 call void @llvm.masked.store.v128f16.p0v128f16(<128 x half> %val, <128 x half>* %a, i32 8, <128 x i1> %mask)
566 declare void @llvm.masked.store.v128f16.p0v128f16(<128 x half>, <128 x half>*, i32, <128 x i1>)