1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
3 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
4 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
5 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
7 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
8 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
9 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
10 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
12 ; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
13 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
14 ; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
15 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
17 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
18 ; VLA-LABEL: insert_nxv8i32_v2i32_0:
20 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
21 ; VLA-NEXT: vle32.v v12, (a0)
22 ; VLA-NEXT: vsetivli zero, 2, e32, m4, tu, ma
23 ; VLA-NEXT: vmv.v.v v8, v12
26 ; VLS-LABEL: insert_nxv8i32_v2i32_0:
28 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
29 ; VLS-NEXT: vle32.v v12, (a0)
30 ; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma
31 ; VLS-NEXT: vmv.v.v v8, v12
33 %sv = load <2 x i32>, ptr %svp
34 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
35 ret <vscale x 8 x i32> %v
38 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
39 ; VLA-LABEL: insert_nxv8i32_v2i32_2:
41 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
42 ; VLA-NEXT: vle32.v v12, (a0)
43 ; VLA-NEXT: vsetivli zero, 4, e32, m4, tu, ma
44 ; VLA-NEXT: vslideup.vi v8, v12, 2
47 ; VLS-LABEL: insert_nxv8i32_v2i32_2:
49 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
50 ; VLS-NEXT: vle32.v v12, (a0)
51 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma
52 ; VLS-NEXT: vslideup.vi v8, v12, 2
54 %sv = load <2 x i32>, ptr %svp
55 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
56 ret <vscale x 8 x i32> %v
59 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
60 ; VLA-LABEL: insert_nxv8i32_v2i32_6:
62 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
63 ; VLA-NEXT: vle32.v v12, (a0)
64 ; VLA-NEXT: vsetivli zero, 8, e32, m4, tu, ma
65 ; VLA-NEXT: vslideup.vi v8, v12, 6
68 ; VLS-LABEL: insert_nxv8i32_v2i32_6:
70 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
71 ; VLS-NEXT: vle32.v v12, (a0)
72 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma
73 ; VLS-NEXT: vslideup.vi v9, v12, 2
75 %sv = load <2 x i32>, ptr %svp
76 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
77 ret <vscale x 8 x i32> %v
80 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
81 ; VLA-LABEL: insert_nxv8i32_v8i32_0:
83 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
84 ; VLA-NEXT: vle32.v v12, (a0)
85 ; VLA-NEXT: vsetivli zero, 8, e32, m4, tu, ma
86 ; VLA-NEXT: vmv.v.v v8, v12
89 ; VLS-LABEL: insert_nxv8i32_v8i32_0:
91 ; VLS-NEXT: vl2re32.v v8, (a0)
93 %sv = load <8 x i32>, ptr %svp
94 %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
95 ret <vscale x 8 x i32> %v
98 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
99 ; VLA-LABEL: insert_nxv8i32_v8i32_8:
101 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
102 ; VLA-NEXT: vle32.v v12, (a0)
103 ; VLA-NEXT: vsetivli zero, 16, e32, m4, tu, ma
104 ; VLA-NEXT: vslideup.vi v8, v12, 8
107 ; VLS-LABEL: insert_nxv8i32_v8i32_8:
109 ; VLS-NEXT: vl2re32.v v10, (a0)
111 %sv = load <8 x i32>, ptr %svp
112 %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
113 ret <vscale x 8 x i32> %v
116 define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
117 ; CHECK-LABEL: insert_nxv8i32_undef_v2i32_0:
119 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
120 ; CHECK-NEXT: vle32.v v8, (a0)
122 %sv = load <2 x i32>, ptr %svp
123 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> undef, <2 x i32> %sv, i64 0)
124 ret <vscale x 8 x i32> %v
127 define <vscale x 2 x i32> @insert_nxv8i32_v4i32_0(<vscale x 2 x i32> %vec, <4 x i32> %subvec) {
128 ; VLA-LABEL: insert_nxv8i32_v4i32_0:
130 ; VLA-NEXT: vsetivli zero, 4, e32, m1, tu, ma
131 ; VLA-NEXT: vmv.v.v v8, v9
134 ; VLS-LABEL: insert_nxv8i32_v4i32_0:
136 ; VLS-NEXT: vmv1r.v v8, v9
138 %v = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> %vec, <4 x i32> %subvec, i64 0)
139 ret <vscale x 2 x i32> %v
143 define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) {
144 ; CHECK-LABEL: insert_v4i32_v4i32_0:
146 ; CHECK-NEXT: vmv1r.v v8, v9
148 %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0)
152 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
153 ; VLA-LABEL: insert_v4i32_v2i32_0:
155 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
156 ; VLA-NEXT: vle32.v v8, (a1)
157 ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma
158 ; VLA-NEXT: vle32.v v9, (a0)
159 ; VLA-NEXT: vsetivli zero, 2, e32, m1, tu, ma
160 ; VLA-NEXT: vmv.v.v v9, v8
161 ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma
162 ; VLA-NEXT: vse32.v v9, (a0)
165 ; VLS-LABEL: insert_v4i32_v2i32_0:
167 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
168 ; VLS-NEXT: vle32.v v8, (a1)
169 ; VLS-NEXT: vl1re32.v v9, (a0)
170 ; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma
171 ; VLS-NEXT: vmv.v.v v9, v8
172 ; VLS-NEXT: vs1r.v v9, (a0)
174 %sv = load <2 x i32>, ptr %svp
175 %vec = load <4 x i32>, ptr %vp
176 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
177 store <4 x i32> %v, ptr %vp
181 define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
182 ; VLA-LABEL: insert_v4i32_v2i32_2:
184 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
185 ; VLA-NEXT: vle32.v v8, (a1)
186 ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma
187 ; VLA-NEXT: vle32.v v9, (a0)
188 ; VLA-NEXT: vslideup.vi v9, v8, 2
189 ; VLA-NEXT: vse32.v v9, (a0)
192 ; VLS-LABEL: insert_v4i32_v2i32_2:
194 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
195 ; VLS-NEXT: vle32.v v8, (a1)
196 ; VLS-NEXT: vl1re32.v v9, (a0)
197 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma
198 ; VLS-NEXT: vslideup.vi v9, v8, 2
199 ; VLS-NEXT: vs1r.v v9, (a0)
201 %sv = load <2 x i32>, ptr %svp
202 %vec = load <4 x i32>, ptr %vp
203 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
204 store <4 x i32> %v, ptr %vp
208 define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
209 ; VLA-LABEL: insert_v4i32_undef_v2i32_0:
211 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
212 ; VLA-NEXT: vle32.v v8, (a1)
213 ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma
214 ; VLA-NEXT: vse32.v v8, (a0)
217 ; VLS-LABEL: insert_v4i32_undef_v2i32_0:
219 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
220 ; VLS-NEXT: vle32.v v8, (a1)
221 ; VLS-NEXT: vs1r.v v8, (a0)
223 %sv = load <2 x i32>, ptr %svp
224 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
225 store <4 x i32> %v, ptr %vp
229 ; This tests the code path in RISCVISelDAGToDAG::Select where we select an
230 ; insert_subvector with a fixed vector and fixed subvector type. The phi here is
231 ; used to prevent the fixed insert_subvector from being combined away into a
232 ; scalable insert_subvector.
233 define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) {
234 ; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi:
235 ; CHECK: # %bb.0: # %entry
236 ; CHECK-NEXT: andi a0, a0, 1
237 ; CHECK-NEXT: bnez a0, .LBB11_2
238 ; CHECK-NEXT: # %bb.1:
239 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
240 ; CHECK-NEXT: vmv.v.i v8, 0
241 ; CHECK-NEXT: .LBB11_2: # %bar
244 br i1 %cond, label %foo, label %bar
246 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0)
249 %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry]
254 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
255 ; VLA-LABEL: insert_v8i32_v2i32_0:
257 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
258 ; VLA-NEXT: vle32.v v8, (a1)
259 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
260 ; VLA-NEXT: vle32.v v10, (a0)
261 ; VLA-NEXT: vsetivli zero, 2, e32, m2, tu, ma
262 ; VLA-NEXT: vmv.v.v v10, v8
263 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
264 ; VLA-NEXT: vse32.v v10, (a0)
267 ; VLS-LABEL: insert_v8i32_v2i32_0:
269 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
270 ; VLS-NEXT: vle32.v v8, (a1)
271 ; VLS-NEXT: vl2re32.v v10, (a0)
272 ; VLS-NEXT: vsetivli zero, 2, e32, m1, tu, ma
273 ; VLS-NEXT: vmv.v.v v10, v8
274 ; VLS-NEXT: vs2r.v v10, (a0)
276 %sv = load <2 x i32>, ptr %svp
277 %vec = load <8 x i32>, ptr %vp
278 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
279 store <8 x i32> %v, ptr %vp
283 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
284 ; VLA-LABEL: insert_v8i32_v2i32_2:
286 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
287 ; VLA-NEXT: vle32.v v8, (a0)
288 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
289 ; VLA-NEXT: vle32.v v10, (a1)
290 ; VLA-NEXT: vsetivli zero, 4, e32, m2, tu, ma
291 ; VLA-NEXT: vslideup.vi v8, v10, 2
292 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
293 ; VLA-NEXT: vse32.v v8, (a0)
296 ; VLS-LABEL: insert_v8i32_v2i32_2:
298 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
299 ; VLS-NEXT: vle32.v v8, (a1)
300 ; VLS-NEXT: vl2re32.v v10, (a0)
301 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma
302 ; VLS-NEXT: vslideup.vi v10, v8, 2
303 ; VLS-NEXT: vs2r.v v10, (a0)
305 %sv = load <2 x i32>, ptr %svp
306 %vec = load <8 x i32>, ptr %vp
307 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
308 store <8 x i32> %v, ptr %vp
312 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
313 ; VLA-LABEL: insert_v8i32_v2i32_6:
315 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
316 ; VLA-NEXT: vle32.v v8, (a0)
317 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
318 ; VLA-NEXT: vle32.v v10, (a1)
319 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
320 ; VLA-NEXT: vslideup.vi v8, v10, 6
321 ; VLA-NEXT: vse32.v v8, (a0)
324 ; VLS-LABEL: insert_v8i32_v2i32_6:
326 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
327 ; VLS-NEXT: vle32.v v8, (a1)
328 ; VLS-NEXT: vl2re32.v v10, (a0)
329 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma
330 ; VLS-NEXT: vslideup.vi v11, v8, 2
331 ; VLS-NEXT: vs2r.v v10, (a0)
333 %sv = load <2 x i32>, ptr %svp
334 %vec = load <8 x i32>, ptr %vp
335 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
336 store <8 x i32> %v, ptr %vp
340 define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
341 ; VLA-LABEL: insert_v8i32_undef_v2i32_6:
343 ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
344 ; VLA-NEXT: vle32.v v8, (a1)
345 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma
346 ; VLA-NEXT: vslideup.vi v10, v8, 6
347 ; VLA-NEXT: vse32.v v10, (a0)
350 ; VLS-LABEL: insert_v8i32_undef_v2i32_6:
352 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
353 ; VLS-NEXT: vle32.v v8, (a1)
354 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma
355 ; VLS-NEXT: vslideup.vi v9, v8, 2
356 ; VLS-NEXT: vs2r.v v8, (a0)
358 %sv = load <2 x i32>, ptr %svp
359 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
360 store <8 x i32> %v, ptr %vp
364 define void @insert_v4i16_v2i16_0(ptr %vp, ptr %svp) {
365 ; CHECK-LABEL: insert_v4i16_v2i16_0:
367 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
368 ; CHECK-NEXT: vle16.v v8, (a0)
369 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
370 ; CHECK-NEXT: vle16.v v9, (a1)
371 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
372 ; CHECK-NEXT: vmv.v.v v8, v9
373 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
374 ; CHECK-NEXT: vse16.v v8, (a0)
376 %v = load <4 x i16>, ptr %vp
377 %sv = load <2 x i16>, ptr %svp
378 %c = call <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 0)
379 store <4 x i16> %c, ptr %vp
383 define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
384 ; CHECK-LABEL: insert_v4i16_v2i16_2:
386 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
387 ; CHECK-NEXT: vle16.v v8, (a0)
388 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
389 ; CHECK-NEXT: vle16.v v9, (a1)
390 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
391 ; CHECK-NEXT: vslideup.vi v8, v9, 2
392 ; CHECK-NEXT: vse16.v v8, (a0)
394 %v = load <4 x i16>, ptr %vp
395 %sv = load <2 x i16>, ptr %svp
396 %c = call <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 2)
397 store <4 x i16> %c, ptr %vp
401 define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
402 ; VLA-LABEL: insert_v32i1_v8i1_0:
404 ; VLA-NEXT: li a2, 32
405 ; VLA-NEXT: vsetvli zero, a2, e8, m2, ta, ma
406 ; VLA-NEXT: vlm.v v8, (a0)
407 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
408 ; VLA-NEXT: vlm.v v9, (a1)
409 ; VLA-NEXT: vsetivli zero, 1, e8, mf4, tu, ma
410 ; VLA-NEXT: vmv.v.v v8, v9
411 ; VLA-NEXT: vsetvli zero, a2, e8, m2, ta, ma
412 ; VLA-NEXT: vsm.v v8, (a0)
415 ; VLS-LABEL: insert_v32i1_v8i1_0:
417 ; VLS-NEXT: vsetvli a2, zero, e8, m2, ta, ma
418 ; VLS-NEXT: vlm.v v8, (a0)
419 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
420 ; VLS-NEXT: vlm.v v9, (a1)
421 ; VLS-NEXT: vsetivli zero, 1, e8, mf4, tu, ma
422 ; VLS-NEXT: vmv.v.v v8, v9
423 ; VLS-NEXT: vsetvli a1, zero, e8, m2, ta, ma
424 ; VLS-NEXT: vsm.v v8, (a0)
426 %v = load <32 x i1>, ptr %vp
427 %sv = load <8 x i1>, ptr %svp
428 %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
429 store <32 x i1> %c, ptr %vp
433 define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
434 ; VLA-LABEL: insert_v32i1_v8i1_16:
436 ; VLA-NEXT: li a2, 32
437 ; VLA-NEXT: vsetvli zero, a2, e8, m2, ta, ma
438 ; VLA-NEXT: vlm.v v8, (a0)
439 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
440 ; VLA-NEXT: vlm.v v9, (a1)
441 ; VLA-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
442 ; VLA-NEXT: vslideup.vi v8, v9, 2
443 ; VLA-NEXT: vsetvli zero, a2, e8, m2, ta, ma
444 ; VLA-NEXT: vsm.v v8, (a0)
447 ; VLS-LABEL: insert_v32i1_v8i1_16:
449 ; VLS-NEXT: vsetvli a2, zero, e8, m2, ta, ma
450 ; VLS-NEXT: vlm.v v8, (a0)
451 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
452 ; VLS-NEXT: vlm.v v9, (a1)
453 ; VLS-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
454 ; VLS-NEXT: vslideup.vi v8, v9, 2
455 ; VLS-NEXT: vsetvli a1, zero, e8, m2, ta, ma
456 ; VLS-NEXT: vsm.v v8, (a0)
458 %v = load <32 x i1>, ptr %vp
459 %sv = load <8 x i1>, ptr %svp
460 %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
461 store <32 x i1> %c, ptr %vp
465 define void @insert_v8i1_v4i1_0(ptr %vp, ptr %svp) {
466 ; CHECK-LABEL: insert_v8i1_v4i1_0:
468 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
469 ; CHECK-NEXT: vlm.v v0, (a0)
470 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
471 ; CHECK-NEXT: vlm.v v8, (a1)
472 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
473 ; CHECK-NEXT: vmv.v.i v9, 0
474 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
475 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
476 ; CHECK-NEXT: vmv.v.i v10, 0
477 ; CHECK-NEXT: vmv1r.v v0, v8
478 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
479 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
480 ; CHECK-NEXT: vmv.v.v v9, v8
481 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
482 ; CHECK-NEXT: vmsne.vi v8, v9, 0
483 ; CHECK-NEXT: vsm.v v8, (a0)
485 %v = load <8 x i1>, ptr %vp
486 %sv = load <4 x i1>, ptr %svp
487 %c = call <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 0)
488 store <8 x i1> %c, ptr %vp
492 define void @insert_v8i1_v4i1_4(ptr %vp, ptr %svp) {
493 ; CHECK-LABEL: insert_v8i1_v4i1_4:
495 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
496 ; CHECK-NEXT: vlm.v v0, (a0)
497 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
498 ; CHECK-NEXT: vlm.v v8, (a1)
499 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
500 ; CHECK-NEXT: vmv.v.i v9, 0
501 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
502 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
503 ; CHECK-NEXT: vmv.v.i v10, 0
504 ; CHECK-NEXT: vmv1r.v v0, v8
505 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
506 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
507 ; CHECK-NEXT: vslideup.vi v9, v8, 4
508 ; CHECK-NEXT: vmsne.vi v8, v9, 0
509 ; CHECK-NEXT: vsm.v v8, (a0)
511 %v = load <8 x i1>, ptr %vp
512 %sv = load <4 x i1>, ptr %svp
513 %c = call <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 4)
514 store <8 x i1> %c, ptr %vp
518 define <vscale x 2 x i16> @insert_nxv2i16_v2i16_0(<vscale x 2 x i16> %v, ptr %svp) {
519 ; CHECK-LABEL: insert_nxv2i16_v2i16_0:
521 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
522 ; CHECK-NEXT: vle16.v v9, (a0)
523 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
524 ; CHECK-NEXT: vmv.v.v v8, v9
526 %sv = load <2 x i16>, ptr %svp
527 %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 0)
528 ret <vscale x 2 x i16> %c
531 define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %svp) {
532 ; CHECK-LABEL: insert_nxv2i16_v2i16_2:
534 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
535 ; CHECK-NEXT: vle16.v v9, (a0)
536 ; CHECK-NEXT: vsetivli zero, 6, e16, mf2, tu, ma
537 ; CHECK-NEXT: vslideup.vi v8, v9, 4
539 %sv = load <2 x i16>, ptr %svp
540 %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 4)
541 ret <vscale x 2 x i16> %c
544 define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
545 ; VLA-LABEL: insert_nxv2i1_v4i1_0:
547 ; VLA-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
548 ; VLA-NEXT: vlm.v v8, (a0)
549 ; VLA-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
550 ; VLA-NEXT: vmv.v.i v9, 0
551 ; VLA-NEXT: vmerge.vim v9, v9, 1, v0
552 ; VLA-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
553 ; VLA-NEXT: vmv.v.i v10, 0
554 ; VLA-NEXT: vmv1r.v v0, v8
555 ; VLA-NEXT: vmerge.vim v8, v10, 1, v0
556 ; VLA-NEXT: vsetvli zero, zero, e8, mf4, tu, ma
557 ; VLA-NEXT: vmv.v.v v9, v8
558 ; VLA-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
559 ; VLA-NEXT: vmsne.vi v0, v9, 0
562 ; VLS-LABEL: insert_nxv2i1_v4i1_0:
564 ; VLS-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
565 ; VLS-NEXT: vlm.v v8, (a0)
566 ; VLS-NEXT: vmv.v.i v9, 0
567 ; VLS-NEXT: vmerge.vim v10, v9, 1, v0
568 ; VLS-NEXT: vmv1r.v v0, v8
569 ; VLS-NEXT: vmerge.vim v8, v9, 1, v0
570 ; VLS-NEXT: vsetvli zero, zero, e8, mf4, tu, ma
571 ; VLS-NEXT: vmv.v.v v10, v8
572 ; VLS-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
573 ; VLS-NEXT: vmsne.vi v0, v10, 0
575 %sv = load <4 x i1>, ptr %svp
576 %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
577 ret <vscale x 2 x i1> %c
580 define <vscale x 8 x i1> @insert_nxv8i1_v4i1_0(<vscale x 8 x i1> %v, ptr %svp) {
581 ; CHECK-LABEL: insert_nxv8i1_v4i1_0:
583 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
584 ; CHECK-NEXT: vlm.v v8, (a0)
585 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, ma
586 ; CHECK-NEXT: vmv.v.v v0, v8
588 %sv = load <8 x i1>, ptr %svp
589 %c = call <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1> %v, <8 x i1> %sv, i64 0)
590 ret <vscale x 8 x i1> %c
593 define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp) {
594 ; CHECK-LABEL: insert_nxv8i1_v8i1_16:
596 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
597 ; CHECK-NEXT: vlm.v v8, (a0)
598 ; CHECK-NEXT: vsetivli zero, 3, e8, mf8, tu, ma
599 ; CHECK-NEXT: vslideup.vi v0, v8, 2
601 %sv = load <8 x i1>, ptr %svp
602 %c = call <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1> %v, <8 x i1> %sv, i64 16)
603 ret <vscale x 8 x i1> %c
606 declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
608 define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
609 ; VLA-LABEL: insert_v2i64_nxv16i64:
611 ; VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
612 ; VLA-NEXT: vle64.v v8, (a0)
613 ; VLA-NEXT: vle64.v v16, (a1)
614 ; VLA-NEXT: vsetivli zero, 6, e64, m8, tu, ma
615 ; VLA-NEXT: vslideup.vi v8, v16, 4
616 ; VLA-NEXT: vs8r.v v8, (a2)
619 ; VLS-LABEL: insert_v2i64_nxv16i64:
621 ; VLS-NEXT: vl1re64.v v8, (a0)
622 ; VLS-NEXT: vl1re64.v v10, (a1)
623 ; VLS-NEXT: vs8r.v v8, (a2)
625 %sv0 = load <2 x i64>, ptr %psv0
626 %sv1 = load <2 x i64>, ptr %psv1
627 %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
628 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
629 store <vscale x 16 x i64> %v, ptr %out
633 define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
634 ; VLA-LABEL: insert_v2i64_nxv16i64_lo0:
636 ; VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
637 ; VLA-NEXT: vle64.v v8, (a0)
638 ; VLA-NEXT: vs8r.v v8, (a1)
641 ; VLS-LABEL: insert_v2i64_nxv16i64_lo0:
643 ; VLS-NEXT: vl1re64.v v8, (a0)
644 ; VLS-NEXT: vs8r.v v8, (a1)
646 %sv = load <2 x i64>, ptr %psv
647 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
648 store <vscale x 16 x i64> %v, ptr %out
652 define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
653 ; VLA-LABEL: insert_v2i64_nxv16i64_lo2:
655 ; VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
656 ; VLA-NEXT: vle64.v v8, (a0)
657 ; VLA-NEXT: vsetivli zero, 4, e64, m8, ta, ma
658 ; VLA-NEXT: vslideup.vi v16, v8, 2
659 ; VLA-NEXT: vs8r.v v16, (a1)
662 ; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
664 ; VLS-NEXT: vl1re64.v v9, (a0)
665 ; VLS-NEXT: vs8r.v v8, (a1)
667 %sv = load <2 x i64>, ptr %psv
668 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
669 store <vscale x 16 x i64> %v, ptr %out
673 ; Check we don't mistakenly optimize this: we don't know whether this is
674 ; inserted into the low or high split vector.
675 define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
676 ; RV32-LABEL: insert_v2i64_nxv16i64_hi:
678 ; RV32-NEXT: addi sp, sp, -80
679 ; RV32-NEXT: .cfi_def_cfa_offset 80
680 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
681 ; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
682 ; RV32-NEXT: .cfi_offset ra, -4
683 ; RV32-NEXT: .cfi_offset s0, -8
684 ; RV32-NEXT: addi s0, sp, 80
685 ; RV32-NEXT: .cfi_def_cfa s0, 0
686 ; RV32-NEXT: csrr a2, vlenb
687 ; RV32-NEXT: slli a2, a2, 4
688 ; RV32-NEXT: sub sp, sp, a2
689 ; RV32-NEXT: andi sp, sp, -64
690 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
691 ; RV32-NEXT: vle64.v v8, (a0)
692 ; RV32-NEXT: addi a0, sp, 128
693 ; RV32-NEXT: vse64.v v8, (a0)
694 ; RV32-NEXT: csrr a0, vlenb
695 ; RV32-NEXT: slli a0, a0, 3
696 ; RV32-NEXT: addi a2, sp, 64
697 ; RV32-NEXT: add a3, a2, a0
698 ; RV32-NEXT: vl8re64.v v8, (a3)
699 ; RV32-NEXT: vl8re64.v v16, (a2)
700 ; RV32-NEXT: add a0, a1, a0
701 ; RV32-NEXT: vs8r.v v8, (a0)
702 ; RV32-NEXT: vs8r.v v16, (a1)
703 ; RV32-NEXT: addi sp, s0, -80
704 ; RV32-NEXT: .cfi_def_cfa sp, 80
705 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
706 ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
707 ; RV32-NEXT: addi sp, sp, 80
709 ; RV64-LABEL: insert_v2i64_nxv16i64_hi:
711 ; RV64-NEXT: addi sp, sp, -80
712 ; RV64-NEXT: .cfi_def_cfa_offset 80
713 ; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
714 ; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
715 ; RV64-NEXT: .cfi_offset ra, -8
716 ; RV64-NEXT: .cfi_offset s0, -16
717 ; RV64-NEXT: addi s0, sp, 80
718 ; RV64-NEXT: .cfi_def_cfa s0, 0
719 ; RV64-NEXT: csrr a2, vlenb
720 ; RV64-NEXT: slli a2, a2, 4
721 ; RV64-NEXT: sub sp, sp, a2
722 ; RV64-NEXT: andi sp, sp, -64
723 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
724 ; RV64-NEXT: vle64.v v8, (a0)
725 ; RV64-NEXT: addi a0, sp, 128
726 ; RV64-NEXT: vse64.v v8, (a0)
727 ; RV64-NEXT: csrr a0, vlenb
728 ; RV64-NEXT: slli a0, a0, 3
729 ; RV64-NEXT: addi a2, sp, 64
730 ; RV64-NEXT: add a3, a2, a0
731 ; RV64-NEXT: vl8re64.v v8, (a3)
732 ; RV64-NEXT: vl8re64.v v16, (a2)
733 ; RV64-NEXT: add a0, a1, a0
734 ; RV64-NEXT: vs8r.v v8, (a0)
735 ; RV64-NEXT: vs8r.v v16, (a1)
736 ; RV64-NEXT: addi sp, s0, -80
737 ; RV64-NEXT: .cfi_def_cfa sp, 80
738 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
739 ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
740 ; RV64-NEXT: addi sp, sp, 80
742 ; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
744 ; RV32VLA-NEXT: addi sp, sp, -80
745 ; RV32VLA-NEXT: .cfi_def_cfa_offset 80
746 ; RV32VLA-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
747 ; RV32VLA-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
748 ; RV32VLA-NEXT: .cfi_offset ra, -4
749 ; RV32VLA-NEXT: .cfi_offset s0, -8
750 ; RV32VLA-NEXT: addi s0, sp, 80
751 ; RV32VLA-NEXT: .cfi_def_cfa s0, 0
752 ; RV32VLA-NEXT: csrr a2, vlenb
753 ; RV32VLA-NEXT: slli a2, a2, 4
754 ; RV32VLA-NEXT: sub sp, sp, a2
755 ; RV32VLA-NEXT: andi sp, sp, -64
756 ; RV32VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
757 ; RV32VLA-NEXT: vle64.v v8, (a0)
758 ; RV32VLA-NEXT: addi a0, sp, 128
759 ; RV32VLA-NEXT: csrr a2, vlenb
760 ; RV32VLA-NEXT: addi a3, sp, 64
761 ; RV32VLA-NEXT: slli a2, a2, 3
762 ; RV32VLA-NEXT: vse64.v v8, (a0)
763 ; RV32VLA-NEXT: add a0, a3, a2
764 ; RV32VLA-NEXT: vl8re64.v v8, (a0)
765 ; RV32VLA-NEXT: vl8re64.v v16, (a3)
766 ; RV32VLA-NEXT: add a2, a1, a2
767 ; RV32VLA-NEXT: vs8r.v v8, (a2)
768 ; RV32VLA-NEXT: vs8r.v v16, (a1)
769 ; RV32VLA-NEXT: addi sp, s0, -80
770 ; RV32VLA-NEXT: .cfi_def_cfa sp, 80
771 ; RV32VLA-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
772 ; RV32VLA-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
773 ; RV32VLA-NEXT: .cfi_restore ra
774 ; RV32VLA-NEXT: .cfi_restore s0
775 ; RV32VLA-NEXT: addi sp, sp, 80
776 ; RV32VLA-NEXT: .cfi_def_cfa_offset 0
779 ; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
781 ; RV64VLA-NEXT: addi sp, sp, -80
782 ; RV64VLA-NEXT: .cfi_def_cfa_offset 80
783 ; RV64VLA-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
784 ; RV64VLA-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
785 ; RV64VLA-NEXT: .cfi_offset ra, -8
786 ; RV64VLA-NEXT: .cfi_offset s0, -16
787 ; RV64VLA-NEXT: addi s0, sp, 80
788 ; RV64VLA-NEXT: .cfi_def_cfa s0, 0
789 ; RV64VLA-NEXT: csrr a2, vlenb
790 ; RV64VLA-NEXT: slli a2, a2, 4
791 ; RV64VLA-NEXT: sub sp, sp, a2
792 ; RV64VLA-NEXT: andi sp, sp, -64
793 ; RV64VLA-NEXT: vsetivli zero, 2, e64, m1, ta, ma
794 ; RV64VLA-NEXT: vle64.v v8, (a0)
795 ; RV64VLA-NEXT: addi a0, sp, 128
796 ; RV64VLA-NEXT: csrr a2, vlenb
797 ; RV64VLA-NEXT: addi a3, sp, 64
798 ; RV64VLA-NEXT: slli a2, a2, 3
799 ; RV64VLA-NEXT: vse64.v v8, (a0)
800 ; RV64VLA-NEXT: add a0, a3, a2
801 ; RV64VLA-NEXT: vl8re64.v v8, (a0)
802 ; RV64VLA-NEXT: vl8re64.v v16, (a3)
803 ; RV64VLA-NEXT: add a2, a1, a2
804 ; RV64VLA-NEXT: vs8r.v v8, (a2)
805 ; RV64VLA-NEXT: vs8r.v v16, (a1)
806 ; RV64VLA-NEXT: addi sp, s0, -80
807 ; RV64VLA-NEXT: .cfi_def_cfa sp, 80
808 ; RV64VLA-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
809 ; RV64VLA-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
810 ; RV64VLA-NEXT: .cfi_restore ra
811 ; RV64VLA-NEXT: .cfi_restore s0
812 ; RV64VLA-NEXT: addi sp, sp, 80
813 ; RV64VLA-NEXT: .cfi_def_cfa_offset 0
816 ; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
818 ; RV32VLS-NEXT: addi sp, sp, -80
819 ; RV32VLS-NEXT: .cfi_def_cfa_offset 80
820 ; RV32VLS-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
821 ; RV32VLS-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
822 ; RV32VLS-NEXT: .cfi_offset ra, -4
823 ; RV32VLS-NEXT: .cfi_offset s0, -8
824 ; RV32VLS-NEXT: addi s0, sp, 80
825 ; RV32VLS-NEXT: .cfi_def_cfa s0, 0
826 ; RV32VLS-NEXT: addi sp, sp, -256
827 ; RV32VLS-NEXT: andi sp, sp, -64
828 ; RV32VLS-NEXT: vl1re64.v v8, (a0)
829 ; RV32VLS-NEXT: addi a0, sp, 128
830 ; RV32VLS-NEXT: vs1r.v v8, (a0)
831 ; RV32VLS-NEXT: addi a0, sp, 192
832 ; RV32VLS-NEXT: vl8re64.v v8, (a0)
833 ; RV32VLS-NEXT: addi a0, sp, 64
834 ; RV32VLS-NEXT: vl8re64.v v16, (a0)
835 ; RV32VLS-NEXT: addi a0, a1, 128
836 ; RV32VLS-NEXT: vs8r.v v8, (a0)
837 ; RV32VLS-NEXT: vs8r.v v16, (a1)
838 ; RV32VLS-NEXT: addi sp, s0, -80
839 ; RV32VLS-NEXT: .cfi_def_cfa sp, 80
840 ; RV32VLS-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
841 ; RV32VLS-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
842 ; RV32VLS-NEXT: .cfi_restore ra
843 ; RV32VLS-NEXT: .cfi_restore s0
844 ; RV32VLS-NEXT: addi sp, sp, 80
845 ; RV32VLS-NEXT: .cfi_def_cfa_offset 0
848 ; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
850 ; RV64VLS-NEXT: addi sp, sp, -80
851 ; RV64VLS-NEXT: .cfi_def_cfa_offset 80
852 ; RV64VLS-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
853 ; RV64VLS-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
854 ; RV64VLS-NEXT: .cfi_offset ra, -8
855 ; RV64VLS-NEXT: .cfi_offset s0, -16
856 ; RV64VLS-NEXT: addi s0, sp, 80
857 ; RV64VLS-NEXT: .cfi_def_cfa s0, 0
858 ; RV64VLS-NEXT: addi sp, sp, -256
859 ; RV64VLS-NEXT: andi sp, sp, -64
860 ; RV64VLS-NEXT: vl1re64.v v8, (a0)
861 ; RV64VLS-NEXT: addi a0, sp, 128
862 ; RV64VLS-NEXT: vs1r.v v8, (a0)
863 ; RV64VLS-NEXT: addi a0, sp, 192
864 ; RV64VLS-NEXT: vl8re64.v v8, (a0)
865 ; RV64VLS-NEXT: addi a0, sp, 64
866 ; RV64VLS-NEXT: vl8re64.v v16, (a0)
867 ; RV64VLS-NEXT: addi a0, a1, 128
868 ; RV64VLS-NEXT: vs8r.v v8, (a0)
869 ; RV64VLS-NEXT: vs8r.v v16, (a1)
870 ; RV64VLS-NEXT: addi sp, s0, -80
871 ; RV64VLS-NEXT: .cfi_def_cfa sp, 80
872 ; RV64VLS-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
873 ; RV64VLS-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
874 ; RV64VLS-NEXT: .cfi_restore ra
875 ; RV64VLS-NEXT: .cfi_restore s0
876 ; RV64VLS-NEXT: addi sp, sp, 80
877 ; RV64VLS-NEXT: .cfi_def_cfa_offset 0
879 %sv = load <2 x i64>, ptr %psv
880 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
881 store <vscale x 16 x i64> %v, ptr %out
885 define <vscale x 8 x bfloat> @insert_nxv8bf16_v2bf16_0(<vscale x 8 x bfloat> %vec, ptr %svp) {
886 ; VLA-LABEL: insert_nxv8bf16_v2bf16_0:
888 ; VLA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
889 ; VLA-NEXT: vle16.v v10, (a0)
890 ; VLA-NEXT: vsetivli zero, 2, e16, m2, tu, ma
891 ; VLA-NEXT: vmv.v.v v8, v10
894 ; VLS-LABEL: insert_nxv8bf16_v2bf16_0:
896 ; VLS-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
897 ; VLS-NEXT: vle16.v v10, (a0)
898 ; VLS-NEXT: vsetivli zero, 2, e16, m1, tu, ma
899 ; VLS-NEXT: vmv.v.v v8, v10
901 %sv = load <2 x bfloat>, ptr %svp
902 %v = call <vscale x 8 x bfloat> @llvm.vector.insert.v2bf16.nxv8bf16(<vscale x 8 x bfloat> %vec, <2 x bfloat> %sv, i64 0)
903 ret <vscale x 8 x bfloat> %v
906 define <vscale x 8 x bfloat> @insert_nxv8bf16_v2bf16_2(<vscale x 8 x bfloat> %vec, ptr %svp) {
907 ; VLA-LABEL: insert_nxv8bf16_v2bf16_2:
909 ; VLA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
910 ; VLA-NEXT: vle16.v v10, (a0)
911 ; VLA-NEXT: vsetivli zero, 4, e16, m2, tu, ma
912 ; VLA-NEXT: vslideup.vi v8, v10, 2
915 ; VLS-LABEL: insert_nxv8bf16_v2bf16_2:
917 ; VLS-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
918 ; VLS-NEXT: vle16.v v10, (a0)
919 ; VLS-NEXT: vsetivli zero, 4, e16, m1, tu, ma
920 ; VLS-NEXT: vslideup.vi v8, v10, 2
922 %sv = load <2 x bfloat>, ptr %svp
923 %v = call <vscale x 8 x bfloat> @llvm.vector.insert.v2bf16.nxv8bf16(<vscale x 8 x bfloat> %vec, <2 x bfloat> %sv, i64 2)
924 ret <vscale x 8 x bfloat> %v
927 define <vscale x 8 x half> @insert_nxv8f16_v2f16_0(<vscale x 8 x half> %vec, ptr %svp) {
928 ; VLA-LABEL: insert_nxv8f16_v2f16_0:
930 ; VLA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
931 ; VLA-NEXT: vle16.v v10, (a0)
932 ; VLA-NEXT: vsetivli zero, 2, e16, m2, tu, ma
933 ; VLA-NEXT: vmv.v.v v8, v10
936 ; VLS-LABEL: insert_nxv8f16_v2f16_0:
938 ; VLS-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
939 ; VLS-NEXT: vle16.v v10, (a0)
940 ; VLS-NEXT: vsetivli zero, 2, e16, m1, tu, ma
941 ; VLS-NEXT: vmv.v.v v8, v10
943 %sv = load <2 x half>, ptr %svp
944 %v = call <vscale x 8 x half> @llvm.vector.insert.v2f16.nxv8f16(<vscale x 8 x half> %vec, <2 x half> %sv, i64 0)
945 ret <vscale x 8 x half> %v
948 define <vscale x 8 x half> @insert_nxv8f16_v2f16_2(<vscale x 8 x half> %vec, ptr %svp) {
949 ; VLA-LABEL: insert_nxv8f16_v2f16_2:
951 ; VLA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
952 ; VLA-NEXT: vle16.v v10, (a0)
953 ; VLA-NEXT: vsetivli zero, 4, e16, m2, tu, ma
954 ; VLA-NEXT: vslideup.vi v8, v10, 2
957 ; VLS-LABEL: insert_nxv8f16_v2f16_2:
959 ; VLS-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
960 ; VLS-NEXT: vle16.v v10, (a0)
961 ; VLS-NEXT: vsetivli zero, 4, e16, m1, tu, ma
962 ; VLS-NEXT: vslideup.vi v8, v10, 2
964 %sv = load <2 x half>, ptr %svp
965 %v = call <vscale x 8 x half> @llvm.vector.insert.v2f16.nxv8f16(<vscale x 8 x half> %vec, <2 x half> %sv, i64 2)
966 ret <vscale x 8 x half> %v
969 declare <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1>, <4 x i1>, i64)
970 declare <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1>, <8 x i1>, i64)
972 declare <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16>, <2 x i16>, i64)
974 declare <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64)
975 declare <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64)
977 declare <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1>, <4 x i1>, i64)
978 declare <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1>, <8 x i1>, i64)
980 declare <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16>, <2 x i16>, i64)
982 declare <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32>, <2 x i32>, i64)
983 declare <vscale x 8 x i32> @llvm.vector.insert.v4i32.nxv8i32(<vscale x 8 x i32>, <4 x i32>, i64)
984 declare <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32>, <8 x i32>, i64)
986 ; We emit insert_subvectors of fixed vectors at index 0 into undefs as a
987 ; copy_to_regclass or insert_subreg, depending on the register classes of the
988 ; vector types. Make sure that we use the correct type and not the shrunken
989 ; LMUL=1 type, otherwise we will end up with an invalid extract_subvector when
990 ; converting it from scalable->fixed, e.g. we get this for VLEN=128:
992 ; t14: nxv2i32 = insert_subvector undef:nxv2i32, t4, Constant:i64<0>
993 ; t15: v8i32 = extract_subvector t14, Constant:i64<0>
994 declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64)
995 define <4 x i32> @insert_extract_v8i32_v2i32_0(<2 x i32> %v) {
996 ; CHECK-LABEL: insert_extract_v8i32_v2i32_0:
999 %1 = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> poison, <2 x i32> %v, i64 0)
1000 %2 = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %1, i64 0)