1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX2
3 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX1
4 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX2
5 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX1
7 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=2 -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX2
8 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=1 -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX1
9 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=2 -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX2
10 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-fixed-length-vector-lmul-max=1 -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX1
12 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
13 ; CHECK-LABEL: insert_nxv8i32_v2i32_0:
15 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
16 ; CHECK-NEXT: vle32.v v12, (a0)
17 ; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma
18 ; CHECK-NEXT: vmv.v.v v8, v12
20 %sv = load <2 x i32>, ptr %svp
21 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
22 ret <vscale x 8 x i32> %v
25 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
26 ; CHECK-LABEL: insert_nxv8i32_v2i32_2:
28 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
29 ; CHECK-NEXT: vle32.v v12, (a0)
30 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
31 ; CHECK-NEXT: vslideup.vi v8, v12, 2
33 %sv = load <2 x i32>, ptr %svp
34 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
35 ret <vscale x 8 x i32> %v
38 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
39 ; CHECK-LABEL: insert_nxv8i32_v2i32_6:
41 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
42 ; CHECK-NEXT: vle32.v v12, (a0)
43 ; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
44 ; CHECK-NEXT: vslideup.vi v8, v12, 6
46 %sv = load <2 x i32>, ptr %svp
47 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
48 ret <vscale x 8 x i32> %v
51 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
52 ; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0:
54 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
55 ; LMULMAX2-NEXT: vle32.v v12, (a0)
56 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma
57 ; LMULMAX2-NEXT: vmv.v.v v8, v12
60 ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0:
62 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
63 ; LMULMAX1-NEXT: vle32.v v12, (a0)
64 ; LMULMAX1-NEXT: addi a0, a0, 16
65 ; LMULMAX1-NEXT: vle32.v v16, (a0)
66 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma
67 ; LMULMAX1-NEXT: vmv.v.v v8, v12
68 ; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma
69 ; LMULMAX1-NEXT: vslideup.vi v8, v16, 4
71 %sv = load <8 x i32>, ptr %svp
72 %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
73 ret <vscale x 8 x i32> %v
76 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
77 ; LMULMAX2-LABEL: insert_nxv8i32_v8i32_8:
79 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
80 ; LMULMAX2-NEXT: vle32.v v12, (a0)
81 ; LMULMAX2-NEXT: vsetivli zero, 16, e32, m4, tu, ma
82 ; LMULMAX2-NEXT: vslideup.vi v8, v12, 8
85 ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8:
87 ; LMULMAX1-NEXT: addi a1, a0, 16
88 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
89 ; LMULMAX1-NEXT: vle32.v v12, (a1)
90 ; LMULMAX1-NEXT: vle32.v v16, (a0)
91 ; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, ma
92 ; LMULMAX1-NEXT: vslideup.vi v8, v16, 8
93 ; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, ma
94 ; LMULMAX1-NEXT: vslideup.vi v8, v12, 12
96 %sv = load <8 x i32>, ptr %svp
97 %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
98 ret <vscale x 8 x i32> %v
101 define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
102 ; CHECK-LABEL: insert_nxv8i32_undef_v2i32_0:
104 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
105 ; CHECK-NEXT: vle32.v v8, (a0)
107 %sv = load <2 x i32>, ptr %svp
108 %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> undef, <2 x i32> %sv, i64 0)
109 ret <vscale x 8 x i32> %v
112 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
113 ; CHECK-LABEL: insert_v4i32_v2i32_0:
115 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
116 ; CHECK-NEXT: vle32.v v8, (a1)
117 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
118 ; CHECK-NEXT: vle32.v v9, (a0)
119 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma
120 ; CHECK-NEXT: vmv.v.v v9, v8
121 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
122 ; CHECK-NEXT: vse32.v v9, (a0)
124 %sv = load <2 x i32>, ptr %svp
125 %vec = load <4 x i32>, ptr %vp
126 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
127 store <4 x i32> %v, ptr %vp
131 define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
132 ; CHECK-LABEL: insert_v4i32_v2i32_2:
134 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
135 ; CHECK-NEXT: vle32.v v8, (a1)
136 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
137 ; CHECK-NEXT: vle32.v v9, (a0)
138 ; CHECK-NEXT: vslideup.vi v9, v8, 2
139 ; CHECK-NEXT: vse32.v v9, (a0)
141 %sv = load <2 x i32>, ptr %svp
142 %vec = load <4 x i32>, ptr %vp
143 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
144 store <4 x i32> %v, ptr %vp
148 define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
149 ; CHECK-LABEL: insert_v4i32_undef_v2i32_0:
151 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
152 ; CHECK-NEXT: vle32.v v8, (a1)
153 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
154 ; CHECK-NEXT: vse32.v v8, (a0)
156 %sv = load <2 x i32>, ptr %svp
157 %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
158 store <4 x i32> %v, ptr %vp
162 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
163 ; LMULMAX2-LABEL: insert_v8i32_v2i32_0:
165 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
166 ; LMULMAX2-NEXT: vle32.v v8, (a1)
167 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
168 ; LMULMAX2-NEXT: vle32.v v10, (a0)
169 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma
170 ; LMULMAX2-NEXT: vmv.v.v v10, v8
171 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
172 ; LMULMAX2-NEXT: vse32.v v10, (a0)
175 ; LMULMAX1-LABEL: insert_v8i32_v2i32_0:
177 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
178 ; LMULMAX1-NEXT: vle32.v v8, (a1)
179 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
180 ; LMULMAX1-NEXT: vle32.v v9, (a0)
181 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, tu, ma
182 ; LMULMAX1-NEXT: vmv.v.v v9, v8
183 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
184 ; LMULMAX1-NEXT: vse32.v v9, (a0)
186 %sv = load <2 x i32>, ptr %svp
187 %vec = load <8 x i32>, ptr %vp
188 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
189 store <8 x i32> %v, ptr %vp
193 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
194 ; LMULMAX2-LABEL: insert_v8i32_v2i32_2:
196 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
197 ; LMULMAX2-NEXT: vle32.v v8, (a1)
198 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
199 ; LMULMAX2-NEXT: vle32.v v10, (a0)
200 ; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma
201 ; LMULMAX2-NEXT: vslideup.vi v10, v8, 2
202 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
203 ; LMULMAX2-NEXT: vse32.v v10, (a0)
206 ; LMULMAX1-LABEL: insert_v8i32_v2i32_2:
208 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
209 ; LMULMAX1-NEXT: vle32.v v8, (a1)
210 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
211 ; LMULMAX1-NEXT: vle32.v v9, (a0)
212 ; LMULMAX1-NEXT: vslideup.vi v9, v8, 2
213 ; LMULMAX1-NEXT: vse32.v v9, (a0)
215 %sv = load <2 x i32>, ptr %svp
216 %vec = load <8 x i32>, ptr %vp
217 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
218 store <8 x i32> %v, ptr %vp
222 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
223 ; LMULMAX2-LABEL: insert_v8i32_v2i32_6:
225 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
226 ; LMULMAX2-NEXT: vle32.v v8, (a1)
227 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
228 ; LMULMAX2-NEXT: vle32.v v10, (a0)
229 ; LMULMAX2-NEXT: vslideup.vi v10, v8, 6
230 ; LMULMAX2-NEXT: vse32.v v10, (a0)
233 ; LMULMAX1-LABEL: insert_v8i32_v2i32_6:
235 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
236 ; LMULMAX1-NEXT: vle32.v v8, (a1)
237 ; LMULMAX1-NEXT: addi a0, a0, 16
238 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
239 ; LMULMAX1-NEXT: vle32.v v9, (a0)
240 ; LMULMAX1-NEXT: vslideup.vi v9, v8, 2
241 ; LMULMAX1-NEXT: vse32.v v9, (a0)
243 %sv = load <2 x i32>, ptr %svp
244 %vec = load <8 x i32>, ptr %vp
245 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
246 store <8 x i32> %v, ptr %vp
250 define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
251 ; LMULMAX2-LABEL: insert_v8i32_undef_v2i32_6:
253 ; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
254 ; LMULMAX2-NEXT: vle32.v v8, (a1)
255 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
256 ; LMULMAX2-NEXT: vslideup.vi v10, v8, 6
257 ; LMULMAX2-NEXT: vse32.v v10, (a0)
260 ; LMULMAX1-LABEL: insert_v8i32_undef_v2i32_6:
262 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
263 ; LMULMAX1-NEXT: vle32.v v8, (a1)
264 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
265 ; LMULMAX1-NEXT: vslideup.vi v9, v8, 2
266 ; LMULMAX1-NEXT: addi a0, a0, 16
267 ; LMULMAX1-NEXT: vse32.v v9, (a0)
269 %sv = load <2 x i32>, ptr %svp
270 %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
271 store <8 x i32> %v, ptr %vp
275 define void @insert_v4i16_v2i16_0(ptr %vp, ptr %svp) {
276 ; CHECK-LABEL: insert_v4i16_v2i16_0:
278 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
279 ; CHECK-NEXT: vle16.v v8, (a0)
280 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
281 ; CHECK-NEXT: vle16.v v9, (a1)
282 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
283 ; CHECK-NEXT: vmv.v.v v8, v9
284 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
285 ; CHECK-NEXT: vse16.v v8, (a0)
287 %v = load <4 x i16>, ptr %vp
288 %sv = load <2 x i16>, ptr %svp
289 %c = call <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 0)
290 store <4 x i16> %c, ptr %vp
294 define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
295 ; CHECK-LABEL: insert_v4i16_v2i16_2:
297 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
298 ; CHECK-NEXT: vle16.v v8, (a0)
299 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
300 ; CHECK-NEXT: vle16.v v9, (a1)
301 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
302 ; CHECK-NEXT: vslideup.vi v8, v9, 2
303 ; CHECK-NEXT: vse16.v v8, (a0)
305 %v = load <4 x i16>, ptr %vp
306 %sv = load <2 x i16>, ptr %svp
307 %c = call <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 2)
308 store <4 x i16> %c, ptr %vp
312 define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
313 ; LMULMAX2-LABEL: insert_v32i1_v8i1_0:
315 ; LMULMAX2-NEXT: li a2, 32
316 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, ma
317 ; LMULMAX2-NEXT: vlm.v v8, (a0)
318 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
319 ; LMULMAX2-NEXT: vlm.v v9, (a1)
320 ; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf4, tu, ma
321 ; LMULMAX2-NEXT: vmv.v.v v8, v9
322 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, ma
323 ; LMULMAX2-NEXT: vsm.v v8, (a0)
326 ; LMULMAX1-LABEL: insert_v32i1_v8i1_0:
328 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
329 ; LMULMAX1-NEXT: vlm.v v8, (a0)
330 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
331 ; LMULMAX1-NEXT: vlm.v v9, (a1)
332 ; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, tu, ma
333 ; LMULMAX1-NEXT: vmv.v.v v8, v9
334 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
335 ; LMULMAX1-NEXT: vsm.v v8, (a0)
337 %v = load <32 x i1>, ptr %vp
338 %sv = load <8 x i1>, ptr %svp
339 %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
340 store <32 x i1> %c, ptr %vp
344 define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
345 ; LMULMAX2-LABEL: insert_v32i1_v8i1_16:
347 ; LMULMAX2-NEXT: li a2, 32
348 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, ma
349 ; LMULMAX2-NEXT: vlm.v v8, (a0)
350 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
351 ; LMULMAX2-NEXT: vlm.v v9, (a1)
352 ; LMULMAX2-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
353 ; LMULMAX2-NEXT: vslideup.vi v8, v9, 2
354 ; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, ma
355 ; LMULMAX2-NEXT: vsm.v v8, (a0)
358 ; LMULMAX1-LABEL: insert_v32i1_v8i1_16:
360 ; LMULMAX1-NEXT: addi a0, a0, 2
361 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
362 ; LMULMAX1-NEXT: vlm.v v8, (a0)
363 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
364 ; LMULMAX1-NEXT: vlm.v v9, (a1)
365 ; LMULMAX1-NEXT: vsetivli zero, 1, e8, mf8, tu, ma
366 ; LMULMAX1-NEXT: vmv.v.v v8, v9
367 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
368 ; LMULMAX1-NEXT: vsm.v v8, (a0)
370 %v = load <32 x i1>, ptr %vp
371 %sv = load <8 x i1>, ptr %svp
372 %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
373 store <32 x i1> %c, ptr %vp
377 define void @insert_v8i1_v4i1_0(ptr %vp, ptr %svp) {
378 ; CHECK-LABEL: insert_v8i1_v4i1_0:
380 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
381 ; CHECK-NEXT: vlm.v v0, (a0)
382 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
383 ; CHECK-NEXT: vlm.v v8, (a1)
384 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
385 ; CHECK-NEXT: vmv.v.i v9, 0
386 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
387 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
388 ; CHECK-NEXT: vmv.v.i v10, 0
389 ; CHECK-NEXT: vmv1r.v v0, v8
390 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
391 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
392 ; CHECK-NEXT: vmv.v.v v9, v8
393 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
394 ; CHECK-NEXT: vmsne.vi v8, v9, 0
395 ; CHECK-NEXT: vsm.v v8, (a0)
397 %v = load <8 x i1>, ptr %vp
398 %sv = load <4 x i1>, ptr %svp
399 %c = call <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 0)
400 store <8 x i1> %c, ptr %vp
404 define void @insert_v8i1_v4i1_4(ptr %vp, ptr %svp) {
405 ; CHECK-LABEL: insert_v8i1_v4i1_4:
407 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
408 ; CHECK-NEXT: vlm.v v0, (a0)
409 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
410 ; CHECK-NEXT: vlm.v v8, (a1)
411 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
412 ; CHECK-NEXT: vmv.v.i v9, 0
413 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
414 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
415 ; CHECK-NEXT: vmv.v.i v10, 0
416 ; CHECK-NEXT: vmv1r.v v0, v8
417 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
418 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
419 ; CHECK-NEXT: vslideup.vi v9, v8, 4
420 ; CHECK-NEXT: vmsne.vi v8, v9, 0
421 ; CHECK-NEXT: vsm.v v8, (a0)
423 %v = load <8 x i1>, ptr %vp
424 %sv = load <4 x i1>, ptr %svp
425 %c = call <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 4)
426 store <8 x i1> %c, ptr %vp
430 define <vscale x 2 x i16> @insert_nxv2i16_v2i16_0(<vscale x 2 x i16> %v, ptr %svp) {
431 ; CHECK-LABEL: insert_nxv2i16_v2i16_0:
433 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
434 ; CHECK-NEXT: vle16.v v9, (a0)
435 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
436 ; CHECK-NEXT: vmv.v.v v8, v9
438 %sv = load <2 x i16>, ptr %svp
439 %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 0)
440 ret <vscale x 2 x i16> %c
443 define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %svp) {
444 ; CHECK-LABEL: insert_nxv2i16_v2i16_2:
446 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
447 ; CHECK-NEXT: vle16.v v9, (a0)
448 ; CHECK-NEXT: vsetivli zero, 6, e16, mf2, tu, ma
449 ; CHECK-NEXT: vslideup.vi v8, v9, 4
451 %sv = load <2 x i16>, ptr %svp
452 %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 4)
453 ret <vscale x 2 x i16> %c
456 define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
457 ; CHECK-LABEL: insert_nxv2i1_v4i1_0:
459 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
460 ; CHECK-NEXT: vlm.v v8, (a0)
461 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
462 ; CHECK-NEXT: vmv.v.i v9, 0
463 ; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
464 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
465 ; CHECK-NEXT: vmv.v.i v10, 0
466 ; CHECK-NEXT: vmv1r.v v0, v8
467 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
468 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma
469 ; CHECK-NEXT: vmv.v.v v9, v8
470 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
471 ; CHECK-NEXT: vmsne.vi v0, v9, 0
473 %sv = load <4 x i1>, ptr %svp
474 %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
475 ret <vscale x 2 x i1> %c
478 define <vscale x 8 x i1> @insert_nxv8i1_v4i1_0(<vscale x 8 x i1> %v, ptr %svp) {
479 ; CHECK-LABEL: insert_nxv8i1_v4i1_0:
481 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
482 ; CHECK-NEXT: vlm.v v8, (a0)
483 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, tu, ma
484 ; CHECK-NEXT: vmv.v.v v0, v8
486 %sv = load <8 x i1>, ptr %svp
487 %c = call <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1> %v, <8 x i1> %sv, i64 0)
488 ret <vscale x 8 x i1> %c
491 define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp) {
492 ; CHECK-LABEL: insert_nxv8i1_v8i1_16:
494 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
495 ; CHECK-NEXT: vlm.v v8, (a0)
496 ; CHECK-NEXT: vsetivli zero, 3, e8, mf8, tu, ma
497 ; CHECK-NEXT: vslideup.vi v0, v8, 2
499 %sv = load <8 x i1>, ptr %svp
500 %c = call <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1> %v, <8 x i1> %sv, i64 16)
501 ret <vscale x 8 x i1> %c
504 declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
506 define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, <vscale x 16 x i64>* %out) {
507 ; CHECK-LABEL: insert_v2i64_nxv16i64:
509 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
510 ; CHECK-NEXT: vle64.v v8, (a0)
511 ; CHECK-NEXT: vle64.v v16, (a1)
512 ; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma
513 ; CHECK-NEXT: vslideup.vi v8, v16, 4
514 ; CHECK-NEXT: vs8r.v v8, (a2)
516 %sv0 = load <2 x i64>, ptr %psv0
517 %sv1 = load <2 x i64>, ptr %psv1
518 %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
519 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
520 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
524 define void @insert_v2i64_nxv16i64_lo0(ptr %psv, <vscale x 16 x i64>* %out) {
525 ; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
527 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
528 ; CHECK-NEXT: vle64.v v8, (a0)
529 ; CHECK-NEXT: vs8r.v v8, (a1)
531 %sv = load <2 x i64>, ptr %psv
532 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
533 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
537 define void @insert_v2i64_nxv16i64_lo2(ptr %psv, <vscale x 16 x i64>* %out) {
538 ; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
540 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
541 ; CHECK-NEXT: vle64.v v8, (a0)
542 ; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma
543 ; CHECK-NEXT: vslideup.vi v16, v8, 2
544 ; CHECK-NEXT: vs8r.v v16, (a1)
546 %sv = load <2 x i64>, ptr %psv
547 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
548 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
552 ; Check we don't mistakenly optimize this: we don't know whether this is
553 ; inserted into the low or high split vector.
554 define void @insert_v2i64_nxv16i64_hi(ptr %psv, <vscale x 16 x i64>* %out) {
555 ; RV32-LABEL: insert_v2i64_nxv16i64_hi:
557 ; RV32-NEXT: addi sp, sp, -80
558 ; RV32-NEXT: .cfi_def_cfa_offset 80
559 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
560 ; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
561 ; RV32-NEXT: .cfi_offset ra, -4
562 ; RV32-NEXT: .cfi_offset s0, -8
563 ; RV32-NEXT: addi s0, sp, 80
564 ; RV32-NEXT: .cfi_def_cfa s0, 0
565 ; RV32-NEXT: csrr a2, vlenb
566 ; RV32-NEXT: slli a2, a2, 4
567 ; RV32-NEXT: sub sp, sp, a2
568 ; RV32-NEXT: andi sp, sp, -64
569 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
570 ; RV32-NEXT: vle64.v v8, (a0)
571 ; RV32-NEXT: addi a0, sp, 128
572 ; RV32-NEXT: vse64.v v8, (a0)
573 ; RV32-NEXT: csrr a0, vlenb
574 ; RV32-NEXT: slli a0, a0, 3
575 ; RV32-NEXT: addi a2, sp, 64
576 ; RV32-NEXT: add a3, a2, a0
577 ; RV32-NEXT: vl8re64.v v8, (a3)
578 ; RV32-NEXT: vl8re64.v v16, (a2)
579 ; RV32-NEXT: add a0, a1, a0
580 ; RV32-NEXT: vs8r.v v8, (a0)
581 ; RV32-NEXT: vs8r.v v16, (a1)
582 ; RV32-NEXT: addi sp, s0, -80
583 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
584 ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
585 ; RV32-NEXT: addi sp, sp, 80
588 ; RV64-LABEL: insert_v2i64_nxv16i64_hi:
590 ; RV64-NEXT: addi sp, sp, -80
591 ; RV64-NEXT: .cfi_def_cfa_offset 80
592 ; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
593 ; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
594 ; RV64-NEXT: .cfi_offset ra, -8
595 ; RV64-NEXT: .cfi_offset s0, -16
596 ; RV64-NEXT: addi s0, sp, 80
597 ; RV64-NEXT: .cfi_def_cfa s0, 0
598 ; RV64-NEXT: csrr a2, vlenb
599 ; RV64-NEXT: slli a2, a2, 4
600 ; RV64-NEXT: sub sp, sp, a2
601 ; RV64-NEXT: andi sp, sp, -64
602 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
603 ; RV64-NEXT: vle64.v v8, (a0)
604 ; RV64-NEXT: addi a0, sp, 128
605 ; RV64-NEXT: vse64.v v8, (a0)
606 ; RV64-NEXT: csrr a0, vlenb
607 ; RV64-NEXT: slli a0, a0, 3
608 ; RV64-NEXT: addi a2, sp, 64
609 ; RV64-NEXT: add a3, a2, a0
610 ; RV64-NEXT: vl8re64.v v8, (a3)
611 ; RV64-NEXT: vl8re64.v v16, (a2)
612 ; RV64-NEXT: add a0, a1, a0
613 ; RV64-NEXT: vs8r.v v8, (a0)
614 ; RV64-NEXT: vs8r.v v16, (a1)
615 ; RV64-NEXT: addi sp, s0, -80
616 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
617 ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
618 ; RV64-NEXT: addi sp, sp, 80
620 %sv = load <2 x i64>, ptr %psv
621 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
622 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
626 declare <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1>, <4 x i1>, i64)
627 declare <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1>, <8 x i1>, i64)
629 declare <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16>, <2 x i16>, i64)
631 declare <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64)
632 declare <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64)
634 declare <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1>, <4 x i1>, i64)
635 declare <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1>, <8 x i1>, i64)
637 declare <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16>, <2 x i16>, i64)
639 declare <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32>, <2 x i32>, i64)
640 declare <vscale x 8 x i32> @llvm.vector.insert.v4i32.nxv8i32(<vscale x 8 x i32>, <4 x i32>, i64)
641 declare <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32>, <8 x i32>, i64)
643 ; We emit insert_subvectors of fixed vectors at index 0 into undefs as a
644 ; copy_to_regclass or insert_subreg, depending on the register classes of the
645 ; vector types. Make sure that we use the correct type and not the shrunken
646 ; LMUL=1 type, otherwise we will end up with an invalid extract_subvector when
647 ; converting it from scalable->fixed, e.g. we get this for VLEN=128:
649 ; t14: nxv2i32 = insert_subvector undef:nxv2i32, t4, Constant:i64<0>
650 ; t15: v8i32 = extract_subvector t14, Constant:i64<0>
651 declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64)
652 define <4 x i32> @insert_extract_v8i32_v2i32_0(<2 x i32> %v) {
653 ; CHECK-LABEL: insert_extract_v8i32_v2i32_0:
656 %1 = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> poison, <2 x i32> %v, i64 0)
657 %2 = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %1, i64 0)