1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s
3 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s
4 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
5 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
6 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB
7 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB
11 define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) {
12 ; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1:
14 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
15 ; CHECK-NEXT: vmv1r.v v9, v0
16 ; CHECK-NEXT: vmv1r.v v0, v8
17 ; CHECK-NEXT: vmv.v.i v10, 0
18 ; CHECK-NEXT: li a0, -1
19 ; CHECK-NEXT: csrr a1, vlenb
20 ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0
21 ; CHECK-NEXT: vmv1r.v v0, v9
22 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
23 ; CHECK-NEXT: srli a1, a1, 2
24 ; CHECK-NEXT: vwaddu.vv v16, v8, v12
25 ; CHECK-NEXT: vwmaccu.vx v16, a0, v12
26 ; CHECK-NEXT: vmsne.vi v8, v18, 0
27 ; CHECK-NEXT: vmsne.vi v0, v16, 0
28 ; CHECK-NEXT: add a0, a1, a1
29 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
30 ; CHECK-NEXT: vslideup.vx v0, v8, a1
33 ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
35 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, mu
36 ; ZVBB-NEXT: vmv1r.v v9, v0
37 ; ZVBB-NEXT: vmv1r.v v0, v8
38 ; ZVBB-NEXT: vmv.v.i v10, 0
40 ; ZVBB-NEXT: csrr a1, vlenb
41 ; ZVBB-NEXT: vmerge.vim v10, v10, 1, v0
42 ; ZVBB-NEXT: srli a1, a1, 2
43 ; ZVBB-NEXT: vwsll.vi v12, v10, 8
44 ; ZVBB-NEXT: vmv1r.v v0, v9
45 ; ZVBB-NEXT: vwaddu.wx v12, v12, a0, v0.t
46 ; ZVBB-NEXT: vmsne.vi v8, v14, 0
47 ; ZVBB-NEXT: vmsne.vi v0, v12, 0
48 ; ZVBB-NEXT: add a0, a1, a1
49 ; ZVBB-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
50 ; ZVBB-NEXT: vslideup.vx v0, v8, a1
52 %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
53 ret <vscale x 32 x i1> %res
56 define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
57 ; CHECK-LABEL: vector_interleave_nxv32i8_nxv16i8:
59 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
60 ; CHECK-NEXT: vwaddu.vv v12, v8, v10
61 ; CHECK-NEXT: li a0, -1
62 ; CHECK-NEXT: vwmaccu.vx v12, a0, v10
63 ; CHECK-NEXT: vmv4r.v v8, v12
66 ; ZVBB-LABEL: vector_interleave_nxv32i8_nxv16i8:
68 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
69 ; ZVBB-NEXT: vwsll.vi v12, v10, 8
70 ; ZVBB-NEXT: vwaddu.wv v12, v12, v8
71 ; ZVBB-NEXT: vmv4r.v v8, v12
73 %res = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
74 ret <vscale x 32 x i8> %res
77 define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
78 ; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16:
80 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
81 ; CHECK-NEXT: vwaddu.vv v12, v8, v10
82 ; CHECK-NEXT: li a0, -1
83 ; CHECK-NEXT: vwmaccu.vx v12, a0, v10
84 ; CHECK-NEXT: vmv4r.v v8, v12
87 ; ZVBB-LABEL: vector_interleave_nxv16i16_nxv8i16:
89 ; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
90 ; ZVBB-NEXT: vwsll.vi v12, v10, 16
91 ; ZVBB-NEXT: vwaddu.wv v12, v12, v8
92 ; ZVBB-NEXT: vmv4r.v v8, v12
94 %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
95 ret <vscale x 16 x i16> %res
98 define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
99 ; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32:
101 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
102 ; CHECK-NEXT: vwaddu.vv v12, v8, v10
103 ; CHECK-NEXT: li a0, -1
104 ; CHECK-NEXT: vwmaccu.vx v12, a0, v10
105 ; CHECK-NEXT: vmv4r.v v8, v12
108 ; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32:
110 ; ZVBB-NEXT: li a0, 32
111 ; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
112 ; ZVBB-NEXT: vwsll.vx v12, v10, a0
113 ; ZVBB-NEXT: vwaddu.wv v12, v12, v8
114 ; ZVBB-NEXT: vmv4r.v v8, v12
116 %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
117 ret <vscale x 8 x i32> %res
120 define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
121 ; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64:
123 ; CHECK-NEXT: csrr a0, vlenb
124 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
125 ; CHECK-NEXT: vid.v v12
126 ; CHECK-NEXT: srli a0, a0, 2
127 ; CHECK-NEXT: vand.vi v13, v12, 1
128 ; CHECK-NEXT: vmsne.vi v0, v13, 0
129 ; CHECK-NEXT: vsrl.vi v16, v12, 1
130 ; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t
131 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
132 ; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
133 ; CHECK-NEXT: vmv.v.v v8, v12
136 ; ZVBB-LABEL: vector_interleave_nxv4i64_nxv2i64:
138 ; ZVBB-NEXT: csrr a0, vlenb
139 ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu
140 ; ZVBB-NEXT: vid.v v12
141 ; ZVBB-NEXT: srli a0, a0, 2
142 ; ZVBB-NEXT: vand.vi v13, v12, 1
143 ; ZVBB-NEXT: vmsne.vi v0, v13, 0
144 ; ZVBB-NEXT: vsrl.vi v16, v12, 1
145 ; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t
146 ; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma
147 ; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
148 ; ZVBB-NEXT: vmv.v.v v8, v12
150 %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
151 ret <vscale x 4 x i64> %res
154 declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
155 declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
156 declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
157 declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
158 declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
160 define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
161 ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
163 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
164 ; CHECK-NEXT: vmv1r.v v9, v0
165 ; CHECK-NEXT: vmv1r.v v0, v8
166 ; CHECK-NEXT: vmv.v.i v24, 0
167 ; CHECK-NEXT: li a0, -1
168 ; CHECK-NEXT: vmerge.vim v16, v24, 1, v0
169 ; CHECK-NEXT: vmv1r.v v0, v9
170 ; CHECK-NEXT: vmerge.vim v24, v24, 1, v0
171 ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
172 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
173 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
174 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
175 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
176 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
177 ; CHECK-NEXT: vmsne.vi v16, v8, 0
178 ; CHECK-NEXT: vmsne.vi v8, v0, 0
179 ; CHECK-NEXT: vmv1r.v v0, v16
182 ; ZVBB-LABEL: vector_interleave_nxv128i1_nxv64i1:
184 ; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
185 ; ZVBB-NEXT: vmv.v.i v24, 0
186 ; ZVBB-NEXT: vmerge.vim v16, v24, 1, v0
187 ; ZVBB-NEXT: vmv1r.v v0, v8
188 ; ZVBB-NEXT: vmerge.vim v24, v24, 1, v0
189 ; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
190 ; ZVBB-NEXT: vwsll.vi v8, v24, 8
191 ; ZVBB-NEXT: vwsll.vi v0, v28, 8
192 ; ZVBB-NEXT: vwaddu.wv v8, v8, v16
193 ; ZVBB-NEXT: vwaddu.wv v0, v0, v20
194 ; ZVBB-NEXT: vsetvli a0, zero, e8, m8, ta, ma
195 ; ZVBB-NEXT: vmsne.vi v16, v8, 0
196 ; ZVBB-NEXT: vmsne.vi v8, v0, 0
197 ; ZVBB-NEXT: vmv1r.v v0, v16
199 %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
200 ret <vscale x 128 x i1> %res
203 define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) {
204 ; CHECK-LABEL: vector_interleave_nxv128i8_nxv64i8:
206 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
207 ; CHECK-NEXT: vmv8r.v v24, v8
208 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
209 ; CHECK-NEXT: li a0, -1
210 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
211 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
212 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
213 ; CHECK-NEXT: vmv8r.v v16, v0
216 ; ZVBB-LABEL: vector_interleave_nxv128i8_nxv64i8:
218 ; ZVBB-NEXT: vsetvli a0, zero, e8, m4, ta, ma
219 ; ZVBB-NEXT: vmv8r.v v24, v8
220 ; ZVBB-NEXT: vwsll.vi v8, v16, 8
221 ; ZVBB-NEXT: vwsll.vi v0, v20, 8
222 ; ZVBB-NEXT: vwaddu.wv v8, v8, v24
223 ; ZVBB-NEXT: vwaddu.wv v0, v0, v28
224 ; ZVBB-NEXT: vmv8r.v v16, v0
226 %res = call <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
227 ret <vscale x 128 x i8> %res
230 define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) {
231 ; CHECK-LABEL: vector_interleave_nxv64i16_nxv32i16:
233 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
234 ; CHECK-NEXT: vmv8r.v v24, v8
235 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
236 ; CHECK-NEXT: li a0, -1
237 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
238 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
239 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
240 ; CHECK-NEXT: vmv8r.v v16, v0
243 ; ZVBB-LABEL: vector_interleave_nxv64i16_nxv32i16:
245 ; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
246 ; ZVBB-NEXT: vmv8r.v v24, v8
247 ; ZVBB-NEXT: vwsll.vi v8, v16, 16
248 ; ZVBB-NEXT: vwsll.vi v0, v20, 16
249 ; ZVBB-NEXT: vwaddu.wv v8, v8, v24
250 ; ZVBB-NEXT: vwaddu.wv v0, v0, v28
251 ; ZVBB-NEXT: vmv8r.v v16, v0
253 %res = call <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
254 ret <vscale x 64 x i16> %res
257 define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b) {
258 ; CHECK-LABEL: vector_interleave_nxv32i32_nxv16i32:
260 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
261 ; CHECK-NEXT: vmv8r.v v24, v8
262 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
263 ; CHECK-NEXT: li a0, -1
264 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
265 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
266 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
267 ; CHECK-NEXT: vmv8r.v v16, v0
270 ; ZVBB-LABEL: vector_interleave_nxv32i32_nxv16i32:
272 ; ZVBB-NEXT: vsetvli a0, zero, e32, m4, ta, ma
273 ; ZVBB-NEXT: vmv8r.v v24, v8
274 ; ZVBB-NEXT: li a0, 32
275 ; ZVBB-NEXT: vwsll.vx v8, v16, a0
276 ; ZVBB-NEXT: vwsll.vx v0, v20, a0
277 ; ZVBB-NEXT: vwaddu.wv v8, v8, v24
278 ; ZVBB-NEXT: vwaddu.wv v0, v0, v28
279 ; ZVBB-NEXT: vmv8r.v v16, v0
281 %res = call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
282 ret <vscale x 32 x i32> %res
285 define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) {
286 ; CHECK-LABEL: vector_interleave_nxv16i64_nxv8i64:
288 ; CHECK-NEXT: csrr a0, vlenb
289 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu
290 ; CHECK-NEXT: vid.v v6
291 ; CHECK-NEXT: vmv8r.v v24, v8
292 ; CHECK-NEXT: srli a0, a0, 1
293 ; CHECK-NEXT: vmv4r.v v28, v16
294 ; CHECK-NEXT: vmv4r.v v16, v12
295 ; CHECK-NEXT: vand.vi v8, v6, 1
296 ; CHECK-NEXT: vmsne.vi v0, v8, 0
297 ; CHECK-NEXT: vsrl.vi v6, v6, 1
298 ; CHECK-NEXT: vadd.vx v6, v6, a0, v0.t
299 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
300 ; CHECK-NEXT: vrgatherei16.vv v8, v24, v6
301 ; CHECK-NEXT: vrgatherei16.vv v24, v16, v6
302 ; CHECK-NEXT: vmv.v.v v16, v24
305 ; ZVBB-LABEL: vector_interleave_nxv16i64_nxv8i64:
307 ; ZVBB-NEXT: csrr a0, vlenb
308 ; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu
309 ; ZVBB-NEXT: vid.v v6
310 ; ZVBB-NEXT: vmv8r.v v24, v8
311 ; ZVBB-NEXT: srli a0, a0, 1
312 ; ZVBB-NEXT: vmv4r.v v28, v16
313 ; ZVBB-NEXT: vmv4r.v v16, v12
314 ; ZVBB-NEXT: vand.vi v8, v6, 1
315 ; ZVBB-NEXT: vmsne.vi v0, v8, 0
316 ; ZVBB-NEXT: vsrl.vi v6, v6, 1
317 ; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t
318 ; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma
319 ; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6
320 ; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6
321 ; ZVBB-NEXT: vmv.v.v v16, v24
323 %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
324 ret <vscale x 16 x i64> %res
327 declare <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
328 declare <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
329 declare <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
330 declare <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
331 declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
335 define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
336 ; CHECK-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
338 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
339 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
340 ; CHECK-NEXT: li a0, -1
341 ; CHECK-NEXT: csrr a1, vlenb
342 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
343 ; CHECK-NEXT: srli a1, a1, 2
344 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
345 ; CHECK-NEXT: vslidedown.vx v8, v10, a1
346 ; CHECK-NEXT: add a0, a1, a1
347 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
348 ; CHECK-NEXT: vslideup.vx v10, v8, a1
349 ; CHECK-NEXT: vmv.v.v v8, v10
352 ; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
354 ; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
355 ; ZVBB-NEXT: vwsll.vi v10, v9, 16
356 ; ZVBB-NEXT: csrr a0, vlenb
357 ; ZVBB-NEXT: vwaddu.wv v10, v10, v8
358 ; ZVBB-NEXT: srli a0, a0, 2
359 ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
360 ; ZVBB-NEXT: vslidedown.vx v8, v10, a0
361 ; ZVBB-NEXT: add a1, a0, a0
362 ; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma
363 ; ZVBB-NEXT: vslideup.vx v10, v8, a0
364 ; ZVBB-NEXT: vmv.v.v v8, v10
366 %res = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
367 ret <vscale x 4 x bfloat> %res
370 define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
371 ; CHECK-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
373 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
374 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
375 ; CHECK-NEXT: li a0, -1
376 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
377 ; CHECK-NEXT: vmv2r.v v8, v10
380 ; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
382 ; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
383 ; ZVBB-NEXT: vwsll.vi v10, v9, 16
384 ; ZVBB-NEXT: vwaddu.wv v10, v10, v8
385 ; ZVBB-NEXT: vmv2r.v v8, v10
387 %res = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
388 ret <vscale x 8 x bfloat> %res
391 define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
392 ; CHECK-LABEL: vector_interleave_nxv4f16_nxv2f16:
394 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
395 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
396 ; CHECK-NEXT: li a0, -1
397 ; CHECK-NEXT: csrr a1, vlenb
398 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
399 ; CHECK-NEXT: srli a1, a1, 2
400 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
401 ; CHECK-NEXT: vslidedown.vx v8, v10, a1
402 ; CHECK-NEXT: add a0, a1, a1
403 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
404 ; CHECK-NEXT: vslideup.vx v10, v8, a1
405 ; CHECK-NEXT: vmv.v.v v8, v10
408 ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
410 ; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
411 ; ZVBB-NEXT: vwsll.vi v10, v9, 16
412 ; ZVBB-NEXT: csrr a0, vlenb
413 ; ZVBB-NEXT: vwaddu.wv v10, v10, v8
414 ; ZVBB-NEXT: srli a0, a0, 2
415 ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
416 ; ZVBB-NEXT: vslidedown.vx v8, v10, a0
417 ; ZVBB-NEXT: add a1, a0, a0
418 ; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma
419 ; ZVBB-NEXT: vslideup.vx v10, v8, a0
420 ; ZVBB-NEXT: vmv.v.v v8, v10
422 %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
423 ret <vscale x 4 x half> %res
426 define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
427 ; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16:
429 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
430 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
431 ; CHECK-NEXT: li a0, -1
432 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
433 ; CHECK-NEXT: vmv2r.v v8, v10
436 ; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
438 ; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
439 ; ZVBB-NEXT: vwsll.vi v10, v9, 16
440 ; ZVBB-NEXT: vwaddu.wv v10, v10, v8
441 ; ZVBB-NEXT: vmv2r.v v8, v10
443 %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
444 ret <vscale x 8 x half> %res
447 define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
448 ; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32:
450 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
451 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
452 ; CHECK-NEXT: li a0, -1
453 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
454 ; CHECK-NEXT: vmv2r.v v8, v10
457 ; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
459 ; ZVBB-NEXT: li a0, 32
460 ; ZVBB-NEXT: vsetvli a1, zero, e32, m1, ta, ma
461 ; ZVBB-NEXT: vwsll.vx v10, v9, a0
462 ; ZVBB-NEXT: vwaddu.wv v10, v10, v8
463 ; ZVBB-NEXT: vmv2r.v v8, v10
465 %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
466 ret <vscale x 4 x float> %res
469 define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
470 ; CHECK-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
472 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
473 ; CHECK-NEXT: vwaddu.vv v12, v8, v10
474 ; CHECK-NEXT: li a0, -1
475 ; CHECK-NEXT: vwmaccu.vx v12, a0, v10
476 ; CHECK-NEXT: vmv4r.v v8, v12
479 ; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
481 ; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
482 ; ZVBB-NEXT: vwsll.vi v12, v10, 16
483 ; ZVBB-NEXT: vwaddu.wv v12, v12, v8
484 ; ZVBB-NEXT: vmv4r.v v8, v12
486 %res = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
487 ret <vscale x 16 x bfloat> %res
490 define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
491 ; CHECK-LABEL: vector_interleave_nxv16f16_nxv8f16:
493 ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
494 ; CHECK-NEXT: vwaddu.vv v12, v8, v10
495 ; CHECK-NEXT: li a0, -1
496 ; CHECK-NEXT: vwmaccu.vx v12, a0, v10
497 ; CHECK-NEXT: vmv4r.v v8, v12
500 ; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
502 ; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
503 ; ZVBB-NEXT: vwsll.vi v12, v10, 16
504 ; ZVBB-NEXT: vwaddu.wv v12, v12, v8
505 ; ZVBB-NEXT: vmv4r.v v8, v12
507 %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
508 ret <vscale x 16 x half> %res
511 define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
512 ; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32:
514 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
515 ; CHECK-NEXT: vwaddu.vv v12, v8, v10
516 ; CHECK-NEXT: li a0, -1
517 ; CHECK-NEXT: vwmaccu.vx v12, a0, v10
518 ; CHECK-NEXT: vmv4r.v v8, v12
521 ; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
523 ; ZVBB-NEXT: li a0, 32
524 ; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
525 ; ZVBB-NEXT: vwsll.vx v12, v10, a0
526 ; ZVBB-NEXT: vwaddu.wv v12, v12, v8
527 ; ZVBB-NEXT: vmv4r.v v8, v12
529 %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
530 ret <vscale x 8 x float> %res
533 define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
534 ; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64:
536 ; CHECK-NEXT: csrr a0, vlenb
537 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
538 ; CHECK-NEXT: vid.v v12
539 ; CHECK-NEXT: srli a0, a0, 2
540 ; CHECK-NEXT: vand.vi v13, v12, 1
541 ; CHECK-NEXT: vmsne.vi v0, v13, 0
542 ; CHECK-NEXT: vsrl.vi v16, v12, 1
543 ; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t
544 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
545 ; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
546 ; CHECK-NEXT: vmv.v.v v8, v12
549 ; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
551 ; ZVBB-NEXT: csrr a0, vlenb
552 ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu
553 ; ZVBB-NEXT: vid.v v12
554 ; ZVBB-NEXT: srli a0, a0, 2
555 ; ZVBB-NEXT: vand.vi v13, v12, 1
556 ; ZVBB-NEXT: vmsne.vi v0, v13, 0
557 ; ZVBB-NEXT: vsrl.vi v16, v12, 1
558 ; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t
559 ; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma
560 ; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
561 ; ZVBB-NEXT: vmv.v.v v8, v12
563 %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
564 ret <vscale x 4 x double> %res
568 declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
569 declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
570 declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
571 declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
572 declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
573 declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
575 define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
576 ; CHECK-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
578 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
579 ; CHECK-NEXT: vmv8r.v v24, v8
580 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
581 ; CHECK-NEXT: li a0, -1
582 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
583 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
584 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
585 ; CHECK-NEXT: vmv8r.v v16, v0
588 ; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
590 ; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
591 ; ZVBB-NEXT: vmv8r.v v24, v8
592 ; ZVBB-NEXT: vwsll.vi v8, v16, 16
593 ; ZVBB-NEXT: vwsll.vi v0, v20, 16
594 ; ZVBB-NEXT: vwaddu.wv v8, v8, v24
595 ; ZVBB-NEXT: vwaddu.wv v0, v0, v28
596 ; ZVBB-NEXT: vmv8r.v v16, v0
598 %res = call <vscale x 64 x bfloat> @llvm.vector.interleave2.nxv64bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
599 ret <vscale x 64 x bfloat> %res
602 define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
603 ; CHECK-LABEL: vector_interleave_nxv64f16_nxv32f16:
605 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
606 ; CHECK-NEXT: vmv8r.v v24, v8
607 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
608 ; CHECK-NEXT: li a0, -1
609 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
610 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
611 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
612 ; CHECK-NEXT: vmv8r.v v16, v0
615 ; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16:
617 ; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
618 ; ZVBB-NEXT: vmv8r.v v24, v8
619 ; ZVBB-NEXT: vwsll.vi v8, v16, 16
620 ; ZVBB-NEXT: vwsll.vi v0, v20, 16
621 ; ZVBB-NEXT: vwaddu.wv v8, v8, v24
622 ; ZVBB-NEXT: vwaddu.wv v0, v0, v28
623 ; ZVBB-NEXT: vmv8r.v v16, v0
625 %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
626 ret <vscale x 64 x half> %res
629 define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
630 ; CHECK-LABEL: vector_interleave_nxv32f32_nxv16f32:
632 ; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
633 ; CHECK-NEXT: vmv8r.v v24, v8
634 ; CHECK-NEXT: vwaddu.vv v8, v24, v16
635 ; CHECK-NEXT: li a0, -1
636 ; CHECK-NEXT: vwaddu.vv v0, v28, v20
637 ; CHECK-NEXT: vwmaccu.vx v8, a0, v16
638 ; CHECK-NEXT: vwmaccu.vx v0, a0, v20
639 ; CHECK-NEXT: vmv8r.v v16, v0
642 ; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32:
644 ; ZVBB-NEXT: vsetvli a0, zero, e32, m4, ta, ma
645 ; ZVBB-NEXT: vmv8r.v v24, v8
646 ; ZVBB-NEXT: li a0, 32
647 ; ZVBB-NEXT: vwsll.vx v8, v16, a0
648 ; ZVBB-NEXT: vwsll.vx v0, v20, a0
649 ; ZVBB-NEXT: vwaddu.wv v8, v8, v24
650 ; ZVBB-NEXT: vwaddu.wv v0, v0, v28
651 ; ZVBB-NEXT: vmv8r.v v16, v0
653 %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
654 ret <vscale x 32 x float> %res
657 define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
658 ; CHECK-LABEL: vector_interleave_nxv16f64_nxv8f64:
660 ; CHECK-NEXT: csrr a0, vlenb
661 ; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, mu
662 ; CHECK-NEXT: vid.v v6
663 ; CHECK-NEXT: vmv8r.v v24, v8
664 ; CHECK-NEXT: srli a0, a0, 1
665 ; CHECK-NEXT: vmv4r.v v28, v16
666 ; CHECK-NEXT: vmv4r.v v16, v12
667 ; CHECK-NEXT: vand.vi v8, v6, 1
668 ; CHECK-NEXT: vmsne.vi v0, v8, 0
669 ; CHECK-NEXT: vsrl.vi v6, v6, 1
670 ; CHECK-NEXT: vadd.vx v6, v6, a0, v0.t
671 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
672 ; CHECK-NEXT: vrgatherei16.vv v8, v24, v6
673 ; CHECK-NEXT: vrgatherei16.vv v24, v16, v6
674 ; CHECK-NEXT: vmv.v.v v16, v24
677 ; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
679 ; ZVBB-NEXT: csrr a0, vlenb
680 ; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu
681 ; ZVBB-NEXT: vid.v v6
682 ; ZVBB-NEXT: vmv8r.v v24, v8
683 ; ZVBB-NEXT: srli a0, a0, 1
684 ; ZVBB-NEXT: vmv4r.v v28, v16
685 ; ZVBB-NEXT: vmv4r.v v16, v12
686 ; ZVBB-NEXT: vand.vi v8, v6, 1
687 ; ZVBB-NEXT: vmsne.vi v0, v8, 0
688 ; ZVBB-NEXT: vsrl.vi v6, v6, 1
689 ; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t
690 ; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma
691 ; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6
692 ; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6
693 ; ZVBB-NEXT: vmv.v.v v16, v24
695 %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
696 ret <vscale x 16 x double> %res
699 define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison(<vscale x 4 x i32> %a) {
700 ; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32_poison:
702 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
703 ; CHECK-NEXT: vzext.vf2 v12, v8
704 ; CHECK-NEXT: vmv.v.v v8, v12
707 ; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32_poison:
709 ; ZVBB-NEXT: vsetvli a0, zero, e64, m4, ta, ma
710 ; ZVBB-NEXT: vzext.vf2 v12, v8
711 ; ZVBB-NEXT: vmv.v.v v8, v12
713 %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
714 ret <vscale x 8 x i32> %res
717 define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison2(<vscale x 4 x i32> %a) {
718 ; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32_poison2:
720 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
721 ; CHECK-NEXT: vzext.vf2 v12, v8
722 ; CHECK-NEXT: li a0, 32
723 ; CHECK-NEXT: vsll.vx v8, v12, a0
726 ; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32_poison2:
728 ; ZVBB-NEXT: li a0, 32
729 ; ZVBB-NEXT: vsetvli a1, zero, e32, m2, ta, ma
730 ; ZVBB-NEXT: vwsll.vx v12, v8, a0
731 ; ZVBB-NEXT: vmv4r.v v8, v12
733 %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
734 ret <vscale x 8 x i32> %res
737 declare <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
738 declare <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
739 declare <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)