1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128
3 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128
4 ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512
5 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512
7 ; Test optimizing interleaves to widening arithmetic.
9 define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) {
10 ; CHECK-LABEL: interleave_v2i8:
12 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
13 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
14 ; CHECK-NEXT: li a0, -1
15 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
16 ; CHECK-NEXT: vmv1r.v v8, v10
18 %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
22 define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) {
23 ; CHECK-LABEL: interleave_v2i16:
25 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
26 ; CHECK-NEXT: vwaddu.vv v10, v8, v9
27 ; CHECK-NEXT: li a0, -1
28 ; CHECK-NEXT: vwmaccu.vx v10, a0, v9
29 ; CHECK-NEXT: vmv1r.v v8, v10
31 %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
35 ; Vector order switched for coverage.
36 define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) {
37 ; CHECK-LABEL: interleave_v2i32:
39 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
40 ; CHECK-NEXT: vwaddu.vv v10, v9, v8
41 ; CHECK-NEXT: li a0, -1
42 ; CHECK-NEXT: vwmaccu.vx v10, a0, v8
43 ; CHECK-NEXT: vmv1r.v v8, v10
45 %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
49 ; One vXi64 test case to very that we don't optimize it.
50 ; FIXME: Is there better codegen we can do here?
51 define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
52 ; V128-LABEL: interleave_v2i64:
54 ; V128-NEXT: vmv1r.v v12, v9
55 ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
57 ; V128-NEXT: vsrl.vi v14, v9, 1
58 ; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
59 ; V128-NEXT: vrgatherei16.vv v10, v8, v14
60 ; V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
61 ; V128-NEXT: vmv.v.i v0, 10
62 ; V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
63 ; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
64 ; V128-NEXT: vmv.v.v v8, v10
67 ; RV32-V512-LABEL: interleave_v2i64:
69 ; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
70 ; RV32-V512-NEXT: vid.v v10
71 ; RV32-V512-NEXT: vsrl.vi v11, v10, 1
72 ; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu
73 ; RV32-V512-NEXT: vmv.v.i v0, 10
74 ; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11
75 ; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t
76 ; RV32-V512-NEXT: vmv.v.v v8, v10
79 ; RV64-V512-LABEL: interleave_v2i64:
81 ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
82 ; RV64-V512-NEXT: vid.v v10
83 ; RV64-V512-NEXT: vsrl.vi v11, v10, 1
84 ; RV64-V512-NEXT: vmv.v.i v0, 10
85 ; RV64-V512-NEXT: vrgather.vv v10, v8, v11
86 ; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t
87 ; RV64-V512-NEXT: vmv.v.v v8, v10
89 %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
93 ; Vector order switched for coverage.
94 define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) {
95 ; V128-LABEL: interleave_v4i8:
97 ; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
98 ; V128-NEXT: vwaddu.vv v10, v9, v8
99 ; V128-NEXT: li a0, -1
100 ; V128-NEXT: vwmaccu.vx v10, a0, v8
101 ; V128-NEXT: vmv1r.v v8, v10
104 ; V512-LABEL: interleave_v4i8:
106 ; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
107 ; V512-NEXT: vwaddu.vv v10, v9, v8
108 ; V512-NEXT: li a0, -1
109 ; V512-NEXT: vwmaccu.vx v10, a0, v8
110 ; V512-NEXT: vmv1r.v v8, v10
112 %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
116 ; Undef elements for coverage
117 define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) {
118 ; V128-LABEL: interleave_v4i16:
120 ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
121 ; V128-NEXT: vwaddu.vv v10, v8, v9
122 ; V128-NEXT: li a0, -1
123 ; V128-NEXT: vwmaccu.vx v10, a0, v9
124 ; V128-NEXT: vmv1r.v v8, v10
127 ; V512-LABEL: interleave_v4i16:
129 ; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
130 ; V512-NEXT: vwaddu.vv v10, v8, v9
131 ; V512-NEXT: li a0, -1
132 ; V512-NEXT: vwmaccu.vx v10, a0, v9
133 ; V512-NEXT: vmv1r.v v8, v10
135 %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
139 define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) {
140 ; V128-LABEL: interleave_v4i32:
142 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
143 ; V128-NEXT: vwaddu.vv v10, v8, v9
144 ; V128-NEXT: li a0, -1
145 ; V128-NEXT: vwmaccu.vx v10, a0, v9
146 ; V128-NEXT: vmv2r.v v8, v10
149 ; V512-LABEL: interleave_v4i32:
151 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
152 ; V512-NEXT: vwaddu.vv v10, v8, v9
153 ; V512-NEXT: li a0, -1
154 ; V512-NEXT: vwmaccu.vx v10, a0, v9
155 ; V512-NEXT: vmv1r.v v8, v10
157 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
161 ; %y should be slid down by 2
162 define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
163 ; V128-LABEL: interleave_v4i32_offset_2:
165 ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma
166 ; V128-NEXT: vslidedown.vi v10, v9, 2
167 ; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
168 ; V128-NEXT: vwaddu.vv v9, v8, v10
169 ; V128-NEXT: li a0, -1
170 ; V128-NEXT: vwmaccu.vx v9, a0, v10
171 ; V128-NEXT: vmv1r.v v8, v9
174 ; V512-LABEL: interleave_v4i32_offset_2:
176 ; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
177 ; V512-NEXT: vslidedown.vi v10, v9, 2
178 ; V512-NEXT: vwaddu.vv v9, v8, v10
179 ; V512-NEXT: li a0, -1
180 ; V512-NEXT: vwmaccu.vx v9, a0, v10
181 ; V512-NEXT: vmv1r.v v8, v9
183 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 6, i32 1, i32 7>
187 ; %y should be slid down by 1
188 define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
189 ; V128-LABEL: interleave_v4i32_offset_1:
191 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
192 ; V128-NEXT: vid.v v10
193 ; V128-NEXT: vsrl.vi v11, v10, 1
194 ; V128-NEXT: vrgather.vv v10, v8, v11
195 ; V128-NEXT: vmv.v.i v0, 10
196 ; V128-NEXT: vadd.vi v8, v11, 1
197 ; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
198 ; V128-NEXT: vmv.v.v v8, v10
201 ; V512-LABEL: interleave_v4i32_offset_1:
203 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
204 ; V512-NEXT: vid.v v10
205 ; V512-NEXT: vsrl.vi v11, v10, 1
206 ; V512-NEXT: vrgather.vv v10, v8, v11
207 ; V512-NEXT: vmv.v.i v0, 10
208 ; V512-NEXT: vadd.vi v8, v11, 1
209 ; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
210 ; V512-NEXT: vmv1r.v v8, v10
212 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6>
216 define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) {
217 ; V128-LABEL: interleave_v8i8:
219 ; V128-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
220 ; V128-NEXT: vwaddu.vv v10, v8, v9
221 ; V128-NEXT: li a0, -1
222 ; V128-NEXT: vwmaccu.vx v10, a0, v9
223 ; V128-NEXT: vmv1r.v v8, v10
226 ; V512-LABEL: interleave_v8i8:
228 ; V512-NEXT: vsetivli zero, 8, e8, mf8, ta, ma
229 ; V512-NEXT: vwaddu.vv v10, v8, v9
230 ; V512-NEXT: li a0, -1
231 ; V512-NEXT: vwmaccu.vx v10, a0, v9
232 ; V512-NEXT: vmv1r.v v8, v10
234 %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
238 ; Vector order switched for coverage.
239 define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) {
240 ; V128-LABEL: interleave_v8i16:
242 ; V128-NEXT: vsetivli zero, 8, e16, m1, ta, ma
243 ; V128-NEXT: vwaddu.vv v10, v9, v8
244 ; V128-NEXT: li a0, -1
245 ; V128-NEXT: vwmaccu.vx v10, a0, v8
246 ; V128-NEXT: vmv2r.v v8, v10
249 ; V512-LABEL: interleave_v8i16:
251 ; V512-NEXT: vsetivli zero, 8, e16, mf4, ta, ma
252 ; V512-NEXT: vwaddu.vv v10, v9, v8
253 ; V512-NEXT: li a0, -1
254 ; V512-NEXT: vwmaccu.vx v10, a0, v8
255 ; V512-NEXT: vmv1r.v v8, v10
257 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
261 define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) {
262 ; V128-LABEL: interleave_v8i32:
264 ; V128-NEXT: vsetivli zero, 8, e32, m2, ta, ma
265 ; V128-NEXT: vwaddu.vv v12, v8, v10
266 ; V128-NEXT: li a0, -1
267 ; V128-NEXT: vwmaccu.vx v12, a0, v10
268 ; V128-NEXT: vmv4r.v v8, v12
271 ; V512-LABEL: interleave_v8i32:
273 ; V512-NEXT: vsetivli zero, 8, e32, mf2, ta, ma
274 ; V512-NEXT: vwaddu.vv v10, v8, v9
275 ; V512-NEXT: li a0, -1
276 ; V512-NEXT: vwmaccu.vx v10, a0, v9
277 ; V512-NEXT: vmv1r.v v8, v10
279 %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
283 define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) {
284 ; V128-LABEL: interleave_v16i8:
286 ; V128-NEXT: vsetivli zero, 16, e8, m1, ta, ma
287 ; V128-NEXT: vwaddu.vv v10, v8, v9
288 ; V128-NEXT: li a0, -1
289 ; V128-NEXT: vwmaccu.vx v10, a0, v9
290 ; V128-NEXT: vmv2r.v v8, v10
293 ; V512-LABEL: interleave_v16i8:
295 ; V512-NEXT: vsetivli zero, 16, e8, mf4, ta, ma
296 ; V512-NEXT: vwaddu.vv v10, v8, v9
297 ; V512-NEXT: li a0, -1
298 ; V512-NEXT: vwmaccu.vx v10, a0, v9
299 ; V512-NEXT: vmv1r.v v8, v10
301 %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
305 define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) {
306 ; V128-LABEL: interleave_v16i16:
308 ; V128-NEXT: vsetivli zero, 16, e16, m2, ta, ma
309 ; V128-NEXT: vwaddu.vv v12, v8, v10
310 ; V128-NEXT: li a0, -1
311 ; V128-NEXT: vwmaccu.vx v12, a0, v10
312 ; V128-NEXT: vmv4r.v v8, v12
315 ; V512-LABEL: interleave_v16i16:
317 ; V512-NEXT: vsetivli zero, 16, e16, mf2, ta, ma
318 ; V512-NEXT: vwaddu.vv v10, v8, v9
319 ; V512-NEXT: li a0, -1
320 ; V512-NEXT: vwmaccu.vx v10, a0, v9
321 ; V512-NEXT: vmv1r.v v8, v10
323 %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
327 define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) {
328 ; V128-LABEL: interleave_v16i32:
330 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
331 ; V128-NEXT: vwaddu.vv v16, v8, v12
332 ; V128-NEXT: li a0, -1
333 ; V128-NEXT: vwmaccu.vx v16, a0, v12
334 ; V128-NEXT: vmv8r.v v8, v16
337 ; V512-LABEL: interleave_v16i32:
339 ; V512-NEXT: vsetivli zero, 16, e32, m1, ta, ma
340 ; V512-NEXT: vwaddu.vv v10, v8, v9
341 ; V512-NEXT: li a0, -1
342 ; V512-NEXT: vwmaccu.vx v10, a0, v9
343 ; V512-NEXT: vmv2r.v v8, v10
345 %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
349 define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) {
350 ; V128-LABEL: interleave_v32i8:
352 ; V128-NEXT: li a0, 32
353 ; V128-NEXT: vsetvli zero, a0, e8, m2, ta, ma
354 ; V128-NEXT: vwaddu.vv v12, v8, v10
355 ; V128-NEXT: li a0, -1
356 ; V128-NEXT: vwmaccu.vx v12, a0, v10
357 ; V128-NEXT: vmv4r.v v8, v12
360 ; V512-LABEL: interleave_v32i8:
362 ; V512-NEXT: li a0, 32
363 ; V512-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
364 ; V512-NEXT: vwaddu.vv v10, v8, v9
365 ; V512-NEXT: li a0, -1
366 ; V512-NEXT: vwmaccu.vx v10, a0, v9
367 ; V512-NEXT: vmv1r.v v8, v10
369 %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
373 define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
374 ; V128-LABEL: interleave_v32i16:
376 ; V128-NEXT: li a0, 32
377 ; V128-NEXT: vsetvli zero, a0, e16, m4, ta, ma
378 ; V128-NEXT: vwaddu.vv v16, v8, v12
379 ; V128-NEXT: li a0, -1
380 ; V128-NEXT: vwmaccu.vx v16, a0, v12
381 ; V128-NEXT: vmv8r.v v8, v16
384 ; V512-LABEL: interleave_v32i16:
386 ; V512-NEXT: li a0, 32
387 ; V512-NEXT: vsetvli zero, a0, e16, m1, ta, ma
388 ; V512-NEXT: vwaddu.vv v10, v8, v9
389 ; V512-NEXT: li a0, -1
390 ; V512-NEXT: vwmaccu.vx v10, a0, v9
391 ; V512-NEXT: vmv2r.v v8, v10
393 %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
397 define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
398 ; V128-LABEL: interleave_v32i32:
400 ; V128-NEXT: addi sp, sp, -16
401 ; V128-NEXT: .cfi_def_cfa_offset 16
402 ; V128-NEXT: csrr a0, vlenb
403 ; V128-NEXT: slli a0, a0, 2
404 ; V128-NEXT: sub sp, sp, a0
405 ; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
406 ; V128-NEXT: lui a0, %hi(.LCPI17_0)
407 ; V128-NEXT: addi a0, a0, %lo(.LCPI17_0)
408 ; V128-NEXT: li a1, 32
409 ; V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
410 ; V128-NEXT: vle16.v v4, (a0)
411 ; V128-NEXT: lui a0, %hi(.LCPI17_1)
412 ; V128-NEXT: addi a0, a0, %lo(.LCPI17_1)
413 ; V128-NEXT: vle16.v v24, (a0)
414 ; V128-NEXT: addi a0, sp, 16
415 ; V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
416 ; V128-NEXT: lui a0, 699051
417 ; V128-NEXT: addi a0, a0, -1366
418 ; V128-NEXT: vmv.s.x v0, a0
419 ; V128-NEXT: vrgatherei16.vv v24, v8, v4
420 ; V128-NEXT: addi a0, sp, 16
421 ; V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
422 ; V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
423 ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
424 ; V128-NEXT: vwaddu.vv v0, v8, v16
425 ; V128-NEXT: li a0, -1
426 ; V128-NEXT: vwmaccu.vx v0, a0, v16
427 ; V128-NEXT: vmv8r.v v8, v0
428 ; V128-NEXT: vmv8r.v v16, v24
429 ; V128-NEXT: csrr a0, vlenb
430 ; V128-NEXT: slli a0, a0, 2
431 ; V128-NEXT: add sp, sp, a0
432 ; V128-NEXT: addi sp, sp, 16
435 ; V512-LABEL: interleave_v32i32:
437 ; V512-NEXT: li a0, 32
438 ; V512-NEXT: vsetvli zero, a0, e32, m2, ta, ma
439 ; V512-NEXT: vwaddu.vv v12, v8, v10
440 ; V512-NEXT: li a0, -1
441 ; V512-NEXT: vwmaccu.vx v12, a0, v10
442 ; V512-NEXT: vmv4r.v v8, v12
444 %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
448 define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) {
449 ; V128-LABEL: unary_interleave_v4i8:
451 ; V128-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
452 ; V128-NEXT: vslidedown.vi v10, v8, 2
453 ; V128-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
454 ; V128-NEXT: vwaddu.vv v9, v8, v10
455 ; V128-NEXT: li a0, -1
456 ; V128-NEXT: vwmaccu.vx v9, a0, v10
457 ; V128-NEXT: vmv1r.v v8, v9
460 ; V512-LABEL: unary_interleave_v4i8:
462 ; V512-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
463 ; V512-NEXT: vslidedown.vi v10, v8, 2
464 ; V512-NEXT: vwaddu.vv v9, v8, v10
465 ; V512-NEXT: li a0, -1
466 ; V512-NEXT: vwmaccu.vx v9, a0, v10
467 ; V512-NEXT: vmv1r.v v8, v9
469 %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
473 ; This shouldn't be interleaved
474 define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) {
475 ; V128-LABEL: unary_interleave_v4i8_invalid:
477 ; V128-NEXT: lui a0, 16
478 ; V128-NEXT: addi a0, a0, 768
479 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
480 ; V128-NEXT: vmv.s.x v10, a0
481 ; V128-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
482 ; V128-NEXT: vrgather.vv v9, v8, v10
483 ; V128-NEXT: vmv1r.v v8, v9
486 ; V512-LABEL: unary_interleave_v4i8_invalid:
488 ; V512-NEXT: lui a0, 16
489 ; V512-NEXT: addi a0, a0, 768
490 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
491 ; V512-NEXT: vmv.s.x v10, a0
492 ; V512-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
493 ; V512-NEXT: vrgather.vv v9, v8, v10
494 ; V512-NEXT: vmv1r.v v8, v9
496 %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 4>
500 define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) {
501 ; V128-LABEL: unary_interleave_v4i16:
503 ; V128-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
504 ; V128-NEXT: vslidedown.vi v10, v8, 2
505 ; V128-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
506 ; V128-NEXT: vwaddu.vv v9, v8, v10
507 ; V128-NEXT: li a0, -1
508 ; V128-NEXT: vwmaccu.vx v9, a0, v10
509 ; V128-NEXT: vmv1r.v v8, v9
512 ; V512-LABEL: unary_interleave_v4i16:
514 ; V512-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
515 ; V512-NEXT: vslidedown.vi v10, v8, 2
516 ; V512-NEXT: vwaddu.vv v9, v8, v10
517 ; V512-NEXT: li a0, -1
518 ; V512-NEXT: vwmaccu.vx v9, a0, v10
519 ; V512-NEXT: vmv1r.v v8, v9
521 %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
525 define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) {
526 ; V128-LABEL: unary_interleave_v4i32:
528 ; V128-NEXT: vsetivli zero, 2, e32, m1, ta, ma
529 ; V128-NEXT: vslidedown.vi v10, v8, 2
530 ; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
531 ; V128-NEXT: vwaddu.vv v9, v8, v10
532 ; V128-NEXT: li a0, -1
533 ; V128-NEXT: vwmaccu.vx v9, a0, v10
534 ; V128-NEXT: vmv1r.v v8, v9
537 ; V512-LABEL: unary_interleave_v4i32:
539 ; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
540 ; V512-NEXT: vslidedown.vi v10, v8, 2
541 ; V512-NEXT: vwaddu.vv v9, v8, v10
542 ; V512-NEXT: li a0, -1
543 ; V512-NEXT: vwmaccu.vx v9, a0, v10
544 ; V512-NEXT: vmv1r.v v8, v9
546 %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
550 ; FIXME: Is there better codegen we can do here?
551 define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) {
552 ; V128-LABEL: unary_interleave_v4i64:
554 ; V128-NEXT: lui a0, 12304
555 ; V128-NEXT: addi a0, a0, 512
556 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
557 ; V128-NEXT: vmv.s.x v10, a0
558 ; V128-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
559 ; V128-NEXT: vsext.vf2 v12, v10
560 ; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
561 ; V128-NEXT: vrgatherei16.vv v10, v8, v12
562 ; V128-NEXT: vmv.v.v v8, v10
565 ; RV32-V512-LABEL: unary_interleave_v4i64:
566 ; RV32-V512: # %bb.0:
567 ; RV32-V512-NEXT: lui a0, 12304
568 ; RV32-V512-NEXT: addi a0, a0, 512
569 ; RV32-V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
570 ; RV32-V512-NEXT: vmv.s.x v9, a0
571 ; RV32-V512-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
572 ; RV32-V512-NEXT: vsext.vf2 v10, v9
573 ; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, ma
574 ; RV32-V512-NEXT: vrgatherei16.vv v9, v8, v10
575 ; RV32-V512-NEXT: vmv.v.v v8, v9
576 ; RV32-V512-NEXT: ret
578 ; RV64-V512-LABEL: unary_interleave_v4i64:
579 ; RV64-V512: # %bb.0:
580 ; RV64-V512-NEXT: lui a0, 12304
581 ; RV64-V512-NEXT: addi a0, a0, 512
582 ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma
583 ; RV64-V512-NEXT: vmv.s.x v9, a0
584 ; RV64-V512-NEXT: vsext.vf8 v10, v9
585 ; RV64-V512-NEXT: vrgather.vv v9, v8, v10
586 ; RV64-V512-NEXT: vmv.v.v v8, v9
587 ; RV64-V512-NEXT: ret
588 %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
592 define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) {
593 ; V128-LABEL: unary_interleave_v8i8:
595 ; V128-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
596 ; V128-NEXT: vslidedown.vi v10, v8, 4
597 ; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
598 ; V128-NEXT: vwaddu.vv v9, v8, v10
599 ; V128-NEXT: li a0, -1
600 ; V128-NEXT: vwmaccu.vx v9, a0, v10
601 ; V128-NEXT: vmv1r.v v8, v9
604 ; V512-LABEL: unary_interleave_v8i8:
606 ; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
607 ; V512-NEXT: vslidedown.vi v10, v8, 4
608 ; V512-NEXT: vwaddu.vv v9, v8, v10
609 ; V512-NEXT: li a0, -1
610 ; V512-NEXT: vwmaccu.vx v9, a0, v10
611 ; V512-NEXT: vmv1r.v v8, v9
613 %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 undef, i32 6, i32 3, i32 7>
617 define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) {
618 ; V128-LABEL: unary_interleave_v8i16:
620 ; V128-NEXT: vsetivli zero, 4, e16, m1, ta, ma
621 ; V128-NEXT: vslidedown.vi v10, v8, 4
622 ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
623 ; V128-NEXT: vwaddu.vv v9, v10, v8
624 ; V128-NEXT: li a0, -1
625 ; V128-NEXT: vwmaccu.vx v9, a0, v8
626 ; V128-NEXT: vmv1r.v v8, v9
629 ; V512-LABEL: unary_interleave_v8i16:
631 ; V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
632 ; V512-NEXT: vslidedown.vi v10, v8, 4
633 ; V512-NEXT: vwaddu.vv v9, v10, v8
634 ; V512-NEXT: li a0, -1
635 ; V512-NEXT: vwmaccu.vx v9, a0, v8
636 ; V512-NEXT: vmv1r.v v8, v9
638 %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
642 define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) {
643 ; V128-LABEL: unary_interleave_v8i32:
645 ; V128-NEXT: vsetivli zero, 4, e32, m2, ta, ma
646 ; V128-NEXT: vslidedown.vi v12, v8, 4
647 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
648 ; V128-NEXT: vwaddu.vv v10, v8, v12
649 ; V128-NEXT: li a0, -1
650 ; V128-NEXT: vwmaccu.vx v10, a0, v12
651 ; V128-NEXT: vmv2r.v v8, v10
654 ; V512-LABEL: unary_interleave_v8i32:
656 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
657 ; V512-NEXT: vslidedown.vi v10, v8, 4
658 ; V512-NEXT: vwaddu.vv v9, v8, v10
659 ; V512-NEXT: li a0, -1
660 ; V512-NEXT: vwmaccu.vx v9, a0, v10
661 ; V512-NEXT: vmv1r.v v8, v9
663 %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
667 ; This interleaves the first 2 elements of a vector in opposite order. With
668 ; undefs for the remaining elements. We use to miscompile this.
669 define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) {
670 ; CHECK-LABEL: unary_interleave_10uu_v4i8:
672 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
673 ; CHECK-NEXT: vsrl.vi v9, v8, 8
674 ; CHECK-NEXT: vsll.vi v8, v8, 8
675 ; CHECK-NEXT: vor.vv v8, v8, v9
677 %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
681 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: