1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s \
3 ; RUN: | FileCheck %s --check-prefixes=CHECK,RV32
4 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \
5 ; RUN: | FileCheck %s --check-prefixes=CHECK,RV64
7 define fastcc <vscale x 4 x i8> @ret_nxv4i8(ptr %p) {
8 ; CHECK-LABEL: ret_nxv4i8:
10 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
11 ; CHECK-NEXT: vle8.v v8, (a0)
13 %v = load <vscale x 4 x i8>, ptr %p
14 ret <vscale x 4 x i8> %v
17 define fastcc <vscale x 4 x i32> @ret_nxv4i32(ptr %p) {
18 ; CHECK-LABEL: ret_nxv4i32:
20 ; CHECK-NEXT: vl2re32.v v8, (a0)
22 %v = load <vscale x 4 x i32>, ptr %p
23 ret <vscale x 4 x i32> %v
26 define fastcc <vscale x 8 x i32> @ret_nxv8i32(ptr %p) {
27 ; CHECK-LABEL: ret_nxv8i32:
29 ; CHECK-NEXT: vl4re32.v v8, (a0)
31 %v = load <vscale x 8 x i32>, ptr %p
32 ret <vscale x 8 x i32> %v
35 define fastcc <vscale x 16 x i64> @ret_nxv16i64(ptr %p) {
36 ; CHECK-LABEL: ret_nxv16i64:
38 ; CHECK-NEXT: csrr a1, vlenb
39 ; CHECK-NEXT: slli a1, a1, 3
40 ; CHECK-NEXT: add a1, a0, a1
41 ; CHECK-NEXT: vl8re64.v v16, (a1)
42 ; CHECK-NEXT: vl8re64.v v8, (a0)
44 %v = load <vscale x 16 x i64>, ptr %p
45 ret <vscale x 16 x i64> %v
48 define fastcc <vscale x 8 x i1> @ret_mask_nxv8i1(ptr %p) {
49 ; CHECK-LABEL: ret_mask_nxv8i1:
51 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
52 ; CHECK-NEXT: vlm.v v0, (a0)
54 %v = load <vscale x 8 x i1>, ptr %p
55 ret <vscale x 8 x i1> %v
58 define fastcc <vscale x 32 x i1> @ret_mask_nxv32i1(ptr %p) {
59 ; CHECK-LABEL: ret_mask_nxv32i1:
61 ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
62 ; CHECK-NEXT: vlm.v v0, (a0)
64 %v = load <vscale x 32 x i1>, ptr %p
65 ret <vscale x 32 x i1> %v
68 ; Return the vector via registers v8-v23
69 define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) {
70 ; CHECK-LABEL: ret_split_nxv64i32:
72 ; CHECK-NEXT: csrr a2, vlenb
73 ; CHECK-NEXT: slli a3, a2, 3
74 ; CHECK-NEXT: slli a4, a2, 5
75 ; CHECK-NEXT: sub a4, a4, a3
76 ; CHECK-NEXT: add a5, a1, a4
77 ; CHECK-NEXT: vl8re32.v v8, (a5)
78 ; CHECK-NEXT: add a5, a1, a3
79 ; CHECK-NEXT: slli a2, a2, 4
80 ; CHECK-NEXT: vl8re32.v v16, (a1)
81 ; CHECK-NEXT: add a1, a1, a2
82 ; CHECK-NEXT: vl8re32.v v24, (a1)
83 ; CHECK-NEXT: vl8re32.v v0, (a5)
84 ; CHECK-NEXT: vs8r.v v16, (a0)
85 ; CHECK-NEXT: add a2, a0, a2
86 ; CHECK-NEXT: vs8r.v v24, (a2)
87 ; CHECK-NEXT: add a3, a0, a3
88 ; CHECK-NEXT: vs8r.v v0, (a3)
89 ; CHECK-NEXT: add a0, a0, a4
90 ; CHECK-NEXT: vs8r.v v8, (a0)
92 %v = load <vscale x 64 x i32>, ptr %x
93 ret <vscale x 64 x i32> %v
96 ; Return the vector fully via the stack
97 define fastcc <vscale x 128 x i32> @ret_split_nxv128i32(ptr %x) {
98 ; CHECK-LABEL: ret_split_nxv128i32:
100 ; CHECK-NEXT: addi sp, sp, -16
101 ; CHECK-NEXT: .cfi_def_cfa_offset 16
102 ; CHECK-NEXT: csrr a2, vlenb
103 ; CHECK-NEXT: slli a2, a2, 5
104 ; CHECK-NEXT: sub sp, sp, a2
105 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
106 ; CHECK-NEXT: csrr a2, vlenb
107 ; CHECK-NEXT: slli a3, a2, 3
108 ; CHECK-NEXT: slli a4, a2, 5
109 ; CHECK-NEXT: sub a5, a4, a3
110 ; CHECK-NEXT: add a6, a1, a5
111 ; CHECK-NEXT: vl8re32.v v8, (a6)
112 ; CHECK-NEXT: csrr a6, vlenb
113 ; CHECK-NEXT: li a7, 24
114 ; CHECK-NEXT: mul a6, a6, a7
115 ; CHECK-NEXT: add a6, sp, a6
116 ; CHECK-NEXT: addi a6, a6, 16
117 ; CHECK-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
118 ; CHECK-NEXT: slli a6, a2, 4
119 ; CHECK-NEXT: slli a7, a2, 6
120 ; CHECK-NEXT: sub t0, a7, a6
121 ; CHECK-NEXT: add t1, a1, t0
122 ; CHECK-NEXT: vl8re32.v v8, (t1)
123 ; CHECK-NEXT: csrr t1, vlenb
124 ; CHECK-NEXT: slli t1, t1, 4
125 ; CHECK-NEXT: add t1, sp, t1
126 ; CHECK-NEXT: addi t1, t1, 16
127 ; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill
128 ; CHECK-NEXT: sub a7, a7, a3
129 ; CHECK-NEXT: add t1, a1, a7
130 ; CHECK-NEXT: vl8re32.v v8, (t1)
131 ; CHECK-NEXT: csrr t1, vlenb
132 ; CHECK-NEXT: slli t1, t1, 3
133 ; CHECK-NEXT: add t1, sp, t1
134 ; CHECK-NEXT: addi t1, t1, 16
135 ; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill
136 ; CHECK-NEXT: add t1, a1, a3
137 ; CHECK-NEXT: vl8re32.v v8, (t1)
138 ; CHECK-NEXT: addi t1, sp, 16
139 ; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill
140 ; CHECK-NEXT: add t1, a1, a6
141 ; CHECK-NEXT: add t2, a1, a4
142 ; CHECK-NEXT: li t3, 40
143 ; CHECK-NEXT: mul a2, a2, t3
144 ; CHECK-NEXT: add t3, a1, a2
145 ; CHECK-NEXT: vl8re32.v v8, (a1)
146 ; CHECK-NEXT: vl8re32.v v0, (t1)
147 ; CHECK-NEXT: vl8re32.v v16, (t3)
148 ; CHECK-NEXT: vl8re32.v v24, (t2)
149 ; CHECK-NEXT: vs8r.v v8, (a0)
150 ; CHECK-NEXT: add a2, a0, a2
151 ; CHECK-NEXT: vs8r.v v16, (a2)
152 ; CHECK-NEXT: add a4, a0, a4
153 ; CHECK-NEXT: vs8r.v v24, (a4)
154 ; CHECK-NEXT: add a6, a0, a6
155 ; CHECK-NEXT: vs8r.v v0, (a6)
156 ; CHECK-NEXT: add a3, a0, a3
157 ; CHECK-NEXT: addi a1, sp, 16
158 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
159 ; CHECK-NEXT: vs8r.v v8, (a3)
160 ; CHECK-NEXT: add a7, a0, a7
161 ; CHECK-NEXT: csrr a1, vlenb
162 ; CHECK-NEXT: slli a1, a1, 3
163 ; CHECK-NEXT: add a1, sp, a1
164 ; CHECK-NEXT: addi a1, a1, 16
165 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
166 ; CHECK-NEXT: vs8r.v v8, (a7)
167 ; CHECK-NEXT: add t0, a0, t0
168 ; CHECK-NEXT: csrr a1, vlenb
169 ; CHECK-NEXT: slli a1, a1, 4
170 ; CHECK-NEXT: add a1, sp, a1
171 ; CHECK-NEXT: addi a1, a1, 16
172 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
173 ; CHECK-NEXT: vs8r.v v8, (t0)
174 ; CHECK-NEXT: add a0, a0, a5
175 ; CHECK-NEXT: csrr a1, vlenb
176 ; CHECK-NEXT: li a2, 24
177 ; CHECK-NEXT: mul a1, a1, a2
178 ; CHECK-NEXT: add a1, sp, a1
179 ; CHECK-NEXT: addi a1, a1, 16
180 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
181 ; CHECK-NEXT: vs8r.v v8, (a0)
182 ; CHECK-NEXT: csrr a0, vlenb
183 ; CHECK-NEXT: slli a0, a0, 5
184 ; CHECK-NEXT: add sp, sp, a0
185 ; CHECK-NEXT: addi sp, sp, 16
187 %v = load <vscale x 128 x i32>, ptr %x
188 ret <vscale x 128 x i32> %v
191 define fastcc <vscale x 4 x i8> @ret_nxv4i8_param_nxv4i8_nxv4i8(<vscale x 4 x i8> %v, <vscale x 4 x i8> %w) {
192 ; CHECK-LABEL: ret_nxv4i8_param_nxv4i8_nxv4i8:
194 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
195 ; CHECK-NEXT: vadd.vv v8, v8, v9
197 %r = add <vscale x 4 x i8> %v, %w
198 ret <vscale x 4 x i8> %r
201 define fastcc <vscale x 4 x i64> @ret_nxv4i64_param_nxv4i64_nxv4i64(<vscale x 4 x i64> %v, <vscale x 4 x i64> %w) {
202 ; CHECK-LABEL: ret_nxv4i64_param_nxv4i64_nxv4i64:
204 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
205 ; CHECK-NEXT: vadd.vv v8, v8, v12
207 %r = add <vscale x 4 x i64> %v, %w
208 ret <vscale x 4 x i64> %r
211 define fastcc <vscale x 8 x i1> @ret_nxv8i1_param_nxv8i1_nxv8i1(<vscale x 8 x i1> %v, <vscale x 8 x i1> %w) {
212 ; CHECK-LABEL: ret_nxv8i1_param_nxv8i1_nxv8i1:
214 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
215 ; CHECK-NEXT: vmxor.mm v0, v0, v8
217 %r = xor <vscale x 8 x i1> %v, %w
218 ret <vscale x 8 x i1> %r
221 define fastcc <vscale x 32 x i1> @ret_nxv32i1_param_nxv32i1_nxv32i1(<vscale x 32 x i1> %v, <vscale x 32 x i1> %w) {
222 ; CHECK-LABEL: ret_nxv32i1_param_nxv32i1_nxv32i1:
224 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
225 ; CHECK-NEXT: vmand.mm v0, v0, v8
227 %r = and <vscale x 32 x i1> %v, %w
228 ret <vscale x 32 x i1> %r
231 define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %w) {
232 ; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32:
234 ; CHECK-NEXT: addi sp, sp, -16
235 ; CHECK-NEXT: .cfi_def_cfa_offset 16
236 ; CHECK-NEXT: csrr a1, vlenb
237 ; CHECK-NEXT: slli a1, a1, 4
238 ; CHECK-NEXT: sub sp, sp, a1
239 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
240 ; CHECK-NEXT: csrr a1, vlenb
241 ; CHECK-NEXT: slli a1, a1, 3
242 ; CHECK-NEXT: add a1, sp, a1
243 ; CHECK-NEXT: addi a1, a1, 16
244 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
245 ; CHECK-NEXT: csrr a1, vlenb
246 ; CHECK-NEXT: slli a1, a1, 3
247 ; CHECK-NEXT: add a3, a2, a1
248 ; CHECK-NEXT: add a1, a0, a1
249 ; CHECK-NEXT: vl8re32.v v24, (a0)
250 ; CHECK-NEXT: vl8re32.v v0, (a1)
251 ; CHECK-NEXT: vl8re32.v v16, (a3)
252 ; CHECK-NEXT: addi a0, sp, 16
253 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
254 ; CHECK-NEXT: vl8re32.v v16, (a2)
255 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
256 ; CHECK-NEXT: vadd.vv v24, v8, v24
257 ; CHECK-NEXT: csrr a0, vlenb
258 ; CHECK-NEXT: slli a0, a0, 3
259 ; CHECK-NEXT: add a0, sp, a0
260 ; CHECK-NEXT: addi a0, a0, 16
261 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
262 ; CHECK-NEXT: vadd.vv v0, v8, v0
263 ; CHECK-NEXT: addi a0, sp, 16
264 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
265 ; CHECK-NEXT: vadd.vv v8, v0, v8
266 ; CHECK-NEXT: vadd.vv v24, v24, v16
267 ; CHECK-NEXT: vadd.vx v16, v8, a4
268 ; CHECK-NEXT: vadd.vx v8, v24, a4
269 ; CHECK-NEXT: csrr a0, vlenb
270 ; CHECK-NEXT: slli a0, a0, 4
271 ; CHECK-NEXT: add sp, sp, a0
272 ; CHECK-NEXT: addi sp, sp, 16
274 %r = add <vscale x 32 x i32> %x, %y
275 %s = add <vscale x 32 x i32> %r, %z
276 %head = insertelement <vscale x 32 x i32> poison, i32 %w, i32 0
277 %splat = shufflevector <vscale x 32 x i32> %head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
278 %t = add <vscale x 32 x i32> %s, %splat
279 ret <vscale x 32 x i32> %t
282 declare <vscale x 32 x i32> @ext2(<vscale x 32 x i32>, <vscale x 32 x i32>, i32, i32)
283 declare <vscale x 32 x i32> @ext3(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i32>, i32, i32)
285 define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, i32 %w) {
286 ; RV32-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_i32:
288 ; RV32-NEXT: addi sp, sp, -144
289 ; RV32-NEXT: .cfi_def_cfa_offset 144
290 ; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
291 ; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
292 ; RV32-NEXT: .cfi_offset ra, -4
293 ; RV32-NEXT: .cfi_offset s0, -8
294 ; RV32-NEXT: addi s0, sp, 144
295 ; RV32-NEXT: .cfi_def_cfa s0, 0
296 ; RV32-NEXT: csrr a1, vlenb
297 ; RV32-NEXT: slli a1, a1, 4
298 ; RV32-NEXT: sub sp, sp, a1
299 ; RV32-NEXT: andi sp, sp, -128
300 ; RV32-NEXT: csrr a1, vlenb
301 ; RV32-NEXT: slli a1, a1, 3
302 ; RV32-NEXT: add a3, a0, a1
303 ; RV32-NEXT: vl8re32.v v24, (a3)
304 ; RV32-NEXT: vl8re32.v v0, (a0)
305 ; RV32-NEXT: addi a0, sp, 128
306 ; RV32-NEXT: vs8r.v v8, (a0)
307 ; RV32-NEXT: add a1, a0, a1
308 ; RV32-NEXT: addi a0, sp, 128
309 ; RV32-NEXT: li a3, 2
310 ; RV32-NEXT: vs8r.v v16, (a1)
311 ; RV32-NEXT: vmv8r.v v8, v0
312 ; RV32-NEXT: vmv8r.v v16, v24
313 ; RV32-NEXT: call ext2
314 ; RV32-NEXT: addi sp, s0, -144
315 ; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
316 ; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
317 ; RV32-NEXT: addi sp, sp, 144
320 ; RV64-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_i32:
322 ; RV64-NEXT: addi sp, sp, -144
323 ; RV64-NEXT: .cfi_def_cfa_offset 144
324 ; RV64-NEXT: sd ra, 136(sp) # 8-byte Folded Spill
325 ; RV64-NEXT: sd s0, 128(sp) # 8-byte Folded Spill
326 ; RV64-NEXT: .cfi_offset ra, -8
327 ; RV64-NEXT: .cfi_offset s0, -16
328 ; RV64-NEXT: addi s0, sp, 144
329 ; RV64-NEXT: .cfi_def_cfa s0, 0
330 ; RV64-NEXT: csrr a1, vlenb
331 ; RV64-NEXT: slli a1, a1, 4
332 ; RV64-NEXT: sub sp, sp, a1
333 ; RV64-NEXT: andi sp, sp, -128
334 ; RV64-NEXT: csrr a1, vlenb
335 ; RV64-NEXT: slli a1, a1, 3
336 ; RV64-NEXT: add a3, a0, a1
337 ; RV64-NEXT: vl8re32.v v24, (a3)
338 ; RV64-NEXT: vl8re32.v v0, (a0)
339 ; RV64-NEXT: addi a0, sp, 128
340 ; RV64-NEXT: vs8r.v v8, (a0)
341 ; RV64-NEXT: add a1, a0, a1
342 ; RV64-NEXT: addi a0, sp, 128
343 ; RV64-NEXT: li a3, 2
344 ; RV64-NEXT: vs8r.v v16, (a1)
345 ; RV64-NEXT: vmv8r.v v8, v0
346 ; RV64-NEXT: vmv8r.v v16, v24
347 ; RV64-NEXT: call ext2
348 ; RV64-NEXT: addi sp, s0, -144
349 ; RV64-NEXT: ld ra, 136(sp) # 8-byte Folded Reload
350 ; RV64-NEXT: ld s0, 128(sp) # 8-byte Folded Reload
351 ; RV64-NEXT: addi sp, sp, 144
353 %t = call fastcc <vscale x 32 x i32> @ext2(<vscale x 32 x i32> %y, <vscale x 32 x i32> %x, i32 %w, i32 2)
354 ret <vscale x 32 x i32> %t
357 define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %w) {
358 ; RV32-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32:
360 ; RV32-NEXT: addi sp, sp, -144
361 ; RV32-NEXT: .cfi_def_cfa_offset 144
362 ; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
363 ; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
364 ; RV32-NEXT: .cfi_offset ra, -4
365 ; RV32-NEXT: .cfi_offset s0, -8
366 ; RV32-NEXT: addi s0, sp, 144
367 ; RV32-NEXT: .cfi_def_cfa s0, 0
368 ; RV32-NEXT: csrr a1, vlenb
369 ; RV32-NEXT: li a3, 48
370 ; RV32-NEXT: mul a1, a1, a3
371 ; RV32-NEXT: sub sp, sp, a1
372 ; RV32-NEXT: andi sp, sp, -128
373 ; RV32-NEXT: csrr a1, vlenb
374 ; RV32-NEXT: slli a1, a1, 3
375 ; RV32-NEXT: add a3, a2, a1
376 ; RV32-NEXT: vl8re32.v v24, (a3)
377 ; RV32-NEXT: csrr a3, vlenb
378 ; RV32-NEXT: slli a3, a3, 3
379 ; RV32-NEXT: add a3, sp, a3
380 ; RV32-NEXT: addi a3, a3, 128
381 ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
382 ; RV32-NEXT: add a3, a0, a1
383 ; RV32-NEXT: vl8re32.v v24, (a3)
384 ; RV32-NEXT: addi a3, sp, 128
385 ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
386 ; RV32-NEXT: vl8re32.v v0, (a2)
387 ; RV32-NEXT: vl8re32.v v24, (a0)
388 ; RV32-NEXT: csrr a0, vlenb
389 ; RV32-NEXT: slli a0, a0, 4
390 ; RV32-NEXT: add a0, sp, a0
391 ; RV32-NEXT: addi a0, a0, 128
392 ; RV32-NEXT: vs8r.v v8, (a0)
393 ; RV32-NEXT: csrr a2, vlenb
394 ; RV32-NEXT: slli a2, a2, 5
395 ; RV32-NEXT: add a2, sp, a2
396 ; RV32-NEXT: addi a2, a2, 128
397 ; RV32-NEXT: vs8r.v v24, (a2)
398 ; RV32-NEXT: add a0, a0, a1
399 ; RV32-NEXT: vs8r.v v16, (a0)
400 ; RV32-NEXT: add a1, a2, a1
401 ; RV32-NEXT: csrr a0, vlenb
402 ; RV32-NEXT: slli a0, a0, 5
403 ; RV32-NEXT: add a0, sp, a0
404 ; RV32-NEXT: addi a0, a0, 128
405 ; RV32-NEXT: csrr a2, vlenb
406 ; RV32-NEXT: slli a2, a2, 4
407 ; RV32-NEXT: add a2, sp, a2
408 ; RV32-NEXT: addi a2, a2, 128
409 ; RV32-NEXT: li a5, 42
410 ; RV32-NEXT: addi a3, sp, 128
411 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
412 ; RV32-NEXT: vs8r.v v8, (a1)
413 ; RV32-NEXT: vmv8r.v v8, v0
414 ; RV32-NEXT: csrr a1, vlenb
415 ; RV32-NEXT: slli a1, a1, 3
416 ; RV32-NEXT: add a1, sp, a1
417 ; RV32-NEXT: addi a1, a1, 128
418 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
419 ; RV32-NEXT: call ext3
420 ; RV32-NEXT: addi sp, s0, -144
421 ; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
422 ; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
423 ; RV32-NEXT: addi sp, sp, 144
426 ; RV64-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32:
428 ; RV64-NEXT: addi sp, sp, -144
429 ; RV64-NEXT: .cfi_def_cfa_offset 144
430 ; RV64-NEXT: sd ra, 136(sp) # 8-byte Folded Spill
431 ; RV64-NEXT: sd s0, 128(sp) # 8-byte Folded Spill
432 ; RV64-NEXT: .cfi_offset ra, -8
433 ; RV64-NEXT: .cfi_offset s0, -16
434 ; RV64-NEXT: addi s0, sp, 144
435 ; RV64-NEXT: .cfi_def_cfa s0, 0
436 ; RV64-NEXT: csrr a1, vlenb
437 ; RV64-NEXT: li a3, 48
438 ; RV64-NEXT: mul a1, a1, a3
439 ; RV64-NEXT: sub sp, sp, a1
440 ; RV64-NEXT: andi sp, sp, -128
441 ; RV64-NEXT: csrr a1, vlenb
442 ; RV64-NEXT: slli a1, a1, 3
443 ; RV64-NEXT: add a3, a2, a1
444 ; RV64-NEXT: vl8re32.v v24, (a3)
445 ; RV64-NEXT: csrr a3, vlenb
446 ; RV64-NEXT: slli a3, a3, 3
447 ; RV64-NEXT: add a3, sp, a3
448 ; RV64-NEXT: addi a3, a3, 128
449 ; RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
450 ; RV64-NEXT: add a3, a0, a1
451 ; RV64-NEXT: vl8re32.v v24, (a3)
452 ; RV64-NEXT: addi a3, sp, 128
453 ; RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill
454 ; RV64-NEXT: vl8re32.v v0, (a2)
455 ; RV64-NEXT: vl8re32.v v24, (a0)
456 ; RV64-NEXT: csrr a0, vlenb
457 ; RV64-NEXT: slli a0, a0, 4
458 ; RV64-NEXT: add a0, sp, a0
459 ; RV64-NEXT: addi a0, a0, 128
460 ; RV64-NEXT: vs8r.v v8, (a0)
461 ; RV64-NEXT: csrr a2, vlenb
462 ; RV64-NEXT: slli a2, a2, 5
463 ; RV64-NEXT: add a2, sp, a2
464 ; RV64-NEXT: addi a2, a2, 128
465 ; RV64-NEXT: vs8r.v v24, (a2)
466 ; RV64-NEXT: add a0, a0, a1
467 ; RV64-NEXT: vs8r.v v16, (a0)
468 ; RV64-NEXT: add a1, a2, a1
469 ; RV64-NEXT: csrr a0, vlenb
470 ; RV64-NEXT: slli a0, a0, 5
471 ; RV64-NEXT: add a0, sp, a0
472 ; RV64-NEXT: addi a0, a0, 128
473 ; RV64-NEXT: csrr a2, vlenb
474 ; RV64-NEXT: slli a2, a2, 4
475 ; RV64-NEXT: add a2, sp, a2
476 ; RV64-NEXT: addi a2, a2, 128
477 ; RV64-NEXT: li a5, 42
478 ; RV64-NEXT: addi a3, sp, 128
479 ; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload
480 ; RV64-NEXT: vs8r.v v8, (a1)
481 ; RV64-NEXT: vmv8r.v v8, v0
482 ; RV64-NEXT: csrr a1, vlenb
483 ; RV64-NEXT: slli a1, a1, 3
484 ; RV64-NEXT: add a1, sp, a1
485 ; RV64-NEXT: addi a1, a1, 128
486 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload
487 ; RV64-NEXT: call ext3
488 ; RV64-NEXT: addi sp, s0, -144
489 ; RV64-NEXT: ld ra, 136(sp) # 8-byte Folded Reload
490 ; RV64-NEXT: ld s0, 128(sp) # 8-byte Folded Reload
491 ; RV64-NEXT: addi sp, sp, 144
493 %t = call fastcc <vscale x 32 x i32> @ext3(<vscale x 32 x i32> %z, <vscale x 32 x i32> %y, <vscale x 32 x i32> %x, i32 %w, i32 42)
494 ret <vscale x 32 x i32> %t
497 ; A test case where the normal calling convention would pass directly via the
498 ; stack, but with fastcc can pass indirectly with the extra GPR registers
500 define fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %8) {
501 ; CHECK-LABEL: vector_arg_indirect_stack:
503 ; CHECK-NEXT: csrr a0, vlenb
504 ; CHECK-NEXT: slli a0, a0, 3
505 ; CHECK-NEXT: add a0, t5, a0
506 ; CHECK-NEXT: vl8re32.v v24, (t5)
507 ; CHECK-NEXT: vl8re32.v v0, (a0)
508 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
509 ; CHECK-NEXT: vadd.vv v8, v8, v24
510 ; CHECK-NEXT: vadd.vv v16, v16, v0
512 %s = add <vscale x 32 x i32> %x, %z
513 ret <vscale x 32 x i32> %s
516 ; Calling the function above. Ensure we pass the arguments correctly.
517 define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z) {
518 ; RV32-LABEL: pass_vector_arg_indirect_stack:
520 ; RV32-NEXT: addi sp, sp, -144
521 ; RV32-NEXT: .cfi_def_cfa_offset 144
522 ; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
523 ; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
524 ; RV32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
525 ; RV32-NEXT: .cfi_offset ra, -4
526 ; RV32-NEXT: .cfi_offset s0, -8
527 ; RV32-NEXT: .cfi_offset s1, -12
528 ; RV32-NEXT: addi s0, sp, 144
529 ; RV32-NEXT: .cfi_def_cfa s0, 0
530 ; RV32-NEXT: csrr a0, vlenb
531 ; RV32-NEXT: slli a0, a0, 5
532 ; RV32-NEXT: sub sp, sp, a0
533 ; RV32-NEXT: andi sp, sp, -128
534 ; RV32-NEXT: mv s1, sp
535 ; RV32-NEXT: csrr a0, vlenb
536 ; RV32-NEXT: slli a0, a0, 3
537 ; RV32-NEXT: addi sp, sp, -16
538 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
539 ; RV32-NEXT: vmv.v.i v8, 0
540 ; RV32-NEXT: addi a1, s1, 128
541 ; RV32-NEXT: vs8r.v v8, (a1)
542 ; RV32-NEXT: csrr a2, vlenb
543 ; RV32-NEXT: slli a2, a2, 4
544 ; RV32-NEXT: add a2, s1, a2
545 ; RV32-NEXT: addi a2, a2, 128
546 ; RV32-NEXT: vs8r.v v8, (a2)
547 ; RV32-NEXT: li a3, 8
548 ; RV32-NEXT: sw a3, 0(sp)
549 ; RV32-NEXT: add a1, a1, a0
550 ; RV32-NEXT: vs8r.v v8, (a1)
551 ; RV32-NEXT: add a0, a2, a0
552 ; RV32-NEXT: li a1, 1
553 ; RV32-NEXT: li a2, 2
554 ; RV32-NEXT: li a3, 3
555 ; RV32-NEXT: li a4, 4
556 ; RV32-NEXT: li a5, 5
557 ; RV32-NEXT: li a6, 6
558 ; RV32-NEXT: li a7, 7
559 ; RV32-NEXT: csrr t3, vlenb
560 ; RV32-NEXT: slli t3, t3, 4
561 ; RV32-NEXT: add t3, s1, t3
562 ; RV32-NEXT: addi t3, t3, 128
563 ; RV32-NEXT: addi t5, s1, 128
564 ; RV32-NEXT: vs8r.v v8, (a0)
565 ; RV32-NEXT: li a0, 0
566 ; RV32-NEXT: vmv.v.i v16, 0
567 ; RV32-NEXT: call vector_arg_indirect_stack
568 ; RV32-NEXT: addi sp, sp, 16
569 ; RV32-NEXT: addi sp, s0, -144
570 ; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
571 ; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
572 ; RV32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
573 ; RV32-NEXT: addi sp, sp, 144
576 ; RV64-LABEL: pass_vector_arg_indirect_stack:
578 ; RV64-NEXT: addi sp, sp, -160
579 ; RV64-NEXT: .cfi_def_cfa_offset 160
580 ; RV64-NEXT: sd ra, 152(sp) # 8-byte Folded Spill
581 ; RV64-NEXT: sd s0, 144(sp) # 8-byte Folded Spill
582 ; RV64-NEXT: sd s1, 136(sp) # 8-byte Folded Spill
583 ; RV64-NEXT: .cfi_offset ra, -8
584 ; RV64-NEXT: .cfi_offset s0, -16
585 ; RV64-NEXT: .cfi_offset s1, -24
586 ; RV64-NEXT: addi s0, sp, 160
587 ; RV64-NEXT: .cfi_def_cfa s0, 0
588 ; RV64-NEXT: csrr a0, vlenb
589 ; RV64-NEXT: slli a0, a0, 5
590 ; RV64-NEXT: sub sp, sp, a0
591 ; RV64-NEXT: andi sp, sp, -128
592 ; RV64-NEXT: mv s1, sp
593 ; RV64-NEXT: csrr a0, vlenb
594 ; RV64-NEXT: slli a0, a0, 3
595 ; RV64-NEXT: addi sp, sp, -16
596 ; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma
597 ; RV64-NEXT: vmv.v.i v8, 0
598 ; RV64-NEXT: addi a1, s1, 128
599 ; RV64-NEXT: vs8r.v v8, (a1)
600 ; RV64-NEXT: csrr a2, vlenb
601 ; RV64-NEXT: slli a2, a2, 4
602 ; RV64-NEXT: add a2, s1, a2
603 ; RV64-NEXT: addi a2, a2, 128
604 ; RV64-NEXT: vs8r.v v8, (a2)
605 ; RV64-NEXT: li a3, 8
606 ; RV64-NEXT: sd a3, 0(sp)
607 ; RV64-NEXT: add a1, a1, a0
608 ; RV64-NEXT: vs8r.v v8, (a1)
609 ; RV64-NEXT: add a0, a2, a0
610 ; RV64-NEXT: li a1, 1
611 ; RV64-NEXT: li a2, 2
612 ; RV64-NEXT: li a3, 3
613 ; RV64-NEXT: li a4, 4
614 ; RV64-NEXT: li a5, 5
615 ; RV64-NEXT: li a6, 6
616 ; RV64-NEXT: li a7, 7
617 ; RV64-NEXT: csrr t3, vlenb
618 ; RV64-NEXT: slli t3, t3, 4
619 ; RV64-NEXT: add t3, s1, t3
620 ; RV64-NEXT: addi t3, t3, 128
621 ; RV64-NEXT: addi t5, s1, 128
622 ; RV64-NEXT: vs8r.v v8, (a0)
623 ; RV64-NEXT: li a0, 0
624 ; RV64-NEXT: vmv.v.i v16, 0
625 ; RV64-NEXT: call vector_arg_indirect_stack
626 ; RV64-NEXT: addi sp, sp, 16
627 ; RV64-NEXT: addi sp, s0, -160
628 ; RV64-NEXT: ld ra, 152(sp) # 8-byte Folded Reload
629 ; RV64-NEXT: ld s0, 144(sp) # 8-byte Folded Reload
630 ; RV64-NEXT: ld s1, 136(sp) # 8-byte Folded Reload
631 ; RV64-NEXT: addi sp, sp, 160
633 %s = call fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 8)
634 ret <vscale x 32 x i32> %s