1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
4 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+fast-unaligned-access -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
6 ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
8 ; The two loads are contigous and should be folded into one
9 define void @widen_2xv4i16(ptr %x, ptr %z) {
10 ; CHECK-LABEL: widen_2xv4i16:
12 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
13 ; CHECK-NEXT: vle16.v v8, (a0)
14 ; CHECK-NEXT: vse16.v v8, (a1)
16 %a = load <4 x i16>, ptr %x
17 %b.gep = getelementptr i8, ptr %x, i64 8
18 %b = load <4 x i16>, ptr %b.gep
19 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
20 store <8 x i16> %c, ptr %z
24 define void @widen_3xv4i16(ptr %x, ptr %z) {
25 ; CHECK-LABEL: widen_3xv4i16:
27 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
28 ; CHECK-NEXT: vle16.v v8, (a0)
29 ; CHECK-NEXT: addi a2, a0, 8
30 ; CHECK-NEXT: vle16.v v10, (a2)
31 ; CHECK-NEXT: addi a0, a0, 16
32 ; CHECK-NEXT: vle16.v v12, (a0)
33 ; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
34 ; CHECK-NEXT: vslideup.vi v8, v10, 4
35 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
36 ; CHECK-NEXT: vslideup.vi v8, v12, 8
37 ; CHECK-NEXT: vse16.v v8, (a1)
39 %a = load <4 x i16>, ptr %x
40 %b.gep = getelementptr i8, ptr %x, i64 8
41 %b = load <4 x i16>, ptr %b.gep
42 %c.gep = getelementptr i8, ptr %b.gep, i64 8
43 %c = load <4 x i16>, ptr %c.gep
44 %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
45 %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
46 %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
47 store <12 x i16> %d.2, ptr %z
51 define void @widen_4xv4i16(ptr %x, ptr %z) {
52 ; CHECK-LABEL: widen_4xv4i16:
54 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
55 ; CHECK-NEXT: vle16.v v8, (a0)
56 ; CHECK-NEXT: vse16.v v8, (a1)
58 %a = load <4 x i16>, ptr %x
59 %b.gep = getelementptr i8, ptr %x, i64 8
60 %b = load <4 x i16>, ptr %b.gep
61 %c.gep = getelementptr i8, ptr %b.gep, i64 8
62 %c = load <4 x i16>, ptr %c.gep
63 %d.gep = getelementptr i8, ptr %c.gep, i64 8
64 %d = load <4 x i16>, ptr %d.gep
65 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
67 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
68 store <16 x i16> %e.2, ptr %z
72 define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
73 ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
74 ; CHECK-NO-MISALIGN: # %bb.0:
75 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
76 ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
77 ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
78 ; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
79 ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
80 ; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2)
81 ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
82 ; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0)
83 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma
84 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4
85 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma
86 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8
87 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
88 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12
89 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
90 ; CHECK-NO-MISALIGN-NEXT: ret
92 ; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
93 ; RV64-MISALIGN: # %bb.0:
94 ; RV64-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
95 ; RV64-MISALIGN-NEXT: vle16.v v8, (a0)
96 ; RV64-MISALIGN-NEXT: vse16.v v8, (a1)
97 ; RV64-MISALIGN-NEXT: ret
98 %a = load <4 x i16>, ptr %x, align 1
99 %b.gep = getelementptr i8, ptr %x, i64 8
100 %b = load <4 x i16>, ptr %b.gep, align 1
101 %c.gep = getelementptr i8, ptr %b.gep, i64 8
102 %c = load <4 x i16>, ptr %c.gep, align 1
103 %d.gep = getelementptr i8, ptr %c.gep, i64 8
104 %d = load <4 x i16>, ptr %d.gep, align 1
105 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
106 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
107 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
108 store <16 x i16> %e.2, ptr %z
112 ; Should be a strided load - with type coercion to i64
113 define void @strided_constant(ptr %x, ptr %z) {
114 ; CHECK-LABEL: strided_constant:
116 ; CHECK-NEXT: li a2, 16
117 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
118 ; CHECK-NEXT: vlse64.v v8, (a0), a2
119 ; CHECK-NEXT: vse64.v v8, (a1)
121 %a = load <4 x i16>, ptr %x
122 %b.gep = getelementptr i8, ptr %x, i64 16
123 %b = load <4 x i16>, ptr %b.gep
124 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
125 store <8 x i16> %c, ptr %z
129 ; Should be a strided load
130 define void @strided_constant_64(ptr %x, ptr %z) {
131 ; CHECK-LABEL: strided_constant_64:
133 ; CHECK-NEXT: li a2, 64
134 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
135 ; CHECK-NEXT: vlse64.v v8, (a0), a2
136 ; CHECK-NEXT: vse64.v v8, (a1)
138 %a = load <4 x i16>, ptr %x
139 %b.gep = getelementptr i8, ptr %x, i64 64
140 %b = load <4 x i16>, ptr %b.gep
141 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
142 store <8 x i16> %c, ptr %z
146 ; Vector is too large to fit into a single strided load
147 define void @strided_constant_v4i32(ptr %x, ptr %z) {
148 ; CHECK-LABEL: strided_constant_v4i32:
150 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
151 ; CHECK-NEXT: vle32.v v8, (a0)
152 ; CHECK-NEXT: addi a0, a0, 32
153 ; CHECK-NEXT: vle32.v v10, (a0)
154 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
155 ; CHECK-NEXT: vslideup.vi v8, v10, 4
156 ; CHECK-NEXT: vse32.v v8, (a1)
158 %a = load <4 x i32>, ptr %x
159 %b.gep = getelementptr i8, ptr %x, i64 32
160 %b = load <4 x i32>, ptr %b.gep
161 %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
162 store <8 x i32> %c, ptr %z
166 ; Interestingly, can be a stride 0 load
167 define void @strided_constant_0(ptr %x, ptr %z) {
168 ; CHECK-LABEL: strided_constant_0:
170 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
171 ; CHECK-NEXT: vle16.v v8, (a0)
172 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
173 ; CHECK-NEXT: vmv1r.v v9, v8
174 ; CHECK-NEXT: vslideup.vi v9, v8, 4
175 ; CHECK-NEXT: vse16.v v9, (a1)
177 %a = load <4 x i16>, ptr %x
178 %b = load <4 x i16>, ptr %x
179 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
180 store <8 x i16> %c, ptr %z
184 ; Stride isn't consistent, so shouldn't be combined
185 define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
186 ; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
188 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
189 ; CHECK-NEXT: vle16.v v8, (a0)
190 ; CHECK-NEXT: addi a2, a0, 2
191 ; CHECK-NEXT: vle16.v v10, (a2)
192 ; CHECK-NEXT: addi a2, a0, 6
193 ; CHECK-NEXT: vle16.v v12, (a2)
194 ; CHECK-NEXT: addi a0, a0, 8
195 ; CHECK-NEXT: vle16.v v14, (a0)
196 ; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
197 ; CHECK-NEXT: vslideup.vi v8, v10, 4
198 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
199 ; CHECK-NEXT: vslideup.vi v8, v12, 8
200 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
201 ; CHECK-NEXT: vslideup.vi v8, v14, 12
202 ; CHECK-NEXT: vse16.v v8, (a1)
204 %a = load <4 x i16>, ptr %x
205 %b.gep = getelementptr i8, ptr %x, i64 2
206 %b = load <4 x i16>, ptr %b.gep
207 %c.gep = getelementptr i8, ptr %b.gep, i64 4
208 %c = load <4 x i16>, ptr %c.gep
209 %d.gep = getelementptr i8, ptr %c.gep, i64 2
210 %d = load <4 x i16>, ptr %d.gep
211 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
212 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
213 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
214 store <16 x i16> %e.2, ptr %z
218 define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
219 ; CHECK-LABEL: strided_runtime:
221 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
222 ; CHECK-NEXT: vlse64.v v8, (a0), a2
223 ; CHECK-NEXT: vse64.v v8, (a1)
225 %a = load <4 x i16>, ptr %x
226 %b.gep = getelementptr i8, ptr %x, i64 %s
227 %b = load <4 x i16>, ptr %b.gep
228 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
229 store <8 x i16> %c, ptr %z
233 define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
234 ; CHECK-LABEL: strided_runtime_4xv4i16:
236 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
237 ; CHECK-NEXT: vlse64.v v8, (a0), a2
238 ; CHECK-NEXT: vse64.v v8, (a1)
240 %a = load <4 x i16>, ptr %x
241 %b.gep = getelementptr i8, ptr %x, i64 %s
242 %b = load <4 x i16>, ptr %b.gep
243 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
244 %c = load <4 x i16>, ptr %c.gep
245 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
246 %d = load <4 x i16>, ptr %d.gep
247 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
248 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
249 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
250 store <16 x i16> %e.2, ptr %z
254 ; Stride isn't consistent, so shouldn't be combined
255 define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
256 ; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
258 ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
259 ; RV32-NEXT: vle16.v v8, (a0)
260 ; RV32-NEXT: add a0, a0, a2
261 ; RV32-NEXT: vle16.v v10, (a0)
262 ; RV32-NEXT: add a0, a0, a4
263 ; RV32-NEXT: vle16.v v12, (a0)
264 ; RV32-NEXT: add a0, a0, a2
265 ; RV32-NEXT: vle16.v v14, (a0)
266 ; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma
267 ; RV32-NEXT: vslideup.vi v8, v10, 4
268 ; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma
269 ; RV32-NEXT: vslideup.vi v8, v12, 8
270 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
271 ; RV32-NEXT: vslideup.vi v8, v14, 12
272 ; RV32-NEXT: vse16.v v8, (a1)
275 ; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
277 ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
278 ; RV64-NEXT: vle16.v v8, (a0)
279 ; RV64-NEXT: add a0, a0, a2
280 ; RV64-NEXT: vle16.v v10, (a0)
281 ; RV64-NEXT: add a0, a0, a3
282 ; RV64-NEXT: vle16.v v12, (a0)
283 ; RV64-NEXT: add a0, a0, a2
284 ; RV64-NEXT: vle16.v v14, (a0)
285 ; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma
286 ; RV64-NEXT: vslideup.vi v8, v10, 4
287 ; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma
288 ; RV64-NEXT: vslideup.vi v8, v12, 8
289 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
290 ; RV64-NEXT: vslideup.vi v8, v14, 12
291 ; RV64-NEXT: vse16.v v8, (a1)
294 ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
296 ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
297 ; ZVE64F-NEXT: vle16.v v8, (a0)
298 ; ZVE64F-NEXT: add a0, a0, a2
299 ; ZVE64F-NEXT: vle16.v v10, (a0)
300 ; ZVE64F-NEXT: add a0, a0, a3
301 ; ZVE64F-NEXT: vle16.v v12, (a0)
302 ; ZVE64F-NEXT: add a0, a0, a2
303 ; ZVE64F-NEXT: vle16.v v14, (a0)
304 ; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
305 ; ZVE64F-NEXT: vslideup.vi v8, v10, 4
306 ; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
307 ; ZVE64F-NEXT: vslideup.vi v8, v12, 8
308 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
309 ; ZVE64F-NEXT: vslideup.vi v8, v14, 12
310 ; ZVE64F-NEXT: vse16.v v8, (a1)
312 %a = load <4 x i16>, ptr %x
313 %b.gep = getelementptr i8, ptr %x, i64 %s
314 %b = load <4 x i16>, ptr %b.gep
315 %c.gep = getelementptr i8, ptr %b.gep, i64 %t
316 %c = load <4 x i16>, ptr %c.gep
317 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
318 %d = load <4 x i16>, ptr %d.gep
319 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
320 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
321 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
322 store <16 x i16> %e.2, ptr %z
326 define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
327 ; CHECK-LABEL: strided_runtime_4xv4f16:
329 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
330 ; CHECK-NEXT: vlse64.v v8, (a0), a2
331 ; CHECK-NEXT: vse64.v v8, (a1)
333 %a = load <4 x half>, ptr %x
334 %b.gep = getelementptr i8, ptr %x, i64 %s
335 %b = load <4 x half>, ptr %b.gep
336 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
337 %c = load <4 x half>, ptr %c.gep
338 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
339 %d = load <4 x half>, ptr %d.gep
340 %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
341 %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
342 %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
343 store <16 x half> %e.2, ptr %z
347 define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
348 ; CHECK-LABEL: strided_runtime_4xv2f32:
350 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
351 ; CHECK-NEXT: vlse64.v v8, (a0), a2
352 ; CHECK-NEXT: vse64.v v8, (a1)
354 %a = load <2 x float>, ptr %x
355 %b.gep = getelementptr i8, ptr %x, i64 %s
356 %b = load <2 x float>, ptr %b.gep
357 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
358 %c = load <2 x float>, ptr %c.gep
359 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
360 %d = load <2 x float>, ptr %d.gep
361 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
363 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
364 store <8 x float> %e.2, ptr %z
368 define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
369 ; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
370 ; CHECK-NO-MISALIGN: # %bb.0:
371 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
372 ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
373 ; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2
374 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0)
375 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
376 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
377 ; CHECK-NO-MISALIGN-NEXT: ret
379 ; RV64-MISALIGN-LABEL: strided_unaligned:
380 ; RV64-MISALIGN: # %bb.0:
381 ; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma
382 ; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2
383 ; RV64-MISALIGN-NEXT: vse64.v v8, (a1)
384 ; RV64-MISALIGN-NEXT: ret
385 %a = load <4 x i16>, ptr %x, align 1
386 %b.gep = getelementptr i8, ptr %x, i64 %s
387 %b = load <4 x i16>, ptr %b.gep, align 1
388 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
389 store <8 x i16> %c, ptr %z
393 ; Should use the most restrictive common alignment
394 define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
395 ; CHECK-LABEL: strided_mismatched_alignments:
397 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
398 ; CHECK-NEXT: vlse64.v v8, (a0), a2
399 ; CHECK-NEXT: vse64.v v8, (a1)
401 %a = load <4 x i16>, ptr %x, align 8
402 %b.gep = getelementptr i8, ptr %x, i64 %s
403 %b = load <4 x i16>, ptr %b.gep, align 16
404 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
405 store <8 x i16> %c, ptr %z
409 define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
410 ; CHECK-LABEL: strided_ok_alignments_8:
412 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
413 ; CHECK-NEXT: vlse64.v v8, (a0), a2
414 ; CHECK-NEXT: vse64.v v8, (a1)
416 %a = load <4 x i16>, ptr %x, align 8
417 %b.gep = getelementptr i8, ptr %x, i64 %s
418 %b = load <4 x i16>, ptr %b.gep, align 8
419 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
420 store <8 x i16> %c, ptr %z
424 define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) {
425 ; CHECK-LABEL: strided_ok_alignments_16:
427 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
428 ; CHECK-NEXT: vlse64.v v8, (a0), a2
429 ; CHECK-NEXT: vse64.v v8, (a1)
431 %a = load <4 x i16>, ptr %x, align 16
432 %b.gep = getelementptr i8, ptr %x, i64 %s
433 %b = load <4 x i16>, ptr %b.gep, align 16
434 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
435 store <8 x i16> %c, ptr %z
439 ; Shouldn't be combined because one of the loads is not simple
440 define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) {
441 ; CHECK-LABEL: strided_non_simple_load:
443 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
444 ; CHECK-NEXT: vle16.v v8, (a0)
445 ; CHECK-NEXT: add a0, a0, a2
446 ; CHECK-NEXT: vle16.v v9, (a0)
447 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
448 ; CHECK-NEXT: vslideup.vi v8, v9, 4
449 ; CHECK-NEXT: vse16.v v8, (a1)
451 %a = load <4 x i16>, ptr %x
452 %b.gep = getelementptr i8, ptr %x, i64 %s
453 %b = load volatile <4 x i16>, ptr %b.gep
454 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
455 store <8 x i16> %c, ptr %z
459 ; Shouldn't be combined because one of the operands is not a load
460 define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) {
461 ; CHECK-LABEL: strided_non_load:
463 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
464 ; CHECK-NEXT: vle16.v v9, (a0)
465 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
466 ; CHECK-NEXT: vslideup.vi v9, v8, 4
467 ; CHECK-NEXT: vse16.v v9, (a1)
469 %a = load <4 x i16>, ptr %x
470 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
471 store <8 x i16> %c, ptr %z
475 define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
476 ; CHECK-LABEL: strided_constant_neg_4xv2f32:
478 ; CHECK-NEXT: li a2, -64
479 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
480 ; CHECK-NEXT: vlse64.v v8, (a0), a2
481 ; CHECK-NEXT: vse64.v v8, (a1)
483 %a = load <2 x float>, ptr %x
484 %b.gep = getelementptr i8, ptr %x, i64 -64
485 %b = load <2 x float>, ptr %b.gep
486 %c.gep = getelementptr i8, ptr %b.gep, i64 -64
487 %c = load <2 x float>, ptr %c.gep
488 %d.gep = getelementptr i8, ptr %c.gep, i64 -64
489 %d = load <2 x float>, ptr %d.gep
490 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
491 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
492 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
493 store <8 x float> %e.2, ptr %z
497 ; This is a strided load with a negative stride
498 define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) {
499 ; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32:
501 ; CHECK-NEXT: addi a0, a0, 192
502 ; CHECK-NEXT: li a2, -64
503 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
504 ; CHECK-NEXT: vlse64.v v8, (a0), a2
505 ; CHECK-NEXT: vse64.v v8, (a1)
507 %x.1 = getelementptr i8, ptr %x, i64 64
508 %x.2 = getelementptr i8, ptr %x.1, i64 64
509 %x.3 = getelementptr i8, ptr %x.2, i64 64
510 %a = load <2 x float>, ptr %x.3
511 %b = load <2 x float>, ptr %x.2
512 %c = load <2 x float>, ptr %x.1
513 %d = load <2 x float>, ptr %x
514 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
515 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
516 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
517 store <8 x float> %e.2, ptr %z
521 define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
522 ; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32:
524 ; CHECK-NEXT: addi a0, a0, -192
525 ; CHECK-NEXT: li a2, 64
526 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
527 ; CHECK-NEXT: vlse64.v v8, (a0), a2
528 ; CHECK-NEXT: vse64.v v8, (a1)
530 %x.1 = getelementptr i8, ptr %x, i64 -64
531 %x.2 = getelementptr i8, ptr %x.1, i64 -64
532 %x.3 = getelementptr i8, ptr %x.2, i64 -64
533 %a = load <2 x float>, ptr %x.3
534 %b = load <2 x float>, ptr %x.2
535 %c = load <2 x float>, ptr %x.1
536 %d = load <2 x float>, ptr %x
537 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
538 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
539 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
540 store <8 x float> %e.2, ptr %z
544 ; This is a strided load with a negative stride
545 define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
546 ; CHECK-LABEL: reverse_strided_runtime_4xv2f32:
548 ; CHECK-NEXT: add a0, a0, a2
549 ; CHECK-NEXT: add a3, a2, a2
550 ; CHECK-NEXT: add a0, a0, a3
551 ; CHECK-NEXT: neg a2, a2
552 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
553 ; CHECK-NEXT: vlse64.v v8, (a0), a2
554 ; CHECK-NEXT: vse64.v v8, (a1)
556 %x.1 = getelementptr i8, ptr %x, i64 %s
557 %x.2 = getelementptr i8, ptr %x.1, i64 %s
558 %x.3 = getelementptr i8, ptr %x.2, i64 %s
559 %a = load <2 x float>, ptr %x.3
560 %b = load <2 x float>, ptr %x.2
561 %c = load <2 x float>, ptr %x.1
562 %d = load <2 x float>, ptr %x
563 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
564 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
565 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
566 store <8 x float> %e.2, ptr %z