1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
4 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
6 ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
8 ; The two loads are contigous and should be folded into one
9 define void @widen_2xv4i16(ptr %x, ptr %z) {
10 ; CHECK-LABEL: widen_2xv4i16:
12 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
13 ; CHECK-NEXT: vle16.v v8, (a0)
14 ; CHECK-NEXT: vse16.v v8, (a1)
16 %a = load <4 x i16>, ptr %x
17 %b.gep = getelementptr i8, ptr %x, i64 8
18 %b = load <4 x i16>, ptr %b.gep
19 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
20 store <8 x i16> %c, ptr %z
24 define void @widen_3xv4i16(ptr %x, ptr %z) {
25 ; CHECK-LABEL: widen_3xv4i16:
27 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
28 ; CHECK-NEXT: vle16.v v8, (a0)
29 ; CHECK-NEXT: addi a2, a0, 8
30 ; CHECK-NEXT: vle16.v v9, (a2)
31 ; CHECK-NEXT: addi a0, a0, 16
32 ; CHECK-NEXT: vle16.v v10, (a0)
33 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
34 ; CHECK-NEXT: vslideup.vi v8, v9, 4
35 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
36 ; CHECK-NEXT: vslideup.vi v8, v10, 8
37 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, ta, ma
38 ; CHECK-NEXT: vse16.v v8, (a1)
40 %a = load <4 x i16>, ptr %x
41 %b.gep = getelementptr i8, ptr %x, i64 8
42 %b = load <4 x i16>, ptr %b.gep
43 %c.gep = getelementptr i8, ptr %b.gep, i64 8
44 %c = load <4 x i16>, ptr %c.gep
45 %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
46 %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
47 %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
48 store <12 x i16> %d.2, ptr %z
52 define void @widen_4xv4i16(ptr %x, ptr %z) {
53 ; CHECK-LABEL: widen_4xv4i16:
55 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
56 ; CHECK-NEXT: vle16.v v8, (a0)
57 ; CHECK-NEXT: vse16.v v8, (a1)
59 %a = load <4 x i16>, ptr %x
60 %b.gep = getelementptr i8, ptr %x, i64 8
61 %b = load <4 x i16>, ptr %b.gep
62 %c.gep = getelementptr i8, ptr %b.gep, i64 8
63 %c = load <4 x i16>, ptr %c.gep
64 %d.gep = getelementptr i8, ptr %c.gep, i64 8
65 %d = load <4 x i16>, ptr %d.gep
66 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
67 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
68 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
69 store <16 x i16> %e.2, ptr %z
73 define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
74 ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
75 ; CHECK-NO-MISALIGN: # %bb.0:
76 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
77 ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
78 ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
79 ; CHECK-NO-MISALIGN-NEXT: addi a3, a0, 16
80 ; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a3)
81 ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
82 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0)
83 ; CHECK-NO-MISALIGN-NEXT: vle8.v v11, (a2)
84 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v10, v9, 4
85 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v11, 4
86 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
87 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8
88 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
89 ; CHECK-NO-MISALIGN-NEXT: ret
91 ; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
92 ; RV64-MISALIGN: # %bb.0:
93 ; RV64-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
94 ; RV64-MISALIGN-NEXT: vle16.v v8, (a0)
95 ; RV64-MISALIGN-NEXT: vse16.v v8, (a1)
96 ; RV64-MISALIGN-NEXT: ret
97 %a = load <4 x i16>, ptr %x, align 1
98 %b.gep = getelementptr i8, ptr %x, i64 8
99 %b = load <4 x i16>, ptr %b.gep, align 1
100 %c.gep = getelementptr i8, ptr %b.gep, i64 8
101 %c = load <4 x i16>, ptr %c.gep, align 1
102 %d.gep = getelementptr i8, ptr %c.gep, i64 8
103 %d = load <4 x i16>, ptr %d.gep, align 1
104 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
105 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
106 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
107 store <16 x i16> %e.2, ptr %z
111 ; Should be a strided load - with type coercion to i64
112 define void @strided_constant(ptr %x, ptr %z) {
113 ; CHECK-LABEL: strided_constant:
115 ; CHECK-NEXT: li a2, 16
116 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
117 ; CHECK-NEXT: vlse64.v v8, (a0), a2
118 ; CHECK-NEXT: vse64.v v8, (a1)
120 %a = load <4 x i16>, ptr %x
121 %b.gep = getelementptr i8, ptr %x, i64 16
122 %b = load <4 x i16>, ptr %b.gep
123 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
124 store <8 x i16> %c, ptr %z
128 ; Should be a strided load
129 define void @strided_constant_64(ptr %x, ptr %z) {
130 ; CHECK-LABEL: strided_constant_64:
132 ; CHECK-NEXT: li a2, 64
133 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
134 ; CHECK-NEXT: vlse64.v v8, (a0), a2
135 ; CHECK-NEXT: vse64.v v8, (a1)
137 %a = load <4 x i16>, ptr %x
138 %b.gep = getelementptr i8, ptr %x, i64 64
139 %b = load <4 x i16>, ptr %b.gep
140 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
141 store <8 x i16> %c, ptr %z
145 ; Vector is too large to fit into a single strided load
146 define void @strided_constant_v4i32(ptr %x, ptr %z) {
147 ; CHECK-LABEL: strided_constant_v4i32:
149 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
150 ; CHECK-NEXT: vle32.v v8, (a0)
151 ; CHECK-NEXT: addi a0, a0, 32
152 ; CHECK-NEXT: vle32.v v10, (a0)
153 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
154 ; CHECK-NEXT: vslideup.vi v8, v10, 4
155 ; CHECK-NEXT: vse32.v v8, (a1)
157 %a = load <4 x i32>, ptr %x
158 %b.gep = getelementptr i8, ptr %x, i64 32
159 %b = load <4 x i32>, ptr %b.gep
160 %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
161 store <8 x i32> %c, ptr %z
165 ; Interestingly, can be a stride 0 load
166 define void @strided_constant_0(ptr %x, ptr %z) {
167 ; CHECK-LABEL: strided_constant_0:
169 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
170 ; CHECK-NEXT: vle16.v v8, (a0)
171 ; CHECK-NEXT: vmv1r.v v9, v8
172 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
173 ; CHECK-NEXT: vslideup.vi v9, v8, 4
174 ; CHECK-NEXT: vse16.v v9, (a1)
176 %a = load <4 x i16>, ptr %x
177 %b = load <4 x i16>, ptr %x
178 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
179 store <8 x i16> %c, ptr %z
183 ; Stride isn't consistent, so shouldn't be combined
184 define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
185 ; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
187 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
188 ; CHECK-NEXT: vle16.v v8, (a0)
189 ; CHECK-NEXT: addi a2, a0, 2
190 ; CHECK-NEXT: addi a3, a0, 6
191 ; CHECK-NEXT: vle16.v v10, (a3)
192 ; CHECK-NEXT: addi a0, a0, 8
193 ; CHECK-NEXT: vle16.v v9, (a0)
194 ; CHECK-NEXT: vle16.v v11, (a2)
195 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
196 ; CHECK-NEXT: vslideup.vi v10, v9, 4
197 ; CHECK-NEXT: vslideup.vi v8, v11, 4
198 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
199 ; CHECK-NEXT: vslideup.vi v8, v10, 8
200 ; CHECK-NEXT: vse16.v v8, (a1)
202 %a = load <4 x i16>, ptr %x
203 %b.gep = getelementptr i8, ptr %x, i64 2
204 %b = load <4 x i16>, ptr %b.gep
205 %c.gep = getelementptr i8, ptr %b.gep, i64 4
206 %c = load <4 x i16>, ptr %c.gep
207 %d.gep = getelementptr i8, ptr %c.gep, i64 2
208 %d = load <4 x i16>, ptr %d.gep
209 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
210 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
211 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
212 store <16 x i16> %e.2, ptr %z
216 define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
217 ; CHECK-LABEL: strided_runtime:
219 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
220 ; CHECK-NEXT: vlse64.v v8, (a0), a2
221 ; CHECK-NEXT: vse64.v v8, (a1)
223 %a = load <4 x i16>, ptr %x
224 %b.gep = getelementptr i8, ptr %x, i64 %s
225 %b = load <4 x i16>, ptr %b.gep
226 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
227 store <8 x i16> %c, ptr %z
231 define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
232 ; CHECK-LABEL: strided_runtime_4xv4i16:
234 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
235 ; CHECK-NEXT: vlse64.v v8, (a0), a2
236 ; CHECK-NEXT: vse64.v v8, (a1)
238 %a = load <4 x i16>, ptr %x
239 %b.gep = getelementptr i8, ptr %x, i64 %s
240 %b = load <4 x i16>, ptr %b.gep
241 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
242 %c = load <4 x i16>, ptr %c.gep
243 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
244 %d = load <4 x i16>, ptr %d.gep
245 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
246 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
247 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
248 store <16 x i16> %e.2, ptr %z
252 ; Stride isn't consistent, so shouldn't be combined
253 define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
254 ; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
256 ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
257 ; RV32-NEXT: vle16.v v8, (a0)
258 ; RV32-NEXT: add a0, a0, a2
259 ; RV32-NEXT: add a4, a0, a4
260 ; RV32-NEXT: vle16.v v10, (a4)
261 ; RV32-NEXT: add a2, a4, a2
262 ; RV32-NEXT: vle16.v v9, (a2)
263 ; RV32-NEXT: vle16.v v11, (a0)
264 ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
265 ; RV32-NEXT: vslideup.vi v10, v9, 4
266 ; RV32-NEXT: vslideup.vi v8, v11, 4
267 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
268 ; RV32-NEXT: vslideup.vi v8, v10, 8
269 ; RV32-NEXT: vse16.v v8, (a1)
272 ; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
274 ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
275 ; RV64-NEXT: vle16.v v8, (a0)
276 ; RV64-NEXT: add a0, a0, a2
277 ; RV64-NEXT: add a3, a0, a3
278 ; RV64-NEXT: vle16.v v10, (a3)
279 ; RV64-NEXT: add a2, a3, a2
280 ; RV64-NEXT: vle16.v v9, (a2)
281 ; RV64-NEXT: vle16.v v11, (a0)
282 ; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
283 ; RV64-NEXT: vslideup.vi v10, v9, 4
284 ; RV64-NEXT: vslideup.vi v8, v11, 4
285 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
286 ; RV64-NEXT: vslideup.vi v8, v10, 8
287 ; RV64-NEXT: vse16.v v8, (a1)
290 ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
292 ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
293 ; ZVE64F-NEXT: vle16.v v8, (a0)
294 ; ZVE64F-NEXT: add a0, a0, a2
295 ; ZVE64F-NEXT: add a3, a0, a3
296 ; ZVE64F-NEXT: vle16.v v10, (a3)
297 ; ZVE64F-NEXT: add a2, a3, a2
298 ; ZVE64F-NEXT: vle16.v v9, (a2)
299 ; ZVE64F-NEXT: vle16.v v11, (a0)
300 ; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
301 ; ZVE64F-NEXT: vslideup.vi v10, v9, 4
302 ; ZVE64F-NEXT: vslideup.vi v8, v11, 4
303 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
304 ; ZVE64F-NEXT: vslideup.vi v8, v10, 8
305 ; ZVE64F-NEXT: vse16.v v8, (a1)
307 %a = load <4 x i16>, ptr %x
308 %b.gep = getelementptr i8, ptr %x, i64 %s
309 %b = load <4 x i16>, ptr %b.gep
310 %c.gep = getelementptr i8, ptr %b.gep, i64 %t
311 %c = load <4 x i16>, ptr %c.gep
312 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
313 %d = load <4 x i16>, ptr %d.gep
314 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
315 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
316 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
317 store <16 x i16> %e.2, ptr %z
321 define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
322 ; CHECK-LABEL: strided_runtime_4xv4f16:
324 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
325 ; CHECK-NEXT: vlse64.v v8, (a0), a2
326 ; CHECK-NEXT: vse64.v v8, (a1)
328 %a = load <4 x half>, ptr %x
329 %b.gep = getelementptr i8, ptr %x, i64 %s
330 %b = load <4 x half>, ptr %b.gep
331 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
332 %c = load <4 x half>, ptr %c.gep
333 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
334 %d = load <4 x half>, ptr %d.gep
335 %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336 %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
337 %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
338 store <16 x half> %e.2, ptr %z
342 define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
343 ; CHECK-LABEL: strided_runtime_4xv2f32:
345 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
346 ; CHECK-NEXT: vlse64.v v8, (a0), a2
347 ; CHECK-NEXT: vse64.v v8, (a1)
349 %a = load <2 x float>, ptr %x
350 %b.gep = getelementptr i8, ptr %x, i64 %s
351 %b = load <2 x float>, ptr %b.gep
352 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
353 %c = load <2 x float>, ptr %c.gep
354 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
355 %d = load <2 x float>, ptr %d.gep
356 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
359 store <8 x float> %e.2, ptr %z
363 define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
364 ; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
365 ; CHECK-NO-MISALIGN: # %bb.0:
366 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
367 ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
368 ; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2
369 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0)
370 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
371 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
372 ; CHECK-NO-MISALIGN-NEXT: ret
374 ; RV64-MISALIGN-LABEL: strided_unaligned:
375 ; RV64-MISALIGN: # %bb.0:
376 ; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma
377 ; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2
378 ; RV64-MISALIGN-NEXT: vse64.v v8, (a1)
379 ; RV64-MISALIGN-NEXT: ret
380 %a = load <4 x i16>, ptr %x, align 1
381 %b.gep = getelementptr i8, ptr %x, i64 %s
382 %b = load <4 x i16>, ptr %b.gep, align 1
383 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
384 store <8 x i16> %c, ptr %z
388 ; Should use the most restrictive common alignment
389 define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
390 ; CHECK-LABEL: strided_mismatched_alignments:
392 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
393 ; CHECK-NEXT: vlse64.v v8, (a0), a2
394 ; CHECK-NEXT: vse64.v v8, (a1)
396 %a = load <4 x i16>, ptr %x, align 8
397 %b.gep = getelementptr i8, ptr %x, i64 %s
398 %b = load <4 x i16>, ptr %b.gep, align 16
399 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
400 store <8 x i16> %c, ptr %z
404 define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
405 ; CHECK-LABEL: strided_ok_alignments_8:
407 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
408 ; CHECK-NEXT: vlse64.v v8, (a0), a2
409 ; CHECK-NEXT: vse64.v v8, (a1)
411 %a = load <4 x i16>, ptr %x, align 8
412 %b.gep = getelementptr i8, ptr %x, i64 %s
413 %b = load <4 x i16>, ptr %b.gep, align 8
414 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
415 store <8 x i16> %c, ptr %z
419 define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) {
420 ; CHECK-LABEL: strided_ok_alignments_16:
422 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
423 ; CHECK-NEXT: vlse64.v v8, (a0), a2
424 ; CHECK-NEXT: vse64.v v8, (a1)
426 %a = load <4 x i16>, ptr %x, align 16
427 %b.gep = getelementptr i8, ptr %x, i64 %s
428 %b = load <4 x i16>, ptr %b.gep, align 16
429 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
430 store <8 x i16> %c, ptr %z
434 ; Shouldn't be combined because one of the loads is not simple
435 define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) {
436 ; CHECK-LABEL: strided_non_simple_load:
438 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
439 ; CHECK-NEXT: vle16.v v8, (a0)
440 ; CHECK-NEXT: add a0, a0, a2
441 ; CHECK-NEXT: vle16.v v9, (a0)
442 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
443 ; CHECK-NEXT: vslideup.vi v8, v9, 4
444 ; CHECK-NEXT: vse16.v v8, (a1)
446 %a = load <4 x i16>, ptr %x
447 %b.gep = getelementptr i8, ptr %x, i64 %s
448 %b = load volatile <4 x i16>, ptr %b.gep
449 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
450 store <8 x i16> %c, ptr %z
454 ; Shouldn't be combined because one of the operands is not a load
455 define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) {
456 ; CHECK-LABEL: strided_non_load:
458 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
459 ; CHECK-NEXT: vle16.v v9, (a0)
460 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
461 ; CHECK-NEXT: vslideup.vi v9, v8, 4
462 ; CHECK-NEXT: vse16.v v9, (a1)
464 %a = load <4 x i16>, ptr %x
465 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
466 store <8 x i16> %c, ptr %z
470 define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
471 ; CHECK-LABEL: strided_constant_neg_4xv2f32:
473 ; CHECK-NEXT: li a2, -64
474 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
475 ; CHECK-NEXT: vlse64.v v8, (a0), a2
476 ; CHECK-NEXT: vse64.v v8, (a1)
478 %a = load <2 x float>, ptr %x
479 %b.gep = getelementptr i8, ptr %x, i64 -64
480 %b = load <2 x float>, ptr %b.gep
481 %c.gep = getelementptr i8, ptr %b.gep, i64 -64
482 %c = load <2 x float>, ptr %c.gep
483 %d.gep = getelementptr i8, ptr %c.gep, i64 -64
484 %d = load <2 x float>, ptr %d.gep
485 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
487 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
488 store <8 x float> %e.2, ptr %z
492 ; This is a strided load with a negative stride
493 define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) {
494 ; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32:
496 ; CHECK-NEXT: addi a0, a0, 192
497 ; CHECK-NEXT: li a2, -64
498 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
499 ; CHECK-NEXT: vlse64.v v8, (a0), a2
500 ; CHECK-NEXT: vse64.v v8, (a1)
502 %x.1 = getelementptr i8, ptr %x, i64 64
503 %x.2 = getelementptr i8, ptr %x.1, i64 64
504 %x.3 = getelementptr i8, ptr %x.2, i64 64
505 %a = load <2 x float>, ptr %x.3
506 %b = load <2 x float>, ptr %x.2
507 %c = load <2 x float>, ptr %x.1
508 %d = load <2 x float>, ptr %x
509 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
510 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
511 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
512 store <8 x float> %e.2, ptr %z
516 define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
517 ; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32:
519 ; CHECK-NEXT: addi a0, a0, -192
520 ; CHECK-NEXT: li a2, 64
521 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
522 ; CHECK-NEXT: vlse64.v v8, (a0), a2
523 ; CHECK-NEXT: vse64.v v8, (a1)
525 %x.1 = getelementptr i8, ptr %x, i64 -64
526 %x.2 = getelementptr i8, ptr %x.1, i64 -64
527 %x.3 = getelementptr i8, ptr %x.2, i64 -64
528 %a = load <2 x float>, ptr %x.3
529 %b = load <2 x float>, ptr %x.2
530 %c = load <2 x float>, ptr %x.1
531 %d = load <2 x float>, ptr %x
532 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
533 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
534 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
535 store <8 x float> %e.2, ptr %z
539 ; This is a strided load with a negative stride
540 define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
541 ; CHECK-LABEL: reverse_strided_runtime_4xv2f32:
543 ; CHECK-NEXT: add a0, a0, a2
544 ; CHECK-NEXT: add a3, a2, a2
545 ; CHECK-NEXT: add a0, a0, a3
546 ; CHECK-NEXT: neg a2, a2
547 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
548 ; CHECK-NEXT: vlse64.v v8, (a0), a2
549 ; CHECK-NEXT: vse64.v v8, (a1)
551 %x.1 = getelementptr i8, ptr %x, i64 %s
552 %x.2 = getelementptr i8, ptr %x.1, i64 %s
553 %x.3 = getelementptr i8, ptr %x.2, i64 %s
554 %a = load <2 x float>, ptr %x.3
555 %b = load <2 x float>, ptr %x.2
556 %c = load <2 x float>, ptr %x.1
557 %d = load <2 x float>, ptr %x
558 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
559 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
560 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
561 store <8 x float> %e.2, ptr %z
565 ; The middle end sometimes produces this pattern of shuffles, where the
566 ; intermediate shuffles are the full result vector size padded with poison
568 define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) {
569 ; CHECK-LABEL: widen_4xv4i8_immediate_expand:
571 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
572 ; CHECK-NEXT: vlse32.v v8, (a0), a1
574 %a = load <4 x i8>, ptr %p
575 %b.ptr = getelementptr i8, ptr %p, i64 %s
576 %b = load <4 x i8>, ptr %b.ptr
577 %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s
578 %c = load <4 x i8>, ptr %c.ptr
579 %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s
580 %d = load <4 x i8>, ptr %d.ptr
582 %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
583 %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
584 %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
585 %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
586 %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>