1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
3 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
4 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
6 ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
8 ; The two loads are contigous and should be folded into one
9 define void @widen_2xv4i16(ptr %x, ptr %z) {
10 ; CHECK-LABEL: widen_2xv4i16:
12 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
13 ; CHECK-NEXT: vle64.v v8, (a0)
14 ; CHECK-NEXT: vse64.v v8, (a1)
16 %a = load <4 x i16>, ptr %x
17 %b.gep = getelementptr i8, ptr %x, i64 8
18 %b = load <4 x i16>, ptr %b.gep
19 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
20 store <8 x i16> %c, ptr %z
24 define void @widen_3xv4i16(ptr %x, ptr %z) {
25 ; CHECK-LABEL: widen_3xv4i16:
27 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
28 ; CHECK-NEXT: vle16.v v8, (a0)
29 ; CHECK-NEXT: addi a2, a0, 8
30 ; CHECK-NEXT: vle16.v v9, (a2)
31 ; CHECK-NEXT: addi a0, a0, 16
32 ; CHECK-NEXT: vle16.v v10, (a0)
33 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
34 ; CHECK-NEXT: vslideup.vi v8, v9, 4
35 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, ta, ma
36 ; CHECK-NEXT: vslideup.vi v8, v10, 8
37 ; CHECK-NEXT: vse16.v v8, (a1)
39 %a = load <4 x i16>, ptr %x
40 %b.gep = getelementptr i8, ptr %x, i64 8
41 %b = load <4 x i16>, ptr %b.gep
42 %c.gep = getelementptr i8, ptr %b.gep, i64 8
43 %c = load <4 x i16>, ptr %c.gep
44 %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
45 %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
46 %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
47 store <12 x i16> %d.2, ptr %z
51 define void @widen_4xv4i16(ptr %x, ptr %z) {
52 ; CHECK-LABEL: widen_4xv4i16:
54 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
55 ; CHECK-NEXT: vle64.v v8, (a0)
56 ; CHECK-NEXT: vse64.v v8, (a1)
58 %a = load <4 x i16>, ptr %x
59 %b.gep = getelementptr i8, ptr %x, i64 8
60 %b = load <4 x i16>, ptr %b.gep
61 %c.gep = getelementptr i8, ptr %b.gep, i64 8
62 %c = load <4 x i16>, ptr %c.gep
63 %d.gep = getelementptr i8, ptr %c.gep, i64 8
64 %d = load <4 x i16>, ptr %d.gep
65 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
67 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
68 store <16 x i16> %e.2, ptr %z
72 define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
73 ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
74 ; CHECK-NO-MISALIGN: # %bb.0:
75 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
76 ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
77 ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
78 ; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
79 ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
80 ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
81 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0)
82 ; CHECK-NO-MISALIGN-NEXT: vle8.v v11, (a2)
83 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v10, v9, 4
84 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v11, 4
85 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
86 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8
87 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
88 ; CHECK-NO-MISALIGN-NEXT: ret
90 ; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
91 ; RV64-MISALIGN: # %bb.0:
92 ; RV64-MISALIGN-NEXT: vsetivli zero, 4, e64, m2, ta, ma
93 ; RV64-MISALIGN-NEXT: vle64.v v8, (a0)
94 ; RV64-MISALIGN-NEXT: vse64.v v8, (a1)
95 ; RV64-MISALIGN-NEXT: ret
96 %a = load <4 x i16>, ptr %x, align 1
97 %b.gep = getelementptr i8, ptr %x, i64 8
98 %b = load <4 x i16>, ptr %b.gep, align 1
99 %c.gep = getelementptr i8, ptr %b.gep, i64 8
100 %c = load <4 x i16>, ptr %c.gep, align 1
101 %d.gep = getelementptr i8, ptr %c.gep, i64 8
102 %d = load <4 x i16>, ptr %d.gep, align 1
103 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
104 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
105 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106 store <16 x i16> %e.2, ptr %z
110 ; Should be a strided load - with type coercion to i64
111 define void @strided_constant(ptr %x, ptr %z) {
112 ; CHECK-LABEL: strided_constant:
114 ; CHECK-NEXT: li a2, 16
115 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
116 ; CHECK-NEXT: vlse64.v v8, (a0), a2
117 ; CHECK-NEXT: vse64.v v8, (a1)
119 %a = load <4 x i16>, ptr %x
120 %b.gep = getelementptr i8, ptr %x, i64 16
121 %b = load <4 x i16>, ptr %b.gep
122 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
123 store <8 x i16> %c, ptr %z
127 ; Should be a strided load
128 define void @strided_constant_64(ptr %x, ptr %z) {
129 ; CHECK-LABEL: strided_constant_64:
131 ; CHECK-NEXT: li a2, 64
132 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
133 ; CHECK-NEXT: vlse64.v v8, (a0), a2
134 ; CHECK-NEXT: vse64.v v8, (a1)
136 %a = load <4 x i16>, ptr %x
137 %b.gep = getelementptr i8, ptr %x, i64 64
138 %b = load <4 x i16>, ptr %b.gep
139 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
140 store <8 x i16> %c, ptr %z
144 ; Vector is too large to fit into a single strided load
145 define void @strided_constant_v4i32(ptr %x, ptr %z) {
146 ; CHECK-LABEL: strided_constant_v4i32:
148 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
149 ; CHECK-NEXT: vle32.v v8, (a0)
150 ; CHECK-NEXT: addi a0, a0, 32
151 ; CHECK-NEXT: vle32.v v10, (a0)
152 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
153 ; CHECK-NEXT: vslideup.vi v8, v10, 4
154 ; CHECK-NEXT: vse32.v v8, (a1)
156 %a = load <4 x i32>, ptr %x
157 %b.gep = getelementptr i8, ptr %x, i64 32
158 %b = load <4 x i32>, ptr %b.gep
159 %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
160 store <8 x i32> %c, ptr %z
164 ; Interestingly, can be a stride 0 load
165 define void @strided_constant_0(ptr %x, ptr %z) {
166 ; CHECK-LABEL: strided_constant_0:
168 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
169 ; CHECK-NEXT: vle16.v v8, (a0)
170 ; CHECK-NEXT: vmv1r.v v9, v8
171 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
172 ; CHECK-NEXT: vslideup.vi v9, v8, 4
173 ; CHECK-NEXT: vse16.v v9, (a1)
175 %a = load <4 x i16>, ptr %x
176 %b = load <4 x i16>, ptr %x
177 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
178 store <8 x i16> %c, ptr %z
182 ; Stride isn't consistent, so shouldn't be combined
183 define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
184 ; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
186 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
187 ; CHECK-NEXT: vle16.v v8, (a0)
188 ; CHECK-NEXT: addi a2, a0, 6
189 ; CHECK-NEXT: vle16.v v10, (a2)
190 ; CHECK-NEXT: addi a2, a0, 2
191 ; CHECK-NEXT: addi a0, a0, 8
192 ; CHECK-NEXT: vle16.v v9, (a0)
193 ; CHECK-NEXT: vle16.v v11, (a2)
194 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
195 ; CHECK-NEXT: vslideup.vi v10, v9, 4
196 ; CHECK-NEXT: vslideup.vi v8, v11, 4
197 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
198 ; CHECK-NEXT: vslideup.vi v8, v10, 8
199 ; CHECK-NEXT: vse16.v v8, (a1)
201 %a = load <4 x i16>, ptr %x
202 %b.gep = getelementptr i8, ptr %x, i64 2
203 %b = load <4 x i16>, ptr %b.gep
204 %c.gep = getelementptr i8, ptr %b.gep, i64 4
205 %c = load <4 x i16>, ptr %c.gep
206 %d.gep = getelementptr i8, ptr %c.gep, i64 2
207 %d = load <4 x i16>, ptr %d.gep
208 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
209 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
210 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
211 store <16 x i16> %e.2, ptr %z
215 define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
216 ; CHECK-LABEL: strided_runtime:
218 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
219 ; CHECK-NEXT: vlse64.v v8, (a0), a2
220 ; CHECK-NEXT: vse64.v v8, (a1)
222 %a = load <4 x i16>, ptr %x
223 %b.gep = getelementptr i8, ptr %x, i64 %s
224 %b = load <4 x i16>, ptr %b.gep
225 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
226 store <8 x i16> %c, ptr %z
230 define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
231 ; CHECK-LABEL: strided_runtime_4xv4i16:
233 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
234 ; CHECK-NEXT: vlse64.v v8, (a0), a2
235 ; CHECK-NEXT: vse64.v v8, (a1)
237 %a = load <4 x i16>, ptr %x
238 %b.gep = getelementptr i8, ptr %x, i64 %s
239 %b = load <4 x i16>, ptr %b.gep
240 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
241 %c = load <4 x i16>, ptr %c.gep
242 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
243 %d = load <4 x i16>, ptr %d.gep
244 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
245 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
246 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
247 store <16 x i16> %e.2, ptr %z
251 ; Stride isn't consistent, so shouldn't be combined
252 define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
253 ; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
255 ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
256 ; RV32-NEXT: vle16.v v8, (a0)
257 ; RV32-NEXT: add a0, a0, a2
258 ; RV32-NEXT: add a4, a0, a4
259 ; RV32-NEXT: vle16.v v10, (a4)
260 ; RV32-NEXT: add a2, a4, a2
261 ; RV32-NEXT: vle16.v v9, (a2)
262 ; RV32-NEXT: vle16.v v11, (a0)
263 ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
264 ; RV32-NEXT: vslideup.vi v10, v9, 4
265 ; RV32-NEXT: vslideup.vi v8, v11, 4
266 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
267 ; RV32-NEXT: vslideup.vi v8, v10, 8
268 ; RV32-NEXT: vse16.v v8, (a1)
271 ; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
273 ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
274 ; RV64-NEXT: vle16.v v8, (a0)
275 ; RV64-NEXT: add a0, a0, a2
276 ; RV64-NEXT: add a3, a0, a3
277 ; RV64-NEXT: vle16.v v10, (a3)
278 ; RV64-NEXT: add a2, a3, a2
279 ; RV64-NEXT: vle16.v v9, (a2)
280 ; RV64-NEXT: vle16.v v11, (a0)
281 ; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma
282 ; RV64-NEXT: vslideup.vi v10, v9, 4
283 ; RV64-NEXT: vslideup.vi v8, v11, 4
284 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
285 ; RV64-NEXT: vslideup.vi v8, v10, 8
286 ; RV64-NEXT: vse16.v v8, (a1)
289 ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
291 ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
292 ; ZVE64F-NEXT: vle16.v v8, (a0)
293 ; ZVE64F-NEXT: add a0, a0, a2
294 ; ZVE64F-NEXT: add a3, a0, a3
295 ; ZVE64F-NEXT: vle16.v v10, (a3)
296 ; ZVE64F-NEXT: add a2, a3, a2
297 ; ZVE64F-NEXT: vle16.v v9, (a2)
298 ; ZVE64F-NEXT: vle16.v v11, (a0)
299 ; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
300 ; ZVE64F-NEXT: vslideup.vi v10, v9, 4
301 ; ZVE64F-NEXT: vslideup.vi v8, v11, 4
302 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
303 ; ZVE64F-NEXT: vslideup.vi v8, v10, 8
304 ; ZVE64F-NEXT: vse16.v v8, (a1)
306 %a = load <4 x i16>, ptr %x
307 %b.gep = getelementptr i8, ptr %x, i64 %s
308 %b = load <4 x i16>, ptr %b.gep
309 %c.gep = getelementptr i8, ptr %b.gep, i64 %t
310 %c = load <4 x i16>, ptr %c.gep
311 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
312 %d = load <4 x i16>, ptr %d.gep
313 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
314 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
315 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
316 store <16 x i16> %e.2, ptr %z
320 define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
321 ; CHECK-LABEL: strided_runtime_4xv4f16:
323 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
324 ; CHECK-NEXT: vlse64.v v8, (a0), a2
325 ; CHECK-NEXT: vse64.v v8, (a1)
327 %a = load <4 x half>, ptr %x
328 %b.gep = getelementptr i8, ptr %x, i64 %s
329 %b = load <4 x half>, ptr %b.gep
330 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
331 %c = load <4 x half>, ptr %c.gep
332 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
333 %d = load <4 x half>, ptr %d.gep
334 %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
335 %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336 %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
337 store <16 x half> %e.2, ptr %z
341 define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
342 ; CHECK-LABEL: strided_runtime_4xv2f32:
344 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
345 ; CHECK-NEXT: vlse64.v v8, (a0), a2
346 ; CHECK-NEXT: vse64.v v8, (a1)
348 %a = load <2 x float>, ptr %x
349 %b.gep = getelementptr i8, ptr %x, i64 %s
350 %b = load <2 x float>, ptr %b.gep
351 %c.gep = getelementptr i8, ptr %b.gep, i64 %s
352 %c = load <2 x float>, ptr %c.gep
353 %d.gep = getelementptr i8, ptr %c.gep, i64 %s
354 %d = load <2 x float>, ptr %d.gep
355 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
356 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
358 store <8 x float> %e.2, ptr %z
362 define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
363 ; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
364 ; CHECK-NO-MISALIGN: # %bb.0:
365 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
366 ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
367 ; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2
368 ; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0)
369 ; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
370 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
371 ; CHECK-NO-MISALIGN-NEXT: ret
373 ; RV64-MISALIGN-LABEL: strided_unaligned:
374 ; RV64-MISALIGN: # %bb.0:
375 ; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma
376 ; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2
377 ; RV64-MISALIGN-NEXT: vse64.v v8, (a1)
378 ; RV64-MISALIGN-NEXT: ret
379 %a = load <4 x i16>, ptr %x, align 1
380 %b.gep = getelementptr i8, ptr %x, i64 %s
381 %b = load <4 x i16>, ptr %b.gep, align 1
382 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
383 store <8 x i16> %c, ptr %z
387 ; Should use the most restrictive common alignment
388 define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
389 ; CHECK-LABEL: strided_mismatched_alignments:
391 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
392 ; CHECK-NEXT: vlse64.v v8, (a0), a2
393 ; CHECK-NEXT: vse64.v v8, (a1)
395 %a = load <4 x i16>, ptr %x, align 8
396 %b.gep = getelementptr i8, ptr %x, i64 %s
397 %b = load <4 x i16>, ptr %b.gep, align 16
398 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
399 store <8 x i16> %c, ptr %z
403 define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
404 ; CHECK-LABEL: strided_ok_alignments_8:
406 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
407 ; CHECK-NEXT: vlse64.v v8, (a0), a2
408 ; CHECK-NEXT: vse64.v v8, (a1)
410 %a = load <4 x i16>, ptr %x, align 8
411 %b.gep = getelementptr i8, ptr %x, i64 %s
412 %b = load <4 x i16>, ptr %b.gep, align 8
413 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
414 store <8 x i16> %c, ptr %z
418 define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) {
419 ; CHECK-LABEL: strided_ok_alignments_16:
421 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
422 ; CHECK-NEXT: vlse64.v v8, (a0), a2
423 ; CHECK-NEXT: vse64.v v8, (a1)
425 %a = load <4 x i16>, ptr %x, align 16
426 %b.gep = getelementptr i8, ptr %x, i64 %s
427 %b = load <4 x i16>, ptr %b.gep, align 16
428 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
429 store <8 x i16> %c, ptr %z
433 ; Shouldn't be combined because one of the loads is not simple
434 define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) {
435 ; CHECK-LABEL: strided_non_simple_load:
437 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
438 ; CHECK-NEXT: vle16.v v8, (a0)
439 ; CHECK-NEXT: add a0, a0, a2
440 ; CHECK-NEXT: vle16.v v9, (a0)
441 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
442 ; CHECK-NEXT: vslideup.vi v8, v9, 4
443 ; CHECK-NEXT: vse16.v v8, (a1)
445 %a = load <4 x i16>, ptr %x
446 %b.gep = getelementptr i8, ptr %x, i64 %s
447 %b = load volatile <4 x i16>, ptr %b.gep
448 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
449 store <8 x i16> %c, ptr %z
453 ; Shouldn't be combined because one of the operands is not a load
454 define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) {
455 ; CHECK-LABEL: strided_non_load:
457 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
458 ; CHECK-NEXT: vle16.v v9, (a0)
459 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
460 ; CHECK-NEXT: vslideup.vi v9, v8, 4
461 ; CHECK-NEXT: vse16.v v9, (a1)
463 %a = load <4 x i16>, ptr %x
464 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
465 store <8 x i16> %c, ptr %z
469 define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
470 ; CHECK-LABEL: strided_constant_neg_4xv2f32:
472 ; CHECK-NEXT: li a2, -64
473 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
474 ; CHECK-NEXT: vlse64.v v8, (a0), a2
475 ; CHECK-NEXT: vse64.v v8, (a1)
477 %a = load <2 x float>, ptr %x
478 %b.gep = getelementptr i8, ptr %x, i64 -64
479 %b = load <2 x float>, ptr %b.gep
480 %c.gep = getelementptr i8, ptr %b.gep, i64 -64
481 %c = load <2 x float>, ptr %c.gep
482 %d.gep = getelementptr i8, ptr %c.gep, i64 -64
483 %d = load <2 x float>, ptr %d.gep
484 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
485 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
487 store <8 x float> %e.2, ptr %z
491 ; This is a strided load with a negative stride
492 define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) {
493 ; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32:
495 ; CHECK-NEXT: addi a0, a0, 192
496 ; CHECK-NEXT: li a2, -64
497 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
498 ; CHECK-NEXT: vlse64.v v8, (a0), a2
499 ; CHECK-NEXT: vse64.v v8, (a1)
501 %x.1 = getelementptr i8, ptr %x, i64 64
502 %x.2 = getelementptr i8, ptr %x.1, i64 64
503 %x.3 = getelementptr i8, ptr %x.2, i64 64
504 %a = load <2 x float>, ptr %x.3
505 %b = load <2 x float>, ptr %x.2
506 %c = load <2 x float>, ptr %x.1
507 %d = load <2 x float>, ptr %x
508 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
509 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
510 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
511 store <8 x float> %e.2, ptr %z
515 define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
516 ; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32:
518 ; CHECK-NEXT: addi a0, a0, -192
519 ; CHECK-NEXT: li a2, 64
520 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
521 ; CHECK-NEXT: vlse64.v v8, (a0), a2
522 ; CHECK-NEXT: vse64.v v8, (a1)
524 %x.1 = getelementptr i8, ptr %x, i64 -64
525 %x.2 = getelementptr i8, ptr %x.1, i64 -64
526 %x.3 = getelementptr i8, ptr %x.2, i64 -64
527 %a = load <2 x float>, ptr %x.3
528 %b = load <2 x float>, ptr %x.2
529 %c = load <2 x float>, ptr %x.1
530 %d = load <2 x float>, ptr %x
531 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
532 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
533 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
534 store <8 x float> %e.2, ptr %z
538 ; This is a strided load with a negative stride
539 define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
540 ; CHECK-LABEL: reverse_strided_runtime_4xv2f32:
542 ; CHECK-NEXT: add a0, a0, a2
543 ; CHECK-NEXT: add a3, a2, a2
544 ; CHECK-NEXT: add a0, a0, a3
545 ; CHECK-NEXT: neg a2, a2
546 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
547 ; CHECK-NEXT: vlse64.v v8, (a0), a2
548 ; CHECK-NEXT: vse64.v v8, (a1)
550 %x.1 = getelementptr i8, ptr %x, i64 %s
551 %x.2 = getelementptr i8, ptr %x.1, i64 %s
552 %x.3 = getelementptr i8, ptr %x.2, i64 %s
553 %a = load <2 x float>, ptr %x.3
554 %b = load <2 x float>, ptr %x.2
555 %c = load <2 x float>, ptr %x.1
556 %d = load <2 x float>, ptr %x
557 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
558 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
559 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
560 store <8 x float> %e.2, ptr %z
564 ; The middle end sometimes produces this pattern of shuffles, where the
565 ; intermediate shuffles are the full result vector size padded with poison
567 define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) {
568 ; CHECK-LABEL: widen_4xv4i8_immediate_expand:
570 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
571 ; CHECK-NEXT: vlse32.v v8, (a0), a1
573 %a = load <4 x i8>, ptr %p
574 %b.ptr = getelementptr i8, ptr %p, i64 %s
575 %b = load <4 x i8>, ptr %b.ptr
576 %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s
577 %c = load <4 x i8>, ptr %c.ptr
578 %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s
579 %d = load <4 x i8>, ptr %d.ptr
581 %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
582 %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
583 %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
584 %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
585 %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>