1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
3 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
4 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED
5 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED
7 %struct.foo = type { i32, i32, i32, i32 }
9 ; void gather(signed char * __restrict A, signed char * __restrict B) {
10 ; for (int i = 0; i != 1024; ++i)
13 define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
14 ; CHECK-LABEL: gather:
15 ; CHECK: # %bb.0: # %entry
16 ; CHECK-NEXT: addi a2, a0, 1024
17 ; CHECK-NEXT: li a4, 32
18 ; CHECK-NEXT: li a3, 5
19 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
20 ; CHECK-NEXT: .LBB0_1: # %vector.body
21 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
22 ; CHECK-NEXT: vlse8.v v8, (a1), a3
23 ; CHECK-NEXT: vle8.v v9, (a0)
24 ; CHECK-NEXT: vadd.vv v8, v9, v8
25 ; CHECK-NEXT: vse8.v v8, (a0)
26 ; CHECK-NEXT: addi a0, a0, 32
27 ; CHECK-NEXT: addi a1, a1, 160
28 ; CHECK-NEXT: bne a0, a2, .LBB0_1
29 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
34 vector.body: ; preds = %vector.body, %entry
35 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
36 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
37 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
38 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
39 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
40 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
41 %wide.load = load <32 x i8>, ptr %i2, align 1
42 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
43 store <32 x i8> %i4, ptr %i2, align 1
44 %index.next = add nuw i64 %index, 32
45 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
46 %i6 = icmp eq i64 %index.next, 1024
47 br i1 %i6, label %for.cond.cleanup, label %vector.body
49 for.cond.cleanup: ; preds = %vector.body
53 define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
54 ; CHECK-LABEL: gather_masked:
55 ; CHECK: # %bb.0: # %entry
56 ; CHECK-NEXT: addi a2, a0, 1024
57 ; CHECK-NEXT: lui a3, 983765
58 ; CHECK-NEXT: addi a3, a3, 873
59 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
60 ; CHECK-NEXT: vmv.s.x v0, a3
61 ; CHECK-NEXT: li a3, 32
62 ; CHECK-NEXT: li a4, 5
63 ; CHECK-NEXT: .LBB1_1: # %vector.body
64 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
65 ; CHECK-NEXT: vmv1r.v v9, v8
66 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu
67 ; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t
68 ; CHECK-NEXT: vle8.v v10, (a0)
69 ; CHECK-NEXT: vadd.vv v9, v10, v9
70 ; CHECK-NEXT: vse8.v v9, (a0)
71 ; CHECK-NEXT: addi a0, a0, 32
72 ; CHECK-NEXT: addi a1, a1, 160
73 ; CHECK-NEXT: bne a0, a2, .LBB1_1
74 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
79 vector.body: ; preds = %vector.body, %entry
80 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
81 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
82 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
83 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
84 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
85 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
86 %wide.load = load <32 x i8>, ptr %i2, align 1
87 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
88 store <32 x i8> %i4, ptr %i2, align 1
89 %index.next = add nuw i64 %index, 32
90 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
91 %i6 = icmp eq i64 %index.next, 1024
92 br i1 %i6, label %for.cond.cleanup, label %vector.body
94 for.cond.cleanup: ; preds = %vector.body
98 define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
99 ; CHECK-LABEL: gather_negative_stride:
100 ; CHECK: # %bb.0: # %entry
101 ; CHECK-NEXT: addi a1, a1, 155
102 ; CHECK-NEXT: addi a2, a0, 1024
103 ; CHECK-NEXT: li a4, 32
104 ; CHECK-NEXT: li a3, -5
105 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
106 ; CHECK-NEXT: .LBB2_1: # %vector.body
107 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
108 ; CHECK-NEXT: vlse8.v v8, (a1), a3
109 ; CHECK-NEXT: vle8.v v9, (a0)
110 ; CHECK-NEXT: vadd.vv v8, v9, v8
111 ; CHECK-NEXT: vse8.v v8, (a0)
112 ; CHECK-NEXT: addi a0, a0, 32
113 ; CHECK-NEXT: addi a1, a1, 160
114 ; CHECK-NEXT: bne a0, a2, .LBB2_1
115 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
118 br label %vector.body
120 vector.body: ; preds = %vector.body, %entry
121 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
122 %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
123 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
124 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
125 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
126 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
127 %wide.load = load <32 x i8>, ptr %i2, align 1
128 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
129 store <32 x i8> %i4, ptr %i2, align 1
130 %index.next = add nuw i64 %index, 32
131 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
132 %i6 = icmp eq i64 %index.next, 1024
133 br i1 %i6, label %for.cond.cleanup, label %vector.body
135 for.cond.cleanup: ; preds = %vector.body
139 define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
140 ; CHECK-LABEL: gather_zero_stride:
141 ; CHECK: # %bb.0: # %entry
142 ; CHECK-NEXT: addi a2, a0, 1024
143 ; CHECK-NEXT: li a3, 32
144 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
145 ; CHECK-NEXT: .LBB3_1: # %vector.body
146 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
147 ; CHECK-NEXT: lbu a3, 0(a1)
148 ; CHECK-NEXT: vle8.v v8, (a0)
149 ; CHECK-NEXT: vadd.vx v8, v8, a3
150 ; CHECK-NEXT: vse8.v v8, (a0)
151 ; CHECK-NEXT: addi a0, a0, 32
152 ; CHECK-NEXT: addi a1, a1, 160
153 ; CHECK-NEXT: bne a0, a2, .LBB3_1
154 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
157 br label %vector.body
159 vector.body: ; preds = %vector.body, %entry
160 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
161 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
162 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
163 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
164 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
165 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
166 %wide.load = load <32 x i8>, ptr %i2, align 1
167 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
168 store <32 x i8> %i4, ptr %i2, align 1
169 %index.next = add nuw i64 %index, 32
170 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
171 %i6 = icmp eq i64 %index.next, 1024
172 br i1 %i6, label %for.cond.cleanup, label %vector.body
174 for.cond.cleanup: ; preds = %vector.body
178 define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
179 ; CHECK-LABEL: gather_zero_stride_i32:
180 ; CHECK: # %bb.0: # %entry
181 ; CHECK-NEXT: addi a2, a0, 1024
182 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
183 ; CHECK-NEXT: .LBB4_1: # %vector.body
184 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
185 ; CHECK-NEXT: lw a3, 0(a1)
186 ; CHECK-NEXT: vle32.v v8, (a0)
187 ; CHECK-NEXT: vadd.vx v8, v8, a3
188 ; CHECK-NEXT: vse32.v v8, (a0)
189 ; CHECK-NEXT: addi a0, a0, 8
190 ; CHECK-NEXT: addi a1, a1, 160
191 ; CHECK-NEXT: bne a0, a2, .LBB4_1
192 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
195 br label %vector.body
197 vector.body: ; preds = %vector.body, %entry
198 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
199 %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
200 %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5)
201 %i1 = getelementptr inbounds i8, ptr %B, <8 x i64> %i
202 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
203 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
204 %wide.load = load <8 x i32>, ptr %i2, align 4
205 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
206 store <8 x i32> %i4, ptr %i2, align 4
207 %index.next = add nuw i64 %index, 8
208 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
209 %i6 = icmp eq i64 %index.next, 1024
210 br i1 %i6, label %for.cond.cleanup, label %vector.body
212 for.cond.cleanup: ; preds = %vector.body
216 define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
217 ; V-LABEL: gather_zero_stride_unfold:
218 ; V: # %bb.0: # %entry
219 ; V-NEXT: addi a2, a0, 1024
221 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
222 ; V-NEXT: .LBB5_1: # %vector.body
223 ; V-NEXT: # =>This Inner Loop Header: Depth=1
224 ; V-NEXT: lbu a3, 0(a1)
225 ; V-NEXT: vle8.v v8, (a0)
226 ; V-NEXT: vmv.v.x v9, a3
227 ; V-NEXT: vdivu.vv v8, v9, v8
228 ; V-NEXT: vse8.v v8, (a0)
229 ; V-NEXT: addi a0, a0, 32
230 ; V-NEXT: addi a1, a1, 160
231 ; V-NEXT: bne a0, a2, .LBB5_1
232 ; V-NEXT: # %bb.2: # %for.cond.cleanup
235 ; ZVE32F-LABEL: gather_zero_stride_unfold:
236 ; ZVE32F: # %bb.0: # %entry
237 ; ZVE32F-NEXT: addi a2, a0, 1024
238 ; ZVE32F-NEXT: li a3, 32
239 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
240 ; ZVE32F-NEXT: .LBB5_1: # %vector.body
241 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
242 ; ZVE32F-NEXT: lbu a3, 0(a1)
243 ; ZVE32F-NEXT: vle8.v v8, (a0)
244 ; ZVE32F-NEXT: vmv.v.x v9, a3
245 ; ZVE32F-NEXT: vdivu.vv v8, v9, v8
246 ; ZVE32F-NEXT: vse8.v v8, (a0)
247 ; ZVE32F-NEXT: addi a0, a0, 32
248 ; ZVE32F-NEXT: addi a1, a1, 160
249 ; ZVE32F-NEXT: bne a0, a2, .LBB5_1
250 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
253 ; OPTIMIZED-LABEL: gather_zero_stride_unfold:
254 ; OPTIMIZED: # %bb.0: # %entry
255 ; OPTIMIZED-NEXT: addi a2, a0, 1024
256 ; OPTIMIZED-NEXT: li a3, 32
257 ; OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
258 ; OPTIMIZED-NEXT: .LBB5_1: # %vector.body
259 ; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
260 ; OPTIMIZED-NEXT: vlse8.v v8, (a1), zero
261 ; OPTIMIZED-NEXT: vle8.v v9, (a0)
262 ; OPTIMIZED-NEXT: vdivu.vv v8, v8, v9
263 ; OPTIMIZED-NEXT: vse8.v v8, (a0)
264 ; OPTIMIZED-NEXT: addi a0, a0, 32
265 ; OPTIMIZED-NEXT: addi a1, a1, 160
266 ; OPTIMIZED-NEXT: bne a0, a2, .LBB5_1
267 ; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
268 ; OPTIMIZED-NEXT: ret
270 br label %vector.body
272 vector.body: ; preds = %vector.body, %entry
273 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
274 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
275 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
276 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
277 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
278 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
279 %wide.load = load <32 x i8>, ptr %i2, align 1
280 %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load
281 store <32 x i8> %i4, ptr %i2, align 1
282 %index.next = add nuw i64 %index, 32
283 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
284 %i6 = icmp eq i64 %index.next, 1024
285 br i1 %i6, label %for.cond.cleanup, label %vector.body
287 for.cond.cleanup: ; preds = %vector.body
291 ;void scatter(signed char * __restrict A, signed char * __restrict B) {
292 ; for (int i = 0; i < 1024; ++i)
295 define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
296 ; CHECK-LABEL: scatter:
297 ; CHECK: # %bb.0: # %entry
298 ; CHECK-NEXT: addi a2, a1, 1024
299 ; CHECK-NEXT: li a4, 32
300 ; CHECK-NEXT: li a3, 5
301 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
302 ; CHECK-NEXT: .LBB6_1: # %vector.body
303 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
304 ; CHECK-NEXT: vle8.v v8, (a1)
305 ; CHECK-NEXT: vlse8.v v9, (a0), a3
306 ; CHECK-NEXT: vadd.vv v8, v9, v8
307 ; CHECK-NEXT: vsse8.v v8, (a0), a3
308 ; CHECK-NEXT: addi a1, a1, 32
309 ; CHECK-NEXT: addi a0, a0, 160
310 ; CHECK-NEXT: bne a1, a2, .LBB6_1
311 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
314 br label %vector.body
316 vector.body: ; preds = %vector.body, %entry
317 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
318 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
319 %i = getelementptr inbounds i8, ptr %B, i64 %index
320 %wide.load = load <32 x i8>, ptr %i, align 1
321 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
322 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
323 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
324 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
325 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true))
326 %index.next = add nuw i64 %index, 32
327 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
328 %i5 = icmp eq i64 %index.next, 1024
329 br i1 %i5, label %for.cond.cleanup, label %vector.body
331 for.cond.cleanup: ; preds = %vector.body
335 define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
336 ; CHECK-LABEL: scatter_masked:
337 ; CHECK: # %bb.0: # %entry
338 ; CHECK-NEXT: addi a2, a1, 1024
339 ; CHECK-NEXT: li a3, 32
340 ; CHECK-NEXT: lui a4, 983765
341 ; CHECK-NEXT: addi a4, a4, 873
342 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
343 ; CHECK-NEXT: vmv.s.x v0, a4
344 ; CHECK-NEXT: li a4, 5
345 ; CHECK-NEXT: .LBB7_1: # %vector.body
346 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
347 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu
348 ; CHECK-NEXT: vle8.v v9, (a1)
349 ; CHECK-NEXT: vmv1r.v v10, v8
350 ; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t
351 ; CHECK-NEXT: vadd.vv v9, v10, v9
352 ; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t
353 ; CHECK-NEXT: addi a1, a1, 32
354 ; CHECK-NEXT: addi a0, a0, 160
355 ; CHECK-NEXT: bne a1, a2, .LBB7_1
356 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
359 br label %vector.body
361 vector.body: ; preds = %vector.body, %entry
362 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
363 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
364 %i = getelementptr inbounds i8, ptr %B, i64 %index
365 %wide.load = load <32 x i8>, ptr %i, align 1
366 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
367 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
368 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
369 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
370 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
371 %index.next = add nuw i64 %index, 32
372 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
373 %i5 = icmp eq i64 %index.next, 1024
374 br i1 %i5, label %for.cond.cleanup, label %vector.body
376 for.cond.cleanup: ; preds = %vector.body
380 ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) {
381 ; for (int i = 0; i != 1024; ++i)
384 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
385 ; CHECK-LABEL: gather_pow2:
386 ; CHECK: # %bb.0: # %entry
387 ; CHECK-NEXT: lui a2, 1
388 ; CHECK-NEXT: add a2, a0, a2
389 ; CHECK-NEXT: li a3, 16
390 ; CHECK-NEXT: li a4, 32
391 ; CHECK-NEXT: .LBB8_1: # %vector.body
392 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
393 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
394 ; CHECK-NEXT: vlse32.v v8, (a1), a3
395 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
396 ; CHECK-NEXT: vle8.v v9, (a0)
397 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
398 ; CHECK-NEXT: vadd.vv v8, v9, v8
399 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
400 ; CHECK-NEXT: vse8.v v8, (a0)
401 ; CHECK-NEXT: addi a0, a0, 32
402 ; CHECK-NEXT: addi a1, a1, 128
403 ; CHECK-NEXT: bne a0, a2, .LBB8_1
404 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
407 br label %vector.body
409 vector.body: ; preds = %vector.body, %entry
410 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
411 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
412 %i = shl nsw <8 x i64> %vec.ind, splat (i64 2)
413 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
414 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
415 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
416 %wide.load = load <8 x i32>, ptr %i2, align 1
417 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
418 store <8 x i32> %i4, ptr %i2, align 1
419 %index.next = add nuw i64 %index, 8
420 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
421 %i6 = icmp eq i64 %index.next, 1024
422 br i1 %i6, label %for.cond.cleanup, label %vector.body
424 for.cond.cleanup: ; preds = %vector.body
428 ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) {
429 ; for (int i = 0; i < 1024; ++i)
432 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
433 ; CHECK-LABEL: scatter_pow2:
434 ; CHECK: # %bb.0: # %entry
435 ; CHECK-NEXT: lui a2, 1
436 ; CHECK-NEXT: add a2, a1, a2
437 ; CHECK-NEXT: li a3, 32
438 ; CHECK-NEXT: li a4, 16
439 ; CHECK-NEXT: .LBB9_1: # %vector.body
440 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
441 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
442 ; CHECK-NEXT: vle8.v v8, (a1)
443 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
444 ; CHECK-NEXT: vlse32.v v9, (a0), a4
445 ; CHECK-NEXT: vadd.vv v8, v9, v8
446 ; CHECK-NEXT: vsse32.v v8, (a0), a4
447 ; CHECK-NEXT: addi a1, a1, 32
448 ; CHECK-NEXT: addi a0, a0, 128
449 ; CHECK-NEXT: bne a1, a2, .LBB9_1
450 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
453 br label %vector.body
455 vector.body: ; preds = %vector.body, %entry
456 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
457 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
458 %i = getelementptr inbounds i32, ptr %B, i64 %index
459 %wide.load = load <8 x i32>, ptr %i, align 1
460 %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
461 %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
462 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
463 %i4 = add <8 x i32> %wide.masked.gather, %wide.load
464 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true))
465 %index.next = add nuw i64 %index, 8
466 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
467 %i5 = icmp eq i64 %index.next, 1024
468 br i1 %i5, label %for.cond.cleanup, label %vector.body
470 for.cond.cleanup: ; preds = %vector.body
478 ;void struct_gather(int * __restrict A, struct foo * __restrict B) {
479 ; for (int i = 0; i < 1024; ++i)
482 define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
483 ; CHECK-LABEL: struct_gather:
484 ; CHECK: # %bb.0: # %entry
485 ; CHECK-NEXT: addi a1, a1, 132
486 ; CHECK-NEXT: lui a2, 1
487 ; CHECK-NEXT: add a2, a0, a2
488 ; CHECK-NEXT: li a3, 16
489 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
490 ; CHECK-NEXT: .LBB10_1: # %vector.body
491 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
492 ; CHECK-NEXT: addi a4, a0, 32
493 ; CHECK-NEXT: addi a5, a1, -128
494 ; CHECK-NEXT: vlse32.v v8, (a5), a3
495 ; CHECK-NEXT: vlse32.v v9, (a1), a3
496 ; CHECK-NEXT: vle32.v v10, (a0)
497 ; CHECK-NEXT: vle32.v v11, (a4)
498 ; CHECK-NEXT: vadd.vv v8, v10, v8
499 ; CHECK-NEXT: vadd.vv v9, v11, v9
500 ; CHECK-NEXT: vse32.v v8, (a0)
501 ; CHECK-NEXT: vse32.v v9, (a4)
502 ; CHECK-NEXT: addi a0, a0, 64
503 ; CHECK-NEXT: addi a1, a1, 256
504 ; CHECK-NEXT: bne a0, a2, .LBB10_1
505 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
508 br label %vector.body
510 vector.body: ; preds = %vector.body, %entry
511 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
512 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
513 %step.add = add <8 x i64> %vec.ind, splat (i64 8)
514 %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
515 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
516 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
517 %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
518 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
519 %wide.load = load <8 x i32>, ptr %i2, align 4
520 %i4 = getelementptr inbounds i32, ptr %i2, i64 8
521 %wide.load10 = load <8 x i32>, ptr %i4, align 4
522 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
523 %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
524 store <8 x i32> %i6, ptr %i2, align 4
525 store <8 x i32> %i7, ptr %i4, align 4
526 %index.next = add nuw i64 %index, 16
527 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16)
528 %i10 = icmp eq i64 %index.next, 1024
529 br i1 %i10, label %for.cond.cleanup, label %vector.body
531 for.cond.cleanup: ; preds = %vector.body
535 ;void gather_unroll(int * __restrict A, int * __restrict B) {
536 ; for (int i = 0; i < 1024; i+= 4 ) {
538 ; A[i+1] += B[(i+1) * 4];
539 ; A[i+2] += B[(i+2) * 4];
540 ; A[i+3] += B[(i+3) * 4];
543 define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
544 ; CHECK-LABEL: gather_unroll:
545 ; CHECK: # %bb.0: # %entry
546 ; CHECK-NEXT: li a2, 256
547 ; CHECK-NEXT: li a3, 64
548 ; CHECK-NEXT: li a4, 16
549 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
550 ; CHECK-NEXT: .LBB11_1: # %vector.body
551 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
552 ; CHECK-NEXT: vlse32.v v8, (a1), a3
553 ; CHECK-NEXT: vlse32.v v9, (a0), a4
554 ; CHECK-NEXT: vadd.vv v8, v9, v8
555 ; CHECK-NEXT: vsse32.v v8, (a0), a4
556 ; CHECK-NEXT: addi a5, a1, 16
557 ; CHECK-NEXT: vlse32.v v8, (a5), a3
558 ; CHECK-NEXT: addi a5, a0, 4
559 ; CHECK-NEXT: vlse32.v v9, (a5), a4
560 ; CHECK-NEXT: vadd.vv v8, v9, v8
561 ; CHECK-NEXT: vsse32.v v8, (a5), a4
562 ; CHECK-NEXT: addi a5, a1, 32
563 ; CHECK-NEXT: vlse32.v v8, (a5), a3
564 ; CHECK-NEXT: addi a5, a0, 8
565 ; CHECK-NEXT: vlse32.v v9, (a5), a4
566 ; CHECK-NEXT: vadd.vv v8, v9, v8
567 ; CHECK-NEXT: vsse32.v v8, (a5), a4
568 ; CHECK-NEXT: addi a5, a1, 48
569 ; CHECK-NEXT: vlse32.v v8, (a5), a3
570 ; CHECK-NEXT: addi a5, a0, 12
571 ; CHECK-NEXT: vlse32.v v9, (a5), a4
572 ; CHECK-NEXT: vadd.vv v8, v9, v8
573 ; CHECK-NEXT: vsse32.v v8, (a5), a4
574 ; CHECK-NEXT: addi a2, a2, -8
575 ; CHECK-NEXT: addi a1, a1, 512
576 ; CHECK-NEXT: addi a0, a0, 128
577 ; CHECK-NEXT: bnez a2, .LBB11_1
578 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
581 br label %vector.body
583 vector.body: ; preds = %vector.body, %entry
584 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
585 %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
586 %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
587 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
588 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
589 %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
590 %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
591 %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
592 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true))
593 %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1)
594 %i5 = shl nsw <8 x i64> %i4, splat (i64 2)
595 %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
596 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
597 %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
598 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
599 %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
600 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true))
601 %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2)
602 %i10 = shl nsw <8 x i64> %i9, splat (i64 2)
603 %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
604 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
605 %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
606 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
607 %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
608 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true))
609 %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
610 %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
611 %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
612 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
613 %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
614 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
615 %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
616 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true))
617 %index.next = add nuw i64 %index, 8
618 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
619 %i19 = icmp eq i64 %index.next, 256
620 br i1 %i19, label %for.cond.cleanup, label %vector.body
622 for.cond.cleanup: ; preds = %vector.body
626 declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>)
627 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
628 declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>)
629 declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
631 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
632 define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
633 ; V-LABEL: gather_of_pointers:
636 ; V-NEXT: add a2, a0, a2
638 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
639 ; V-NEXT: .LBB12_1: # %bb2
640 ; V-NEXT: # =>This Inner Loop Header: Depth=1
641 ; V-NEXT: addi a4, a1, 80
642 ; V-NEXT: vlse64.v v8, (a1), a3
643 ; V-NEXT: vlse64.v v9, (a4), a3
644 ; V-NEXT: addi a4, a0, 16
645 ; V-NEXT: vse64.v v8, (a0)
646 ; V-NEXT: vse64.v v9, (a4)
647 ; V-NEXT: addi a0, a0, 32
648 ; V-NEXT: addi a1, a1, 160
649 ; V-NEXT: bne a0, a2, .LBB12_1
650 ; V-NEXT: # %bb.2: # %bb18
653 ; ZVE32F-LABEL: gather_of_pointers:
654 ; ZVE32F: # %bb.0: # %bb
655 ; ZVE32F-NEXT: li a2, 0
656 ; ZVE32F-NEXT: lui a3, 2
657 ; ZVE32F-NEXT: add a3, a0, a3
658 ; ZVE32F-NEXT: li a4, 1
659 ; ZVE32F-NEXT: li a5, 40
660 ; ZVE32F-NEXT: .LBB12_1: # %bb2
661 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
662 ; ZVE32F-NEXT: mul a6, a4, a5
663 ; ZVE32F-NEXT: add a6, a1, a6
664 ; ZVE32F-NEXT: mul a7, a2, a5
665 ; ZVE32F-NEXT: add a7, a1, a7
666 ; ZVE32F-NEXT: ld t0, 0(a6)
667 ; ZVE32F-NEXT: ld t1, 0(a7)
668 ; ZVE32F-NEXT: ld a6, 80(a6)
669 ; ZVE32F-NEXT: ld a7, 80(a7)
670 ; ZVE32F-NEXT: sd t0, 8(a0)
671 ; ZVE32F-NEXT: sd t1, 0(a0)
672 ; ZVE32F-NEXT: sd a6, 24(a0)
673 ; ZVE32F-NEXT: sd a7, 16(a0)
674 ; ZVE32F-NEXT: addi a2, a2, 4
675 ; ZVE32F-NEXT: addi a0, a0, 32
676 ; ZVE32F-NEXT: addi a4, a4, 4
677 ; ZVE32F-NEXT: bne a0, a3, .LBB12_1
678 ; ZVE32F-NEXT: # %bb.2: # %bb18
683 bb2: ; preds = %bb2, %bb
684 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
685 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
686 %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
687 %i5 = mul <2 x i64> %i3, splat (i64 5)
688 %i6 = add <2 x i64> %i5, <i64 10, i64 10>
689 %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
690 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
691 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
692 %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
693 %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
694 store <2 x ptr> %i9, ptr %i11, align 8
695 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
696 store <2 x ptr> %i10, ptr %i13, align 8
697 %i15 = add nuw i64 %i, 4
698 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
699 %i17 = icmp eq i64 %i15, 1024
700 br i1 %i17, label %bb18, label %bb2
706 declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>)
708 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
709 define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
710 ; V-LABEL: scatter_of_pointers:
713 ; V-NEXT: add a2, a1, a2
715 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
716 ; V-NEXT: .LBB13_1: # %bb2
717 ; V-NEXT: # =>This Inner Loop Header: Depth=1
718 ; V-NEXT: addi a4, a1, 16
719 ; V-NEXT: vle64.v v8, (a1)
720 ; V-NEXT: vle64.v v9, (a4)
721 ; V-NEXT: addi a4, a0, 80
722 ; V-NEXT: vsse64.v v8, (a0), a3
723 ; V-NEXT: vsse64.v v9, (a4), a3
724 ; V-NEXT: addi a1, a1, 32
725 ; V-NEXT: addi a0, a0, 160
726 ; V-NEXT: bne a1, a2, .LBB13_1
727 ; V-NEXT: # %bb.2: # %bb18
730 ; ZVE32F-LABEL: scatter_of_pointers:
731 ; ZVE32F: # %bb.0: # %bb
732 ; ZVE32F-NEXT: li a2, 0
733 ; ZVE32F-NEXT: lui a3, 2
734 ; ZVE32F-NEXT: add a3, a1, a3
735 ; ZVE32F-NEXT: li a4, 1
736 ; ZVE32F-NEXT: li a5, 40
737 ; ZVE32F-NEXT: .LBB13_1: # %bb2
738 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
739 ; ZVE32F-NEXT: ld a6, 8(a1)
740 ; ZVE32F-NEXT: ld a7, 0(a1)
741 ; ZVE32F-NEXT: ld t0, 24(a1)
742 ; ZVE32F-NEXT: ld t1, 16(a1)
743 ; ZVE32F-NEXT: mul t2, a4, a5
744 ; ZVE32F-NEXT: add t2, a0, t2
745 ; ZVE32F-NEXT: mul t3, a2, a5
746 ; ZVE32F-NEXT: add t3, a0, t3
747 ; ZVE32F-NEXT: sd a7, 0(t3)
748 ; ZVE32F-NEXT: sd a6, 0(t2)
749 ; ZVE32F-NEXT: sd t1, 80(t3)
750 ; ZVE32F-NEXT: sd t0, 80(t2)
751 ; ZVE32F-NEXT: addi a2, a2, 4
752 ; ZVE32F-NEXT: addi a1, a1, 32
753 ; ZVE32F-NEXT: addi a4, a4, 4
754 ; ZVE32F-NEXT: bne a1, a3, .LBB13_1
755 ; ZVE32F-NEXT: # %bb.2: # %bb18
760 bb2: ; preds = %bb2, %bb
761 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
762 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
763 %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i
764 %i6 = load <2 x ptr>, ptr %i4, align 8
765 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
766 %i9 = load <2 x ptr>, ptr %i7, align 8
767 %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
768 %i11 = mul <2 x i64> %i3, splat (i64 5)
769 %i12 = add <2 x i64> %i11, <i64 10, i64 10>
770 %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
771 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
772 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true))
773 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true))
774 %i15 = add nuw i64 %i, 4
775 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
776 %i17 = icmp eq i64 %i15, 1024
777 br i1 %i17, label %bb18, label %bb2
783 declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>)
785 define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) {
786 ; CHECK-LABEL: strided_load_startval_add_with_splat:
787 ; CHECK: # %bb.0: # %bb
788 ; CHECK-NEXT: li a3, 1024
789 ; CHECK-NEXT: beq a2, a3, .LBB14_7
790 ; CHECK-NEXT: # %bb.1: # %bb3
791 ; CHECK-NEXT: li a3, 1023
792 ; CHECK-NEXT: subw a5, a3, a2
793 ; CHECK-NEXT: li a6, 31
794 ; CHECK-NEXT: mv a4, a2
795 ; CHECK-NEXT: bltu a5, a6, .LBB14_5
796 ; CHECK-NEXT: # %bb.2: # %bb9
797 ; CHECK-NEXT: slli a5, a5, 32
798 ; CHECK-NEXT: srli a5, a5, 32
799 ; CHECK-NEXT: addi a5, a5, 1
800 ; CHECK-NEXT: andi a6, a5, -32
801 ; CHECK-NEXT: add a4, a6, a2
802 ; CHECK-NEXT: slli t0, a2, 2
803 ; CHECK-NEXT: add a7, a0, a2
804 ; CHECK-NEXT: add a2, a1, a2
805 ; CHECK-NEXT: add a2, a2, t0
806 ; CHECK-NEXT: add t0, a4, a0
807 ; CHECK-NEXT: li t2, 32
808 ; CHECK-NEXT: li t1, 5
809 ; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma
810 ; CHECK-NEXT: .LBB14_3: # %bb15
811 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
812 ; CHECK-NEXT: vlse8.v v8, (a2), t1
813 ; CHECK-NEXT: vle8.v v9, (a7)
814 ; CHECK-NEXT: vadd.vv v8, v9, v8
815 ; CHECK-NEXT: vse8.v v8, (a7)
816 ; CHECK-NEXT: addi a7, a7, 32
817 ; CHECK-NEXT: addi a2, a2, 160
818 ; CHECK-NEXT: bne a7, t0, .LBB14_3
819 ; CHECK-NEXT: # %bb.4: # %bb30
820 ; CHECK-NEXT: beq a5, a6, .LBB14_7
821 ; CHECK-NEXT: .LBB14_5: # %bb32
822 ; CHECK-NEXT: add a2, a0, a4
823 ; CHECK-NEXT: slli a5, a4, 2
824 ; CHECK-NEXT: add a1, a1, a4
825 ; CHECK-NEXT: add a1, a1, a5
826 ; CHECK-NEXT: subw a3, a3, a4
827 ; CHECK-NEXT: slli a3, a3, 32
828 ; CHECK-NEXT: srli a3, a3, 32
829 ; CHECK-NEXT: add a0, a4, a0
830 ; CHECK-NEXT: add a0, a0, a3
831 ; CHECK-NEXT: addi a0, a0, 1
832 ; CHECK-NEXT: .LBB14_6: # %bb35
833 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
834 ; CHECK-NEXT: lbu a3, 0(a1)
835 ; CHECK-NEXT: lbu a4, 0(a2)
836 ; CHECK-NEXT: add a3, a4, a3
837 ; CHECK-NEXT: sb a3, 0(a2)
838 ; CHECK-NEXT: addi a2, a2, 1
839 ; CHECK-NEXT: addi a1, a1, 5
840 ; CHECK-NEXT: bne a2, a0, .LBB14_6
841 ; CHECK-NEXT: .LBB14_7: # %bb34
844 %i = icmp eq i32 %arg2, 1024
845 br i1 %i, label %bb34, label %bb3
848 %i4 = sext i32 %arg2 to i64
849 %i5 = sub i32 1023, %arg2
850 %i6 = zext i32 %i5 to i64
851 %i7 = add nuw nsw i64 %i6, 1
852 %i8 = icmp ult i32 %i5, 31
853 br i1 %i8, label %bb32, label %bb9
856 %i10 = and i64 %i7, 8589934560
857 %i11 = add nsw i64 %i10, %i4
858 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0
859 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer
860 %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>
863 bb15: ; preds = %bb15, %bb9
864 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
865 %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
866 %i18 = add i64 %i16, %i4
867 %i19 = mul nsw <32 x i64> %i17, splat (i64 5)
868 %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
869 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
870 %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
871 %i24 = load <32 x i8>, ptr %i22, align 1
872 %i25 = add <32 x i8> %i24, %i21
873 store <32 x i8> %i25, ptr %i22, align 1
874 %i27 = add nuw i64 %i16, 32
875 %i28 = add <32 x i64> %i17, splat (i64 32)
876 %i29 = icmp eq i64 %i27, %i10
877 br i1 %i29, label %bb30, label %bb15
879 bb30: ; preds = %bb15
880 %i31 = icmp eq i64 %i7, %i10
881 br i1 %i31, label %bb34, label %bb32
883 bb32: ; preds = %bb30, %bb3
884 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ]
887 bb34: ; preds = %bb35, %bb30, %bb
890 bb35: ; preds = %bb35, %bb32
891 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ]
892 %i37 = mul nsw i64 %i36, 5
893 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37
894 %i39 = load i8, ptr %i38, align 1
895 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36
896 %i41 = load i8, ptr %i40, align 1
897 %i42 = add i8 %i41, %i39
898 store i8 %i42, ptr %i40, align 1
899 %i43 = add nsw i64 %i36, 1
900 %i44 = trunc i64 %i43 to i32
901 %i45 = icmp eq i32 %i44, 1024
902 br i1 %i45, label %bb34, label %bb35
905 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>)
906 declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>)
908 define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) {
909 ; CHECK-LABEL: gather_no_scalar_remainder:
910 ; CHECK: # %bb.0: # %bb
911 ; CHECK-NEXT: slli a2, a2, 4
912 ; CHECK-NEXT: beqz a2, .LBB15_3
913 ; CHECK-NEXT: # %bb.1: # %bb2
914 ; CHECK-NEXT: addi a2, a2, -16
915 ; CHECK-NEXT: andi a2, a2, -16
916 ; CHECK-NEXT: add a2, a2, a0
917 ; CHECK-NEXT: addi a2, a2, 16
918 ; CHECK-NEXT: li a3, 5
919 ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma
920 ; CHECK-NEXT: .LBB15_2: # %bb4
921 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
922 ; CHECK-NEXT: vlse8.v v8, (a1), a3
923 ; CHECK-NEXT: vle8.v v9, (a0)
924 ; CHECK-NEXT: vadd.vv v8, v9, v8
925 ; CHECK-NEXT: vse8.v v8, (a0)
926 ; CHECK-NEXT: addi a0, a0, 16
927 ; CHECK-NEXT: addi a1, a1, 80
928 ; CHECK-NEXT: bne a0, a2, .LBB15_2
929 ; CHECK-NEXT: .LBB15_3: # %bb16
932 %i = shl i64 %arg2, 4
933 %i3 = icmp eq i64 %i, 0
934 br i1 %i3, label %bb16, label %bb2
939 bb4: ; preds = %bb4, %bb2
940 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
941 %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
942 %i7 = mul <16 x i64> %i6, splat (i64 5)
943 %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
944 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef)
945 %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
946 %i11 = load <16 x i8>, ptr %i10, align 1
947 %i12 = add <16 x i8> %i11, %i9
948 store <16 x i8> %i12, ptr %i10, align 1
949 %i13 = add nuw i64 %i5, 16
950 %i14 = add <16 x i64> %i6, splat (i64 16)
951 %i15 = icmp eq i64 %i13, %i
952 br i1 %i15, label %bb16, label %bb4
954 bb16: ; preds = %bb4, %bb
958 define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
959 ; CHECK-LABEL: gather_zero_stride_fp:
960 ; CHECK: # %bb.0: # %entry
961 ; CHECK-NEXT: lui a2, 1
962 ; CHECK-NEXT: add a2, a0, a2
963 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
964 ; CHECK-NEXT: .LBB16_1: # %vector.body
965 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
966 ; CHECK-NEXT: flw fa5, 0(a1)
967 ; CHECK-NEXT: vle32.v v8, (a0)
968 ; CHECK-NEXT: vfadd.vf v8, v8, fa5
969 ; CHECK-NEXT: vse32.v v8, (a0)
970 ; CHECK-NEXT: addi a0, a0, 128
971 ; CHECK-NEXT: addi a1, a1, 640
972 ; CHECK-NEXT: bne a0, a2, .LBB16_1
973 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
976 br label %vector.body
978 vector.body: ; preds = %vector.body, %entry
979 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
980 %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
981 %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5)
982 %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i
983 %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x float> undef)
984 %i2 = getelementptr inbounds float, ptr %A, i64 %index
985 %wide.load = load <8 x float>, ptr %i2, align 4
986 %i4 = fadd <8 x float> %wide.load, %wide.masked.gather
987 store <8 x float> %i4, ptr %i2, align 4
988 %index.next = add nuw i64 %index, 32
989 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
990 %i6 = icmp eq i64 %index.next, 1024
991 br i1 %i6, label %for.cond.cleanup, label %vector.body
993 for.cond.cleanup: ; preds = %vector.body