1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
3 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
4 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
5 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
7 %struct.foo = type { i32, i32, i32, i32 }
9 ; void gather(signed char * __restrict A, signed char * __restrict B) {
10 ; for (int i = 0; i != 1024; ++i)
13 define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
14 ; CHECK-LABEL: gather:
15 ; CHECK: # %bb.0: # %entry
16 ; CHECK-NEXT: addi a2, a0, 1024
17 ; CHECK-NEXT: li a4, 32
18 ; CHECK-NEXT: li a3, 5
19 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
20 ; CHECK-NEXT: .LBB0_1: # %vector.body
21 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
22 ; CHECK-NEXT: vlse8.v v8, (a1), a3
23 ; CHECK-NEXT: vle8.v v9, (a0)
24 ; CHECK-NEXT: vadd.vv v8, v9, v8
25 ; CHECK-NEXT: vse8.v v8, (a0)
26 ; CHECK-NEXT: addi a0, a0, 32
27 ; CHECK-NEXT: addi a1, a1, 160
28 ; CHECK-NEXT: bne a0, a2, .LBB0_1
29 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
34 vector.body: ; preds = %vector.body, %entry
35 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
36 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
37 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
38 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
39 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
40 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
41 %wide.load = load <32 x i8>, ptr %i2, align 1
42 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
43 store <32 x i8> %i4, ptr %i2, align 1
44 %index.next = add nuw i64 %index, 32
45 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
46 %i6 = icmp eq i64 %index.next, 1024
47 br i1 %i6, label %for.cond.cleanup, label %vector.body
49 for.cond.cleanup: ; preds = %vector.body
53 define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
54 ; CHECK-LABEL: gather_masked:
55 ; CHECK: # %bb.0: # %entry
56 ; CHECK-NEXT: addi a2, a0, 1024
57 ; CHECK-NEXT: lui a3, 983765
58 ; CHECK-NEXT: addi a3, a3, 873
59 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
60 ; CHECK-NEXT: vmv.s.x v0, a3
61 ; CHECK-NEXT: li a3, 32
62 ; CHECK-NEXT: li a4, 5
63 ; CHECK-NEXT: .LBB1_1: # %vector.body
64 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
65 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu
66 ; CHECK-NEXT: vmv1r.v v9, v8
67 ; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t
68 ; CHECK-NEXT: vle8.v v10, (a0)
69 ; CHECK-NEXT: vadd.vv v9, v10, v9
70 ; CHECK-NEXT: vse8.v v9, (a0)
71 ; CHECK-NEXT: addi a0, a0, 32
72 ; CHECK-NEXT: addi a1, a1, 160
73 ; CHECK-NEXT: bne a0, a2, .LBB1_1
74 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
79 vector.body: ; preds = %vector.body, %entry
80 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
81 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
82 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
83 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
84 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
85 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
86 %wide.load = load <32 x i8>, ptr %i2, align 1
87 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
88 store <32 x i8> %i4, ptr %i2, align 1
89 %index.next = add nuw i64 %index, 32
90 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
91 %i6 = icmp eq i64 %index.next, 1024
92 br i1 %i6, label %for.cond.cleanup, label %vector.body
94 for.cond.cleanup: ; preds = %vector.body
98 define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
99 ; CHECK-LABEL: gather_negative_stride:
100 ; CHECK: # %bb.0: # %entry
101 ; CHECK-NEXT: addi a1, a1, 155
102 ; CHECK-NEXT: addi a2, a0, 1024
103 ; CHECK-NEXT: li a4, 32
104 ; CHECK-NEXT: li a3, -5
105 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
106 ; CHECK-NEXT: .LBB2_1: # %vector.body
107 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
108 ; CHECK-NEXT: vlse8.v v8, (a1), a3
109 ; CHECK-NEXT: vle8.v v9, (a0)
110 ; CHECK-NEXT: vadd.vv v8, v9, v8
111 ; CHECK-NEXT: vse8.v v8, (a0)
112 ; CHECK-NEXT: addi a0, a0, 32
113 ; CHECK-NEXT: addi a1, a1, 160
114 ; CHECK-NEXT: bne a0, a2, .LBB2_1
115 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
118 br label %vector.body
120 vector.body: ; preds = %vector.body, %entry
121 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
122 %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
123 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
124 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
125 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
126 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
127 %wide.load = load <32 x i8>, ptr %i2, align 1
128 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
129 store <32 x i8> %i4, ptr %i2, align 1
130 %index.next = add nuw i64 %index, 32
131 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
132 %i6 = icmp eq i64 %index.next, 1024
133 br i1 %i6, label %for.cond.cleanup, label %vector.body
135 for.cond.cleanup: ; preds = %vector.body
139 define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
140 ; CHECK-LABEL: gather_zero_stride:
141 ; CHECK: # %bb.0: # %entry
142 ; CHECK-NEXT: addi a2, a0, 1024
143 ; CHECK-NEXT: li a3, 32
144 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
145 ; CHECK-NEXT: .LBB3_1: # %vector.body
146 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
147 ; CHECK-NEXT: lbu a3, 0(a1)
148 ; CHECK-NEXT: vle8.v v8, (a0)
149 ; CHECK-NEXT: vadd.vx v8, v8, a3
150 ; CHECK-NEXT: vse8.v v8, (a0)
151 ; CHECK-NEXT: addi a0, a0, 32
152 ; CHECK-NEXT: addi a1, a1, 160
153 ; CHECK-NEXT: bne a0, a2, .LBB3_1
154 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
157 br label %vector.body
159 vector.body: ; preds = %vector.body, %entry
160 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
161 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
162 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
163 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
164 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
165 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
166 %wide.load = load <32 x i8>, ptr %i2, align 1
167 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
168 store <32 x i8> %i4, ptr %i2, align 1
169 %index.next = add nuw i64 %index, 32
170 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
171 %i6 = icmp eq i64 %index.next, 1024
172 br i1 %i6, label %for.cond.cleanup, label %vector.body
174 for.cond.cleanup: ; preds = %vector.body
178 define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
179 ; V-LABEL: gather_zero_stride_unfold:
180 ; V: # %bb.0: # %entry
181 ; V-NEXT: addi a2, a0, 1024
183 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
184 ; V-NEXT: .LBB4_1: # %vector.body
185 ; V-NEXT: # =>This Inner Loop Header: Depth=1
186 ; V-NEXT: vlse8.v v8, (a1), zero
187 ; V-NEXT: vle8.v v9, (a0)
188 ; V-NEXT: vdivu.vv v8, v8, v9
189 ; V-NEXT: vse8.v v8, (a0)
190 ; V-NEXT: addi a0, a0, 32
191 ; V-NEXT: addi a1, a1, 160
192 ; V-NEXT: bne a0, a2, .LBB4_1
193 ; V-NEXT: # %bb.2: # %for.cond.cleanup
196 ; ZVE32F-LABEL: gather_zero_stride_unfold:
197 ; ZVE32F: # %bb.0: # %entry
198 ; ZVE32F-NEXT: addi a2, a0, 1024
199 ; ZVE32F-NEXT: li a3, 32
200 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
201 ; ZVE32F-NEXT: .LBB4_1: # %vector.body
202 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
203 ; ZVE32F-NEXT: vlse8.v v8, (a1), zero
204 ; ZVE32F-NEXT: vle8.v v9, (a0)
205 ; ZVE32F-NEXT: vdivu.vv v8, v8, v9
206 ; ZVE32F-NEXT: vse8.v v8, (a0)
207 ; ZVE32F-NEXT: addi a0, a0, 32
208 ; ZVE32F-NEXT: addi a1, a1, 160
209 ; ZVE32F-NEXT: bne a0, a2, .LBB4_1
210 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
213 ; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold:
214 ; NOT-OPTIMIZED: # %bb.0: # %entry
215 ; NOT-OPTIMIZED-NEXT: addi a2, a0, 1024
216 ; NOT-OPTIMIZED-NEXT: li a3, 32
217 ; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
218 ; NOT-OPTIMIZED-NEXT: .LBB4_1: # %vector.body
219 ; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
220 ; NOT-OPTIMIZED-NEXT: lbu a3, 0(a1)
221 ; NOT-OPTIMIZED-NEXT: vle8.v v8, (a0)
222 ; NOT-OPTIMIZED-NEXT: vmv.v.x v9, a3
223 ; NOT-OPTIMIZED-NEXT: vdivu.vv v8, v9, v8
224 ; NOT-OPTIMIZED-NEXT: vse8.v v8, (a0)
225 ; NOT-OPTIMIZED-NEXT: addi a0, a0, 32
226 ; NOT-OPTIMIZED-NEXT: addi a1, a1, 160
227 ; NOT-OPTIMIZED-NEXT: bne a0, a2, .LBB4_1
228 ; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
229 ; NOT-OPTIMIZED-NEXT: ret
231 br label %vector.body
233 vector.body: ; preds = %vector.body, %entry
234 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
235 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
236 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
237 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
238 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
239 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
240 %wide.load = load <32 x i8>, ptr %i2, align 1
241 %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load
242 store <32 x i8> %i4, ptr %i2, align 1
243 %index.next = add nuw i64 %index, 32
244 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
245 %i6 = icmp eq i64 %index.next, 1024
246 br i1 %i6, label %for.cond.cleanup, label %vector.body
248 for.cond.cleanup: ; preds = %vector.body
252 ;void scatter(signed char * __restrict A, signed char * __restrict B) {
253 ; for (int i = 0; i < 1024; ++i)
256 define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
257 ; CHECK-LABEL: scatter:
258 ; CHECK: # %bb.0: # %entry
259 ; CHECK-NEXT: addi a2, a1, 1024
260 ; CHECK-NEXT: li a4, 32
261 ; CHECK-NEXT: li a3, 5
262 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
263 ; CHECK-NEXT: .LBB5_1: # %vector.body
264 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
265 ; CHECK-NEXT: vle8.v v8, (a1)
266 ; CHECK-NEXT: vlse8.v v9, (a0), a3
267 ; CHECK-NEXT: vadd.vv v8, v9, v8
268 ; CHECK-NEXT: vsse8.v v8, (a0), a3
269 ; CHECK-NEXT: addi a1, a1, 32
270 ; CHECK-NEXT: addi a0, a0, 160
271 ; CHECK-NEXT: bne a1, a2, .LBB5_1
272 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
275 br label %vector.body
277 vector.body: ; preds = %vector.body, %entry
278 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
279 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
280 %i = getelementptr inbounds i8, ptr %B, i64 %index
281 %wide.load = load <32 x i8>, ptr %i, align 1
282 %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
283 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
284 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
285 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
286 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
287 %index.next = add nuw i64 %index, 32
288 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
289 %i5 = icmp eq i64 %index.next, 1024
290 br i1 %i5, label %for.cond.cleanup, label %vector.body
292 for.cond.cleanup: ; preds = %vector.body
296 define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
297 ; CHECK-LABEL: scatter_masked:
298 ; CHECK: # %bb.0: # %entry
299 ; CHECK-NEXT: addi a2, a1, 1024
300 ; CHECK-NEXT: li a3, 32
301 ; CHECK-NEXT: lui a4, 983765
302 ; CHECK-NEXT: addi a4, a4, 873
303 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
304 ; CHECK-NEXT: vmv.s.x v0, a4
305 ; CHECK-NEXT: li a4, 5
306 ; CHECK-NEXT: .LBB6_1: # %vector.body
307 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
308 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu
309 ; CHECK-NEXT: vle8.v v9, (a1)
310 ; CHECK-NEXT: vmv1r.v v10, v8
311 ; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t
312 ; CHECK-NEXT: vadd.vv v9, v10, v9
313 ; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t
314 ; CHECK-NEXT: addi a1, a1, 32
315 ; CHECK-NEXT: addi a0, a0, 160
316 ; CHECK-NEXT: bne a1, a2, .LBB6_1
317 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
320 br label %vector.body
322 vector.body: ; preds = %vector.body, %entry
323 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
324 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
325 %i = getelementptr inbounds i8, ptr %B, i64 %index
326 %wide.load = load <32 x i8>, ptr %i, align 1
327 %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
328 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
329 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
330 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
331 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
332 %index.next = add nuw i64 %index, 32
333 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
334 %i5 = icmp eq i64 %index.next, 1024
335 br i1 %i5, label %for.cond.cleanup, label %vector.body
337 for.cond.cleanup: ; preds = %vector.body
341 ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) {
342 ; for (int i = 0; i != 1024; ++i)
345 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
346 ; CHECK-LABEL: gather_pow2:
347 ; CHECK: # %bb.0: # %entry
348 ; CHECK-NEXT: lui a2, 1
349 ; CHECK-NEXT: add a2, a0, a2
350 ; CHECK-NEXT: li a3, 16
351 ; CHECK-NEXT: li a4, 32
352 ; CHECK-NEXT: .LBB7_1: # %vector.body
353 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
354 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
355 ; CHECK-NEXT: vlse32.v v8, (a1), a3
356 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
357 ; CHECK-NEXT: vle8.v v9, (a0)
358 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
359 ; CHECK-NEXT: vadd.vv v8, v9, v8
360 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
361 ; CHECK-NEXT: vse8.v v8, (a0)
362 ; CHECK-NEXT: addi a0, a0, 32
363 ; CHECK-NEXT: addi a1, a1, 128
364 ; CHECK-NEXT: bne a0, a2, .LBB7_1
365 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
368 br label %vector.body
370 vector.body: ; preds = %vector.body, %entry
371 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
372 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
373 %i = shl nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
374 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
375 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
376 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
377 %wide.load = load <8 x i32>, ptr %i2, align 1
378 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
379 store <8 x i32> %i4, ptr %i2, align 1
380 %index.next = add nuw i64 %index, 8
381 %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
382 %i6 = icmp eq i64 %index.next, 1024
383 br i1 %i6, label %for.cond.cleanup, label %vector.body
385 for.cond.cleanup: ; preds = %vector.body
389 ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) {
390 ; for (int i = 0; i < 1024; ++i)
393 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
394 ; CHECK-LABEL: scatter_pow2:
395 ; CHECK: # %bb.0: # %entry
396 ; CHECK-NEXT: lui a2, 1
397 ; CHECK-NEXT: add a2, a1, a2
398 ; CHECK-NEXT: li a3, 32
399 ; CHECK-NEXT: li a4, 16
400 ; CHECK-NEXT: .LBB8_1: # %vector.body
401 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
402 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
403 ; CHECK-NEXT: vle8.v v8, (a1)
404 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
405 ; CHECK-NEXT: vlse32.v v9, (a0), a4
406 ; CHECK-NEXT: vadd.vv v8, v9, v8
407 ; CHECK-NEXT: vsse32.v v8, (a0), a4
408 ; CHECK-NEXT: addi a1, a1, 32
409 ; CHECK-NEXT: addi a0, a0, 128
410 ; CHECK-NEXT: bne a1, a2, .LBB8_1
411 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
414 br label %vector.body
416 vector.body: ; preds = %vector.body, %entry
417 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
418 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
419 %i = getelementptr inbounds i32, ptr %B, i64 %index
420 %wide.load = load <8 x i32>, ptr %i, align 1
421 %i2 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
422 %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
423 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
424 %i4 = add <8 x i32> %wide.masked.gather, %wide.load
425 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
426 %index.next = add nuw i64 %index, 8
427 %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
428 %i5 = icmp eq i64 %index.next, 1024
429 br i1 %i5, label %for.cond.cleanup, label %vector.body
431 for.cond.cleanup: ; preds = %vector.body
439 ;void struct_gather(int * __restrict A, struct foo * __restrict B) {
440 ; for (int i = 0; i < 1024; ++i)
443 define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
444 ; CHECK-LABEL: struct_gather:
445 ; CHECK: # %bb.0: # %entry
446 ; CHECK-NEXT: addi a1, a1, 132
447 ; CHECK-NEXT: lui a2, 1
448 ; CHECK-NEXT: add a2, a0, a2
449 ; CHECK-NEXT: li a3, 16
450 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
451 ; CHECK-NEXT: .LBB9_1: # %vector.body
452 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
453 ; CHECK-NEXT: addi a4, a0, 32
454 ; CHECK-NEXT: addi a5, a1, -128
455 ; CHECK-NEXT: vlse32.v v8, (a5), a3
456 ; CHECK-NEXT: vlse32.v v9, (a1), a3
457 ; CHECK-NEXT: vle32.v v10, (a0)
458 ; CHECK-NEXT: vle32.v v11, (a4)
459 ; CHECK-NEXT: vadd.vv v8, v10, v8
460 ; CHECK-NEXT: vadd.vv v9, v11, v9
461 ; CHECK-NEXT: vse32.v v8, (a0)
462 ; CHECK-NEXT: vse32.v v9, (a4)
463 ; CHECK-NEXT: addi a0, a0, 64
464 ; CHECK-NEXT: addi a1, a1, 256
465 ; CHECK-NEXT: bne a0, a2, .LBB9_1
466 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
469 br label %vector.body
471 vector.body: ; preds = %vector.body, %entry
472 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
473 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
474 %step.add = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
475 %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
476 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
477 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
478 %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
479 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
480 %wide.load = load <8 x i32>, ptr %i2, align 4
481 %i4 = getelementptr inbounds i32, ptr %i2, i64 8
482 %wide.load10 = load <8 x i32>, ptr %i4, align 4
483 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
484 %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
485 store <8 x i32> %i6, ptr %i2, align 4
486 store <8 x i32> %i7, ptr %i4, align 4
487 %index.next = add nuw i64 %index, 16
488 %vec.ind.next = add <8 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
489 %i10 = icmp eq i64 %index.next, 1024
490 br i1 %i10, label %for.cond.cleanup, label %vector.body
492 for.cond.cleanup: ; preds = %vector.body
496 ;void gather_unroll(int * __restrict A, int * __restrict B) {
497 ; for (int i = 0; i < 1024; i+= 4 ) {
499 ; A[i+1] += B[(i+1) * 4];
500 ; A[i+2] += B[(i+2) * 4];
501 ; A[i+3] += B[(i+3) * 4];
504 define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
505 ; CHECK-LABEL: gather_unroll:
506 ; CHECK: # %bb.0: # %entry
507 ; CHECK-NEXT: li a2, 256
508 ; CHECK-NEXT: li a3, 64
509 ; CHECK-NEXT: li a4, 16
510 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
511 ; CHECK-NEXT: .LBB10_1: # %vector.body
512 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
513 ; CHECK-NEXT: vlse32.v v8, (a1), a3
514 ; CHECK-NEXT: vlse32.v v9, (a0), a4
515 ; CHECK-NEXT: vadd.vv v8, v9, v8
516 ; CHECK-NEXT: vsse32.v v8, (a0), a4
517 ; CHECK-NEXT: addi a5, a1, 16
518 ; CHECK-NEXT: vlse32.v v8, (a5), a3
519 ; CHECK-NEXT: addi a5, a0, 4
520 ; CHECK-NEXT: vlse32.v v9, (a5), a4
521 ; CHECK-NEXT: vadd.vv v8, v9, v8
522 ; CHECK-NEXT: vsse32.v v8, (a5), a4
523 ; CHECK-NEXT: addi a5, a1, 32
524 ; CHECK-NEXT: vlse32.v v8, (a5), a3
525 ; CHECK-NEXT: addi a5, a0, 8
526 ; CHECK-NEXT: vlse32.v v9, (a5), a4
527 ; CHECK-NEXT: vadd.vv v8, v9, v8
528 ; CHECK-NEXT: vsse32.v v8, (a5), a4
529 ; CHECK-NEXT: addi a5, a1, 48
530 ; CHECK-NEXT: vlse32.v v8, (a5), a3
531 ; CHECK-NEXT: addi a5, a0, 12
532 ; CHECK-NEXT: vlse32.v v9, (a5), a4
533 ; CHECK-NEXT: vadd.vv v8, v9, v8
534 ; CHECK-NEXT: vsse32.v v8, (a5), a4
535 ; CHECK-NEXT: addi a2, a2, -8
536 ; CHECK-NEXT: addi a1, a1, 512
537 ; CHECK-NEXT: addi a0, a0, 128
538 ; CHECK-NEXT: bnez a2, .LBB10_1
539 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
542 br label %vector.body
544 vector.body: ; preds = %vector.body, %entry
545 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
546 %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
547 %i = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
548 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
549 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
550 %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
551 %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
552 %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
553 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
554 %i4 = or disjoint <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
555 %i5 = shl nsw <8 x i64> %i4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
556 %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
557 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
558 %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
559 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
560 %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
561 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
562 %i9 = or disjoint <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
563 %i10 = shl nsw <8 x i64> %i9, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
564 %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
565 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
566 %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
567 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
568 %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
569 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
570 %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
571 %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
572 %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
573 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
574 %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
575 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
576 %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
577 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
578 %index.next = add nuw i64 %index, 8
579 %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
580 %i19 = icmp eq i64 %index.next, 256
581 br i1 %i19, label %for.cond.cleanup, label %vector.body
583 for.cond.cleanup: ; preds = %vector.body
587 declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>)
588 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
589 declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>)
590 declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
592 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
593 define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
594 ; V-LABEL: gather_of_pointers:
597 ; V-NEXT: add a2, a0, a2
599 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
600 ; V-NEXT: .LBB11_1: # %bb2
601 ; V-NEXT: # =>This Inner Loop Header: Depth=1
602 ; V-NEXT: addi a4, a1, 80
603 ; V-NEXT: vlse64.v v8, (a1), a3
604 ; V-NEXT: vlse64.v v9, (a4), a3
605 ; V-NEXT: addi a4, a0, 16
606 ; V-NEXT: vse64.v v8, (a0)
607 ; V-NEXT: vse64.v v9, (a4)
608 ; V-NEXT: addi a0, a0, 32
609 ; V-NEXT: addi a1, a1, 160
610 ; V-NEXT: bne a0, a2, .LBB11_1
611 ; V-NEXT: # %bb.2: # %bb18
614 ; ZVE32F-LABEL: gather_of_pointers:
615 ; ZVE32F: # %bb.0: # %bb
616 ; ZVE32F-NEXT: li a2, 0
617 ; ZVE32F-NEXT: lui a3, 2
618 ; ZVE32F-NEXT: add a3, a0, a3
619 ; ZVE32F-NEXT: li a4, 1
620 ; ZVE32F-NEXT: li a5, 40
621 ; ZVE32F-NEXT: .LBB11_1: # %bb2
622 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
623 ; ZVE32F-NEXT: mul a6, a4, a5
624 ; ZVE32F-NEXT: add a6, a1, a6
625 ; ZVE32F-NEXT: mul a7, a2, a5
626 ; ZVE32F-NEXT: add a7, a1, a7
627 ; ZVE32F-NEXT: ld t0, 0(a6)
628 ; ZVE32F-NEXT: ld t1, 0(a7)
629 ; ZVE32F-NEXT: ld a6, 80(a6)
630 ; ZVE32F-NEXT: ld a7, 80(a7)
631 ; ZVE32F-NEXT: sd t0, 8(a0)
632 ; ZVE32F-NEXT: sd t1, 0(a0)
633 ; ZVE32F-NEXT: sd a6, 24(a0)
634 ; ZVE32F-NEXT: sd a7, 16(a0)
635 ; ZVE32F-NEXT: addi a2, a2, 4
636 ; ZVE32F-NEXT: addi a0, a0, 32
637 ; ZVE32F-NEXT: addi a4, a4, 4
638 ; ZVE32F-NEXT: bne a0, a3, .LBB11_1
639 ; ZVE32F-NEXT: # %bb.2: # %bb18
644 bb2: ; preds = %bb2, %bb
645 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
646 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
647 %i4 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
648 %i5 = mul <2 x i64> %i3, <i64 5, i64 5>
649 %i6 = add <2 x i64> %i5, <i64 10, i64 10>
650 %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
651 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
652 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
653 %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
654 %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
655 store <2 x ptr> %i9, ptr %i11, align 8
656 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
657 store <2 x ptr> %i10, ptr %i13, align 8
658 %i15 = add nuw i64 %i, 4
659 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
660 %i17 = icmp eq i64 %i15, 1024
661 br i1 %i17, label %bb18, label %bb2
667 declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>)
669 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
670 define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
671 ; V-LABEL: scatter_of_pointers:
674 ; V-NEXT: add a2, a1, a2
676 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
677 ; V-NEXT: .LBB12_1: # %bb2
678 ; V-NEXT: # =>This Inner Loop Header: Depth=1
679 ; V-NEXT: addi a4, a1, 16
680 ; V-NEXT: vle64.v v8, (a1)
681 ; V-NEXT: vle64.v v9, (a4)
682 ; V-NEXT: addi a4, a0, 80
683 ; V-NEXT: vsse64.v v8, (a0), a3
684 ; V-NEXT: vsse64.v v9, (a4), a3
685 ; V-NEXT: addi a1, a1, 32
686 ; V-NEXT: addi a0, a0, 160
687 ; V-NEXT: bne a1, a2, .LBB12_1
688 ; V-NEXT: # %bb.2: # %bb18
691 ; ZVE32F-LABEL: scatter_of_pointers:
692 ; ZVE32F: # %bb.0: # %bb
693 ; ZVE32F-NEXT: li a2, 0
694 ; ZVE32F-NEXT: lui a3, 2
695 ; ZVE32F-NEXT: add a3, a1, a3
696 ; ZVE32F-NEXT: li a4, 1
697 ; ZVE32F-NEXT: li a5, 40
698 ; ZVE32F-NEXT: .LBB12_1: # %bb2
699 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
700 ; ZVE32F-NEXT: ld a6, 8(a1)
701 ; ZVE32F-NEXT: ld a7, 0(a1)
702 ; ZVE32F-NEXT: ld t0, 24(a1)
703 ; ZVE32F-NEXT: ld t1, 16(a1)
704 ; ZVE32F-NEXT: mul t2, a4, a5
705 ; ZVE32F-NEXT: add t2, a0, t2
706 ; ZVE32F-NEXT: mul t3, a2, a5
707 ; ZVE32F-NEXT: add t3, a0, t3
708 ; ZVE32F-NEXT: sd a7, 0(t3)
709 ; ZVE32F-NEXT: sd a6, 0(t2)
710 ; ZVE32F-NEXT: sd t1, 80(t3)
711 ; ZVE32F-NEXT: sd t0, 80(t2)
712 ; ZVE32F-NEXT: addi a2, a2, 4
713 ; ZVE32F-NEXT: addi a1, a1, 32
714 ; ZVE32F-NEXT: addi a4, a4, 4
715 ; ZVE32F-NEXT: bne a1, a3, .LBB12_1
716 ; ZVE32F-NEXT: # %bb.2: # %bb18
721 bb2: ; preds = %bb2, %bb
722 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
723 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
724 %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i
725 %i6 = load <2 x ptr>, ptr %i4, align 8
726 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
727 %i9 = load <2 x ptr>, ptr %i7, align 8
728 %i10 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
729 %i11 = mul <2 x i64> %i3, <i64 5, i64 5>
730 %i12 = add <2 x i64> %i11, <i64 10, i64 10>
731 %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
732 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
733 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> <i1 true, i1 true>)
734 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> <i1 true, i1 true>)
735 %i15 = add nuw i64 %i, 4
736 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
737 %i17 = icmp eq i64 %i15, 1024
738 br i1 %i17, label %bb18, label %bb2
744 declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>)
746 define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) {
747 ; CHECK-LABEL: strided_load_startval_add_with_splat:
748 ; CHECK: # %bb.0: # %bb
749 ; CHECK-NEXT: li a3, 1024
750 ; CHECK-NEXT: beq a2, a3, .LBB13_7
751 ; CHECK-NEXT: # %bb.1: # %bb3
752 ; CHECK-NEXT: li a3, 1023
753 ; CHECK-NEXT: subw a5, a3, a2
754 ; CHECK-NEXT: li a6, 31
755 ; CHECK-NEXT: mv a4, a2
756 ; CHECK-NEXT: bltu a5, a6, .LBB13_5
757 ; CHECK-NEXT: # %bb.2: # %bb9
758 ; CHECK-NEXT: slli a5, a5, 32
759 ; CHECK-NEXT: srli a5, a5, 32
760 ; CHECK-NEXT: addi a5, a5, 1
761 ; CHECK-NEXT: andi a6, a5, -32
762 ; CHECK-NEXT: add a4, a6, a2
763 ; CHECK-NEXT: slli t0, a2, 2
764 ; CHECK-NEXT: add a7, a0, a2
765 ; CHECK-NEXT: add a2, a1, a2
766 ; CHECK-NEXT: add a2, a2, t0
767 ; CHECK-NEXT: add t0, a4, a0
768 ; CHECK-NEXT: li t2, 32
769 ; CHECK-NEXT: li t1, 5
770 ; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma
771 ; CHECK-NEXT: .LBB13_3: # %bb15
772 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
773 ; CHECK-NEXT: vlse8.v v8, (a2), t1
774 ; CHECK-NEXT: vle8.v v9, (a7)
775 ; CHECK-NEXT: vadd.vv v8, v9, v8
776 ; CHECK-NEXT: vse8.v v8, (a7)
777 ; CHECK-NEXT: addi a7, a7, 32
778 ; CHECK-NEXT: addi a2, a2, 160
779 ; CHECK-NEXT: bne a7, t0, .LBB13_3
780 ; CHECK-NEXT: # %bb.4: # %bb30
781 ; CHECK-NEXT: beq a5, a6, .LBB13_7
782 ; CHECK-NEXT: .LBB13_5: # %bb32
783 ; CHECK-NEXT: add a2, a0, a4
784 ; CHECK-NEXT: slli a5, a4, 2
785 ; CHECK-NEXT: add a1, a1, a4
786 ; CHECK-NEXT: add a1, a1, a5
787 ; CHECK-NEXT: subw a3, a3, a4
788 ; CHECK-NEXT: slli a3, a3, 32
789 ; CHECK-NEXT: srli a3, a3, 32
790 ; CHECK-NEXT: add a0, a4, a0
791 ; CHECK-NEXT: add a0, a0, a3
792 ; CHECK-NEXT: addi a0, a0, 1
793 ; CHECK-NEXT: .LBB13_6: # %bb35
794 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
795 ; CHECK-NEXT: lbu a3, 0(a1)
796 ; CHECK-NEXT: lbu a4, 0(a2)
797 ; CHECK-NEXT: add a3, a4, a3
798 ; CHECK-NEXT: sb a3, 0(a2)
799 ; CHECK-NEXT: addi a2, a2, 1
800 ; CHECK-NEXT: addi a1, a1, 5
801 ; CHECK-NEXT: bne a2, a0, .LBB13_6
802 ; CHECK-NEXT: .LBB13_7: # %bb34
805 %i = icmp eq i32 %arg2, 1024
806 br i1 %i, label %bb34, label %bb3
809 %i4 = sext i32 %arg2 to i64
810 %i5 = sub i32 1023, %arg2
811 %i6 = zext i32 %i5 to i64
812 %i7 = add nuw nsw i64 %i6, 1
813 %i8 = icmp ult i32 %i5, 31
814 br i1 %i8, label %bb32, label %bb9
817 %i10 = and i64 %i7, 8589934560
818 %i11 = add nsw i64 %i10, %i4
819 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0
820 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer
821 %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>
824 bb15: ; preds = %bb15, %bb9
825 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
826 %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
827 %i18 = add i64 %i16, %i4
828 %i19 = mul nsw <32 x i64> %i17, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
829 %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
830 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
831 %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
832 %i24 = load <32 x i8>, ptr %i22, align 1
833 %i25 = add <32 x i8> %i24, %i21
834 store <32 x i8> %i25, ptr %i22, align 1
835 %i27 = add nuw i64 %i16, 32
836 %i28 = add <32 x i64> %i17, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
837 %i29 = icmp eq i64 %i27, %i10
838 br i1 %i29, label %bb30, label %bb15
840 bb30: ; preds = %bb15
841 %i31 = icmp eq i64 %i7, %i10
842 br i1 %i31, label %bb34, label %bb32
844 bb32: ; preds = %bb30, %bb3
845 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ]
848 bb34: ; preds = %bb35, %bb30, %bb
851 bb35: ; preds = %bb35, %bb32
852 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ]
853 %i37 = mul nsw i64 %i36, 5
854 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37
855 %i39 = load i8, ptr %i38, align 1
856 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36
857 %i41 = load i8, ptr %i40, align 1
858 %i42 = add i8 %i41, %i39
859 store i8 %i42, ptr %i40, align 1
860 %i43 = add nsw i64 %i36, 1
861 %i44 = trunc i64 %i43 to i32
862 %i45 = icmp eq i32 %i44, 1024
863 br i1 %i45, label %bb34, label %bb35
866 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>)
867 declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>)
869 define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) {
870 ; CHECK-LABEL: gather_no_scalar_remainder:
871 ; CHECK: # %bb.0: # %bb
872 ; CHECK-NEXT: slli a2, a2, 4
873 ; CHECK-NEXT: beqz a2, .LBB14_3
874 ; CHECK-NEXT: # %bb.1: # %bb2
875 ; CHECK-NEXT: addi a2, a2, -16
876 ; CHECK-NEXT: andi a2, a2, -16
877 ; CHECK-NEXT: add a2, a2, a0
878 ; CHECK-NEXT: addi a2, a2, 16
879 ; CHECK-NEXT: li a3, 5
880 ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma
881 ; CHECK-NEXT: .LBB14_2: # %bb4
882 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
883 ; CHECK-NEXT: vlse8.v v8, (a1), a3
884 ; CHECK-NEXT: vle8.v v9, (a0)
885 ; CHECK-NEXT: vadd.vv v8, v9, v8
886 ; CHECK-NEXT: vse8.v v8, (a0)
887 ; CHECK-NEXT: addi a0, a0, 16
888 ; CHECK-NEXT: addi a1, a1, 80
889 ; CHECK-NEXT: bne a0, a2, .LBB14_2
890 ; CHECK-NEXT: .LBB14_3: # %bb16
893 %i = shl i64 %arg2, 4
894 %i3 = icmp eq i64 %i, 0
895 br i1 %i3, label %bb16, label %bb2
900 bb4: ; preds = %bb4, %bb2
901 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
902 %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
903 %i7 = mul <16 x i64> %i6, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
904 %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
905 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
906 %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
907 %i11 = load <16 x i8>, ptr %i10, align 1
908 %i12 = add <16 x i8> %i11, %i9
909 store <16 x i8> %i12, ptr %i10, align 1
910 %i13 = add nuw i64 %i5, 16
911 %i14 = add <16 x i64> %i6, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
912 %i15 = icmp eq i64 %i13, %i
913 br i1 %i15, label %bb16, label %bb4
915 bb16: ; preds = %bb4, %bb