1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
3 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
4 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
5 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
7 %struct.foo = type { i32, i32, i32, i32 }
9 ; void gather(signed char * __restrict A, signed char * __restrict B) {
10 ; for (int i = 0; i != 1024; ++i)
13 define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
14 ; CHECK-LABEL: gather:
15 ; CHECK: # %bb.0: # %entry
16 ; CHECK-NEXT: li a2, 1024
17 ; CHECK-NEXT: li a3, 32
18 ; CHECK-NEXT: li a4, 5
19 ; CHECK-NEXT: .LBB0_1: # %vector.body
20 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
21 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
22 ; CHECK-NEXT: vlse8.v v8, (a1), a4
23 ; CHECK-NEXT: vle8.v v9, (a0)
24 ; CHECK-NEXT: vadd.vv v8, v9, v8
25 ; CHECK-NEXT: vse8.v v8, (a0)
26 ; CHECK-NEXT: addi a2, a2, -32
27 ; CHECK-NEXT: addi a0, a0, 32
28 ; CHECK-NEXT: addi a1, a1, 160
29 ; CHECK-NEXT: bnez a2, .LBB0_1
30 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
35 vector.body: ; preds = %vector.body, %entry
36 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
37 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
38 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
39 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
40 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
41 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
42 %wide.load = load <32 x i8>, ptr %i2, align 1
43 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
44 store <32 x i8> %i4, ptr %i2, align 1
45 %index.next = add nuw i64 %index, 32
46 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
47 %i6 = icmp eq i64 %index.next, 1024
48 br i1 %i6, label %for.cond.cleanup, label %vector.body
50 for.cond.cleanup: ; preds = %vector.body
54 define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
55 ; V-LABEL: gather_masked:
56 ; V: # %bb.0: # %entry
58 ; V-NEXT: lui a3, 983765
59 ; V-NEXT: addi a3, a3, 873
60 ; V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
61 ; V-NEXT: vmv.s.x v0, a3
64 ; V-NEXT: .LBB1_1: # %vector.body
65 ; V-NEXT: # =>This Inner Loop Header: Depth=1
66 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu
67 ; V-NEXT: vmv1r.v v9, v8
68 ; V-NEXT: vlse8.v v9, (a1), a4, v0.t
69 ; V-NEXT: vle8.v v10, (a0)
70 ; V-NEXT: vadd.vv v9, v10, v9
71 ; V-NEXT: vse8.v v9, (a0)
72 ; V-NEXT: addi a2, a2, -32
73 ; V-NEXT: addi a0, a0, 32
74 ; V-NEXT: addi a1, a1, 160
75 ; V-NEXT: bnez a2, .LBB1_1
76 ; V-NEXT: # %bb.2: # %for.cond.cleanup
79 ; ZVE32F-LABEL: gather_masked:
80 ; ZVE32F: # %bb.0: # %entry
81 ; ZVE32F-NEXT: li a2, 1024
82 ; ZVE32F-NEXT: lui a3, 983765
83 ; ZVE32F-NEXT: addi a3, a3, 873
84 ; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma
85 ; ZVE32F-NEXT: vmv.s.x v0, a3
86 ; ZVE32F-NEXT: li a3, 32
87 ; ZVE32F-NEXT: li a4, 5
88 ; ZVE32F-NEXT: .LBB1_1: # %vector.body
89 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
90 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu
91 ; ZVE32F-NEXT: vmv1r.v v9, v8
92 ; ZVE32F-NEXT: vlse8.v v9, (a1), a4, v0.t
93 ; ZVE32F-NEXT: vle8.v v10, (a0)
94 ; ZVE32F-NEXT: vadd.vv v9, v10, v9
95 ; ZVE32F-NEXT: vse8.v v9, (a0)
96 ; ZVE32F-NEXT: addi a2, a2, -32
97 ; ZVE32F-NEXT: addi a0, a0, 32
98 ; ZVE32F-NEXT: addi a1, a1, 160
99 ; ZVE32F-NEXT: bnez a2, .LBB1_1
100 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
103 br label %vector.body
105 vector.body: ; preds = %vector.body, %entry
106 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
107 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
108 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
109 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
110 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
111 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
112 %wide.load = load <32 x i8>, ptr %i2, align 1
113 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
114 store <32 x i8> %i4, ptr %i2, align 1
115 %index.next = add nuw i64 %index, 32
116 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
117 %i6 = icmp eq i64 %index.next, 1024
118 br i1 %i6, label %for.cond.cleanup, label %vector.body
120 for.cond.cleanup: ; preds = %vector.body
124 define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
125 ; CHECK-LABEL: gather_negative_stride:
126 ; CHECK: # %bb.0: # %entry
127 ; CHECK-NEXT: addi a1, a1, 155
128 ; CHECK-NEXT: li a2, 1024
129 ; CHECK-NEXT: li a3, 32
130 ; CHECK-NEXT: li a4, -5
131 ; CHECK-NEXT: .LBB2_1: # %vector.body
132 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
133 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
134 ; CHECK-NEXT: vlse8.v v8, (a1), a4
135 ; CHECK-NEXT: vle8.v v9, (a0)
136 ; CHECK-NEXT: vadd.vv v8, v9, v8
137 ; CHECK-NEXT: vse8.v v8, (a0)
138 ; CHECK-NEXT: addi a2, a2, -32
139 ; CHECK-NEXT: addi a0, a0, 32
140 ; CHECK-NEXT: addi a1, a1, 160
141 ; CHECK-NEXT: bnez a2, .LBB2_1
142 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
145 br label %vector.body
147 vector.body: ; preds = %vector.body, %entry
148 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
149 %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
150 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
151 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
152 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
153 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
154 %wide.load = load <32 x i8>, ptr %i2, align 1
155 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
156 store <32 x i8> %i4, ptr %i2, align 1
157 %index.next = add nuw i64 %index, 32
158 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
159 %i6 = icmp eq i64 %index.next, 1024
160 br i1 %i6, label %for.cond.cleanup, label %vector.body
162 for.cond.cleanup: ; preds = %vector.body
166 define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
167 ; CHECK-LABEL: gather_zero_stride:
168 ; CHECK: # %bb.0: # %entry
169 ; CHECK-NEXT: li a2, 1024
170 ; CHECK-NEXT: li a3, 32
171 ; CHECK-NEXT: .LBB3_1: # %vector.body
172 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
173 ; CHECK-NEXT: lbu a4, 0(a1)
174 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
175 ; CHECK-NEXT: vle8.v v8, (a0)
176 ; CHECK-NEXT: vadd.vx v8, v8, a4
177 ; CHECK-NEXT: vse8.v v8, (a0)
178 ; CHECK-NEXT: addi a2, a2, -32
179 ; CHECK-NEXT: addi a0, a0, 32
180 ; CHECK-NEXT: addi a1, a1, 160
181 ; CHECK-NEXT: bnez a2, .LBB3_1
182 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
185 br label %vector.body
187 vector.body: ; preds = %vector.body, %entry
188 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
189 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
190 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
191 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
192 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
193 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
194 %wide.load = load <32 x i8>, ptr %i2, align 1
195 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
196 store <32 x i8> %i4, ptr %i2, align 1
197 %index.next = add nuw i64 %index, 32
198 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
199 %i6 = icmp eq i64 %index.next, 1024
200 br i1 %i6, label %for.cond.cleanup, label %vector.body
202 for.cond.cleanup: ; preds = %vector.body
206 define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
207 ; V-LABEL: gather_zero_stride_unfold:
208 ; V: # %bb.0: # %entry
209 ; V-NEXT: li a2, 1024
211 ; V-NEXT: .LBB4_1: # %vector.body
212 ; V-NEXT: # =>This Inner Loop Header: Depth=1
213 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
214 ; V-NEXT: vlse8.v v8, (a1), zero
215 ; V-NEXT: vle8.v v9, (a0)
216 ; V-NEXT: vdivu.vv v8, v8, v9
217 ; V-NEXT: vse8.v v8, (a0)
218 ; V-NEXT: addi a2, a2, -32
219 ; V-NEXT: addi a0, a0, 32
220 ; V-NEXT: addi a1, a1, 160
221 ; V-NEXT: bnez a2, .LBB4_1
222 ; V-NEXT: # %bb.2: # %for.cond.cleanup
225 ; ZVE32F-LABEL: gather_zero_stride_unfold:
226 ; ZVE32F: # %bb.0: # %entry
227 ; ZVE32F-NEXT: li a2, 1024
228 ; ZVE32F-NEXT: li a3, 32
229 ; ZVE32F-NEXT: .LBB4_1: # %vector.body
230 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
231 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
232 ; ZVE32F-NEXT: vlse8.v v8, (a1), zero
233 ; ZVE32F-NEXT: vle8.v v9, (a0)
234 ; ZVE32F-NEXT: vdivu.vv v8, v8, v9
235 ; ZVE32F-NEXT: vse8.v v8, (a0)
236 ; ZVE32F-NEXT: addi a2, a2, -32
237 ; ZVE32F-NEXT: addi a0, a0, 32
238 ; ZVE32F-NEXT: addi a1, a1, 160
239 ; ZVE32F-NEXT: bnez a2, .LBB4_1
240 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
243 ; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold:
244 ; NOT-OPTIMIZED: # %bb.0: # %entry
245 ; NOT-OPTIMIZED-NEXT: li a2, 1024
246 ; NOT-OPTIMIZED-NEXT: li a3, 32
247 ; NOT-OPTIMIZED-NEXT: .LBB4_1: # %vector.body
248 ; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
249 ; NOT-OPTIMIZED-NEXT: lbu a4, 0(a1)
250 ; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
251 ; NOT-OPTIMIZED-NEXT: vle8.v v8, (a0)
252 ; NOT-OPTIMIZED-NEXT: vmv.v.x v9, a4
253 ; NOT-OPTIMIZED-NEXT: vdivu.vv v8, v9, v8
254 ; NOT-OPTIMIZED-NEXT: vse8.v v8, (a0)
255 ; NOT-OPTIMIZED-NEXT: addi a2, a2, -32
256 ; NOT-OPTIMIZED-NEXT: addi a0, a0, 32
257 ; NOT-OPTIMIZED-NEXT: addi a1, a1, 160
258 ; NOT-OPTIMIZED-NEXT: bnez a2, .LBB4_1
259 ; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
260 ; NOT-OPTIMIZED-NEXT: ret
262 br label %vector.body
264 vector.body: ; preds = %vector.body, %entry
265 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
266 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
267 %i = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
268 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
269 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
270 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
271 %wide.load = load <32 x i8>, ptr %i2, align 1
272 %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load
273 store <32 x i8> %i4, ptr %i2, align 1
274 %index.next = add nuw i64 %index, 32
275 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
276 %i6 = icmp eq i64 %index.next, 1024
277 br i1 %i6, label %for.cond.cleanup, label %vector.body
279 for.cond.cleanup: ; preds = %vector.body
283 ;void scatter(signed char * __restrict A, signed char * __restrict B) {
284 ; for (int i = 0; i < 1024; ++i)
287 define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
288 ; CHECK-LABEL: scatter:
289 ; CHECK: # %bb.0: # %entry
290 ; CHECK-NEXT: li a2, 1024
291 ; CHECK-NEXT: li a3, 32
292 ; CHECK-NEXT: li a4, 5
293 ; CHECK-NEXT: .LBB5_1: # %vector.body
294 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
295 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
296 ; CHECK-NEXT: vle8.v v8, (a1)
297 ; CHECK-NEXT: vlse8.v v9, (a0), a4
298 ; CHECK-NEXT: vadd.vv v8, v9, v8
299 ; CHECK-NEXT: vsse8.v v8, (a0), a4
300 ; CHECK-NEXT: addi a2, a2, -32
301 ; CHECK-NEXT: addi a1, a1, 32
302 ; CHECK-NEXT: addi a0, a0, 160
303 ; CHECK-NEXT: bnez a2, .LBB5_1
304 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
307 br label %vector.body
309 vector.body: ; preds = %vector.body, %entry
310 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
311 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
312 %i = getelementptr inbounds i8, ptr %B, i64 %index
313 %wide.load = load <32 x i8>, ptr %i, align 1
314 %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
315 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
316 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
317 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
318 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
319 %index.next = add nuw i64 %index, 32
320 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
321 %i5 = icmp eq i64 %index.next, 1024
322 br i1 %i5, label %for.cond.cleanup, label %vector.body
324 for.cond.cleanup: ; preds = %vector.body
328 define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
329 ; V-LABEL: scatter_masked:
330 ; V: # %bb.0: # %entry
331 ; V-NEXT: li a2, 1024
333 ; V-NEXT: lui a4, 983765
334 ; V-NEXT: addi a4, a4, 873
335 ; V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
336 ; V-NEXT: vmv.s.x v0, a4
338 ; V-NEXT: .LBB6_1: # %vector.body
339 ; V-NEXT: # =>This Inner Loop Header: Depth=1
340 ; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu
341 ; V-NEXT: vle8.v v9, (a1)
342 ; V-NEXT: vmv1r.v v10, v8
343 ; V-NEXT: vlse8.v v10, (a0), a4, v0.t
344 ; V-NEXT: vadd.vv v9, v10, v9
345 ; V-NEXT: vsse8.v v9, (a0), a4, v0.t
346 ; V-NEXT: addi a2, a2, -32
347 ; V-NEXT: addi a1, a1, 32
348 ; V-NEXT: addi a0, a0, 160
349 ; V-NEXT: bnez a2, .LBB6_1
350 ; V-NEXT: # %bb.2: # %for.cond.cleanup
353 ; ZVE32F-LABEL: scatter_masked:
354 ; ZVE32F: # %bb.0: # %entry
355 ; ZVE32F-NEXT: li a2, 1024
356 ; ZVE32F-NEXT: li a3, 32
357 ; ZVE32F-NEXT: lui a4, 983765
358 ; ZVE32F-NEXT: addi a4, a4, 873
359 ; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma
360 ; ZVE32F-NEXT: vmv.s.x v0, a4
361 ; ZVE32F-NEXT: li a4, 5
362 ; ZVE32F-NEXT: .LBB6_1: # %vector.body
363 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
364 ; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu
365 ; ZVE32F-NEXT: vle8.v v9, (a1)
366 ; ZVE32F-NEXT: vmv1r.v v10, v8
367 ; ZVE32F-NEXT: vlse8.v v10, (a0), a4, v0.t
368 ; ZVE32F-NEXT: vadd.vv v9, v10, v9
369 ; ZVE32F-NEXT: vsse8.v v9, (a0), a4, v0.t
370 ; ZVE32F-NEXT: addi a2, a2, -32
371 ; ZVE32F-NEXT: addi a1, a1, 32
372 ; ZVE32F-NEXT: addi a0, a0, 160
373 ; ZVE32F-NEXT: bnez a2, .LBB6_1
374 ; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
377 br label %vector.body
379 vector.body: ; preds = %vector.body, %entry
380 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
381 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
382 %i = getelementptr inbounds i8, ptr %B, i64 %index
383 %wide.load = load <32 x i8>, ptr %i, align 1
384 %i2 = mul nuw nsw <32 x i64> %vec.ind, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
385 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
386 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
387 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
388 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
389 %index.next = add nuw i64 %index, 32
390 %vec.ind.next = add <32 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
391 %i5 = icmp eq i64 %index.next, 1024
392 br i1 %i5, label %for.cond.cleanup, label %vector.body
394 for.cond.cleanup: ; preds = %vector.body
398 ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) {
399 ; for (int i = 0; i != 1024; ++i)
402 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
403 ; CHECK-LABEL: gather_pow2:
404 ; CHECK: # %bb.0: # %entry
405 ; CHECK-NEXT: li a2, 1024
406 ; CHECK-NEXT: li a3, 16
407 ; CHECK-NEXT: li a4, 32
408 ; CHECK-NEXT: .LBB7_1: # %vector.body
409 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
410 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
411 ; CHECK-NEXT: vlse32.v v8, (a1), a3
412 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
413 ; CHECK-NEXT: vle8.v v9, (a0)
414 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
415 ; CHECK-NEXT: vadd.vv v8, v9, v8
416 ; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
417 ; CHECK-NEXT: vse8.v v8, (a0)
418 ; CHECK-NEXT: addi a2, a2, -8
419 ; CHECK-NEXT: addi a0, a0, 32
420 ; CHECK-NEXT: addi a1, a1, 128
421 ; CHECK-NEXT: bnez a2, .LBB7_1
422 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
425 br label %vector.body
427 vector.body: ; preds = %vector.body, %entry
428 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
429 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
430 %i = shl nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
431 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
432 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
433 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
434 %wide.load = load <8 x i32>, ptr %i2, align 1
435 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
436 store <8 x i32> %i4, ptr %i2, align 1
437 %index.next = add nuw i64 %index, 8
438 %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
439 %i6 = icmp eq i64 %index.next, 1024
440 br i1 %i6, label %for.cond.cleanup, label %vector.body
442 for.cond.cleanup: ; preds = %vector.body
446 ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) {
447 ; for (int i = 0; i < 1024; ++i)
450 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
451 ; CHECK-LABEL: scatter_pow2:
452 ; CHECK: # %bb.0: # %entry
453 ; CHECK-NEXT: li a2, 1024
454 ; CHECK-NEXT: li a3, 32
455 ; CHECK-NEXT: li a4, 16
456 ; CHECK-NEXT: .LBB8_1: # %vector.body
457 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
458 ; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
459 ; CHECK-NEXT: vle8.v v8, (a1)
460 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
461 ; CHECK-NEXT: vlse32.v v9, (a0), a4
462 ; CHECK-NEXT: vadd.vv v8, v9, v8
463 ; CHECK-NEXT: vsse32.v v8, (a0), a4
464 ; CHECK-NEXT: addi a2, a2, -8
465 ; CHECK-NEXT: addi a1, a1, 32
466 ; CHECK-NEXT: addi a0, a0, 128
467 ; CHECK-NEXT: bnez a2, .LBB8_1
468 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
471 br label %vector.body
473 vector.body: ; preds = %vector.body, %entry
474 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
475 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
476 %i = getelementptr inbounds i32, ptr %B, i64 %index
477 %wide.load = load <8 x i32>, ptr %i, align 1
478 %i2 = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
479 %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
480 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
481 %i4 = add <8 x i32> %wide.masked.gather, %wide.load
482 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
483 %index.next = add nuw i64 %index, 8
484 %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
485 %i5 = icmp eq i64 %index.next, 1024
486 br i1 %i5, label %for.cond.cleanup, label %vector.body
488 for.cond.cleanup: ; preds = %vector.body
496 ;void struct_gather(int * __restrict A, struct foo * __restrict B) {
497 ; for (int i = 0; i < 1024; ++i)
500 define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
501 ; CHECK-LABEL: struct_gather:
502 ; CHECK: # %bb.0: # %entry
503 ; CHECK-NEXT: addi a1, a1, 132
504 ; CHECK-NEXT: li a2, 1024
505 ; CHECK-NEXT: li a3, 16
506 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
507 ; CHECK-NEXT: .LBB9_1: # %vector.body
508 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
509 ; CHECK-NEXT: addi a4, a0, 32
510 ; CHECK-NEXT: addi a5, a1, -128
511 ; CHECK-NEXT: vlse32.v v8, (a5), a3
512 ; CHECK-NEXT: vlse32.v v9, (a1), a3
513 ; CHECK-NEXT: vle32.v v10, (a0)
514 ; CHECK-NEXT: vle32.v v11, (a4)
515 ; CHECK-NEXT: vadd.vv v8, v10, v8
516 ; CHECK-NEXT: vadd.vv v9, v11, v9
517 ; CHECK-NEXT: vse32.v v8, (a0)
518 ; CHECK-NEXT: vse32.v v9, (a4)
519 ; CHECK-NEXT: addi a2, a2, -16
520 ; CHECK-NEXT: addi a0, a0, 64
521 ; CHECK-NEXT: addi a1, a1, 256
522 ; CHECK-NEXT: bnez a2, .LBB9_1
523 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
526 br label %vector.body
528 vector.body: ; preds = %vector.body, %entry
529 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
530 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
531 %step.add = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
532 %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
533 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
534 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
535 %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
536 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
537 %wide.load = load <8 x i32>, ptr %i2, align 4
538 %i4 = getelementptr inbounds i32, ptr %i2, i64 8
539 %wide.load10 = load <8 x i32>, ptr %i4, align 4
540 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
541 %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
542 store <8 x i32> %i6, ptr %i2, align 4
543 store <8 x i32> %i7, ptr %i4, align 4
544 %index.next = add nuw i64 %index, 16
545 %vec.ind.next = add <8 x i64> %vec.ind, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
546 %i10 = icmp eq i64 %index.next, 1024
547 br i1 %i10, label %for.cond.cleanup, label %vector.body
549 for.cond.cleanup: ; preds = %vector.body
553 ;void gather_unroll(int * __restrict A, int * __restrict B) {
554 ; for (int i = 0; i < 1024; i+= 4 ) {
556 ; A[i+1] += B[(i+1) * 4];
557 ; A[i+2] += B[(i+2) * 4];
558 ; A[i+3] += B[(i+3) * 4];
561 define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
562 ; CHECK-LABEL: gather_unroll:
563 ; CHECK: # %bb.0: # %entry
564 ; CHECK-NEXT: li a2, 256
565 ; CHECK-NEXT: li a3, 64
566 ; CHECK-NEXT: li a4, 16
567 ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
568 ; CHECK-NEXT: .LBB10_1: # %vector.body
569 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
570 ; CHECK-NEXT: vlse32.v v8, (a1), a3
571 ; CHECK-NEXT: vlse32.v v9, (a0), a4
572 ; CHECK-NEXT: vadd.vv v8, v9, v8
573 ; CHECK-NEXT: vsse32.v v8, (a0), a4
574 ; CHECK-NEXT: addi a5, a1, 16
575 ; CHECK-NEXT: vlse32.v v8, (a5), a3
576 ; CHECK-NEXT: addi a5, a0, 4
577 ; CHECK-NEXT: vlse32.v v9, (a5), a4
578 ; CHECK-NEXT: vadd.vv v8, v9, v8
579 ; CHECK-NEXT: vsse32.v v8, (a5), a4
580 ; CHECK-NEXT: addi a5, a1, 32
581 ; CHECK-NEXT: vlse32.v v8, (a5), a3
582 ; CHECK-NEXT: addi a5, a0, 8
583 ; CHECK-NEXT: vlse32.v v9, (a5), a4
584 ; CHECK-NEXT: vadd.vv v8, v9, v8
585 ; CHECK-NEXT: vsse32.v v8, (a5), a4
586 ; CHECK-NEXT: addi a5, a1, 48
587 ; CHECK-NEXT: vlse32.v v8, (a5), a3
588 ; CHECK-NEXT: addi a5, a0, 12
589 ; CHECK-NEXT: vlse32.v v9, (a5), a4
590 ; CHECK-NEXT: vadd.vv v8, v9, v8
591 ; CHECK-NEXT: vsse32.v v8, (a5), a4
592 ; CHECK-NEXT: addi a2, a2, -8
593 ; CHECK-NEXT: addi a1, a1, 512
594 ; CHECK-NEXT: addi a0, a0, 128
595 ; CHECK-NEXT: bnez a2, .LBB10_1
596 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
599 br label %vector.body
601 vector.body: ; preds = %vector.body, %entry
602 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
603 %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
604 %i = shl nuw nsw <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
605 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
606 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
607 %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
608 %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
609 %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
610 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
611 %i4 = or <8 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
612 %i5 = shl nsw <8 x i64> %i4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
613 %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
614 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
615 %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
616 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
617 %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
618 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
619 %i9 = or <8 x i64> %vec.ind, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
620 %i10 = shl nsw <8 x i64> %i9, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
621 %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
622 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
623 %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
624 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
625 %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
626 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
627 %i14 = or <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
628 %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
629 %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
630 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
631 %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
632 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
633 %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
634 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
635 %index.next = add nuw i64 %index, 8
636 %vec.ind.next = add <8 x i64> %vec.ind, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
637 %i19 = icmp eq i64 %index.next, 256
638 br i1 %i19, label %for.cond.cleanup, label %vector.body
640 for.cond.cleanup: ; preds = %vector.body
644 declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>)
645 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
646 declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>)
647 declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
649 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
650 define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
651 ; V-LABEL: gather_of_pointers:
653 ; V-NEXT: li a2, 1024
655 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
656 ; V-NEXT: .LBB11_1: # %bb2
657 ; V-NEXT: # =>This Inner Loop Header: Depth=1
658 ; V-NEXT: addi a4, a1, 80
659 ; V-NEXT: vlse64.v v8, (a1), a3
660 ; V-NEXT: vlse64.v v9, (a4), a3
661 ; V-NEXT: addi a4, a0, 16
662 ; V-NEXT: vse64.v v8, (a0)
663 ; V-NEXT: vse64.v v9, (a4)
664 ; V-NEXT: addi a2, a2, -4
665 ; V-NEXT: addi a0, a0, 32
666 ; V-NEXT: addi a1, a1, 160
667 ; V-NEXT: bnez a2, .LBB11_1
668 ; V-NEXT: # %bb.2: # %bb18
671 ; ZVE32F-LABEL: gather_of_pointers:
672 ; ZVE32F: # %bb.0: # %bb
673 ; ZVE32F-NEXT: li a2, 0
674 ; ZVE32F-NEXT: li a3, 1
675 ; ZVE32F-NEXT: li a4, 1024
676 ; ZVE32F-NEXT: li a5, 40
677 ; ZVE32F-NEXT: .LBB11_1: # %bb2
678 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
679 ; ZVE32F-NEXT: mul a6, a3, a5
680 ; ZVE32F-NEXT: add a6, a1, a6
681 ; ZVE32F-NEXT: mul a7, a2, a5
682 ; ZVE32F-NEXT: add a7, a1, a7
683 ; ZVE32F-NEXT: ld t0, 0(a6)
684 ; ZVE32F-NEXT: ld t1, 0(a7)
685 ; ZVE32F-NEXT: ld a6, 80(a6)
686 ; ZVE32F-NEXT: ld a7, 80(a7)
687 ; ZVE32F-NEXT: sd t0, 8(a0)
688 ; ZVE32F-NEXT: sd t1, 0(a0)
689 ; ZVE32F-NEXT: sd a6, 24(a0)
690 ; ZVE32F-NEXT: sd a7, 16(a0)
691 ; ZVE32F-NEXT: addi a2, a2, 4
692 ; ZVE32F-NEXT: addi a3, a3, 4
693 ; ZVE32F-NEXT: addi a4, a4, -4
694 ; ZVE32F-NEXT: addi a0, a0, 32
695 ; ZVE32F-NEXT: bnez a4, .LBB11_1
696 ; ZVE32F-NEXT: # %bb.2: # %bb18
701 bb2: ; preds = %bb2, %bb
702 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
703 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
704 %i4 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
705 %i5 = mul <2 x i64> %i3, <i64 5, i64 5>
706 %i6 = add <2 x i64> %i5, <i64 10, i64 10>
707 %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
708 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
709 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
710 %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> <i1 true, i1 true>, <2 x ptr> undef)
711 %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
712 store <2 x ptr> %i9, ptr %i11, align 8
713 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
714 store <2 x ptr> %i10, ptr %i13, align 8
715 %i15 = add nuw i64 %i, 4
716 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
717 %i17 = icmp eq i64 %i15, 1024
718 br i1 %i17, label %bb18, label %bb2
724 declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>)
726 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
727 define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
728 ; V-LABEL: scatter_of_pointers:
730 ; V-NEXT: li a2, 1024
732 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
733 ; V-NEXT: .LBB12_1: # %bb2
734 ; V-NEXT: # =>This Inner Loop Header: Depth=1
735 ; V-NEXT: addi a4, a1, 16
736 ; V-NEXT: vle64.v v8, (a1)
737 ; V-NEXT: vle64.v v9, (a4)
738 ; V-NEXT: addi a4, a0, 80
739 ; V-NEXT: vsse64.v v8, (a0), a3
740 ; V-NEXT: vsse64.v v9, (a4), a3
741 ; V-NEXT: addi a2, a2, -4
742 ; V-NEXT: addi a1, a1, 32
743 ; V-NEXT: addi a0, a0, 160
744 ; V-NEXT: bnez a2, .LBB12_1
745 ; V-NEXT: # %bb.2: # %bb18
748 ; ZVE32F-LABEL: scatter_of_pointers:
749 ; ZVE32F: # %bb.0: # %bb
750 ; ZVE32F-NEXT: li a2, 0
751 ; ZVE32F-NEXT: li a3, 1
752 ; ZVE32F-NEXT: li a4, 1024
753 ; ZVE32F-NEXT: li a5, 40
754 ; ZVE32F-NEXT: .LBB12_1: # %bb2
755 ; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
756 ; ZVE32F-NEXT: ld a6, 8(a1)
757 ; ZVE32F-NEXT: ld a7, 0(a1)
758 ; ZVE32F-NEXT: ld t0, 24(a1)
759 ; ZVE32F-NEXT: ld t1, 16(a1)
760 ; ZVE32F-NEXT: mul t2, a3, a5
761 ; ZVE32F-NEXT: add t2, a0, t2
762 ; ZVE32F-NEXT: mul t3, a2, a5
763 ; ZVE32F-NEXT: add t3, a0, t3
764 ; ZVE32F-NEXT: sd a7, 0(t3)
765 ; ZVE32F-NEXT: sd a6, 0(t2)
766 ; ZVE32F-NEXT: sd t1, 80(t3)
767 ; ZVE32F-NEXT: sd t0, 80(t2)
768 ; ZVE32F-NEXT: addi a2, a2, 4
769 ; ZVE32F-NEXT: addi a3, a3, 4
770 ; ZVE32F-NEXT: addi a4, a4, -4
771 ; ZVE32F-NEXT: addi a1, a1, 32
772 ; ZVE32F-NEXT: bnez a4, .LBB12_1
773 ; ZVE32F-NEXT: # %bb.2: # %bb18
778 bb2: ; preds = %bb2, %bb
779 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
780 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
781 %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i
782 %i6 = load <2 x ptr>, ptr %i4, align 8
783 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
784 %i9 = load <2 x ptr>, ptr %i7, align 8
785 %i10 = mul nuw nsw <2 x i64> %i3, <i64 5, i64 5>
786 %i11 = mul <2 x i64> %i3, <i64 5, i64 5>
787 %i12 = add <2 x i64> %i11, <i64 10, i64 10>
788 %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
789 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
790 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> <i1 true, i1 true>)
791 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> <i1 true, i1 true>)
792 %i15 = add nuw i64 %i, 4
793 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
794 %i17 = icmp eq i64 %i15, 1024
795 br i1 %i17, label %bb18, label %bb2
801 declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>)
803 define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) {
804 ; CHECK-LABEL: strided_load_startval_add_with_splat:
805 ; CHECK: # %bb.0: # %bb
806 ; CHECK-NEXT: li a3, 1024
807 ; CHECK-NEXT: beq a2, a3, .LBB13_7
808 ; CHECK-NEXT: # %bb.1: # %bb3
809 ; CHECK-NEXT: li a4, 1023
810 ; CHECK-NEXT: subw a4, a4, a2
811 ; CHECK-NEXT: li a5, 31
812 ; CHECK-NEXT: mv a3, a2
813 ; CHECK-NEXT: bltu a4, a5, .LBB13_5
814 ; CHECK-NEXT: # %bb.2: # %bb9
815 ; CHECK-NEXT: slli a4, a4, 32
816 ; CHECK-NEXT: srli a4, a4, 32
817 ; CHECK-NEXT: addi a4, a4, 1
818 ; CHECK-NEXT: andi a5, a4, -32
819 ; CHECK-NEXT: add a3, a5, a2
820 ; CHECK-NEXT: slli a7, a2, 2
821 ; CHECK-NEXT: add a6, a0, a2
822 ; CHECK-NEXT: add a2, a1, a2
823 ; CHECK-NEXT: add a2, a2, a7
824 ; CHECK-NEXT: li a7, 32
825 ; CHECK-NEXT: li t0, 5
826 ; CHECK-NEXT: mv t1, a5
827 ; CHECK-NEXT: .LBB13_3: # %bb15
828 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
829 ; CHECK-NEXT: vsetvli zero, a7, e8, m1, ta, ma
830 ; CHECK-NEXT: vlse8.v v8, (a2), t0
831 ; CHECK-NEXT: vle8.v v9, (a6)
832 ; CHECK-NEXT: vadd.vv v8, v9, v8
833 ; CHECK-NEXT: vse8.v v8, (a6)
834 ; CHECK-NEXT: addi t1, t1, -32
835 ; CHECK-NEXT: addi a6, a6, 32
836 ; CHECK-NEXT: addi a2, a2, 160
837 ; CHECK-NEXT: bnez t1, .LBB13_3
838 ; CHECK-NEXT: # %bb.4: # %bb30
839 ; CHECK-NEXT: beq a4, a5, .LBB13_7
840 ; CHECK-NEXT: .LBB13_5: # %bb32
841 ; CHECK-NEXT: addi a2, a3, -1024
842 ; CHECK-NEXT: add a0, a0, a3
843 ; CHECK-NEXT: slli a4, a3, 2
844 ; CHECK-NEXT: add a1, a1, a3
845 ; CHECK-NEXT: add a1, a1, a4
846 ; CHECK-NEXT: .LBB13_6: # %bb35
847 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
848 ; CHECK-NEXT: lbu a3, 0(a1)
849 ; CHECK-NEXT: lbu a4, 0(a0)
850 ; CHECK-NEXT: add a3, a4, a3
851 ; CHECK-NEXT: sb a3, 0(a0)
852 ; CHECK-NEXT: addiw a2, a2, 1
853 ; CHECK-NEXT: addi a0, a0, 1
854 ; CHECK-NEXT: addi a1, a1, 5
855 ; CHECK-NEXT: bnez a2, .LBB13_6
856 ; CHECK-NEXT: .LBB13_7: # %bb34
859 %i = icmp eq i32 %arg2, 1024
860 br i1 %i, label %bb34, label %bb3
863 %i4 = sext i32 %arg2 to i64
864 %i5 = sub i32 1023, %arg2
865 %i6 = zext i32 %i5 to i64
866 %i7 = add nuw nsw i64 %i6, 1
867 %i8 = icmp ult i32 %i5, 31
868 br i1 %i8, label %bb32, label %bb9
871 %i10 = and i64 %i7, 8589934560
872 %i11 = add nsw i64 %i10, %i4
873 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0
874 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer
875 %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>
878 bb15: ; preds = %bb15, %bb9
879 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
880 %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
881 %i18 = add i64 %i16, %i4
882 %i19 = mul nsw <32 x i64> %i17, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
883 %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
884 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i8> undef)
885 %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
886 %i24 = load <32 x i8>, ptr %i22, align 1
887 %i25 = add <32 x i8> %i24, %i21
888 store <32 x i8> %i25, ptr %i22, align 1
889 %i27 = add nuw i64 %i16, 32
890 %i28 = add <32 x i64> %i17, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
891 %i29 = icmp eq i64 %i27, %i10
892 br i1 %i29, label %bb30, label %bb15
894 bb30: ; preds = %bb15
895 %i31 = icmp eq i64 %i7, %i10
896 br i1 %i31, label %bb34, label %bb32
898 bb32: ; preds = %bb30, %bb3
899 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ]
902 bb34: ; preds = %bb35, %bb30, %bb
905 bb35: ; preds = %bb35, %bb32
906 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ]
907 %i37 = mul nsw i64 %i36, 5
908 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37
909 %i39 = load i8, ptr %i38, align 1
910 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36
911 %i41 = load i8, ptr %i40, align 1
912 %i42 = add i8 %i41, %i39
913 store i8 %i42, ptr %i40, align 1
914 %i43 = add nsw i64 %i36, 1
915 %i44 = trunc i64 %i43 to i32
916 %i45 = icmp eq i32 %i44, 1024
917 br i1 %i45, label %bb34, label %bb35
920 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>)
921 declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>)
923 define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) {
924 ; CHECK-LABEL: gather_no_scalar_remainder:
925 ; CHECK: # %bb.0: # %bb
926 ; CHECK-NEXT: slli a2, a2, 4
927 ; CHECK-NEXT: beqz a2, .LBB14_3
928 ; CHECK-NEXT: # %bb.1: # %bb2
929 ; CHECK-NEXT: li a3, 5
930 ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma
931 ; CHECK-NEXT: .LBB14_2: # %bb4
932 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
933 ; CHECK-NEXT: vlse8.v v8, (a1), a3
934 ; CHECK-NEXT: vle8.v v9, (a0)
935 ; CHECK-NEXT: vadd.vv v8, v9, v8
936 ; CHECK-NEXT: vse8.v v8, (a0)
937 ; CHECK-NEXT: addi a2, a2, -16
938 ; CHECK-NEXT: addi a0, a0, 16
939 ; CHECK-NEXT: addi a1, a1, 80
940 ; CHECK-NEXT: bnez a2, .LBB14_2
941 ; CHECK-NEXT: .LBB14_3: # %bb16
944 %i = shl i64 %arg2, 4
945 %i3 = icmp eq i64 %i, 0
946 br i1 %i3, label %bb16, label %bb2
951 bb4: ; preds = %bb4, %bb2
952 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
953 %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
954 %i7 = mul <16 x i64> %i6, <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>
955 %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
956 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
957 %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
958 %i11 = load <16 x i8>, ptr %i10, align 1
959 %i12 = add <16 x i8> %i11, %i9
960 store <16 x i8> %i12, ptr %i10, align 1
961 %i13 = add nuw i64 %i5, 16
962 %i14 = add <16 x i64> %i6, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
963 %i15 = icmp eq i64 %i13, %i
964 br i1 %i15, label %bb16, label %bb4
966 bb16: ; preds = %bb4, %bb