1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
3 ; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
5 %struct.foo = type { i32, i32, i32, i32 }
7 ; void gather(signed char * __restrict A, signed char * __restrict B) {
8 ; for (int i = 0; i != 1024; ++i)
11 define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
12 ; CHECK-LABEL: @gather(
14 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
16 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
17 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
18 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
19 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
20 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
21 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
22 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
23 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
24 ; CHECK-NEXT: store <32 x i8> [[I4]], ptr [[I2]], align 1
25 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
26 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
27 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
28 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
29 ; CHECK: for.cond.cleanup:
30 ; CHECK-NEXT: ret void
35 vector.body: ; preds = %vector.body, %entry
36 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
37 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
38 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
39 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
40 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
41 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
42 %wide.load = load <32 x i8>, ptr %i2, align 1
43 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
44 store <32 x i8> %i4, ptr %i2, align 1
45 %index.next = add nuw i64 %index, 32
46 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
47 %i6 = icmp eq i64 %index.next, 1024
48 br i1 %i6, label %for.cond.cleanup, label %vector.body
50 for.cond.cleanup: ; preds = %vector.body
54 define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
55 ; CHECK-LABEL: @gather_masked(
57 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
59 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
60 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
61 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
62 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
63 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> [[MASKEDOFF:%.*]], i32 32)
64 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
65 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
66 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
67 ; CHECK-NEXT: store <32 x i8> [[I4]], ptr [[I2]], align 1
68 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
69 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
70 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
71 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
72 ; CHECK: for.cond.cleanup:
73 ; CHECK-NEXT: ret void
78 vector.body: ; preds = %vector.body, %entry
79 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
80 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
81 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
82 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
83 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
84 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
85 %wide.load = load <32 x i8>, ptr %i2, align 1
86 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
87 store <32 x i8> %i4, ptr %i2, align 1
88 %index.next = add nuw i64 %index, 32
89 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
90 %i6 = icmp eq i64 %index.next, 1024
91 br i1 %i6, label %for.cond.cleanup, label %vector.body
93 for.cond.cleanup: ; preds = %vector.body
97 define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
98 ; CHECK-LABEL: @gather_negative_stride(
100 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
101 ; CHECK: vector.body:
102 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
103 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 155, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
104 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
105 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 -5, <32 x i1> splat (i1 true), i32 32)
106 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
107 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
108 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
109 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
110 ; CHECK-NEXT: store <32 x i8> [[I4]], ptr [[I2]], align 1
111 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
112 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
113 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
114 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
115 ; CHECK: for.cond.cleanup:
116 ; CHECK-NEXT: ret void
119 br label %vector.body
121 vector.body: ; preds = %vector.body, %entry
122 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
123 %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
124 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
125 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
126 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
127 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
128 %wide.load = load <32 x i8>, ptr %i2, align 1
129 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
130 store <32 x i8> %i4, ptr %i2, align 1
131 %index.next = add nuw i64 %index, 32
132 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
133 %i6 = icmp eq i64 %index.next, 1024
134 br i1 %i6, label %for.cond.cleanup, label %vector.body
136 for.cond.cleanup: ; preds = %vector.body
140 define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
141 ; CHECK-LABEL: @gather_zero_stride(
143 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
144 ; CHECK: vector.body:
145 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
146 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
147 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
148 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 0, <32 x i1> splat (i1 true), i32 32)
149 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
150 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
151 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
152 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
153 ; CHECK-NEXT: store <32 x i8> [[I4]], ptr [[I2]], align 1
154 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
155 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
156 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
157 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
158 ; CHECK: for.cond.cleanup:
159 ; CHECK-NEXT: ret void
162 br label %vector.body
164 vector.body: ; preds = %vector.body, %entry
165 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
166 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
167 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
168 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
169 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
170 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
171 %wide.load = load <32 x i8>, ptr %i2, align 1
172 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
173 store <32 x i8> %i4, ptr %i2, align 1
174 %index.next = add nuw i64 %index, 32
175 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
176 %i6 = icmp eq i64 %index.next, 1024
177 br i1 %i6, label %for.cond.cleanup, label %vector.body
179 for.cond.cleanup: ; preds = %vector.body
183 ;void scatter(signed char * __restrict A, signed char * __restrict B) {
184 ; for (int i = 0; i < 1024; ++i)
187 define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
188 ; CHECK-LABEL: @scatter(
190 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
191 ; CHECK: vector.body:
192 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
193 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
194 ; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
195 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
196 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
197 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
198 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
199 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
200 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
201 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
202 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
203 ; CHECK-NEXT: [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
204 ; CHECK-NEXT: br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
205 ; CHECK: for.cond.cleanup:
206 ; CHECK-NEXT: ret void
209 br label %vector.body
211 vector.body: ; preds = %vector.body, %entry
212 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
213 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
214 %i = getelementptr inbounds i8, ptr %B, i64 %index
215 %wide.load = load <32 x i8>, ptr %i, align 1
216 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
217 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
218 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
219 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
220 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true))
221 %index.next = add nuw i64 %index, 32
222 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
223 %i5 = icmp eq i64 %index.next, 1024
224 br i1 %i5, label %for.cond.cleanup, label %vector.body
226 for.cond.cleanup: ; preds = %vector.body
230 define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
231 ; CHECK-LABEL: @scatter_masked(
233 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
234 ; CHECK: vector.body:
235 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
236 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
237 ; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
238 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
239 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
240 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
241 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> [[MASKEDOFF:%.*]], i32 32)
242 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
243 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
244 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
245 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
246 ; CHECK-NEXT: [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
247 ; CHECK-NEXT: br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
248 ; CHECK: for.cond.cleanup:
249 ; CHECK-NEXT: ret void
252 br label %vector.body
254 vector.body: ; preds = %vector.body, %entry
255 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
256 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
257 %i = getelementptr inbounds i8, ptr %B, i64 %index
258 %wide.load = load <32 x i8>, ptr %i, align 1
259 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
260 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
261 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
262 %i4 = add <32 x i8> %wide.masked.gather, %wide.load
263 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
264 %index.next = add nuw i64 %index, 32
265 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
266 %i5 = icmp eq i64 %index.next, 1024
267 br i1 %i5, label %for.cond.cleanup, label %vector.body
269 for.cond.cleanup: ; preds = %vector.body
273 ; void gather_pow2(signed char * __restrict A, signed char * __restrict B) {
274 ; for (int i = 0; i != 1024; ++i)
277 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
278 ; CHECK-LABEL: @gather_pow2(
280 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
281 ; CHECK: vector.body:
282 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
283 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
284 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
285 ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
286 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
287 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
288 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
289 ; CHECK-NEXT: [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
290 ; CHECK-NEXT: store <8 x i32> [[I4]], ptr [[I2]], align 1
291 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
292 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
293 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
294 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
295 ; CHECK: for.cond.cleanup:
296 ; CHECK-NEXT: ret void
299 br label %vector.body
301 vector.body: ; preds = %vector.body, %entry
302 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
303 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
304 %i = shl nsw <8 x i64> %vec.ind, splat (i64 2)
305 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
306 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
307 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
308 %wide.load = load <8 x i32>, ptr %i2, align 1
309 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
310 store <8 x i32> %i4, ptr %i2, align 1
311 %index.next = add nuw i64 %index, 8
312 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
313 %i6 = icmp eq i64 %index.next, 1024
314 br i1 %i6, label %for.cond.cleanup, label %vector.body
316 for.cond.cleanup: ; preds = %vector.body
320 define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
321 ; CHECK-LABEL: @gather_unknown_pow2(
323 ; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT:%.*]]
324 ; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT]]
325 ; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STRIDE]], 4
326 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
327 ; CHECK: vector.body:
328 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
329 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
330 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
331 ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP1]], i64 [[TMP0]], <8 x i1> splat (i1 true), i32 8)
332 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP2]], <8 x i32> undef, i32 8)
333 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
334 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
335 ; CHECK-NEXT: [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
336 ; CHECK-NEXT: store <8 x i32> [[I4]], ptr [[I2]], align 1
337 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
338 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[STEP]]
339 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
340 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
341 ; CHECK: for.cond.cleanup:
342 ; CHECK-NEXT: ret void
345 %.splatinsert = insertelement <8 x i64> poison, i64 %shift, i64 0
346 %.splat = shufflevector <8 x i64> %.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
347 br label %vector.body
349 vector.body: ; preds = %vector.body, %entry
350 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
351 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
352 %i = shl nsw <8 x i64> %vec.ind, %.splat
353 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
354 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
355 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
356 %wide.load = load <8 x i32>, ptr %i2, align 1
357 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
358 store <8 x i32> %i4, ptr %i2, align 1
359 %index.next = add nuw i64 %index, 8
360 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
361 %i6 = icmp eq i64 %index.next, 1024
362 br i1 %i6, label %for.cond.cleanup, label %vector.body
364 for.cond.cleanup: ; preds = %vector.body
368 define void @negative_shl_non_commute(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
369 ; CHECK-LABEL: @negative_shl_non_commute(
371 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[SHIFT:%.*]], i64 0
372 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
373 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
374 ; CHECK: vector.body:
375 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
376 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
377 ; CHECK-NEXT: [[I:%.*]] = shl nsw <8 x i64> [[DOTSPLAT]], [[VEC_IND]]
378 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i64> [[I]]
379 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[I1]], i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
380 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
381 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
382 ; CHECK-NEXT: [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
383 ; CHECK-NEXT: store <8 x i32> [[I4]], ptr [[I2]], align 1
384 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
385 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
386 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
387 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
388 ; CHECK: for.cond.cleanup:
389 ; CHECK-NEXT: ret void
392 %.splatinsert = insertelement <8 x i64> poison, i64 %shift, i64 0
393 %.splat = shufflevector <8 x i64> %.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
394 br label %vector.body
396 vector.body: ; preds = %vector.body, %entry
397 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
398 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
399 %i = shl nsw <8 x i64> %.splat, %vec.ind
400 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
401 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
402 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
403 %wide.load = load <8 x i32>, ptr %i2, align 1
404 %i4 = add <8 x i32> %wide.load, %wide.masked.gather
405 store <8 x i32> %i4, ptr %i2, align 1
406 %index.next = add nuw i64 %index, 8
407 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
408 %i6 = icmp eq i64 %index.next, 1024
409 br i1 %i6, label %for.cond.cleanup, label %vector.body
411 for.cond.cleanup: ; preds = %vector.body
415 ;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) {
416 ; for (int i = 0; i < 1024; ++i)
419 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
420 ; CHECK-LABEL: @scatter_pow2(
422 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
423 ; CHECK: vector.body:
424 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
425 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
426 ; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
427 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I]], align 1
428 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
429 ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
430 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
431 ; CHECK-NEXT: [[I4:%.*]] = add <8 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
432 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I4]], ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
433 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
434 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
435 ; CHECK-NEXT: [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
436 ; CHECK-NEXT: br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
437 ; CHECK: for.cond.cleanup:
438 ; CHECK-NEXT: ret void
441 br label %vector.body
443 vector.body: ; preds = %vector.body, %entry
444 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
445 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
446 %i = getelementptr inbounds i32, ptr %B, i64 %index
447 %wide.load = load <8 x i32>, ptr %i, align 1
448 %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
449 %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
450 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
451 %i4 = add <8 x i32> %wide.masked.gather, %wide.load
452 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true))
453 %index.next = add nuw i64 %index, 8
454 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
455 %i5 = icmp eq i64 %index.next, 1024
456 br i1 %i5, label %for.cond.cleanup, label %vector.body
458 for.cond.cleanup: ; preds = %vector.body
466 ;void struct_gather(int * __restrict A, struct foo * __restrict B) {
467 ; for (int i = 0; i < 1024; ++i)
470 define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
471 ; CHECK-LABEL: @struct_gather(
473 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
474 ; CHECK: vector.body:
475 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
476 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
477 ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
478 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]], i32 1
479 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[B]], i64 [[VEC_IND_SCALAR1]], i32 1
480 ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
481 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP2]], <8 x i32> undef, i32 8)
482 ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP1]], i64 16, <8 x i1> splat (i1 true), i32 8)
483 ; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> undef, i32 8)
484 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
485 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 4
486 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr [[I2]], i64 8
487 ; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[I4]], align 4
488 ; CHECK-NEXT: [[I6:%.*]] = add nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
489 ; CHECK-NEXT: [[I7:%.*]] = add nsw <8 x i32> [[WIDE_LOAD10]], [[WIDE_MASKED_GATHER9]]
490 ; CHECK-NEXT: store <8 x i32> [[I6]], ptr [[I2]], align 4
491 ; CHECK-NEXT: store <8 x i32> [[I7]], ptr [[I4]], align 4
492 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
493 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 16
494 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 16
495 ; CHECK-NEXT: [[I10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
496 ; CHECK-NEXT: br i1 [[I10]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
497 ; CHECK: for.cond.cleanup:
498 ; CHECK-NEXT: ret void
501 br label %vector.body
503 vector.body: ; preds = %vector.body, %entry
504 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
505 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
506 %step.add = add <8 x i64> %vec.ind, splat (i64 8)
507 %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
508 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
509 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
510 %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
511 %i2 = getelementptr inbounds i32, ptr %A, i64 %index
512 %wide.load = load <8 x i32>, ptr %i2, align 4
513 %i4 = getelementptr inbounds i32, ptr %i2, i64 8
514 %wide.load10 = load <8 x i32>, ptr %i4, align 4
515 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
516 %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
517 store <8 x i32> %i6, ptr %i2, align 4
518 store <8 x i32> %i7, ptr %i4, align 4
519 %index.next = add nuw i64 %index, 16
520 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16)
521 %i10 = icmp eq i64 %index.next, 1024
522 br i1 %i10, label %for.cond.cleanup, label %vector.body
524 for.cond.cleanup: ; preds = %vector.body
528 ;void gather_unroll(int * __restrict A, int * __restrict B) {
529 ; for (int i = 0; i < 1024; i+= 4 ) {
531 ; A[i+1] += B[(i+1) * 4];
532 ; A[i+2] += B[(i+2) * 4];
533 ; A[i+3] += B[(i+3) * 4];
536 define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
537 ; CHECK-LABEL: @gather_unroll(
539 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
540 ; CHECK: vector.body:
541 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
542 ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
543 ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
544 ; CHECK-NEXT: [[VEC_IND_SCALAR3:%.*]] = phi i64 [ 4, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR4:%.*]], [[VECTOR_BODY]] ]
545 ; CHECK-NEXT: [[VEC_IND_SCALAR5:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR6:%.*]], [[VECTOR_BODY]] ]
546 ; CHECK-NEXT: [[VEC_IND_SCALAR7:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR8:%.*]], [[VECTOR_BODY]] ]
547 ; CHECK-NEXT: [[VEC_IND_SCALAR9:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR10:%.*]], [[VECTOR_BODY]] ]
548 ; CHECK-NEXT: [[VEC_IND_SCALAR11:%.*]] = phi i64 [ 12, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR12:%.*]], [[VECTOR_BODY]] ]
549 ; CHECK-NEXT: [[VEC_IND_SCALAR13:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR14:%.*]], [[VECTOR_BODY]] ]
550 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
551 ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 64, <8 x i1> splat (i1 true), i32 8)
552 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
553 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
554 ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP2]], i64 16, <8 x i1> splat (i1 true), i32 8)
555 ; CHECK-NEXT: [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> undef, i32 8)
556 ; CHECK-NEXT: [[I3:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER52]], [[WIDE_MASKED_GATHER]]
557 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I3]], ptr [[TMP2]], i64 16, <8 x i1> splat (i1 true), i32 8)
558 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR3]]
559 ; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP4]], i64 64, <8 x i1> splat (i1 true), i32 8)
560 ; CHECK-NEXT: [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP5]], <8 x i32> undef, i32 8)
561 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR5]]
562 ; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP6]], i64 16, <8 x i1> splat (i1 true), i32 8)
563 ; CHECK-NEXT: [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP7]], <8 x i32> undef, i32 8)
564 ; CHECK-NEXT: [[I8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER54]], [[WIDE_MASKED_GATHER53]]
565 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I8]], ptr [[TMP6]], i64 16, <8 x i1> splat (i1 true), i32 8)
566 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR7]]
567 ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP8]], i64 64, <8 x i1> splat (i1 true), i32 8)
568 ; CHECK-NEXT: [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP9]], <8 x i32> undef, i32 8)
569 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR9]]
570 ; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP10]], i64 16, <8 x i1> splat (i1 true), i32 8)
571 ; CHECK-NEXT: [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP11]], <8 x i32> undef, i32 8)
572 ; CHECK-NEXT: [[I13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER56]], [[WIDE_MASKED_GATHER55]]
573 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I13]], ptr [[TMP10]], i64 16, <8 x i1> splat (i1 true), i32 8)
574 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR11]]
575 ; CHECK-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP12]], i64 64, <8 x i1> splat (i1 true), i32 8)
576 ; CHECK-NEXT: [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP13]], <8 x i32> undef, i32 8)
577 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR13]]
578 ; CHECK-NEXT: [[TMP15:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP14]], i64 16, <8 x i1> splat (i1 true), i32 8)
579 ; CHECK-NEXT: [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP15]], <8 x i32> undef, i32 8)
580 ; CHECK-NEXT: [[I18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER58]], [[WIDE_MASKED_GATHER57]]
581 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I18]], ptr [[TMP14]], i64 16, <8 x i1> splat (i1 true), i32 8)
582 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
583 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 128
584 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
585 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR4]] = add i64 [[VEC_IND_SCALAR3]], 128
586 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR6]] = add i64 [[VEC_IND_SCALAR5]], 32
587 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR8]] = add i64 [[VEC_IND_SCALAR7]], 128
588 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR10]] = add i64 [[VEC_IND_SCALAR9]], 32
589 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR12]] = add i64 [[VEC_IND_SCALAR11]], 128
590 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR14]] = add i64 [[VEC_IND_SCALAR13]], 32
591 ; CHECK-NEXT: [[I19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
592 ; CHECK-NEXT: br i1 [[I19]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
593 ; CHECK: for.cond.cleanup:
594 ; CHECK-NEXT: ret void
597 br label %vector.body
599 vector.body: ; preds = %vector.body, %entry
600 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
601 %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
602 %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
603 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
604 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
605 %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
606 %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
607 %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
608 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true))
609 %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1)
610 %i5 = shl nsw <8 x i64> %i4, splat (i64 2)
611 %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
612 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
613 %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
614 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
615 %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
616 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true))
617 %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2)
618 %i10 = shl nsw <8 x i64> %i9, splat (i64 2)
619 %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
620 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
621 %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
622 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
623 %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
624 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true))
625 %i14 = or disjoint <8 x i64> %vec.ind, splat (i64 3)
626 %i15 = shl nsw <8 x i64> %i14, splat (i64 2)
627 %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
628 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
629 %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
630 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
631 %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
632 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true))
633 %index.next = add nuw i64 %index, 8
634 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
635 %i19 = icmp eq i64 %index.next, 256
636 br i1 %i19, label %for.cond.cleanup, label %vector.body
638 for.cond.cleanup: ; preds = %vector.body
642 declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>)
643 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
644 declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>)
645 declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
647 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
648 define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
649 ; V-LABEL: @gather_of_pointers(
651 ; V-NEXT: br label [[BB2:%.*]]
653 ; V-NEXT: [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
654 ; V-NEXT: [[I3_SCALAR:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I16_SCALAR:%.*]], [[BB2]] ]
655 ; V-NEXT: [[I3_SCALAR1:%.*]] = phi i64 [ 10, [[BB]] ], [ [[I16_SCALAR2:%.*]], [[BB2]] ]
656 ; V-NEXT: [[TMP0:%.*]] = getelementptr ptr, ptr [[ARG1:%.*]], i64 [[I3_SCALAR]]
657 ; V-NEXT: [[TMP1:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[I3_SCALAR1]]
658 ; V-NEXT: [[TMP2:%.*]] = call <2 x ptr> @llvm.experimental.vp.strided.load.v2p0.p0.i64(ptr [[TMP0]], i64 40, <2 x i1> splat (i1 true), i32 2)
659 ; V-NEXT: [[I9:%.*]] = call <2 x ptr> @llvm.vp.select.v2p0(<2 x i1> splat (i1 true), <2 x ptr> [[TMP2]], <2 x ptr> undef, i32 2)
660 ; V-NEXT: [[TMP3:%.*]] = call <2 x ptr> @llvm.experimental.vp.strided.load.v2p0.p0.i64(ptr [[TMP1]], i64 40, <2 x i1> splat (i1 true), i32 2)
661 ; V-NEXT: [[I10:%.*]] = call <2 x ptr> @llvm.vp.select.v2p0(<2 x i1> splat (i1 true), <2 x ptr> [[TMP3]], <2 x ptr> undef, i32 2)
662 ; V-NEXT: [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
663 ; V-NEXT: store <2 x ptr> [[I9]], ptr [[I11]], align 8
664 ; V-NEXT: [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
665 ; V-NEXT: store <2 x ptr> [[I10]], ptr [[I13]], align 8
666 ; V-NEXT: [[I15]] = add nuw i64 [[I]], 4
667 ; V-NEXT: [[I16_SCALAR]] = add i64 [[I3_SCALAR]], 20
668 ; V-NEXT: [[I16_SCALAR2]] = add i64 [[I3_SCALAR1]], 20
669 ; V-NEXT: [[I17:%.*]] = icmp eq i64 [[I15]], 1024
670 ; V-NEXT: br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
674 ; ZVE32F-LABEL: @gather_of_pointers(
676 ; ZVE32F-NEXT: br label [[BB2:%.*]]
678 ; ZVE32F-NEXT: [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
679 ; ZVE32F-NEXT: [[I3:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[BB]] ], [ [[I16:%.*]], [[BB2]] ]
680 ; ZVE32F-NEXT: [[I4:%.*]] = mul nuw nsw <2 x i64> [[I3]], splat (i64 5)
681 ; ZVE32F-NEXT: [[I5:%.*]] = mul <2 x i64> [[I3]], splat (i64 5)
682 ; ZVE32F-NEXT: [[I6:%.*]] = add <2 x i64> [[I5]], splat (i64 10)
683 ; ZVE32F-NEXT: [[I7:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], <2 x i64> [[I4]]
684 ; ZVE32F-NEXT: [[I8:%.*]] = getelementptr inbounds ptr, ptr [[ARG1]], <2 x i64> [[I6]]
685 ; ZVE32F-NEXT: [[I9:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I7]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
686 ; ZVE32F-NEXT: [[I10:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I8]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
687 ; ZVE32F-NEXT: [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
688 ; ZVE32F-NEXT: store <2 x ptr> [[I9]], ptr [[I11]], align 8
689 ; ZVE32F-NEXT: [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
690 ; ZVE32F-NEXT: store <2 x ptr> [[I10]], ptr [[I13]], align 8
691 ; ZVE32F-NEXT: [[I15]] = add nuw i64 [[I]], 4
692 ; ZVE32F-NEXT: [[I16]] = add <2 x i64> [[I3]], splat (i64 4)
693 ; ZVE32F-NEXT: [[I17:%.*]] = icmp eq i64 [[I15]], 1024
694 ; ZVE32F-NEXT: br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
696 ; ZVE32F-NEXT: ret void
701 bb2: ; preds = %bb2, %bb
702 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
703 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
704 %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
705 %i5 = mul <2 x i64> %i3, splat (i64 5)
706 %i6 = add <2 x i64> %i5, <i64 10, i64 10>
707 %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
708 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
709 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
710 %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
711 %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
712 store <2 x ptr> %i9, ptr %i11, align 8
713 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
714 store <2 x ptr> %i10, ptr %i13, align 8
715 %i15 = add nuw i64 %i, 4
716 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
717 %i17 = icmp eq i64 %i15, 1024
718 br i1 %i17, label %bb18, label %bb2
724 declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>)
726 ; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
727 define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
728 ; V-LABEL: @scatter_of_pointers(
730 ; V-NEXT: br label [[BB2:%.*]]
732 ; V-NEXT: [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
733 ; V-NEXT: [[I3_SCALAR:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I16_SCALAR:%.*]], [[BB2]] ]
734 ; V-NEXT: [[I3_SCALAR1:%.*]] = phi i64 [ 10, [[BB]] ], [ [[I16_SCALAR2:%.*]], [[BB2]] ]
735 ; V-NEXT: [[I4:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], i64 [[I]]
736 ; V-NEXT: [[I6:%.*]] = load <2 x ptr>, ptr [[I4]], align 8
737 ; V-NEXT: [[I7:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 2
738 ; V-NEXT: [[I9:%.*]] = load <2 x ptr>, ptr [[I7]], align 8
739 ; V-NEXT: [[TMP0:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i64 [[I3_SCALAR]]
740 ; V-NEXT: [[TMP1:%.*]] = getelementptr ptr, ptr [[ARG]], i64 [[I3_SCALAR1]]
741 ; V-NEXT: call void @llvm.experimental.vp.strided.store.v2p0.p0.i64(<2 x ptr> [[I6]], ptr [[TMP0]], i64 40, <2 x i1> splat (i1 true), i32 2)
742 ; V-NEXT: call void @llvm.experimental.vp.strided.store.v2p0.p0.i64(<2 x ptr> [[I9]], ptr [[TMP1]], i64 40, <2 x i1> splat (i1 true), i32 2)
743 ; V-NEXT: [[I15]] = add nuw i64 [[I]], 4
744 ; V-NEXT: [[I16_SCALAR]] = add i64 [[I3_SCALAR]], 20
745 ; V-NEXT: [[I16_SCALAR2]] = add i64 [[I3_SCALAR1]], 20
746 ; V-NEXT: [[I17:%.*]] = icmp eq i64 [[I15]], 1024
747 ; V-NEXT: br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
751 ; ZVE32F-LABEL: @scatter_of_pointers(
753 ; ZVE32F-NEXT: br label [[BB2:%.*]]
755 ; ZVE32F-NEXT: [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
756 ; ZVE32F-NEXT: [[I3:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[BB]] ], [ [[I16:%.*]], [[BB2]] ]
757 ; ZVE32F-NEXT: [[I4:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], i64 [[I]]
758 ; ZVE32F-NEXT: [[I6:%.*]] = load <2 x ptr>, ptr [[I4]], align 8
759 ; ZVE32F-NEXT: [[I7:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 2
760 ; ZVE32F-NEXT: [[I9:%.*]] = load <2 x ptr>, ptr [[I7]], align 8
761 ; ZVE32F-NEXT: [[I10:%.*]] = mul nuw nsw <2 x i64> [[I3]], splat (i64 5)
762 ; ZVE32F-NEXT: [[I11:%.*]] = mul <2 x i64> [[I3]], splat (i64 5)
763 ; ZVE32F-NEXT: [[I12:%.*]] = add <2 x i64> [[I11]], splat (i64 10)
764 ; ZVE32F-NEXT: [[I13:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], <2 x i64> [[I10]]
765 ; ZVE32F-NEXT: [[I14:%.*]] = getelementptr inbounds ptr, ptr [[ARG]], <2 x i64> [[I12]]
766 ; ZVE32F-NEXT: call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I6]], <2 x ptr> [[I13]], i32 8, <2 x i1> splat (i1 true))
767 ; ZVE32F-NEXT: call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I9]], <2 x ptr> [[I14]], i32 8, <2 x i1> splat (i1 true))
768 ; ZVE32F-NEXT: [[I15]] = add nuw i64 [[I]], 4
769 ; ZVE32F-NEXT: [[I16]] = add <2 x i64> [[I3]], splat (i64 4)
770 ; ZVE32F-NEXT: [[I17:%.*]] = icmp eq i64 [[I15]], 1024
771 ; ZVE32F-NEXT: br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
773 ; ZVE32F-NEXT: ret void
778 bb2: ; preds = %bb2, %bb
779 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
780 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
781 %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i
782 %i6 = load <2 x ptr>, ptr %i4, align 8
783 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
784 %i9 = load <2 x ptr>, ptr %i7, align 8
785 %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
786 %i11 = mul <2 x i64> %i3, splat (i64 5)
787 %i12 = add <2 x i64> %i11, <i64 10, i64 10>
788 %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
789 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
790 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true))
791 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true))
792 %i15 = add nuw i64 %i, 4
793 %i16 = add <2 x i64> %i3, <i64 4, i64 4>
794 %i17 = icmp eq i64 %i15, 1024
795 br i1 %i17, label %bb18, label %bb2
801 declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>)
803 define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) {
804 ; CHECK-LABEL: @strided_load_startval_add_with_splat(
806 ; CHECK-NEXT: [[I:%.*]] = icmp eq i32 [[ARG2:%.*]], 1024
807 ; CHECK-NEXT: br i1 [[I]], label [[BB34:%.*]], label [[BB3:%.*]]
809 ; CHECK-NEXT: [[I4:%.*]] = sext i32 [[ARG2]] to i64
810 ; CHECK-NEXT: [[I5:%.*]] = sub i32 1023, [[ARG2]]
811 ; CHECK-NEXT: [[I6:%.*]] = zext i32 [[I5]] to i64
812 ; CHECK-NEXT: [[I7:%.*]] = add nuw nsw i64 [[I6]], 1
813 ; CHECK-NEXT: [[I8:%.*]] = icmp ult i32 [[I5]], 31
814 ; CHECK-NEXT: br i1 [[I8]], label [[BB32:%.*]], label [[BB9:%.*]]
816 ; CHECK-NEXT: [[I10:%.*]] = and i64 [[I7]], 8589934560
817 ; CHECK-NEXT: [[I11:%.*]] = add nsw i64 [[I10]], [[I4]]
818 ; CHECK-NEXT: [[START:%.*]] = mul i64 [[I4]], 5
819 ; CHECK-NEXT: br label [[BB15:%.*]]
821 ; CHECK-NEXT: [[I16:%.*]] = phi i64 [ 0, [[BB9]] ], [ [[I27:%.*]], [[BB15]] ]
822 ; CHECK-NEXT: [[I17_SCALAR:%.*]] = phi i64 [ [[START]], [[BB9]] ], [ [[I28_SCALAR:%.*]], [[BB15]] ]
823 ; CHECK-NEXT: [[I18:%.*]] = add i64 [[I16]], [[I4]]
824 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[ARG1:%.*]], i64 [[I17_SCALAR]]
825 ; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
826 ; CHECK-NEXT: [[I21:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
827 ; CHECK-NEXT: [[I22:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 [[I18]]
828 ; CHECK-NEXT: [[I24:%.*]] = load <32 x i8>, ptr [[I22]], align 1
829 ; CHECK-NEXT: [[I25:%.*]] = add <32 x i8> [[I24]], [[I21]]
830 ; CHECK-NEXT: store <32 x i8> [[I25]], ptr [[I22]], align 1
831 ; CHECK-NEXT: [[I27]] = add nuw i64 [[I16]], 32
832 ; CHECK-NEXT: [[I28_SCALAR]] = add i64 [[I17_SCALAR]], 160
833 ; CHECK-NEXT: [[I29:%.*]] = icmp eq i64 [[I27]], [[I10]]
834 ; CHECK-NEXT: br i1 [[I29]], label [[BB30:%.*]], label [[BB15]]
836 ; CHECK-NEXT: [[I31:%.*]] = icmp eq i64 [[I7]], [[I10]]
837 ; CHECK-NEXT: br i1 [[I31]], label [[BB34]], label [[BB32]]
839 ; CHECK-NEXT: [[I33:%.*]] = phi i64 [ [[I4]], [[BB3]] ], [ [[I11]], [[BB30]] ]
840 ; CHECK-NEXT: br label [[BB35:%.*]]
842 ; CHECK-NEXT: ret void
844 ; CHECK-NEXT: [[I36:%.*]] = phi i64 [ [[I43:%.*]], [[BB35]] ], [ [[I33]], [[BB32]] ]
845 ; CHECK-NEXT: [[I37:%.*]] = mul nsw i64 [[I36]], 5
846 ; CHECK-NEXT: [[I38:%.*]] = getelementptr inbounds i8, ptr [[ARG1]], i64 [[I37]]
847 ; CHECK-NEXT: [[I39:%.*]] = load i8, ptr [[I38]], align 1
848 ; CHECK-NEXT: [[I40:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 [[I36]]
849 ; CHECK-NEXT: [[I41:%.*]] = load i8, ptr [[I40]], align 1
850 ; CHECK-NEXT: [[I42:%.*]] = add i8 [[I41]], [[I39]]
851 ; CHECK-NEXT: store i8 [[I42]], ptr [[I40]], align 1
852 ; CHECK-NEXT: [[I43]] = add nsw i64 [[I36]], 1
853 ; CHECK-NEXT: [[I44:%.*]] = trunc i64 [[I43]] to i32
854 ; CHECK-NEXT: [[I45:%.*]] = icmp eq i32 [[I44]], 1024
855 ; CHECK-NEXT: br i1 [[I45]], label [[BB34]], label [[BB35]]
858 %i = icmp eq i32 %arg2, 1024
859 br i1 %i, label %bb34, label %bb3
862 %i4 = sext i32 %arg2 to i64
863 %i5 = sub i32 1023, %arg2
864 %i6 = zext i32 %i5 to i64
865 %i7 = add nuw nsw i64 %i6, 1
866 %i8 = icmp ult i32 %i5, 31
867 br i1 %i8, label %bb32, label %bb9
870 %i10 = and i64 %i7, 8589934560
871 %i11 = add nsw i64 %i10, %i4
872 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0
873 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer
874 %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>
877 bb15: ; preds = %bb15, %bb9
878 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
879 %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
880 %i18 = add i64 %i16, %i4
881 %i19 = mul nsw <32 x i64> %i17, splat (i64 5)
882 %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
883 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
884 %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
885 %i24 = load <32 x i8>, ptr %i22, align 1
886 %i25 = add <32 x i8> %i24, %i21
887 store <32 x i8> %i25, ptr %i22, align 1
888 %i27 = add nuw i64 %i16, 32
889 %i28 = add <32 x i64> %i17, splat (i64 32)
890 %i29 = icmp eq i64 %i27, %i10
891 br i1 %i29, label %bb30, label %bb15
893 bb30: ; preds = %bb15
894 %i31 = icmp eq i64 %i7, %i10
895 br i1 %i31, label %bb34, label %bb32
897 bb32: ; preds = %bb30, %bb3
898 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ]
901 bb34: ; preds = %bb35, %bb30, %bb
904 bb35: ; preds = %bb35, %bb32
905 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ]
906 %i37 = mul nsw i64 %i36, 5
907 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37
908 %i39 = load i8, ptr %i38, align 1
909 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36
910 %i41 = load i8, ptr %i40, align 1
911 %i42 = add i8 %i41, %i39
912 store i8 %i42, ptr %i40, align 1
913 %i43 = add nsw i64 %i36, 1
914 %i44 = trunc i64 %i43 to i32
915 %i45 = icmp eq i32 %i44, 1024
916 br i1 %i45, label %bb34, label %bb35
919 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>)
920 declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>)
922 define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) {
923 ; CHECK-LABEL: @gather_no_scalar_remainder(
925 ; CHECK-NEXT: [[I:%.*]] = shl i64 [[ARG2:%.*]], 4
926 ; CHECK-NEXT: [[I3:%.*]] = icmp eq i64 [[I]], 0
927 ; CHECK-NEXT: br i1 [[I3]], label [[BB16:%.*]], label [[BB2:%.*]]
929 ; CHECK-NEXT: br label [[BB4:%.*]]
931 ; CHECK-NEXT: [[I5:%.*]] = phi i64 [ [[I13:%.*]], [[BB4]] ], [ 0, [[BB2]] ]
932 ; CHECK-NEXT: [[I6_SCALAR:%.*]] = phi i64 [ 0, [[BB2]] ], [ [[I14_SCALAR:%.*]], [[BB4]] ]
933 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[ARG1:%.*]], i64 [[I6_SCALAR]]
934 ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr [[TMP0]], i64 5, <16 x i1> splat (i1 true), i32 16)
935 ; CHECK-NEXT: [[I9:%.*]] = call <16 x i8> @llvm.vp.select.v16i8(<16 x i1> splat (i1 true), <16 x i8> [[TMP1]], <16 x i8> undef, i32 16)
936 ; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 [[I5]]
937 ; CHECK-NEXT: [[I11:%.*]] = load <16 x i8>, ptr [[I10]], align 1
938 ; CHECK-NEXT: [[I12:%.*]] = add <16 x i8> [[I11]], [[I9]]
939 ; CHECK-NEXT: store <16 x i8> [[I12]], ptr [[I10]], align 1
940 ; CHECK-NEXT: [[I13]] = add nuw i64 [[I5]], 16
941 ; CHECK-NEXT: [[I14_SCALAR]] = add i64 [[I6_SCALAR]], 80
942 ; CHECK-NEXT: [[I15:%.*]] = icmp eq i64 [[I13]], [[I]]
943 ; CHECK-NEXT: br i1 [[I15]], label [[BB16]], label [[BB4]]
945 ; CHECK-NEXT: ret void
948 %i = shl i64 %arg2, 4
949 %i3 = icmp eq i64 %i, 0
950 br i1 %i3, label %bb16, label %bb2
955 bb4: ; preds = %bb4, %bb2
956 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
957 %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
958 %i7 = mul <16 x i64> %i6, splat (i64 5)
959 %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
960 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef)
961 %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
962 %i11 = load <16 x i8>, ptr %i10, align 1
963 %i12 = add <16 x i8> %i11, %i9
964 store <16 x i8> %i12, ptr %i10, align 1
965 %i13 = add nuw i64 %i5, 16
966 %i14 = add <16 x i64> %i6, splat (i64 16)
967 %i15 = icmp eq i64 %i13, %i
968 br i1 %i15, label %bb16, label %bb4
970 bb16: ; preds = %bb4, %bb
974 define <8 x i8> @broadcast_ptr_base(ptr %a) {
975 ; CHECK-LABEL: @broadcast_ptr_base(
977 ; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr [[A:%.*]], i64 64, <8 x i1> splat (i1 true), i32 8)
978 ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.vp.select.v8i8(<8 x i1> splat (i1 true), <8 x i8> [[TMP0]], <8 x i8> poison, i32 8)
979 ; CHECK-NEXT: ret <8 x i8> [[TMP1]]
982 %0 = insertelement <8 x ptr> poison, ptr %a, i64 0
983 %1 = shufflevector <8 x ptr> %0, <8 x ptr> poison, <8 x i32> zeroinitializer
984 %2 = getelementptr i8, <8 x ptr> %1, <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
985 %3 = tail call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %2, i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
989 declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i8>)
991 define void @gather_narrow_idx(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
992 ; CHECK-LABEL: @gather_narrow_idx(
994 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
995 ; CHECK: vector.body:
996 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
997 ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
998 ; CHECK-NEXT: [[I:%.*]] = mul nuw nsw <32 x i16> [[VEC_IND]], splat (i16 5)
999 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i16> [[I]]
1000 ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
1001 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
1002 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
1003 ; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
1004 ; CHECK-NEXT: store <32 x i8> [[I4]], ptr [[I2]], align 1
1005 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
1006 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i16> [[VEC_IND]], splat (i16 32)
1007 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
1008 ; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
1009 ; CHECK: for.cond.cleanup:
1010 ; CHECK-NEXT: ret void
1013 br label %vector.body
1015 vector.body: ; preds = %vector.body, %entry
1016 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
1017 %vec.ind = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, %entry ], [ %vec.ind.next, %vector.body ]
1018 %i = mul nuw nsw <32 x i16> %vec.ind, splat (i16 5)
1019 %i1 = getelementptr inbounds i8, ptr %B, <32 x i16> %i
1020 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
1021 %i2 = getelementptr inbounds i8, ptr %A, i64 %index
1022 %wide.load = load <32 x i8>, ptr %i2, align 1
1023 %i4 = add <32 x i8> %wide.load, %wide.masked.gather
1024 store <32 x i8> %i4, ptr %i2, align 1
1025 %index.next = add nuw i64 %index, 32
1026 %vec.ind.next = add <32 x i16> %vec.ind, splat (i16 32)
1027 %i6 = icmp eq i64 %index.next, 1024
1028 br i1 %i6, label %for.cond.cleanup, label %vector.body
1030 for.cond.cleanup: ; preds = %vector.body