1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=loop-vectorize,simplifycfg -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
3 ; RUN: opt < %s -passes=loop-vectorize,simplifycfg -mcpu=knl -force-vector-width=2 -force-target-max-vector-interleave=1 -S | FileCheck %s -check-prefix=FVW2
5 ; With a force-vector-width, it is sometimes more profitable to generate
6 ; scalarized and predicated stores instead of masked scatter. Disable
7 ; interleaving to simplify CHECKs in that scenario.
9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10 target triple = "x86_64-pc_linux"
14 ;void foo1(ptr __restrict__ in, ptr __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
16 ; for (int i=0; i < SIZE; ++i) {
17 ; if (trigger[i] > 0) {
18 ; out[i] = in[index[i]] + (float) 0.5;
23 ; Function Attrs: nounwind uwtable
24 define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) {
25 ; AVX512-LABEL: @foo1(
27 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
28 ; AVX512: vector.body:
29 ; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
30 ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0
31 ; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
32 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
33 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
34 ; AVX512-NEXT: [[TMP3:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
35 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[TMP0]]
36 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
37 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison)
38 ; AVX512-NEXT: [[TMP6:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
39 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <16 x i64> [[TMP6]]
40 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP7]], i32 4, <16 x i1> [[TMP3]], <16 x float> poison)
41 ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
42 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[TMP0]]
43 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
44 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP10]], i32 4, <16 x i1> [[TMP3]])
45 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
46 ; AVX512-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
47 ; AVX512-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
49 ; AVX512-NEXT: ret void
53 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
55 ; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
56 ; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0
57 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
58 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
59 ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
60 ; FVW2-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
61 ; FVW2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[TMP0]]
62 ; FVW2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
63 ; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP5]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison)
64 ; FVW2-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
65 ; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <2 x i64> [[TMP6]]
66 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP7]], i32 4, <2 x i1> [[TMP3]], <2 x float> poison)
67 ; FVW2-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
68 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[TMP0]]
69 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
70 ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP10]], i32 4, <2 x i1> [[TMP3]])
71 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
72 ; FVW2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
73 ; FVW2-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
81 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
82 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
83 %0 = load i32, ptr %arrayidx, align 4
84 %cmp1 = icmp sgt i32 %0, 0
85 br i1 %cmp1, label %if.then, label %for.inc
88 %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv
89 %1 = load i32, ptr %arrayidx3, align 4
90 %idxprom4 = sext i32 %1 to i64
91 %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %idxprom4
92 %2 = load float, ptr %arrayidx5, align 4
93 %add = fadd float %2, 5.000000e-01
94 %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %indvars.iv
95 store float %add, ptr %arrayidx7, align 4
99 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
100 %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
101 br i1 %exitcond.not, label %for.end, label %for.body
108 ;void foo2 (In * __restrict__ in, ptr __restrict__ out, int * __restrict__ trigger) {
110 ; for (int i=0; i<SIZE; i += 16) {
111 ; if (trigger[i] > 0) {
112 ; out[i] = in[i].b + (float) 0.5;
117 %struct.In = type { float, float }
119 define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) #0 {
120 ; AVX512-LABEL: @foo2(
121 ; AVX512-NEXT: entry:
122 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
123 ; AVX512: vector.body:
124 ; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
125 ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
126 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
127 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
128 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
129 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
130 ; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
131 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
132 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
133 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
134 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
135 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
136 ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
137 ; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
139 ; AVX512-NEXT: ret void
143 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
145 ; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
146 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
147 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
148 ; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
149 ; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
150 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
151 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
152 ; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
153 ; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
154 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
155 ; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
156 ; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
157 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
158 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
159 ; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
160 ; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
161 ; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
162 ; FVW2: pred.store.if:
163 ; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
164 ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
165 ; FVW2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 4
166 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
167 ; FVW2: pred.store.continue:
168 ; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
169 ; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
170 ; FVW2: pred.store.if2:
171 ; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP1]]
172 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
173 ; FVW2-NEXT: store float [[TMP16]], ptr [[TMP15]], align 4
174 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]]
175 ; FVW2: pred.store.continue3:
176 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
177 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
178 ; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
179 ; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
181 ; FVW2-NEXT: ret void
187 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
188 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
189 %0 = load i32, ptr %arrayidx, align 4
190 %cmp1 = icmp sgt i32 %0, 0
191 br i1 %cmp1, label %if.then, label %for.inc
194 %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1
195 %1 = load float, ptr %b, align 4
196 %add = fadd float %1, 5.000000e-01
197 %arrayidx5 = getelementptr inbounds float, ptr %out, i64 %indvars.iv
198 store float %add, ptr %arrayidx5, align 4
202 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
203 %cmp = icmp ult i64 %indvars.iv, 4080
204 br i1 %cmp, label %for.body, label %for.end
215 ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
217 ; for (int i=0; i<SIZE; i += 16) {
218 ; if (trigger[i] > 0) {
219 ; out[i].b = in[i].b + (float) 0.5;
224 %struct.Out = type { float, float }
226 define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
227 ; AVX512-LABEL: @foo3(
228 ; AVX512-NEXT: entry:
229 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
230 ; AVX512: vector.body:
231 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
232 ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
233 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
234 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
235 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
236 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
237 ; AVX512-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
238 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
239 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
240 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
241 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
242 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
243 ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
244 ; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
246 ; AVX512-NEXT: ret void
250 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
252 ; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
253 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
254 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
255 ; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
256 ; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
257 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
258 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
259 ; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
260 ; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
261 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
262 ; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
263 ; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
264 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
265 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
266 ; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
267 ; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
268 ; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
269 ; FVW2: pred.store.if:
270 ; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], i64 [[TMP0]], i32 1
271 ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
272 ; FVW2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 4
273 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
274 ; FVW2: pred.store.continue:
275 ; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
276 ; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
277 ; FVW2: pred.store.if1:
278 ; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_OUT]], ptr [[OUT]], i64 [[TMP1]], i32 1
279 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
280 ; FVW2-NEXT: store float [[TMP16]], ptr [[TMP15]], align 4
281 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE2]]
282 ; FVW2: pred.store.continue2:
283 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
284 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
285 ; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
286 ; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
288 ; FVW2-NEXT: ret void
294 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
295 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
296 %0 = load i32, ptr %arrayidx, align 4
297 %cmp1 = icmp sgt i32 %0, 0
298 br i1 %cmp1, label %if.then, label %for.inc
301 %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1
302 %1 = load float, ptr %b, align 4
303 %add = fadd float %1, 5.000000e-01
304 %b6 = getelementptr inbounds %struct.Out, ptr %out, i64 %indvars.iv, i32 1
305 store float %add, ptr %b6, align 4
309 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
310 %cmp = icmp ult i64 %indvars.iv, 4080
311 br i1 %cmp, label %for.body, label %for.end
316 declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
318 ; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
320 define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) #0 {
321 ; AVX512-LABEL: @foo2_addrspace(
322 ; AVX512-NEXT: entry:
323 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
324 ; AVX512: vector.body:
325 ; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
326 ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
327 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
328 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
329 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
330 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
331 ; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
332 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
333 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
334 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
335 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
336 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
337 ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
338 ; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
340 ; AVX512-NEXT: ret void
342 ; FVW2-LABEL: @foo2_addrspace(
344 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
346 ; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
347 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
348 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
349 ; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
350 ; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
351 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
352 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
353 ; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
354 ; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
355 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
356 ; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
357 ; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
358 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
359 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
360 ; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
361 ; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
362 ; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
363 ; FVW2: pred.store.if:
364 ; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP0]]
365 ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
366 ; FVW2-NEXT: store float [[TMP13]], ptr addrspace(1) [[TMP12]], align 4
367 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
368 ; FVW2: pred.store.continue:
369 ; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
370 ; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
371 ; FVW2: pred.store.if2:
372 ; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
373 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
374 ; FVW2-NEXT: store float [[TMP16]], ptr addrspace(1) [[TMP15]], align 4
375 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]]
376 ; FVW2: pred.store.continue3:
377 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
378 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
379 ; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
380 ; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
382 ; FVW2-NEXT: ret void
388 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
389 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
390 %0 = load i32, ptr %arrayidx, align 4
391 %cmp1 = icmp sgt i32 %0, 0
392 br i1 %cmp1, label %if.then, label %for.inc
395 %b = getelementptr inbounds %struct.In, ptr addrspace(1) %in, i64 %indvars.iv, i32 1
396 %1 = load float, ptr addrspace(1) %b, align 4
397 %add = fadd float %1, 5.000000e-01
398 %arrayidx5 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %indvars.iv
399 store float %add, ptr addrspace(1) %arrayidx5, align 4
403 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
404 %cmp = icmp ult i64 %indvars.iv, 4080
405 br i1 %cmp, label %for.body, label %for.end
411 ; Same as foo2_addrspace but here only the input has the non-default address space.
413 define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noalias %out, ptr noalias %trigger, ptr noalias %index) {
414 ; AVX512-LABEL: @foo2_addrspace2(
415 ; AVX512-NEXT: entry:
416 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
417 ; AVX512: vector.body:
418 ; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
419 ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
420 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
421 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
422 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
423 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
424 ; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
425 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
426 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
427 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
428 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
429 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
430 ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
431 ; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
433 ; AVX512-NEXT: ret void
435 ; FVW2-LABEL: @foo2_addrspace2(
437 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
439 ; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
440 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
441 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
442 ; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
443 ; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
444 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
445 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
446 ; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
447 ; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
448 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
449 ; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
450 ; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
451 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
452 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
453 ; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
454 ; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
455 ; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
456 ; FVW2: pred.store.if:
457 ; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
458 ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
459 ; FVW2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 4
460 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
461 ; FVW2: pred.store.continue:
462 ; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
463 ; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
464 ; FVW2: pred.store.if2:
465 ; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP1]]
466 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
467 ; FVW2-NEXT: store float [[TMP16]], ptr [[TMP15]], align 4
468 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]]
469 ; FVW2: pred.store.continue3:
470 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
471 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
472 ; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
473 ; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
475 ; FVW2-NEXT: ret void
481 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
482 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
483 %0 = load i32, ptr %arrayidx, align 4
484 %cmp1 = icmp sgt i32 %0, 0
485 br i1 %cmp1, label %if.then, label %for.inc
488 %b = getelementptr inbounds %struct.In, ptr addrspace(1) %in, i64 %indvars.iv, i32 1
489 %1 = load float, ptr addrspace(1) %b, align 4
490 %add = fadd float %1, 5.000000e-01
491 %arrayidx5 = getelementptr inbounds float, ptr %out, i64 %indvars.iv
492 store float %add, ptr %arrayidx5, align 4
496 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
497 %cmp = icmp ult i64 %indvars.iv, 4080
498 br i1 %cmp, label %for.body, label %for.end
504 ; Same as foo2_addrspace but here only the output has the non-default address space.
506 define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) {
507 ; AVX512-LABEL: @foo2_addrspace3(
508 ; AVX512-NEXT: entry:
509 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
510 ; AVX512: vector.body:
511 ; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
512 ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
513 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
514 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
515 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
516 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
517 ; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
518 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
519 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
520 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
521 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
522 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
523 ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
524 ; AVX512-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
526 ; AVX512-NEXT: ret void
528 ; FVW2-LABEL: @foo2_addrspace3(
530 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
532 ; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
533 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
534 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
535 ; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
536 ; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
537 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
538 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
539 ; FVW2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
540 ; FVW2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
541 ; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
542 ; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
543 ; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
544 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
545 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
546 ; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
547 ; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
548 ; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
549 ; FVW2: pred.store.if:
550 ; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP0]]
551 ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
552 ; FVW2-NEXT: store float [[TMP13]], ptr addrspace(1) [[TMP12]], align 4
553 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
554 ; FVW2: pred.store.continue:
555 ; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
556 ; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
557 ; FVW2: pred.store.if2:
558 ; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
559 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
560 ; FVW2-NEXT: store float [[TMP16]], ptr addrspace(1) [[TMP15]], align 4
561 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]]
562 ; FVW2: pred.store.continue3:
563 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
564 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
565 ; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
566 ; FVW2-NEXT: br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
568 ; FVW2-NEXT: ret void
574 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
575 %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
576 %0 = load i32, ptr %arrayidx, align 4
577 %cmp1 = icmp sgt i32 %0, 0
578 br i1 %cmp1, label %if.then, label %for.inc
581 %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1
582 %1 = load float, ptr %b, align 4
583 %add = fadd float %1, 5.000000e-01
584 %arrayidx5 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %indvars.iv
585 store float %add, ptr addrspace(1) %arrayidx5, align 4
589 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
590 %cmp = icmp ult i64 %indvars.iv, 4080
591 br i1 %cmp, label %for.body, label %for.end
597 ; Using gathers is not profitable for this function. PR48429.
598 define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %ptr, ptr nocapture noalias %dest) {
599 ; AVX512-LABEL: @test_gather_not_profitable_pr48429(
600 ; AVX512-NEXT: entry:
601 ; AVX512-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
602 ; AVX512-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
603 ; AVX512-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
604 ; AVX512-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]]
605 ; AVX512: iter.check:
606 ; AVX512-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]]
607 ; AVX512-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
608 ; AVX512-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
609 ; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
610 ; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
611 ; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
612 ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
613 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
614 ; AVX512: vector.memcheck:
615 ; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
616 ; AVX512-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
617 ; AVX512-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2
618 ; AVX512-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 6
619 ; AVX512-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8
620 ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]]
621 ; AVX512-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2
622 ; AVX512-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4
623 ; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]]
624 ; AVX512-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4
625 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]]
626 ; AVX512-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]]
627 ; AVX512-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]]
628 ; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]]
629 ; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]]
630 ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
631 ; AVX512-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]]
632 ; AVX512-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
633 ; AVX512-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
634 ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
635 ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
636 ; AVX512: vector.main.loop.iter.check:
637 ; AVX512-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP3]], 16
638 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
640 ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
641 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
642 ; AVX512-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 64
643 ; AVX512-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP13]]
644 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
645 ; AVX512: vector.body:
646 ; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
647 ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
648 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448, i64 512, i64 576, i64 640, i64 704, i64 768, i64 832, i64 896, i64 960>
649 ; AVX512-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
650 ; AVX512-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
651 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]]
652 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
653 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
654 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP18]], align 4, !alias.scope [[META8:![0-9]+]]
655 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
656 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
657 ; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope [[META15:![0-9]+]]
658 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1
659 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD8]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META11]], !noalias [[META13]]
660 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
661 ; AVX512-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024
662 ; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
663 ; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
664 ; AVX512: middle.block:
665 ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
666 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
667 ; AVX512: vec.epilog.iter.check:
668 ; AVX512-NEXT: [[TMP22:%.*]] = mul i64 [[N_VEC]], 64
669 ; AVX512-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP22]]
670 ; AVX512-NEXT: [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
671 ; AVX512-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
672 ; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
673 ; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
674 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
675 ; AVX512: vec.epilog.ph:
676 ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
677 ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
678 ; AVX512-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8
679 ; AVX512-NEXT: [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]]
680 ; AVX512-NEXT: [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4
681 ; AVX512-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP24]]
682 ; AVX512-NEXT: [[TMP25:%.*]] = mul i64 [[N_VEC10]], 64
683 ; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP25]]
684 ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
685 ; AVX512: vec.epilog.vector.body:
686 ; AVX512-NEXT: [[POINTER_PHI19:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
687 ; AVX512-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
688 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[POINTER_PHI19]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
689 ; AVX512-NEXT: [[OFFSET_IDX21:%.*]] = mul i64 [[INDEX18]], 4
690 ; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX21]], 0
691 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]]
692 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
693 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
694 ; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
695 ; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD22]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
696 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
697 ; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
698 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
699 ; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD23]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META20]], !noalias [[META22]]
700 ; AVX512-NEXT: [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
701 ; AVX512-NEXT: [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
702 ; AVX512-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
703 ; AVX512-NEXT: br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
704 ; AVX512: vec.epilog.middle.block:
705 ; AVX512-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
706 ; AVX512-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
707 ; AVX512: vec.epilog.scalar.ph:
708 ; AVX512-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ]
709 ; AVX512-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ]
710 ; AVX512-NEXT: br label [[FOR_BODY:%.*]]
712 ; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
713 ; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ]
714 ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]]
715 ; AVX512-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4
716 ; AVX512-NEXT: store float [[TMP34]], ptr [[DEST_ADDR_011]], align 4
717 ; AVX512-NEXT: [[TMP35:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4
718 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1
719 ; AVX512-NEXT: store float [[TMP35]], ptr [[ARRAYIDX5]], align 4
720 ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
721 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
722 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
723 ; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
725 ; AVX512-NEXT: ret void
727 ; FVW2-LABEL: @test_gather_not_profitable_pr48429(
729 ; FVW2-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
730 ; FVW2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
731 ; FVW2-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
732 ; FVW2-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
733 ; FVW2: for.body.lr.ph:
734 ; FVW2-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]]
735 ; FVW2-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
736 ; FVW2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
737 ; FVW2-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
738 ; FVW2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
739 ; FVW2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
740 ; FVW2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
741 ; FVW2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
742 ; FVW2: vector.memcheck:
743 ; FVW2-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
744 ; FVW2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
745 ; FVW2-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2
746 ; FVW2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 6
747 ; FVW2-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8
748 ; FVW2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]]
749 ; FVW2-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2
750 ; FVW2-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4
751 ; FVW2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]]
752 ; FVW2-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4
753 ; FVW2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]]
754 ; FVW2-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]]
755 ; FVW2-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]]
756 ; FVW2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]]
757 ; FVW2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]]
758 ; FVW2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
759 ; FVW2-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]]
760 ; FVW2-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
761 ; FVW2-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
762 ; FVW2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
763 ; FVW2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
765 ; FVW2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
766 ; FVW2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
767 ; FVW2-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
768 ; FVW2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP13]]
769 ; FVW2-NEXT: [[TMP14:%.*]] = mul i64 [[N_VEC]], 64
770 ; FVW2-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP14]]
771 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
773 ; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
774 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
775 ; FVW2-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
776 ; FVW2-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]]
777 ; FVW2-NEXT: [[OFFSET_IDX9:%.*]] = mul i64 [[INDEX]], 64
778 ; FVW2-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX9]], 0
779 ; FVW2-NEXT: [[TMP18:%.*]] = add i64 [[OFFSET_IDX9]], 64
780 ; FVW2-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP17]]
781 ; FVW2-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP18]]
782 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
783 ; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
784 ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope [[META8:![0-9]+]]
785 ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
786 ; FVW2-NEXT: store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
787 ; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
788 ; FVW2-NEXT: store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]]
789 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
790 ; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope [[META15:![0-9]+]]
791 ; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
792 ; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
793 ; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
794 ; FVW2-NEXT: store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]]
795 ; FVW2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1
796 ; FVW2-NEXT: store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]]
797 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
798 ; FVW2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
799 ; FVW2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
800 ; FVW2: middle.block:
801 ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
802 ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]]
804 ; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ]
805 ; FVW2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ]
806 ; FVW2-NEXT: br label [[FOR_BODY:%.*]]
808 ; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
809 ; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ]
810 ; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]]
811 ; FVW2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX]], align 4
812 ; FVW2-NEXT: store float [[TMP31]], ptr [[DEST_ADDR_011]], align 4
813 ; FVW2-NEXT: [[TMP32:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4
814 ; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1
815 ; FVW2-NEXT: store float [[TMP32]], ptr [[ARRAYIDX5]], align 4
816 ; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
817 ; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
818 ; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
819 ; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
821 ; FVW2-NEXT: ret void
824 %idx.ext = sext i32 %d to i64
825 %add.ptr = getelementptr inbounds float, ptr %ptr, i64 %idx.ext
826 %cmp.not10 = icmp eq i32 %d, 0
827 br i1 %cmp.not10, label %for.end, label %for.body.lr.ph
830 %mul = sub nsw i32 0, %d
831 %idxprom = sext i32 %mul to i64
835 %ptr.addr.012 = phi ptr [ %ptr, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
836 %dest.addr.011 = phi ptr [ %dest, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
837 %arrayidx = getelementptr inbounds float, ptr %ptr.addr.012, i64 %idxprom
838 %0 = load float, ptr %arrayidx, align 4
839 store float %0, ptr %dest.addr.011, align 4
840 %1 = load float, ptr %ptr.addr.012, align 4
841 %arrayidx5 = getelementptr inbounds float, ptr %dest.addr.011, i64 1
842 store float %1, ptr %arrayidx5, align 4
843 %incdec.ptr = getelementptr inbounds float, ptr %ptr.addr.012, i64 1
844 %add.ptr6 = getelementptr inbounds float, ptr %dest.addr.011, i64 16
845 %cmp.not = icmp eq ptr %incdec.ptr, %add.ptr
846 br i1 %cmp.not, label %for.end, label %for.body