1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
3 ; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2
5 ; With a force-vector-width, it is sometimes more profitable to generate
6 ; scalarized and predicated stores instead of masked scatter.
8 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9 target triple = "x86_64-pc_linux"
13 ;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
15 ; for (int i=0; i < SIZE; ++i) {
16 ; if (trigger[i] > 0) {
17 ; out[i] = in[index[i]] + (float) 0.5;
22 ; Function Attrs: nounwind uwtable
23 define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
24 ; AVX512-LABEL: @foo1(
26 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
27 ; AVX512: vector.body:
28 ; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
29 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
30 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
31 ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
32 ; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
33 ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
34 ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
35 ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> undef)
36 ; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
37 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]]
38 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef)
39 ; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
40 ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
41 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>*
42 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]])
43 ; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16
44 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
45 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
46 ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
47 ; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer
48 ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
49 ; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>*
50 ; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef)
51 ; AVX512-NEXT: [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64>
52 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]]
53 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef)
54 ; AVX512-NEXT: [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
55 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
56 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
57 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]])
58 ; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32
59 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
60 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
61 ; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4
62 ; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer
63 ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
64 ; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>*
65 ; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef)
66 ; AVX512-NEXT: [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64>
67 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]]
68 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef)
69 ; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
70 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
71 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
72 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]])
73 ; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48
74 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
75 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
76 ; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4
77 ; AVX512-NEXT: [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer
78 ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
79 ; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>*
80 ; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef)
81 ; AVX512-NEXT: [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64>
82 ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]]
83 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef)
84 ; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
85 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
86 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>*
87 ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]])
88 ; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64
89 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
90 ; AVX512-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
92 ; AVX512-NEXT: ret void
96 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
98 ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
99 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
100 ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
101 ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
102 ; FVW2-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
103 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
104 ; FVW2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
105 ; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> undef)
106 ; FVW2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
107 ; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]]
108 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef)
109 ; FVW2-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
110 ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
111 ; FVW2-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
112 ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]])
113 ; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2
114 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
115 ; FVW2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
116 ; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4
117 ; FVW2-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
118 ; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
119 ; FVW2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
120 ; FVW2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef)
121 ; FVW2-NEXT: [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
122 ; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]]
123 ; FVW2-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef)
124 ; FVW2-NEXT: [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01>
125 ; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
126 ; FVW2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
127 ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
128 ; FVW2-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
129 ; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
130 ; FVW2-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
131 ; FVW2-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
132 ; FVW2-NEXT: [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
133 ; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
134 ; FVW2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
135 ; FVW2-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
136 ; FVW2-NEXT: [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
137 ; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
138 ; FVW2-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
139 ; FVW2-NEXT: [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
140 ; FVW2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
141 ; FVW2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
142 ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
143 ; FVW2-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
144 ; FVW2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
145 ; FVW2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
146 ; FVW2-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
147 ; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
148 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
149 ; FVW2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
150 ; FVW2-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
151 ; FVW2-NEXT: [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
152 ; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
153 ; FVW2-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
154 ; FVW2-NEXT: [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
155 ; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
156 ; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
157 ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
158 ; FVW2-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
159 ; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
160 ; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
162 ; FVW2-NEXT: ret void
165 %in.addr = alloca float*, align 8
166 %out.addr = alloca float*, align 8
167 %trigger.addr = alloca i32*, align 8
168 %index.addr = alloca i32*, align 8
169 %i = alloca i32, align 4
170 store float* %in, float** %in.addr, align 8
171 store float* %out, float** %out.addr, align 8
172 store i32* %trigger, i32** %trigger.addr, align 8
173 store i32* %index, i32** %index.addr, align 8
174 store i32 0, i32* %i, align 4
177 for.cond: ; preds = %for.inc, %entry
178 %0 = load i32, i32* %i, align 4
179 %cmp = icmp slt i32 %0, 4096
180 br i1 %cmp, label %for.body, label %for.end
182 for.body: ; preds = %for.cond
183 %1 = load i32, i32* %i, align 4
184 %idxprom = sext i32 %1 to i64
185 %2 = load i32*, i32** %trigger.addr, align 8
186 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
187 %3 = load i32, i32* %arrayidx, align 4
188 %cmp1 = icmp sgt i32 %3, 0
189 br i1 %cmp1, label %if.then, label %if.end
191 if.then: ; preds = %for.body
192 %4 = load i32, i32* %i, align 4
193 %idxprom2 = sext i32 %4 to i64
194 %5 = load i32*, i32** %index.addr, align 8
195 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
196 %6 = load i32, i32* %arrayidx3, align 4
197 %idxprom4 = sext i32 %6 to i64
198 %7 = load float*, float** %in.addr, align 8
199 %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
200 %8 = load float, float* %arrayidx5, align 4
201 %add = fadd float %8, 5.000000e-01
202 %9 = load i32, i32* %i, align 4
203 %idxprom6 = sext i32 %9 to i64
204 %10 = load float*, float** %out.addr, align 8
205 %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
206 store float %add, float* %arrayidx7, align 4
209 if.end: ; preds = %if.then, %for.body
212 for.inc: ; preds = %if.end
213 %11 = load i32, i32* %i, align 4
214 %inc = add nsw i32 %11, 1
215 store i32 %inc, i32* %i, align 4
218 for.end: ; preds = %for.cond
223 ;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
225 ; for (int i=0; i<SIZE; i += 16) {
226 ; if (trigger[i] > 0) {
227 ; out[i] = in[i].b + (float) 0.5;
232 %struct.In = type { float, float }
234 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
235 ; AVX512-LABEL: @foo2(
236 ; AVX512-NEXT: entry:
237 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
238 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
239 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
240 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
241 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
242 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
243 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
244 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
245 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
246 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
247 ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
248 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
249 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
250 ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
251 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
252 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
253 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
254 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
255 ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
256 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
257 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
258 ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
259 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
260 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
261 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
262 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
263 ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
264 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
265 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
266 ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
267 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
268 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
269 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
270 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
271 ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
272 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
273 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
274 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
275 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
276 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
277 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
278 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
279 ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
280 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
281 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
282 ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
283 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
284 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
285 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
286 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
287 ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
288 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
289 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
290 ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
291 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
292 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
293 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
294 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
295 ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
296 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
297 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
298 ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
299 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
300 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
301 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
302 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
303 ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
304 ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
305 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
306 ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
307 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
308 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
309 ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
310 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
311 ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
312 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
313 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
314 ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
315 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
316 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
317 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
318 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
319 ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
320 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
321 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
322 ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
323 ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
324 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
325 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
326 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
327 ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
328 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
329 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
330 ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
331 ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
332 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
333 ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
334 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
335 ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
336 ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
337 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
338 ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
339 ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
340 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
341 ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
342 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
343 ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
344 ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
345 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
346 ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
347 ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
348 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
349 ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
350 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
351 ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
352 ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
353 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
354 ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
355 ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
356 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
357 ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
358 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
359 ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
360 ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
361 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
362 ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
363 ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
364 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
365 ; AVX512-NEXT: ret void
369 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
371 ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
372 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
373 ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
374 ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
375 ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
376 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
377 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
378 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
379 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
380 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
381 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
382 ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
383 ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
384 ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
385 ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
386 ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
387 ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
388 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
389 ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
390 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
391 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
392 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
393 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
394 ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
395 ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
396 ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
397 ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
398 ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
399 ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
400 ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
401 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
402 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
403 ; FVW2: pred.store.if:
404 ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
405 ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
406 ; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4
407 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
408 ; FVW2: pred.store.continue:
409 ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
410 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
411 ; FVW2: pred.store.if17:
412 ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
413 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
414 ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
415 ; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4
416 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]]
417 ; FVW2: pred.store.continue18:
418 ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
419 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
420 ; FVW2: pred.store.if19:
421 ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
422 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
423 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
424 ; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4
425 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]]
426 ; FVW2: pred.store.continue20:
427 ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
428 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
429 ; FVW2: pred.store.if21:
430 ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
431 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
432 ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
433 ; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4
434 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]]
435 ; FVW2: pred.store.continue22:
436 ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
437 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
438 ; FVW2: pred.store.if23:
439 ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
440 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
441 ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
442 ; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4
443 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]]
444 ; FVW2: pred.store.continue24:
445 ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
446 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
447 ; FVW2: pred.store.if25:
448 ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
449 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
450 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
451 ; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4
452 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]]
453 ; FVW2: pred.store.continue26:
454 ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
455 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
456 ; FVW2: pred.store.if27:
457 ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
458 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
459 ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
460 ; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4
461 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]]
462 ; FVW2: pred.store.continue28:
463 ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
464 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
465 ; FVW2: pred.store.if29:
466 ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
467 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
468 ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
469 ; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4
470 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]]
471 ; FVW2: pred.store.continue30:
472 ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
473 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
474 ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
475 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2
477 ; FVW2-NEXT: ret void
480 %in.addr = alloca %struct.In*, align 8
481 %out.addr = alloca float*, align 8
482 %trigger.addr = alloca i32*, align 8
483 %index.addr = alloca i32*, align 8
484 %i = alloca i32, align 4
485 store %struct.In* %in, %struct.In** %in.addr, align 8
486 store float* %out, float** %out.addr, align 8
487 store i32* %trigger, i32** %trigger.addr, align 8
488 store i32* %index, i32** %index.addr, align 8
489 store i32 0, i32* %i, align 4
492 for.cond: ; preds = %for.inc, %entry
493 %0 = load i32, i32* %i, align 4
494 %cmp = icmp slt i32 %0, 4096
495 br i1 %cmp, label %for.body, label %for.end
497 for.body: ; preds = %for.cond
498 %1 = load i32, i32* %i, align 4
499 %idxprom = sext i32 %1 to i64
500 %2 = load i32*, i32** %trigger.addr, align 8
501 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
502 %3 = load i32, i32* %arrayidx, align 4
503 %cmp1 = icmp sgt i32 %3, 0
504 br i1 %cmp1, label %if.then, label %if.end
506 if.then: ; preds = %for.body
507 %4 = load i32, i32* %i, align 4
508 %idxprom2 = sext i32 %4 to i64
509 %5 = load %struct.In*, %struct.In** %in.addr, align 8
510 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
511 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
512 %6 = load float, float* %b, align 4
513 %add = fadd float %6, 5.000000e-01
514 %7 = load i32, i32* %i, align 4
515 %idxprom4 = sext i32 %7 to i64
516 %8 = load float*, float** %out.addr, align 8
517 %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
518 store float %add, float* %arrayidx5, align 4
521 if.end: ; preds = %if.then, %for.body
524 for.inc: ; preds = %if.end
525 %9 = load i32, i32* %i, align 4
526 %inc = add nsw i32 %9, 16
527 store i32 %inc, i32* %i, align 4
530 for.end: ; preds = %for.cond
539 ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
541 ; for (int i=0; i<SIZE; i += 16) {
542 ; if (trigger[i] > 0) {
543 ; out[i].b = in[i].b + (float) 0.5;
548 %struct.Out = type { float, float }
550 define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
551 ; AVX512-LABEL: @foo3(
552 ; AVX512-NEXT: entry:
553 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
554 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
555 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
556 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
557 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
558 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
559 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
560 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
561 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
562 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
563 ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
564 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
565 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
566 ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
567 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
568 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
569 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
570 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
571 ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
572 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
573 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
574 ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
575 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
576 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
577 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
578 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
579 ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
580 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
581 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
582 ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
583 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
584 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
585 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
586 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
587 ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
588 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
589 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
590 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
591 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
592 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
593 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
594 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
595 ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
596 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
597 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
598 ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
599 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
600 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
601 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
602 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
603 ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
604 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
605 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
606 ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
607 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
608 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
609 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
610 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
611 ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
612 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
613 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
614 ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
615 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
616 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
617 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
618 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
619 ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
620 ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
621 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
622 ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
623 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
624 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
625 ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
626 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
627 ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
628 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
629 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
630 ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
631 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
632 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
633 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
634 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
635 ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
636 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
637 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
638 ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
639 ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
640 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
641 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
642 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
643 ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
644 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
645 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
646 ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
647 ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
648 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
649 ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
650 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
651 ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
652 ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
653 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
654 ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
655 ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
656 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
657 ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
658 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
659 ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
660 ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
661 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
662 ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
663 ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
664 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
665 ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
666 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
667 ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
668 ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
669 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
670 ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
671 ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
672 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
673 ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
674 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
675 ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
676 ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
677 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
678 ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
679 ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
680 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
681 ; AVX512-NEXT: ret void
685 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
687 ; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ]
688 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ]
689 ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
690 ; FVW2-NEXT: [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
691 ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
692 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4
693 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
694 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
695 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]]
696 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
697 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
698 ; FVW2-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
699 ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
700 ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
701 ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
702 ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer
703 ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
704 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
705 ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
706 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
707 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1
708 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
709 ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
710 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
711 ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
712 ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
713 ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], <float 5.000000e-01, float 5.000000e-01>
714 ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
715 ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
716 ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
717 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
718 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
719 ; FVW2: pred.store.if:
720 ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
721 ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
722 ; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4
723 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
724 ; FVW2: pred.store.continue:
725 ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
726 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
727 ; FVW2: pred.store.if16:
728 ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
729 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1
730 ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
731 ; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4
732 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE17]]
733 ; FVW2: pred.store.continue17:
734 ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
735 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
736 ; FVW2: pred.store.if18:
737 ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
738 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1
739 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
740 ; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4
741 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE19]]
742 ; FVW2: pred.store.continue19:
743 ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
744 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
745 ; FVW2: pred.store.if20:
746 ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
747 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1
748 ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
749 ; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4
750 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE21]]
751 ; FVW2: pred.store.continue21:
752 ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
753 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
754 ; FVW2: pred.store.if22:
755 ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
756 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1
757 ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
758 ; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4
759 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]]
760 ; FVW2: pred.store.continue23:
761 ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
762 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
763 ; FVW2: pred.store.if24:
764 ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
765 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1
766 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
767 ; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4
768 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]]
769 ; FVW2: pred.store.continue25:
770 ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
771 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
772 ; FVW2: pred.store.if26:
773 ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
774 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1
775 ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
776 ; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4
777 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]]
778 ; FVW2: pred.store.continue27:
779 ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
780 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]]
781 ; FVW2: pred.store.if28:
782 ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
783 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1
784 ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
785 ; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4
786 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]]
787 ; FVW2: pred.store.continue29:
788 ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
789 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
790 ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
791 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !3
793 ; FVW2-NEXT: ret void
796 %in.addr = alloca %struct.In*, align 8
797 %out.addr = alloca %struct.Out*, align 8
798 %trigger.addr = alloca i32*, align 8
799 %i = alloca i32, align 4
800 store %struct.In* %in, %struct.In** %in.addr, align 8
801 store %struct.Out* %out, %struct.Out** %out.addr, align 8
802 store i32* %trigger, i32** %trigger.addr, align 8
803 store i32 0, i32* %i, align 4
806 for.cond: ; preds = %for.inc, %entry
807 %0 = load i32, i32* %i, align 4
808 %cmp = icmp slt i32 %0, 4096
809 br i1 %cmp, label %for.body, label %for.end
811 for.body: ; preds = %for.cond
812 %1 = load i32, i32* %i, align 4
813 %idxprom = sext i32 %1 to i64
814 %2 = load i32*, i32** %trigger.addr, align 8
815 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
816 %3 = load i32, i32* %arrayidx, align 4
817 %cmp1 = icmp sgt i32 %3, 0
818 br i1 %cmp1, label %if.then, label %if.end
820 if.then: ; preds = %for.body
821 %4 = load i32, i32* %i, align 4
822 %idxprom2 = sext i32 %4 to i64
823 %5 = load %struct.In*, %struct.In** %in.addr, align 8
824 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
825 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
826 %6 = load float, float* %b, align 4
827 %add = fadd float %6, 5.000000e-01
828 %7 = load i32, i32* %i, align 4
829 %idxprom4 = sext i32 %7 to i64
830 %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
831 %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
832 %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
833 store float %add, float* %b6, align 4
836 if.end: ; preds = %if.then, %for.body
839 for.inc: ; preds = %if.end
840 %9 = load i32, i32* %i, align 4
841 %inc = add nsw i32 %9, 16
842 store i32 %inc, i32* %i, align 4
845 for.end: ; preds = %for.cond
848 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
850 ; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
852 define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
853 ; AVX512-LABEL: @foo2_addrspace(
854 ; AVX512-NEXT: entry:
855 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
856 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
857 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
858 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
859 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
860 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
861 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
862 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
863 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
864 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
865 ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
866 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
867 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
868 ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
869 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
870 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
871 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
872 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
873 ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
874 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
875 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
876 ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
877 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
878 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
879 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
880 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
881 ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
882 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
883 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
884 ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
885 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
886 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
887 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
888 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
889 ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
890 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
891 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
892 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
893 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
894 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
895 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
896 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
897 ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
898 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
899 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
900 ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
901 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
902 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
903 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
904 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
905 ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
906 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
907 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
908 ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
909 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
910 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
911 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
912 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
913 ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
914 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
915 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
916 ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
917 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
918 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
919 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
920 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
921 ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
922 ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
923 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
924 ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
925 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
926 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
927 ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
928 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
929 ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
930 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
931 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
932 ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
933 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
934 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
935 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
936 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
937 ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
938 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
939 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
940 ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
941 ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
942 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
943 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
944 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
945 ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
946 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
947 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
948 ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
949 ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
950 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
951 ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
952 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
953 ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
954 ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
955 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
956 ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
957 ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
958 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
959 ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
960 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
961 ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
962 ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
963 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
964 ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
965 ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
966 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
967 ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
968 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
969 ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
970 ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
971 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
972 ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
973 ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
974 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
975 ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
976 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
977 ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
978 ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
979 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
980 ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
981 ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
982 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
983 ; AVX512-NEXT: ret void
985 ; FVW2-LABEL: @foo2_addrspace(
987 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
989 ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
990 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
991 ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
992 ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
993 ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
994 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
995 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
996 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
997 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
998 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
999 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1000 ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1001 ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1002 ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1003 ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
1004 ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
1005 ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
1006 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
1007 ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
1008 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
1009 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
1010 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
1011 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
1012 ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
1013 ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
1014 ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
1015 ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
1016 ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
1017 ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
1018 ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
1019 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
1020 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
1021 ; FVW2: pred.store.if:
1022 ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
1023 ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
1024 ; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
1025 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
1026 ; FVW2: pred.store.continue:
1027 ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
1028 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
1029 ; FVW2: pred.store.if17:
1030 ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
1031 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
1032 ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
1033 ; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
1034 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]]
1035 ; FVW2: pred.store.continue18:
1036 ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
1037 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
1038 ; FVW2: pred.store.if19:
1039 ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
1040 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
1041 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
1042 ; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
1043 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]]
1044 ; FVW2: pred.store.continue20:
1045 ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
1046 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
1047 ; FVW2: pred.store.if21:
1048 ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
1049 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
1050 ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
1051 ; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
1052 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]]
1053 ; FVW2: pred.store.continue22:
1054 ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
1055 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
1056 ; FVW2: pred.store.if23:
1057 ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
1058 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
1059 ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
1060 ; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
1061 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]]
1062 ; FVW2: pred.store.continue24:
1063 ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
1064 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
1065 ; FVW2: pred.store.if25:
1066 ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
1067 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
1068 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
1069 ; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
1070 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]]
1071 ; FVW2: pred.store.continue26:
1072 ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
1073 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
1074 ; FVW2: pred.store.if27:
1075 ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
1076 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
1077 ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
1078 ; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
1079 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]]
1080 ; FVW2: pred.store.continue28:
1081 ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
1082 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
1083 ; FVW2: pred.store.if29:
1084 ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
1085 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
1086 ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
1087 ; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
1088 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]]
1089 ; FVW2: pred.store.continue30:
1090 ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
1091 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
1092 ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
1093 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
1095 ; FVW2-NEXT: ret void
1098 %in.addr = alloca %struct.In addrspace(1)*, align 8
1099 %out.addr = alloca float addrspace(1)*, align 8
1100 %trigger.addr = alloca i32*, align 8
1101 %index.addr = alloca i32*, align 8
1102 %i = alloca i32, align 4
1103 store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
1104 store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
1105 store i32* %trigger, i32** %trigger.addr, align 8
1106 store i32* %index, i32** %index.addr, align 8
1107 store i32 0, i32* %i, align 4
1110 for.cond: ; preds = %for.inc, %entry
1111 %0 = load i32, i32* %i, align 4
1112 %cmp = icmp slt i32 %0, 4096
1113 br i1 %cmp, label %for.body, label %for.end
1115 for.body: ; preds = %for.cond
1116 %1 = load i32, i32* %i, align 4
1117 %idxprom = sext i32 %1 to i64
1118 %2 = load i32*, i32** %trigger.addr, align 8
1119 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
1120 %3 = load i32, i32* %arrayidx, align 4
1121 %cmp1 = icmp sgt i32 %3, 0
1122 br i1 %cmp1, label %if.then, label %if.end
1124 if.then: ; preds = %for.body
1125 %4 = load i32, i32* %i, align 4
1126 %idxprom2 = sext i32 %4 to i64
1127 %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
1128 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
1129 %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
1130 %6 = load float, float addrspace(1)* %b, align 4
1131 %add = fadd float %6, 5.000000e-01
1132 %7 = load i32, i32* %i, align 4
1133 %idxprom4 = sext i32 %7 to i64
1134 %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
1135 %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
1136 store float %add, float addrspace(1)* %arrayidx5, align 4
1139 if.end: ; preds = %if.then, %for.body
1142 for.inc: ; preds = %if.end
1143 %9 = load i32, i32* %i, align 4
1144 %inc = add nsw i32 %9, 16
1145 store i32 %inc, i32* %i, align 4
1148 for.end: ; preds = %for.cond
1152 ; Same as foo2_addrspace but here only the input has the non-default address space.
1154 define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
1155 ; AVX512-LABEL: @foo2_addrspace2(
1156 ; AVX512-NEXT: entry:
1157 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
1158 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1159 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
1160 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
1161 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
1162 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1163 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
1164 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
1165 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
1166 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1167 ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
1168 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
1169 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
1170 ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1171 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
1172 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
1173 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
1174 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1175 ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
1176 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
1177 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
1178 ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1179 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
1180 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
1181 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
1182 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1183 ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
1184 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
1185 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
1186 ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1187 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
1188 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
1189 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
1190 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1191 ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
1192 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
1193 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
1194 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1195 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
1196 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
1197 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
1198 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1199 ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
1200 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
1201 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
1202 ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1203 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
1204 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
1205 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
1206 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1207 ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
1208 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
1209 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
1210 ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1211 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
1212 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
1213 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
1214 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1215 ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
1216 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
1217 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
1218 ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1219 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
1220 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
1221 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
1222 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1223 ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
1224 ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
1225 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
1226 ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1227 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
1228 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
1229 ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
1230 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1231 ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
1232 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
1233 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
1234 ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1235 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
1236 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
1237 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
1238 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1239 ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
1240 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
1241 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
1242 ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1243 ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
1244 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
1245 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
1246 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1247 ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
1248 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
1249 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
1250 ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1251 ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
1252 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
1253 ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
1254 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1255 ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
1256 ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
1257 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
1258 ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1259 ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
1260 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
1261 ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
1262 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1263 ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
1264 ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
1265 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
1266 ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1267 ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
1268 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
1269 ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
1270 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1271 ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
1272 ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
1273 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
1274 ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1275 ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
1276 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
1277 ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
1278 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1279 ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
1280 ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
1281 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
1282 ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1283 ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
1284 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
1285 ; AVX512-NEXT: ret void
1287 ; FVW2-LABEL: @foo2_addrspace2(
1289 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
1290 ; FVW2: vector.body:
1291 ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
1292 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
1293 ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
1294 ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
1295 ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
1296 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
1297 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
1298 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
1299 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
1300 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
1301 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1302 ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1303 ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1304 ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1305 ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
1306 ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
1307 ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
1308 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
1309 ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
1310 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
1311 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
1312 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
1313 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
1314 ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
1315 ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
1316 ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
1317 ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
1318 ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
1319 ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
1320 ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
1321 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
1322 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
1323 ; FVW2: pred.store.if:
1324 ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
1325 ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
1326 ; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4
1327 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
1328 ; FVW2: pred.store.continue:
1329 ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
1330 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
1331 ; FVW2: pred.store.if17:
1332 ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
1333 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
1334 ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
1335 ; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4
1336 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]]
1337 ; FVW2: pred.store.continue18:
1338 ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
1339 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
1340 ; FVW2: pred.store.if19:
1341 ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
1342 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
1343 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
1344 ; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4
1345 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]]
1346 ; FVW2: pred.store.continue20:
1347 ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
1348 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
1349 ; FVW2: pred.store.if21:
1350 ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
1351 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
1352 ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
1353 ; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4
1354 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]]
1355 ; FVW2: pred.store.continue22:
1356 ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
1357 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
1358 ; FVW2: pred.store.if23:
1359 ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
1360 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
1361 ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
1362 ; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4
1363 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]]
1364 ; FVW2: pred.store.continue24:
1365 ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
1366 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
1367 ; FVW2: pred.store.if25:
1368 ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
1369 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
1370 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
1371 ; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4
1372 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]]
1373 ; FVW2: pred.store.continue26:
1374 ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
1375 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
1376 ; FVW2: pred.store.if27:
1377 ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
1378 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
1379 ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
1380 ; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4
1381 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]]
1382 ; FVW2: pred.store.continue28:
1383 ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
1384 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
1385 ; FVW2: pred.store.if29:
1386 ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
1387 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
1388 ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
1389 ; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4
1390 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]]
1391 ; FVW2: pred.store.continue30:
1392 ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
1393 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
1394 ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
1395 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
1397 ; FVW2-NEXT: ret void
1400 %in.addr = alloca %struct.In addrspace(1)*, align 8
1401 %out.addr = alloca float addrspace(0)*, align 8
1402 %trigger.addr = alloca i32*, align 8
1403 %index.addr = alloca i32*, align 8
1404 %i = alloca i32, align 4
1405 store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
1406 store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
1407 store i32* %trigger, i32** %trigger.addr, align 8
1408 store i32* %index, i32** %index.addr, align 8
1409 store i32 0, i32* %i, align 4
1412 for.cond: ; preds = %for.inc, %entry
1413 %0 = load i32, i32* %i, align 4
1414 %cmp = icmp slt i32 %0, 4096
1415 br i1 %cmp, label %for.body, label %for.end
1417 for.body: ; preds = %for.cond
1418 %1 = load i32, i32* %i, align 4
1419 %idxprom = sext i32 %1 to i64
1420 %2 = load i32*, i32** %trigger.addr, align 8
1421 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
1422 %3 = load i32, i32* %arrayidx, align 4
1423 %cmp1 = icmp sgt i32 %3, 0
1424 br i1 %cmp1, label %if.then, label %if.end
1426 if.then: ; preds = %for.body
1427 %4 = load i32, i32* %i, align 4
1428 %idxprom2 = sext i32 %4 to i64
1429 %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
1430 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
1431 %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
1432 %6 = load float, float addrspace(1)* %b, align 4
1433 %add = fadd float %6, 5.000000e-01
1434 %7 = load i32, i32* %i, align 4
1435 %idxprom4 = sext i32 %7 to i64
1436 %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
1437 %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
1438 store float %add, float addrspace(0)* %arrayidx5, align 4
1441 if.end: ; preds = %if.then, %for.body
1444 for.inc: ; preds = %if.end
1445 %9 = load i32, i32* %i, align 4
1446 %inc = add nsw i32 %9, 16
1447 store i32 %inc, i32* %i, align 4
1450 for.end: ; preds = %for.cond
1454 ; Same as foo2_addrspace but here only the output has the non-default address space.
1456 define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
1457 ; AVX512-LABEL: @foo2_addrspace3(
1458 ; AVX512-NEXT: entry:
1459 ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
1460 ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1461 ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
1462 ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
1463 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
1464 ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1465 ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
1466 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
1467 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
1468 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1469 ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
1470 ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
1471 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
1472 ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1473 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
1474 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
1475 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
1476 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1477 ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
1478 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
1479 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
1480 ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1481 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
1482 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
1483 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
1484 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1485 ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
1486 ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
1487 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
1488 ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1489 ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
1490 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
1491 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
1492 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1493 ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
1494 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
1495 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
1496 ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1497 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
1498 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
1499 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
1500 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1501 ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
1502 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
1503 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
1504 ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1505 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
1506 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
1507 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
1508 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1509 ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
1510 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
1511 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
1512 ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1513 ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
1514 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
1515 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
1516 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1517 ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
1518 ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
1519 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
1520 ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1521 ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
1522 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
1523 ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
1524 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1525 ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
1526 ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
1527 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
1528 ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1529 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
1530 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
1531 ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
1532 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1533 ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
1534 ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
1535 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
1536 ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1537 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
1538 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
1539 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
1540 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1541 ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
1542 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
1543 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
1544 ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1545 ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
1546 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
1547 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
1548 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1549 ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
1550 ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
1551 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
1552 ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1553 ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
1554 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
1555 ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
1556 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1557 ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
1558 ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
1559 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
1560 ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1561 ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
1562 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
1563 ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
1564 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1565 ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
1566 ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
1567 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
1568 ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1569 ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
1570 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
1571 ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
1572 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1573 ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
1574 ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
1575 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
1576 ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1577 ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
1578 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
1579 ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
1580 ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
1581 ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
1582 ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
1583 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
1584 ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
1585 ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
1586 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
1587 ; AVX512-NEXT: ret void
1589 ; FVW2-LABEL: @foo2_addrspace3(
1591 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
1592 ; FVW2: vector.body:
1593 ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
1594 ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
1595 ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
1596 ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
1597 ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
1598 ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
1599 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
1600 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
1601 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
1602 ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
1603 ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1604 ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1605 ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1606 ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1607 ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
1608 ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
1609 ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
1610 ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
1611 ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
1612 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
1613 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
1614 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
1615 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
1616 ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
1617 ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
1618 ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
1619 ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
1620 ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
1621 ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
1622 ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
1623 ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
1624 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
1625 ; FVW2: pred.store.if:
1626 ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
1627 ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
1628 ; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
1629 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]]
1630 ; FVW2: pred.store.continue:
1631 ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
1632 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
1633 ; FVW2: pred.store.if17:
1634 ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
1635 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
1636 ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
1637 ; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
1638 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]]
1639 ; FVW2: pred.store.continue18:
1640 ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
1641 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
1642 ; FVW2: pred.store.if19:
1643 ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
1644 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
1645 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
1646 ; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
1647 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]]
1648 ; FVW2: pred.store.continue20:
1649 ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
1650 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
1651 ; FVW2: pred.store.if21:
1652 ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
1653 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
1654 ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
1655 ; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
1656 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]]
1657 ; FVW2: pred.store.continue22:
1658 ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
1659 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
1660 ; FVW2: pred.store.if23:
1661 ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
1662 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
1663 ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
1664 ; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
1665 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]]
1666 ; FVW2: pred.store.continue24:
1667 ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
1668 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
1669 ; FVW2: pred.store.if25:
1670 ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
1671 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
1672 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
1673 ; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
1674 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]]
1675 ; FVW2: pred.store.continue26:
1676 ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
1677 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
1678 ; FVW2: pred.store.if27:
1679 ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
1680 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
1681 ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
1682 ; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
1683 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]]
1684 ; FVW2: pred.store.continue28:
1685 ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
1686 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
1687 ; FVW2: pred.store.if29:
1688 ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
1689 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
1690 ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
1691 ; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
1692 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]]
1693 ; FVW2: pred.store.continue30:
1694 ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
1695 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
1696 ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
1697 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
1699 ; FVW2-NEXT: ret void
1702 %in.addr = alloca %struct.In addrspace(0)*, align 8
1703 %out.addr = alloca float addrspace(1)*, align 8
1704 %trigger.addr = alloca i32*, align 8
1705 %index.addr = alloca i32*, align 8
1706 %i = alloca i32, align 4
1707 store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
1708 store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
1709 store i32* %trigger, i32** %trigger.addr, align 8
1710 store i32* %index, i32** %index.addr, align 8
1711 store i32 0, i32* %i, align 4
1714 for.cond: ; preds = %for.inc, %entry
1715 %0 = load i32, i32* %i, align 4
1716 %cmp = icmp slt i32 %0, 4096
1717 br i1 %cmp, label %for.body, label %for.end
1719 for.body: ; preds = %for.cond
1720 %1 = load i32, i32* %i, align 4
1721 %idxprom = sext i32 %1 to i64
1722 %2 = load i32*, i32** %trigger.addr, align 8
1723 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
1724 %3 = load i32, i32* %arrayidx, align 4
1725 %cmp1 = icmp sgt i32 %3, 0
1726 br i1 %cmp1, label %if.then, label %if.end
1728 if.then: ; preds = %for.body
1729 %4 = load i32, i32* %i, align 4
1730 %idxprom2 = sext i32 %4 to i64
1731 %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
1732 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
1733 %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
1734 %6 = load float, float addrspace(0)* %b, align 4
1735 %add = fadd float %6, 5.000000e-01
1736 %7 = load i32, i32* %i, align 4
1737 %idxprom4 = sext i32 %7 to i64
1738 %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
1739 %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
1740 store float %add, float addrspace(1)* %arrayidx5, align 4
1743 if.end: ; preds = %if.then, %for.body
1746 for.inc: ; preds = %if.end
1747 %9 = load i32, i32* %i, align 4
1748 %inc = add nsw i32 %9, 16
1749 store i32 %inc, i32* %i, align 4
1752 for.end: ; preds = %for.cond