1 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK
2 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK
3 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 --check-prefix=CHECK
5 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
6 ; CHECK-LABEL: @select_const_i32_from_icmp
7 ; CHECK-VF4IC1: vector.body:
8 ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
9 ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32>
10 ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], splat (i32 3)
11 ; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], splat (i1 true)
12 ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
13 ; CHECK-VF4IC1: middle.block:
14 ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
15 ; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
16 ; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
18 ; CHECK-VF4IC4: vector.body:
19 ; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
20 ; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
21 ; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
22 ; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
23 ; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
24 ; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
25 ; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
26 ; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
27 ; CHECK-VF4IC4-NEXT: [[NOT1:%.*]] = xor <4 x i1> [[VEC_ICMP1]], splat (i1 true)
28 ; CHECK-VF4IC4-NEXT: [[NOT2:%.*]] = xor <4 x i1> [[VEC_ICMP2]], splat (i1 true)
29 ; CHECK-VF4IC4-NEXT: [[NOT3:%.*]] = xor <4 x i1> [[VEC_ICMP3]], splat (i1 true)
30 ; CHECK-VF4IC4-NEXT: [[NOT4:%.*]] = xor <4 x i1> [[VEC_ICMP4]], splat (i1 true)
31 ; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or <4 x i1> [[VEC_PHI1]], [[NOT1]]
32 ; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or <4 x i1> [[VEC_PHI2]], [[NOT2]]
33 ; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or <4 x i1> [[VEC_PHI3]], [[NOT3]]
34 ; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or <4 x i1> [[VEC_PHI4]], [[NOT4]]
35 ; CHECK-VF4IC4: middle.block:
36 ; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = or <4 x i1> [[VEC_SEL2]], [[VEC_SEL1]]
37 ; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = or <4 x i1> [[VEC_SEL3]], [[VEC_SEL5]]
38 ; CHECK-VF4IC4-NEXT: [[VEC_SEL7:%.*]] = or <4 x i1> [[VEC_SEL4]], [[VEC_SEL6]]
39 ; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL7]])
40 ; CHECK-VF4IC4-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
41 ; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
44 ; CHECK-VF1IC4: vector.body:
45 ; CHECK-VF1IC4: [[VEC_PHI1:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
46 ; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
47 ; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
48 ; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
49 ; CHECK-VF1IC4: [[VEC_LOAD1:%.*]] = load i32
50 ; CHECK-VF1IC4-NEXT: [[VEC_LOAD2:%.*]] = load i32
51 ; CHECK-VF1IC4-NEXT: [[VEC_LOAD3:%.*]] = load i32
52 ; CHECK-VF1IC4-NEXT: [[VEC_LOAD4:%.*]] = load i32
53 ; CHECK-VF1IC4-NEXT: [[VEC_ICMP1:%.*]] = icmp eq i32 [[VEC_LOAD1]], 3
54 ; CHECK-VF1IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3
55 ; CHECK-VF1IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3
56 ; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3
57 ; CHECK-VF1IC4-NEXT: [[NOT1:%.*]] = xor i1 [[VEC_ICMP1]], true
58 ; CHECK-VF1IC4-NEXT: [[NOT2:%.*]] = xor i1 [[VEC_ICMP2]], true
59 ; CHECK-VF1IC4-NEXT: [[NOT3:%.*]] = xor i1 [[VEC_ICMP3]], true
60 ; CHECK-VF1IC4-NEXT: [[NOT4:%.*]] = xor i1 [[VEC_ICMP4]], true
61 ; CHECK-VF1IC4-NEXT: [[VEC_SEL1:%.*]] = or i1 [[VEC_PHI1]], [[NOT1]]
62 ; CHECK-VF1IC4-NEXT: [[VEC_SEL2:%.*]] = or i1 [[VEC_PHI2]], [[NOT2]]
63 ; CHECK-VF1IC4-NEXT: [[VEC_SEL3:%.*]] = or i1 [[VEC_PHI3]], [[NOT3]]
64 ; CHECK-VF1IC4-NEXT: [[VEC_SEL4:%.*]] = or i1 [[VEC_PHI4]], [[NOT4]]
65 ; CHECK-VF1IC4: middle.block:
66 ; CHECK-VF1IC4-NEXT: [[VEC_SEL5:%.*]] = or i1 [[VEC_SEL2]], [[VEC_SEL1]]
67 ; CHECK-VF1IC4-NEXT: [[VEC_SEL6:%.*]] = or i1 [[VEC_SEL3]], [[VEC_SEL5]]
68 ; CHECK-VF1IC4-NEXT: [[OR_RDX:%.*]] = or i1 [[VEC_SEL4]], [[VEC_SEL6]]
69 ; CHECK-VF1IC4-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
70 ; CHECK-VF1IC4-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
75 for.body: ; preds = %entry, %for.body
76 %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
77 %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
78 %2 = getelementptr inbounds i32, ptr %v, i64 %0
79 %3 = load i32, ptr %2, align 4
80 %4 = icmp eq i32 %3, 3
81 %5 = select i1 %4, i32 %1, i32 7
82 %6 = add nuw nsw i64 %0, 1
83 %7 = icmp eq i64 %6, %n
84 br i1 %7, label %exit, label %for.body
86 exit: ; preds = %for.body
91 define i32 @select_const_i32_from_icmp2(ptr nocapture readonly %v, i64 %n) {
92 ; CHECK-LABEL: @select_const_i32_from_icmp2
93 ; CHECK-VF4IC1: vector.body:
94 ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
95 ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32>
96 ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], splat (i32 3)
97 ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
98 ; CHECK-VF4IC1: middle.block:
99 ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
100 ; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
101 ; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
106 for.body: ; preds = %entry, %for.body
107 %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
108 %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
109 %2 = getelementptr inbounds i32, ptr %v, i64 %0
110 %3 = load i32, ptr %2, align 4
111 %4 = icmp eq i32 %3, 3
112 %5 = select i1 %4, i32 7, i32 %1
113 %6 = add nuw nsw i64 %0, 1
114 %7 = icmp eq i64 %6, %n
115 br i1 %7, label %exit, label %for.body
117 exit: ; preds = %for.body
122 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
123 ; CHECK-LABEL: @select_i32_from_icmp
124 ; CHECK-VF4IC1: vector.ph:
125 ; CHECK-VF4IC1-NOT: shufflevector <4 x i32>
126 ; CHECK-VF4IC1-NOT: shufflevector <4 x i32>
127 ; CHECK-VF4IC1: vector.body:
128 ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
129 ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32>
130 ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], splat (i32 3)
131 ; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], splat (i1 true)
132 ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
133 ; CHECK-VF4IC1: middle.block:
134 ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
135 ; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
136 ; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
140 for.body: ; preds = %entry, %for.body
141 %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
142 %1 = phi i32 [ %a, %entry ], [ %5, %for.body ]
143 %2 = getelementptr inbounds i32, ptr %v, i64 %0
144 %3 = load i32, ptr %2, align 4
145 %4 = icmp eq i32 %3, 3
146 %5 = select i1 %4, i32 %1, i32 %b
147 %6 = add nuw nsw i64 %0, 1
148 %7 = icmp eq i64 %6, %n
149 br i1 %7, label %exit, label %for.body
151 exit: ; preds = %for.body
156 define i32 @select_const_i32_from_fcmp_fast(ptr nocapture readonly %v, i64 %n) {
157 ; CHECK-LABEL: @select_const_i32_from_fcmp_fast
158 ; CHECK-VF4IC1: vector.body:
159 ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
160 ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float>
161 ; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], splat (float 3.000000e+00)
162 ; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], splat (i1 true)
163 ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
164 ; CHECK-VF4IC1: middle.block:
165 ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
166 ; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
167 ; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
171 for.body: ; preds = %entry, %for.body
172 %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
173 %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
174 %2 = getelementptr inbounds float, ptr %v, i64 %0
175 %3 = load float, ptr %2, align 4
176 %4 = fcmp fast ueq float %3, 3.0
177 %5 = select i1 %4, i32 %1, i32 1
178 %6 = add nuw nsw i64 %0, 1
179 %7 = icmp eq i64 %6, %n
180 br i1 %7, label %exit, label %for.body
182 exit: ; preds = %for.body
187 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
188 ; CHECK-LABEL: @select_const_i32_from_fcmp
189 ; CHECK-VF4IC1: vector.body:
190 ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
191 ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float>
192 ; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], splat (float 3.000000e+00)
193 ; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], splat (i1 true)
194 ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
195 ; CHECK-VF4IC1: middle.block:
196 ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
197 ; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
198 ; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
202 for.body: ; preds = %entry, %for.body
203 %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
204 %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
205 %2 = getelementptr inbounds float, ptr %v, i64 %0
206 %3 = load float, ptr %2, align 4
207 %4 = fcmp ueq float %3, 3.0
208 %5 = select i1 %4, i32 %1, i32 1
209 %6 = add nuw nsw i64 %0, 1
210 %7 = icmp eq i64 %6, %n
211 br i1 %7, label %exit, label %for.body
213 exit: ; preds = %for.body
218 define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
219 ; CHECK-LABEL: @select_i32_from_icmp_same_inputs
220 ; CHECK-VF4IC1: vector.ph:
221 ; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
222 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
223 ; CHECK-VF4IC1-NOT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
224 ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[SPLAT_OF_A]], splat (i32 3)
225 ; CHECK-VF4IC1: vector.body:
226 ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
227 ; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], splat (i1 true)
228 ; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
229 ; CHECK-VF4IC1: middle.block:
230 ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
231 ; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
232 ; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
236 for.body: ; preds = %entry, %for.body
237 %0 = phi i64 [ 0, %entry ], [ %4, %for.body ]
238 %1 = phi i32 [ %a, %entry ], [ %3, %for.body ]
239 %2 = icmp eq i32 %1, 3
240 %3 = select i1 %2, i32 %1, i32 %b
241 %4 = add nuw nsw i64 %0, 1
242 %5 = icmp eq i64 %4, %n
243 br i1 %5, label %exit, label %for.body
245 exit: ; preds = %for.body
252 ; We don't support FP reduction variables at the moment.
253 define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) {
254 ; CHECK: @select_const_f32_from_icmp
255 ; CHECK-NOT: vector.body
259 for.body: ; preds = %entry, %for.body
260 %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
261 %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ]
262 %2 = getelementptr inbounds i32, ptr %v, i64 %0
263 %3 = load i32, ptr %2, align 4
264 %4 = icmp eq i32 %3, 3
265 %5 = select fast i1 %4, float %1, float 7.0
266 %6 = add nuw nsw i64 %0, 1
267 %7 = icmp eq i64 %6, %n
268 br i1 %7, label %exit, label %for.body
270 exit: ; preds = %for.body
275 ; We don't support selecting loop-variant values.
276 define i32 @select_variant_i32_from_icmp(ptr nocapture readonly %v1, ptr nocapture readonly %v2, i64 %n) {
277 ; CHECK-LABEL: @select_variant_i32_from_icmp
278 ; CHECK-NOT: vector.body
282 for.body: ; preds = %entry, %for.body
283 %0 = phi i64 [ 0, %entry ], [ %8, %for.body ]
284 %1 = phi i32 [ 3, %entry ], [ %7, %for.body ]
285 %2 = getelementptr inbounds i32, ptr %v1, i64 %0
286 %3 = load i32, ptr %2, align 4
287 %4 = getelementptr inbounds i32, ptr %v2, i64 %0
288 %5 = load i32, ptr %4, align 4
289 %6 = icmp eq i32 %3, 3
290 %7 = select i1 %6, i32 %1, i32 %5
291 %8 = add nuw nsw i64 %0, 1
292 %9 = icmp eq i64 %8, %n
293 br i1 %9, label %exit, label %for.body
295 exit: ; preds = %for.body
300 ; We only support selects where the input comes from the same PHI as the
301 ; reduction PHI. In the example below, the select uses the induction
302 ; variable input and the icmp uses the reduction PHI.
303 define i32 @select_i32_from_icmp_non_redux_phi(i32 %a, i32 %b, i32 %n) {
304 ; CHECK-LABEL: @select_i32_from_icmp_non_redux_phi
305 ; CHECK-NOT: vector.body
309 for.body: ; preds = %entry, %for.body
310 %0 = phi i32 [ 0, %entry ], [ %4, %for.body ]
311 %1 = phi i32 [ %a, %entry ], [ %3, %for.body ]
312 %2 = icmp eq i32 %1, 3
313 %3 = select i1 %2, i32 %0, i32 %b
314 %4 = add nuw nsw i32 %0, 1
315 %5 = icmp eq i32 %4, %n
316 br i1 %5, label %exit, label %for.body
318 exit: ; preds = %for.body