1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -passes=loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: opt -S -passes=loop-vectorize -mtriple=x86_64-apple-darwin -mattr=+avx %s | FileCheck %s --check-prefixes=CHECK,AVX
5 ; Two mostly identical functions. The only difference is the presence of
6 ; fast-math flags on the second. The loop is a pretty simple reduction:
8 ; for (int i = 0; i < 32; ++i)
12 define double @sumIfScalar(ptr nocapture readonly %arr) {
13 ; CHECK-LABEL: @sumIfScalar(
15 ; CHECK-NEXT: br label [[LOOP:%.*]]
17 ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
18 ; CHECK-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
19 ; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[I]]
20 ; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
21 ; CHECK-NEXT: [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
22 ; CHECK-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
24 ; CHECK-NEXT: [[TOT_NEW:%.*]] = fadd double [[TOT]], [[NEXTVAL]]
25 ; CHECK-NEXT: br label [[NEXT_ITER]]
27 ; CHECK-NEXT: br label [[NEXT_ITER]]
29 ; CHECK-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
30 ; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
31 ; CHECK-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
32 ; CHECK-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
34 ; CHECK-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
35 ; CHECK-NEXT: ret double [[TOT_NEXT_LCSSA]]
41 %i = phi i32 [0, %entry], [%i.next, %next.iter]
42 %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
44 %addr = getelementptr double, ptr %arr, i32 %i
45 %nextval = load double, ptr %addr
47 %tst = fcmp une double %nextval, 42.0
48 br i1 %tst, label %do.add, label %no.add
51 %tot.new = fadd double %tot, %nextval
58 %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
59 %i.next = add i32 %i, 1
60 %again = icmp ult i32 %i.next, 32
61 br i1 %again, label %loop, label %done
67 define double @sumIfVector(ptr nocapture readonly %arr) {
68 ; SSE-LABEL: @sumIfVector(
70 ; SSE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
72 ; SSE-NEXT: br label [[VECTOR_BODY:%.*]]
74 ; SSE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
75 ; SSE-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
76 ; SSE-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
77 ; SSE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
78 ; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]]
79 ; SSE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[TMP2]], i32 0
80 ; SSE-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[TMP2]], i32 2
81 ; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
82 ; SSE-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
83 ; SSE-NEXT: [[TMP6:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], splat (double 4.200000e+01)
84 ; SSE-NEXT: [[TMP7:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD2]], splat (double 4.200000e+01)
85 ; SSE-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
86 ; SSE-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]]
87 ; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP8]], <2 x double> [[VEC_PHI]]
88 ; SSE-NEXT: [[PREDPHI3]] = select <2 x i1> [[TMP7]], <2 x double> [[TMP9]], <2 x double> [[VEC_PHI1]]
89 ; SSE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
90 ; SSE-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
91 ; SSE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
93 ; SSE-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI3]], [[PREDPHI]]
94 ; SSE-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
95 ; SSE-NEXT: br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]]
97 ; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
98 ; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
99 ; SSE-NEXT: br label [[LOOP:%.*]]
101 ; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
102 ; SSE-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
103 ; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
104 ; SSE-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
105 ; SSE-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
106 ; SSE-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
108 ; SSE-NEXT: [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
109 ; SSE-NEXT: br label [[NEXT_ITER]]
111 ; SSE-NEXT: br label [[NEXT_ITER]]
113 ; SSE-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
114 ; SSE-NEXT: [[I_NEXT]] = add i32 [[I]], 1
115 ; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
116 ; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]]
118 ; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
119 ; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]]
121 ; AVX-LABEL: @sumIfVector(
123 ; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
125 ; AVX-NEXT: br label [[VECTOR_BODY:%.*]]
127 ; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
128 ; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
129 ; AVX-NEXT: [[VEC_PHI1:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ]
130 ; AVX-NEXT: [[VEC_PHI2:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI8:%.*]], [[VECTOR_BODY]] ]
131 ; AVX-NEXT: [[VEC_PHI3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI9:%.*]], [[VECTOR_BODY]] ]
132 ; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
133 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]]
134 ; AVX-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[TMP4]], i32 0
135 ; AVX-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[TMP4]], i32 4
136 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[TMP4]], i32 8
137 ; AVX-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP4]], i32 12
138 ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP8]], align 8
139 ; AVX-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x double>, ptr [[TMP9]], align 8
140 ; AVX-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x double>, ptr [[TMP10]], align 8
141 ; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
142 ; AVX-NEXT: [[TMP12:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], splat (double 4.200000e+01)
143 ; AVX-NEXT: [[TMP13:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD4]], splat (double 4.200000e+01)
144 ; AVX-NEXT: [[TMP14:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD5]], splat (double 4.200000e+01)
145 ; AVX-NEXT: [[TMP15:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD6]], splat (double 4.200000e+01)
146 ; AVX-NEXT: [[TMP16:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
147 ; AVX-NEXT: [[TMP17:%.*]] = fadd fast <4 x double> [[VEC_PHI1]], [[WIDE_LOAD4]]
148 ; AVX-NEXT: [[TMP18:%.*]] = fadd fast <4 x double> [[VEC_PHI2]], [[WIDE_LOAD5]]
149 ; AVX-NEXT: [[TMP19:%.*]] = fadd fast <4 x double> [[VEC_PHI3]], [[WIDE_LOAD6]]
150 ; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP16]], <4 x double> [[VEC_PHI]]
151 ; AVX-NEXT: [[PREDPHI7]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP17]], <4 x double> [[VEC_PHI1]]
152 ; AVX-NEXT: [[PREDPHI8]] = select <4 x i1> [[TMP14]], <4 x double> [[TMP18]], <4 x double> [[VEC_PHI2]]
153 ; AVX-NEXT: [[PREDPHI9]] = select <4 x i1> [[TMP15]], <4 x double> [[TMP19]], <4 x double> [[VEC_PHI3]]
154 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
155 ; AVX-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
156 ; AVX-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
158 ; AVX-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI7]], [[PREDPHI]]
159 ; AVX-NEXT: [[BIN_RDX10:%.*]] = fadd fast <4 x double> [[PREDPHI8]], [[BIN_RDX]]
160 ; AVX-NEXT: [[BIN_RDX11:%.*]] = fadd fast <4 x double> [[PREDPHI9]], [[BIN_RDX10]]
161 ; AVX-NEXT: [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[BIN_RDX11]])
162 ; AVX-NEXT: br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]]
164 ; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
165 ; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
166 ; AVX-NEXT: br label [[LOOP:%.*]]
168 ; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
169 ; AVX-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
170 ; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
171 ; AVX-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
172 ; AVX-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
173 ; AVX-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
175 ; AVX-NEXT: [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
176 ; AVX-NEXT: br label [[NEXT_ITER]]
178 ; AVX-NEXT: br label [[NEXT_ITER]]
180 ; AVX-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
181 ; AVX-NEXT: [[I_NEXT]] = add i32 [[I]], 1
182 ; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
183 ; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]]
185 ; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
186 ; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]]
192 %i = phi i32 [0, %entry], [%i.next, %next.iter]
193 %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
195 %addr = getelementptr double, ptr %arr, i32 %i
196 %nextval = load double, ptr %addr
198 %tst = fcmp fast une double %nextval, 42.0
199 br i1 %tst, label %do.add, label %no.add
202 %tot.new = fadd fast double %tot, %nextval
209 %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
210 %i.next = add i32 %i, 1
211 %again = icmp ult i32 %i.next, 32
212 br i1 %again, label %loop, label %done