1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
3 ; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
5 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
7 ; Make sure we order the operands of commutative operations so that we get
8 ; bigger vectorizable trees.
10 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
11 ; CHECK-LABEL: @shuffle_operands1(
12 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
13 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
14 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
15 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
16 ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
17 ; CHECK-NEXT: ret void
19 ; SSE2-LABEL: @shuffle_operands1(
20 ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
21 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
22 ; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
23 ; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
24 ; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
27 %from_1 = getelementptr double, ptr %from, i64 1
28 %v0_1 = load double , ptr %from
29 %v0_2 = load double , ptr %from_1
30 %v1_1 = fadd double %v0_1, %v1
31 %v1_2 = fadd double %v2, %v0_2
32 %to_2 = getelementptr double, ptr %to, i64 1
33 store double %v1_1, ptr %to
34 store double %v1_2, ptr %to_2
38 define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
39 ; CHECK-LABEL: @vecload_vs_broadcast(
41 ; CHECK-NEXT: br label [[LP:%.*]]
43 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
44 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
45 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
46 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
47 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
48 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
49 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
51 ; CHECK-NEXT: ret void
53 ; SSE2-LABEL: @vecload_vs_broadcast(
55 ; SSE2-NEXT: br label [[LP:%.*]]
57 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
58 ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
59 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
60 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
61 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
62 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
63 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
71 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
72 %from_1 = getelementptr double, ptr %from, i64 1
73 %v0_1 = load double , ptr %from
74 %v0_2 = load double , ptr %from_1
75 %v1_1 = fadd double %v0_1, %p
76 %v1_2 = fadd double %v0_1, %v0_2
77 %to_2 = getelementptr double, ptr %to, i64 1
78 store double %v1_1, ptr %to
79 store double %v1_2, ptr %to_2
80 br i1 %c, label %lp, label %ext
86 define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
87 ; CHECK-LABEL: @vecload_vs_broadcast2(
89 ; CHECK-NEXT: br label [[LP:%.*]]
91 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
92 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
93 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
94 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
95 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
96 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
97 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
99 ; CHECK-NEXT: ret void
101 ; SSE2-LABEL: @vecload_vs_broadcast2(
103 ; SSE2-NEXT: br label [[LP:%.*]]
105 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
106 ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
107 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
108 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
109 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
110 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
111 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
113 ; SSE2-NEXT: ret void
119 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
120 %from_1 = getelementptr double, ptr %from, i64 1
121 %v0_1 = load double , ptr %from
122 %v0_2 = load double , ptr %from_1
123 %v1_1 = fadd double %p, %v0_1
124 %v1_2 = fadd double %v0_2, %v0_1
125 %to_2 = getelementptr double, ptr %to, i64 1
126 store double %v1_1, ptr %to
127 store double %v1_2, ptr %to_2
128 br i1 %c, label %lp, label %ext
134 define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
135 ; CHECK-LABEL: @vecload_vs_broadcast3(
137 ; CHECK-NEXT: br label [[LP:%.*]]
139 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
140 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
141 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
142 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
143 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
144 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
145 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
147 ; CHECK-NEXT: ret void
149 ; SSE2-LABEL: @vecload_vs_broadcast3(
151 ; SSE2-NEXT: br label [[LP:%.*]]
153 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
154 ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
155 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
156 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
157 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
158 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
159 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
161 ; SSE2-NEXT: ret void
167 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
168 %from_1 = getelementptr double, ptr %from, i64 1
169 %v0_1 = load double , ptr %from
170 %v0_2 = load double , ptr %from_1
171 %v1_1 = fadd double %p, %v0_1
172 %v1_2 = fadd double %v0_1, %v0_2
173 %to_2 = getelementptr double, ptr %to, i64 1
174 store double %v1_1, ptr %to
175 store double %v1_2, ptr %to_2
176 br i1 %c, label %lp, label %ext
182 define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
183 ; CHECK-LABEL: @shuffle_nodes_match1(
185 ; CHECK-NEXT: br label [[LP:%.*]]
187 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
188 ; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
189 ; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
190 ; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
191 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
192 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
193 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
194 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
195 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
196 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
197 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
199 ; CHECK-NEXT: ret void
201 ; SSE2-LABEL: @shuffle_nodes_match1(
203 ; SSE2-NEXT: br label [[LP:%.*]]
205 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
206 ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
207 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
208 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
209 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
210 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
211 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
213 ; SSE2-NEXT: ret void
219 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
220 %from_1 = getelementptr double, ptr %from, i64 1
221 %v0_1 = load double , ptr %from
222 %v0_2 = load double , ptr %from_1
223 %v1_1 = fadd double %v0_2, %v0_1
224 %v1_2 = fadd double %p, %v0_1
225 %to_2 = getelementptr double, ptr %to, i64 1
226 store double %v1_1, ptr %to
227 store double %v1_2, ptr %to_2
228 br i1 %c, label %lp, label %ext
234 define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
235 ; CHECK-LABEL: @vecload_vs_broadcast4(
237 ; CHECK-NEXT: br label [[LP:%.*]]
239 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
240 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
241 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
242 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
243 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
244 ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
245 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
247 ; CHECK-NEXT: ret void
249 ; SSE2-LABEL: @vecload_vs_broadcast4(
251 ; SSE2-NEXT: br label [[LP:%.*]]
253 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
254 ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
255 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
256 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
257 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
258 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
259 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
261 ; SSE2-NEXT: ret void
267 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
268 %from_1 = getelementptr double, ptr %from, i64 1
269 %v0_1 = load double , ptr %from
270 %v0_2 = load double , ptr %from_1
271 %v1_1 = fadd double %v0_1, %v0_2
272 %v1_2 = fadd double %p, %v0_1
273 %to_2 = getelementptr double, ptr %to, i64 1
274 store double %v1_1, ptr %to
275 store double %v1_2, ptr %to_2
276 br i1 %c, label %lp, label %ext
283 define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
284 ; CHECK-LABEL: @shuffle_nodes_match2(
286 ; CHECK-NEXT: br label [[LP:%.*]]
288 ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
289 ; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
290 ; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
291 ; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
292 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
293 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
294 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
295 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1
296 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
297 ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
298 ; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
300 ; CHECK-NEXT: ret void
302 ; SSE2-LABEL: @shuffle_nodes_match2(
304 ; SSE2-NEXT: br label [[LP:%.*]]
306 ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
307 ; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
308 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
309 ; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
310 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
311 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
312 ; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
314 ; SSE2-NEXT: ret void
320 %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
321 %from_1 = getelementptr double, ptr %from, i64 1
322 %v0_1 = load double , ptr %from
323 %v0_2 = load double , ptr %from_1
324 %v1_1 = fadd double %v0_1, %v0_2
325 %v1_2 = fadd double %v0_1, %p
326 %to_2 = getelementptr double, ptr %to, i64 1
327 store double %v1_1, ptr %to
328 store double %v1_2, ptr %to_2
329 br i1 %c, label %lp, label %ext
335 ; Make sure we don't scramble operands when we reorder them and destroy
336 ; 'good' source order.
338 @a = common global [32000 x float] zeroinitializer, align 16
340 define void @good_load_order() {
341 ; CHECK-LABEL: @good_load_order(
343 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
344 ; CHECK: for.cond1.preheader:
345 ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16
346 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
348 ; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
349 ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
350 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
351 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
352 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
353 ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
354 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
355 ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
356 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4
357 ; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
358 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
359 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
360 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
361 ; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
362 ; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
363 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
364 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
365 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
366 ; CHECK-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
367 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
368 ; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
369 ; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
370 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
371 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
372 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
374 ; CHECK-NEXT: ret void
376 ; SSE2-LABEL: @good_load_order(
378 ; SSE2-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
379 ; SSE2: for.cond1.preheader:
380 ; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16
381 ; SSE2-NEXT: br label [[FOR_BODY3:%.*]]
383 ; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
384 ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
385 ; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
386 ; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
387 ; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
388 ; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
389 ; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
390 ; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
391 ; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4
392 ; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
393 ; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
394 ; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
395 ; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
396 ; SSE2-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
397 ; SSE2-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
398 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
399 ; SSE2-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
400 ; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
401 ; SSE2-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
402 ; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
403 ; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
404 ; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
405 ; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
406 ; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
407 ; SSE2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
409 ; SSE2-NEXT: ret void
412 br label %for.cond1.preheader
415 %0 = load float, ptr @a, align 16
419 %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
420 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
421 %2 = add nsw i64 %indvars.iv, 1
422 %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
423 %3 = load float, ptr %arrayidx, align 4
424 %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
425 %mul6 = fmul float %3, %1
426 store float %mul6, ptr %arrayidx5, align 4
427 %4 = add nsw i64 %indvars.iv, 2
428 %arrayidx11 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %4
429 %5 = load float, ptr %arrayidx11, align 4
430 %mul15 = fmul float %5, %3
431 store float %mul15, ptr %arrayidx, align 4
432 %6 = add nsw i64 %indvars.iv, 3
433 %arrayidx21 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %6
434 %7 = load float, ptr %arrayidx21, align 4
435 %mul25 = fmul float %7, %5
436 store float %mul25, ptr %arrayidx11, align 4
437 %8 = add nsw i64 %indvars.iv, 4
438 %arrayidx31 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %8
439 %9 = load float, ptr %arrayidx31, align 4
440 %mul35 = fmul float %9, %7
441 store float %mul35, ptr %arrayidx21, align 4
442 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
443 %arrayidx41 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv.next
444 %10 = load float, ptr %arrayidx41, align 4
445 %mul45 = fmul float %10, %9
446 store float %mul45, ptr %arrayidx31, align 4
447 %11 = trunc i64 %indvars.iv.next to i32
448 %cmp2 = icmp slt i32 %11, 31995
449 br i1 %cmp2, label %for.body3, label %for.end
455 ; Check vectorization of following code for double data type-
457 ; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
459 define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
460 ; CHECK-LABEL: @load_reorder_double(
461 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
462 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
463 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
464 ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
465 ; CHECK-NEXT: ret void
467 ; SSE2-LABEL: @load_reorder_double(
468 ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
469 ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
470 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
471 ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
472 ; SSE2-NEXT: ret void
474 %1 = load double, ptr %a
475 %2 = load double, ptr %b
476 %3 = fadd double %1, %2
477 store double %3, ptr %c
478 %4 = getelementptr inbounds double, ptr %b, i64 1
479 %5 = load double, ptr %4
480 %6 = getelementptr inbounds double, ptr %a, i64 1
481 %7 = load double, ptr %6
482 %8 = fadd double %5, %7
483 %9 = getelementptr inbounds double, ptr %c, i64 1
484 store double %8, ptr %9
488 ; Check vectorization of following code for float data type-
490 ; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
494 define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
495 ; CHECK-LABEL: @load_reorder_float(
496 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
497 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
498 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
499 ; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
500 ; CHECK-NEXT: ret void
502 ; SSE2-LABEL: @load_reorder_float(
503 ; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
504 ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
505 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
506 ; SSE2-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
507 ; SSE2-NEXT: ret void
509 %1 = load float, ptr %a
510 %2 = load float, ptr %b
511 %3 = fadd float %1, %2
512 store float %3, ptr %c
513 %4 = getelementptr inbounds float, ptr %b, i64 1
514 %5 = load float, ptr %4
515 %6 = getelementptr inbounds float, ptr %a, i64 1
516 %7 = load float, ptr %6
517 %8 = fadd float %5, %7
518 %9 = getelementptr inbounds float, ptr %c, i64 1
519 store float %8, ptr %9
520 %10 = getelementptr inbounds float, ptr %a, i64 2
521 %11 = load float, ptr %10
522 %12 = getelementptr inbounds float, ptr %b, i64 2
523 %13 = load float, ptr %12
524 %14 = fadd float %11, %13
525 %15 = getelementptr inbounds float, ptr %c, i64 2
526 store float %14, ptr %15
527 %16 = getelementptr inbounds float, ptr %a, i64 3
528 %17 = load float, ptr %16
529 %18 = getelementptr inbounds float, ptr %b, i64 3
530 %19 = load float, ptr %18
531 %20 = fadd float %17, %19
532 %21 = getelementptr inbounds float, ptr %c, i64 3
533 store float %20, ptr %21
537 ; Check we properly reorder the below code so that it gets vectorized optimally-
538 ; a[0] = (b[0]+c[0])+d[0];
539 ; a[1] = d[1]+(b[1]+c[1]);
540 ; a[2] = (b[2]+c[2])+d[2];
541 ; a[3] = (b[3]+c[3])+d[3];
543 define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) {
544 ; CHECK-LABEL: @opcode_reorder(
545 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
546 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
547 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
548 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
549 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
550 ; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
551 ; CHECK-NEXT: ret void
553 ; SSE2-LABEL: @opcode_reorder(
554 ; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
555 ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
556 ; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
557 ; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
558 ; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
559 ; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
560 ; SSE2-NEXT: ret void
562 %1 = load float, ptr %b
563 %2 = load float, ptr %c
564 %3 = fadd float %1, %2
565 %4 = load float, ptr %d
566 %5 = fadd float %3, %4
567 store float %5, ptr %a
568 %6 = getelementptr inbounds float, ptr %d, i64 1
569 %7 = load float, ptr %6
570 %8 = getelementptr inbounds float, ptr %b, i64 1
571 %9 = load float, ptr %8
572 %10 = getelementptr inbounds float, ptr %c, i64 1
573 %11 = load float, ptr %10
574 %12 = fadd float %9, %11
575 %13 = fadd float %7, %12
576 %14 = getelementptr inbounds float, ptr %a, i64 1
577 store float %13, ptr %14
578 %15 = getelementptr inbounds float, ptr %b, i64 2
579 %16 = load float, ptr %15
580 %17 = getelementptr inbounds float, ptr %c, i64 2
581 %18 = load float, ptr %17
582 %19 = fadd float %16, %18
583 %20 = getelementptr inbounds float, ptr %d, i64 2
584 %21 = load float, ptr %20
585 %22 = fadd float %19, %21
586 %23 = getelementptr inbounds float, ptr %a, i64 2
587 store float %22, ptr %23
588 %24 = getelementptr inbounds float, ptr %b, i64 3
589 %25 = load float, ptr %24
590 %26 = getelementptr inbounds float, ptr %c, i64 3
591 %27 = load float, ptr %26
592 %28 = fadd float %25, %27
593 %29 = getelementptr inbounds float, ptr %d, i64 3
594 %30 = load float, ptr %29
595 %31 = fadd float %28, %30
596 %32 = getelementptr inbounds float, ptr %a, i64 3
597 store float %31, ptr %32