1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,AVX
5 ; This file tests the look-ahead operand reordering heuristic.
8 ; This checks that operand reordering will reorder the operands of the adds
9 ; by taking into consideration the instructions beyond the immediate
12 ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1]
20 define void @lookahead_basic(ptr %array) {
21 ; CHECK-LABEL: @lookahead_basic(
23 ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
24 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
25 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
26 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
27 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
28 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
29 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
30 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
31 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
32 ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
33 ; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8
34 ; CHECK-NEXT: ret void
37 %idx1 = getelementptr inbounds double, ptr %array, i64 1
38 %idx2 = getelementptr inbounds double, ptr %array, i64 2
39 %idx3 = getelementptr inbounds double, ptr %array, i64 3
40 %idx4 = getelementptr inbounds double, ptr %array, i64 4
41 %idx5 = getelementptr inbounds double, ptr %array, i64 5
42 %idx6 = getelementptr inbounds double, ptr %array, i64 6
43 %idx7 = getelementptr inbounds double, ptr %array, i64 7
45 %A_0 = load double, ptr %array, align 8
46 %A_1 = load double, ptr %idx1, align 8
47 %B_0 = load double, ptr %idx2, align 8
48 %B_1 = load double, ptr %idx3, align 8
49 %C_0 = load double, ptr %idx4, align 8
50 %C_1 = load double, ptr %idx5, align 8
51 %D_0 = load double, ptr %idx6, align 8
52 %D_1 = load double, ptr %idx7, align 8
54 %subAB_0 = fsub fast double %A_0, %B_0
55 %subCD_0 = fsub fast double %C_0, %D_0
57 %subAB_1 = fsub fast double %A_1, %B_1
58 %subCD_1 = fsub fast double %C_1, %D_1
60 %addABCD_0 = fadd fast double %subAB_0, %subCD_0
61 %addCDAB_1 = fadd fast double %subCD_1, %subAB_1
63 store double %addABCD_0, ptr %array, align 8
64 store double %addCDAB_1, ptr %idx1, align 8
69 ; Check whether the look-ahead operand reordering heuristic will avoid
70 ; bundling the alt opcodes. The vectorized code should have no shuffles.
72 ; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1]
80 define void @lookahead_alt1(ptr %array) {
81 ; CHECK-LABEL: @lookahead_alt1(
83 ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
84 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
85 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5
86 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
87 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7
88 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
89 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
90 ; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
91 ; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
92 ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]]
93 ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8
94 ; CHECK-NEXT: ret void
97 %idx1 = getelementptr inbounds double, ptr %array, i64 1
98 %idx2 = getelementptr inbounds double, ptr %array, i64 2
99 %idx3 = getelementptr inbounds double, ptr %array, i64 3
100 %idx4 = getelementptr inbounds double, ptr %array, i64 4
101 %idx5 = getelementptr inbounds double, ptr %array, i64 5
102 %idx6 = getelementptr inbounds double, ptr %array, i64 6
103 %idx7 = getelementptr inbounds double, ptr %array, i64 7
105 %A_0 = load double, ptr %array, align 8
106 %A_1 = load double, ptr %idx1, align 8
107 %B_0 = load double, ptr %idx2, align 8
108 %B_1 = load double, ptr %idx3, align 8
110 %addAB_0_L = fadd fast double %A_0, %B_0
111 %subAB_0_R = fsub fast double %A_0, %B_0
113 %subAB_1_L = fsub fast double %A_1, %B_1
114 %addAB_1_R = fadd fast double %A_1, %B_1
116 %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R
117 %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R
119 store double %addABCD_0, ptr %array, align 8
120 store double %addCDAB_1, ptr %idx1, align 8
125 ; This code should get vectorized all the way to the loads with shuffles for
128 ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1]
136 define void @lookahead_alt2(ptr %array) {
137 ; CHECK-LABEL: @lookahead_alt2(
139 ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
140 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
141 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
142 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
143 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
144 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
145 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
146 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
147 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]]
148 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
149 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
150 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
151 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
152 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
153 ; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8
154 ; CHECK-NEXT: ret void
157 %idx1 = getelementptr inbounds double, ptr %array, i64 1
158 %idx2 = getelementptr inbounds double, ptr %array, i64 2
159 %idx3 = getelementptr inbounds double, ptr %array, i64 3
160 %idx4 = getelementptr inbounds double, ptr %array, i64 4
161 %idx5 = getelementptr inbounds double, ptr %array, i64 5
162 %idx6 = getelementptr inbounds double, ptr %array, i64 6
163 %idx7 = getelementptr inbounds double, ptr %array, i64 7
165 %A_0 = load double, ptr %array, align 8
166 %A_1 = load double, ptr %idx1, align 8
167 %B_0 = load double, ptr %idx2, align 8
168 %B_1 = load double, ptr %idx3, align 8
169 %C_0 = load double, ptr %idx4, align 8
170 %C_1 = load double, ptr %idx5, align 8
171 %D_0 = load double, ptr %idx6, align 8
172 %D_1 = load double, ptr %idx7, align 8
174 %addAB_0 = fadd fast double %A_0, %B_0
175 %subCD_0 = fsub fast double %C_0, %D_0
177 %addCD_1 = fadd fast double %C_1, %D_1
178 %subAB_1 = fsub fast double %A_1, %B_1
180 %addABCD_0 = fadd fast double %addAB_0, %subCD_0
181 %addCDAB_1 = fadd fast double %addCD_1, %subAB_1
183 store double %addABCD_0, ptr %array, align 8
184 store double %addCDAB_1, ptr %idx1, align 8
190 ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
198 ; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses.
199 ; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use.
201 define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) {
202 ; CHECK-LABEL: @lookahead_external_uses(
204 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
205 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
206 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
207 ; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8
208 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
209 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
210 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
211 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
212 ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
213 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
214 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
215 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
216 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
217 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
218 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
219 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
220 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
221 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
222 ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
223 ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
224 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
225 ; CHECK-NEXT: store double [[TMP10]], ptr [[EXT1:%.*]], align 8
226 ; CHECK-NEXT: ret void
230 %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
231 %IdxB2 = getelementptr inbounds double, ptr %B, i64 2
232 %IdxA2 = getelementptr inbounds double, ptr %A, i64 2
233 %IdxB1 = getelementptr inbounds double, ptr %B, i64 1
235 %A0 = load double, ptr %A, align 8
236 %B0 = load double, ptr %B, align 8
237 %C0 = load double, ptr %C, align 8
238 %D0 = load double, ptr %D, align 8
240 %A1 = load double, ptr %IdxA1, align 8
241 %B2 = load double, ptr %IdxB2, align 8
242 %A2 = load double, ptr %IdxA2, align 8
243 %B1 = load double, ptr %IdxB1, align 8
245 %subA0B0 = fsub fast double %A0, %B0
246 %subC0D0 = fsub fast double %C0, %D0
248 %subA1B2 = fsub fast double %A1, %B2
249 %subA2B1 = fsub fast double %A2, %B1
251 %add0 = fadd fast double %subA0B0, %subC0D0
252 %add1 = fadd fast double %subA1B2, %subA2B1
254 %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
256 store double %add0, ptr %S, align 8
257 store double %add1, ptr %IdxS1, align 8
260 store double %A1, ptr %Ext1, align 8
264 ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
265 ; \ / \ / / \ / \ / \
266 ; - - U1,U2,U3 - - U4,U5
273 ; If we limit the users budget for the look-ahead heuristic to 2, then the
274 ; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
275 ; over A[1] (with 3 external users).
276 ; The result is that the operands are of the Add not reordered and the loads
277 ; from A get vectorized instead of the loads from B.
279 define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) {
280 ; CHECK-LABEL: @lookahead_limit_users_budget(
282 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
283 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
284 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
285 ; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8
286 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
287 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
288 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
289 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
290 ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
291 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
292 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
293 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
294 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
295 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
296 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
297 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
298 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
299 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
300 ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
301 ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
302 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
303 ; CHECK-NEXT: store double [[TMP10]], ptr [[EXT1:%.*]], align 8
304 ; CHECK-NEXT: store double [[TMP10]], ptr [[EXT2:%.*]], align 8
305 ; CHECK-NEXT: store double [[TMP10]], ptr [[EXT3:%.*]], align 8
306 ; CHECK-NEXT: store double [[B1]], ptr [[EXT4:%.*]], align 8
307 ; CHECK-NEXT: store double [[B1]], ptr [[EXT5:%.*]], align 8
308 ; CHECK-NEXT: ret void
312 %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
313 %IdxB2 = getelementptr inbounds double, ptr %B, i64 2
314 %IdxA2 = getelementptr inbounds double, ptr %A, i64 2
315 %IdxB1 = getelementptr inbounds double, ptr %B, i64 1
317 %A0 = load double, ptr %A, align 8
318 %B0 = load double, ptr %B, align 8
319 %C0 = load double, ptr %C, align 8
320 %D0 = load double, ptr %D, align 8
322 %A1 = load double, ptr %IdxA1, align 8
323 %B2 = load double, ptr %IdxB2, align 8
324 %A2 = load double, ptr %IdxA2, align 8
325 %B1 = load double, ptr %IdxB1, align 8
327 %subA0B0 = fsub fast double %A0, %B0
328 %subC0D0 = fsub fast double %C0, %D0
330 %subA1B2 = fsub fast double %A1, %B2
331 %subA2B1 = fsub fast double %A2, %B1
333 %add0 = fadd fast double %subA0B0, %subC0D0
334 %add1 = fadd fast double %subA1B2, %subA2B1
336 %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
338 store double %add0, ptr %S, align 8
339 store double %add1, ptr %IdxS1, align 8
341 ; External uses of A1
342 store double %A1, ptr %Ext1, align 8
343 store double %A1, ptr %Ext2, align 8
344 store double %A1, ptr %Ext3, align 8
346 ; External uses of B1
347 store double %B1, ptr %Ext4, align 8
348 store double %B1, ptr %Ext5, align 8
353 ; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
356 declare double @_ZN1i2ayEv(ptr)
357 declare double @_ZN1i2axEv()
359 define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) {
360 ; CHECK-LABEL: @lookahead_crash(
361 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
362 ; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(ptr [[ARG0:%.*]])
363 ; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv()
364 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
365 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
366 ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
367 ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[S:%.*]], align 8
368 ; CHECK-NEXT: ret void
370 %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
372 %A0 = load double, ptr %A, align 8
373 %A1 = load double, ptr %IdxA1, align 8
375 %C0 = call double @_ZN1i2ayEv(ptr %Arg0)
376 %C1 = call double @_ZN1i2axEv()
378 %add0 = fadd fast double %A0, %C0
379 %add1 = fadd fast double %A1, %C1
381 %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
382 store double %add0, ptr %S, align 8
383 store double %add1, ptr %IdxS1, align 8
387 ; This checks that we choose to group consecutive extracts from the same vectors.
388 define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) {
389 ; CHECK-LABEL: @ChecksExtractScores(
390 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
391 ; CHECK-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
392 ; CHECK-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
393 ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
394 ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
395 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
396 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
397 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
398 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
399 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
400 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
401 ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
402 ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
403 ; CHECK-NEXT: ret void
405 %idx1 = getelementptr inbounds double, ptr %array, i64 1
406 %loadA0 = load double, ptr %array, align 4
407 %loadA1 = load double, ptr %idx1, align 4
409 %loadVec = load <2 x double>, ptr %vecPtr1, align 4
410 %extrA0 = extractelement <2 x double> %loadVec, i32 0
411 %extrA1 = extractelement <2 x double> %loadVec, i32 1
412 %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
413 %extrB0 = extractelement <2 x double> %loadVec2, i32 0
414 %extrB1 = extractelement <2 x double> %loadVec2, i32 1
416 %mul0 = fmul double %extrA0, %loadA0
417 %mul1 = fmul double %extrA1, %loadA0
418 %mul3 = fmul double %extrB0, %loadA1
419 %mul4 = fmul double %extrB1, %loadA1
420 %add0 = fadd double %mul0, %mul3
421 %add1 = fadd double %mul1, %mul4
423 %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
424 store double %add0, ptr %storeArray, align 8
425 store double %add1, ptr %sidx1, align 8
430 define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
431 ; SSE-LABEL: @ExtractIdxNotConstantInt1(
432 ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
433 ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
434 ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
435 ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
436 ; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
437 ; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
438 ; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
439 ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
440 ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
441 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
442 ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
443 ; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
444 ; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
445 ; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
446 ; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
447 ; SSE-NEXT: ret i1 [[CMP_I185]]
449 ; AVX-LABEL: @ExtractIdxNotConstantInt1(
450 ; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
451 ; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
452 ; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
453 ; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
454 ; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
455 ; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
456 ; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
457 ; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
458 ; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
459 ; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
460 ; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
461 ; AVX-NEXT: ret i1 [[CMP_I185]]
463 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
464 %sub14.i167 = fsub float undef, %vecext.i291.i166
465 %fm = fmul float %a, %sub14.i167
466 %sub25.i168 = fsub float %fm, %b
467 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
468 %add36.i173 = fadd float %sub25.i168, 10.0
469 %mul72.i179 = fmul float %c, %vecext.i276.i169
470 %add78.i180 = fsub float %mul72.i179, 30.0
471 %add79.i181 = fadd float 2.0, %add78.i180
472 %mul123.i184 = fmul float %add36.i173, %add79.i181
473 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
478 define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
479 ; SSE-LABEL: @ExtractIdxNotConstantInt2(
480 ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
481 ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
482 ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
483 ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
484 ; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
485 ; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
486 ; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
487 ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
488 ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
489 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
490 ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
491 ; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
492 ; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
493 ; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
494 ; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
495 ; SSE-NEXT: ret i1 [[CMP_I185]]
497 ; AVX-LABEL: @ExtractIdxNotConstantInt2(
498 ; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
499 ; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
500 ; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
501 ; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
502 ; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
503 ; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
504 ; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
505 ; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
506 ; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
507 ; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
508 ; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
509 ; AVX-NEXT: ret i1 [[CMP_I185]]
511 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
512 %sub14.i167 = fsub float undef, %vecext.i291.i166
513 %fm = fmul float %a, %sub14.i167
514 %sub25.i168 = fsub float %fm, %b
515 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
516 %add36.i173 = fadd float %sub25.i168, 10.0
517 %mul72.i179 = fmul float %c, %vecext.i276.i169
518 %add78.i180 = fsub float %mul72.i179, 30.0
519 %add79.i181 = fadd float 2.0, %add78.i180
520 %mul123.i184 = fmul float %add36.i173, %add79.i181
521 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
526 define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
528 ; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
529 ; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
530 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
531 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
532 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1>
533 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
534 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
535 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
536 ; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
537 ; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
538 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
539 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
540 ; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
541 ; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
542 ; CHECK-NEXT: ret i1 [[CMP_I185]]
544 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
545 %sub14.i167 = fsub float undef, %vecext.i291.i166
546 %fm = fmul float %a, %sub14.i167
547 %sub25.i168 = fsub float %fm, %b
548 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1
549 %add36.i173 = fadd float %sub25.i168, 10.0
550 %mul72.i179 = fmul float %c, %vecext.i276.i169
551 %add78.i180 = fsub float %mul72.i179, 30.0
552 %add79.i181 = fadd float 2.0, %add78.i180
553 %mul123.i184 = fmul float %add36.i173, %add79.i181
554 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
558 ; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
559 define void @ChecksExtractScores_different_vectors(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2, ptr %vecPtr3, ptr %vecPtr4) {
561 ; SSE-LABEL: @ChecksExtractScores_different_vectors(
562 ; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
563 ; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
564 ; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
565 ; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
566 ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4
567 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> <i32 1, i32 2>
568 ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
569 ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
570 ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
571 ; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]]
572 ; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
573 ; SSE-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
576 ; AVX-LABEL: @ChecksExtractScores_different_vectors(
577 ; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
578 ; AVX-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
579 ; AVX-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
580 ; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
581 ; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
582 ; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
583 ; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
584 ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> <i32 0, i32 3>
585 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
586 ; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
587 ; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
588 ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
589 ; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
590 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
591 ; AVX-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
592 ; AVX-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
593 ; AVX-NEXT: store <2 x double> [[TMP9]], ptr [[STOREARRAY:%.*]], align 8
596 %idx1 = getelementptr inbounds double, ptr %array, i64 1
597 %loadA0 = load double, ptr %array, align 4
598 %loadA1 = load double, ptr %idx1, align 4
600 %loadVec = load <2 x double>, ptr %vecPtr1, align 4
601 %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
602 %extrA0 = extractelement <2 x double> %loadVec, i32 0
603 %extrA1 = extractelement <2 x double> %loadVec2, i32 1
604 %loadVec3= load <2 x double>, ptr %vecPtr3, align 4
605 %loadVec4 = load <2 x double>, ptr %vecPtr4, align 4
606 %extrB0 = extractelement <2 x double> %loadVec3, i32 0
607 %extrB1 = extractelement <2 x double> %loadVec4, i32 1
609 %mul0 = fmul double %extrA0, %loadA0
610 %mul1 = fmul double %extrA1, %loadA0
611 %mul3 = fmul double %extrB0, %loadA1
612 %mul4 = fmul double %extrB1, %loadA1
613 %add0 = fadd double %mul0, %mul3
614 %add1 = fadd double %mul1, %mul4
616 %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
617 store double %add0, ptr %storeArray, align 8
618 store double %add1, ptr %sidx1, align 8
622 ; This checks that we we prefer splats rather than reverse load vectors + shuffles.
623 ; 2-wide splat loads in x86 use a single instruction so they are quite cheap.
624 define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
625 ; SSE-LABEL: @splat_loads(
627 ; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
628 ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
629 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
630 ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
631 ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
632 ; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
633 ; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
634 ; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
635 ; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]]
636 ; SSE-NEXT: ret double [[ADD3]]
638 ; AVX-LABEL: @splat_loads(
640 ; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
641 ; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
642 ; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
643 ; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
644 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
645 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
646 ; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
647 ; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
648 ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
649 ; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
650 ; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
651 ; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
652 ; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
653 ; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]]
654 ; AVX-NEXT: ret double [[ADD3]]
657 %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
658 %ld_1_0 = load double, ptr %array1, align 8
659 %ld_1_1 = load double, ptr %gep_1_1, align 8
661 %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
662 %ld_2_0 = load double, ptr %array2, align 8
663 %ld_2_1 = load double, ptr %gep_2_1, align 8
665 %mul1 = fmul double %ld_1_0, %ld_2_0
666 %mul2 = fmul double %ld_1_1, %ld_2_0
668 %mul3 = fmul double %ld_1_0, %ld_2_1
669 %mul4 = fmul double %ld_1_1, %ld_2_1
671 %add1 = fadd double %mul1, %mul3
672 %add2 = fadd double %mul2, %mul4
674 %add3 = fadd double %add1, %add2
679 ; Same as splat_loads() but the splat load has internal uses in the slp graph.
680 define double @splat_loads_with_internal_uses(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
681 ; SSE-LABEL: @splat_loads_with_internal_uses(
683 ; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
684 ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
685 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
686 ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
687 ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
688 ; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
689 ; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
690 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
691 ; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
692 ; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
693 ; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]]
694 ; SSE-NEXT: ret double [[RES]]
696 ; AVX-LABEL: @splat_loads_with_internal_uses(
698 ; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
699 ; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
700 ; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
701 ; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
702 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
703 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
704 ; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
705 ; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
706 ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
707 ; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
708 ; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
709 ; AVX-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
710 ; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
711 ; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
712 ; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]]
713 ; AVX-NEXT: ret double [[RES]]
716 %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
717 %ld_1_0 = load double, ptr %array1, align 8
718 %ld_1_1 = load double, ptr %gep_1_1, align 8
720 %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
721 %ld_2_0 = load double, ptr %array2, align 8
722 %ld_2_1 = load double, ptr %gep_2_1, align 8
724 %mul1 = fmul double %ld_1_0, %ld_2_0
725 %mul2 = fmul double %ld_1_1, %ld_2_0
727 %mul3 = fmul double %ld_1_0, %ld_2_1
728 %mul4 = fmul double %ld_1_1, %ld_2_1
730 %add1 = fadd double %mul1, %mul3
731 %add2 = fadd double %mul2, %mul4
733 ; One more user for the broadcast of %ld_2_0
734 %sub1 = fsub double %add1, %ld_2_0
735 %sub2 = fsub double %add2, %ld_2_0
737 %res = fadd double %sub1, %sub2