1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s
4 ; This file tests the look-ahead operand reordering heuristic.
7 ; This checks that operand reordering will reorder the operands of the adds
8 ; by taking into consideration the instructions beyond the immediate
11 ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1]
19 define void @lookahead_basic(double* %array) {
20 ; CHECK-LABEL: @lookahead_basic(
22 ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
23 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
24 ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2
25 ; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3
26 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4
27 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
28 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
29 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
30 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
31 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
32 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
33 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
34 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
35 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
36 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
37 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
38 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
39 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
40 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
41 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
42 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
43 ; CHECK-NEXT: ret void
46 %idx0 = getelementptr inbounds double, double* %array, i64 0
47 %idx1 = getelementptr inbounds double, double* %array, i64 1
48 %idx2 = getelementptr inbounds double, double* %array, i64 2
49 %idx3 = getelementptr inbounds double, double* %array, i64 3
50 %idx4 = getelementptr inbounds double, double* %array, i64 4
51 %idx5 = getelementptr inbounds double, double* %array, i64 5
52 %idx6 = getelementptr inbounds double, double* %array, i64 6
53 %idx7 = getelementptr inbounds double, double* %array, i64 7
55 %A_0 = load double, double *%idx0, align 8
56 %A_1 = load double, double *%idx1, align 8
57 %B_0 = load double, double *%idx2, align 8
58 %B_1 = load double, double *%idx3, align 8
59 %C_0 = load double, double *%idx4, align 8
60 %C_1 = load double, double *%idx5, align 8
61 %D_0 = load double, double *%idx6, align 8
62 %D_1 = load double, double *%idx7, align 8
64 %subAB_0 = fsub fast double %A_0, %B_0
65 %subCD_0 = fsub fast double %C_0, %D_0
67 %subAB_1 = fsub fast double %A_1, %B_1
68 %subCD_1 = fsub fast double %C_1, %D_1
70 %addABCD_0 = fadd fast double %subAB_0, %subCD_0
71 %addCDAB_1 = fadd fast double %subCD_1, %subAB_1
73 store double %addABCD_0, double *%idx0, align 8
74 store double %addCDAB_1, double *%idx1, align 8
79 ; Check whether the look-ahead operand reordering heuristic will avoid
80 ; bundling the alt opcodes. The vectorized code should have no shuffles.
82 ; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1]
90 define void @lookahead_alt1(double* %array) {
91 ; CHECK-LABEL: @lookahead_alt1(
93 ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
94 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
95 ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2
96 ; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3
97 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4
98 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
99 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
100 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
101 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
102 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
103 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
104 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
105 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
106 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
107 ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
108 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
109 ; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
110 ; CHECK-NEXT: ret void
113 %idx0 = getelementptr inbounds double, double* %array, i64 0
114 %idx1 = getelementptr inbounds double, double* %array, i64 1
115 %idx2 = getelementptr inbounds double, double* %array, i64 2
116 %idx3 = getelementptr inbounds double, double* %array, i64 3
117 %idx4 = getelementptr inbounds double, double* %array, i64 4
118 %idx5 = getelementptr inbounds double, double* %array, i64 5
119 %idx6 = getelementptr inbounds double, double* %array, i64 6
120 %idx7 = getelementptr inbounds double, double* %array, i64 7
122 %A_0 = load double, double *%idx0, align 8
123 %A_1 = load double, double *%idx1, align 8
124 %B_0 = load double, double *%idx2, align 8
125 %B_1 = load double, double *%idx3, align 8
127 %addAB_0_L = fadd fast double %A_0, %B_0
128 %subAB_0_R = fsub fast double %A_0, %B_0
130 %subAB_1_L = fsub fast double %A_1, %B_1
131 %addAB_1_R = fadd fast double %A_1, %B_1
133 %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R
134 %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R
136 store double %addABCD_0, double *%idx0, align 8
137 store double %addCDAB_1, double *%idx1, align 8
142 ; This code should get vectorized all the way to the loads with shuffles for
145 ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1]
153 define void @lookahead_alt2(double* %array) {
154 ; CHECK-LABEL: @lookahead_alt2(
156 ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
157 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
158 ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2
159 ; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3
160 ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4
161 ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
162 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
163 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
164 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
165 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
166 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
167 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
168 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
169 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
170 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
171 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
172 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
173 ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
174 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
175 ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
176 ; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
177 ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
178 ; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]]
179 ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
180 ; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
181 ; CHECK-NEXT: ret void
184 %idx0 = getelementptr inbounds double, double* %array, i64 0
185 %idx1 = getelementptr inbounds double, double* %array, i64 1
186 %idx2 = getelementptr inbounds double, double* %array, i64 2
187 %idx3 = getelementptr inbounds double, double* %array, i64 3
188 %idx4 = getelementptr inbounds double, double* %array, i64 4
189 %idx5 = getelementptr inbounds double, double* %array, i64 5
190 %idx6 = getelementptr inbounds double, double* %array, i64 6
191 %idx7 = getelementptr inbounds double, double* %array, i64 7
193 %A_0 = load double, double *%idx0, align 8
194 %A_1 = load double, double *%idx1, align 8
195 %B_0 = load double, double *%idx2, align 8
196 %B_1 = load double, double *%idx3, align 8
197 %C_0 = load double, double *%idx4, align 8
198 %C_1 = load double, double *%idx5, align 8
199 %D_0 = load double, double *%idx6, align 8
200 %D_1 = load double, double *%idx7, align 8
202 %addAB_0 = fadd fast double %A_0, %B_0
203 %subCD_0 = fsub fast double %C_0, %D_0
205 %addCD_1 = fadd fast double %C_1, %D_1
206 %subAB_1 = fsub fast double %A_1, %B_1
208 %addABCD_0 = fadd fast double %addAB_0, %subCD_0
209 %addCDAB_1 = fadd fast double %addCD_1, %subAB_1
211 store double %addABCD_0, double *%idx0, align 8
212 store double %addCDAB_1, double *%idx1, align 8
218 ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
226 ; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses.
227 ; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use.
229 define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
230 ; CHECK-LABEL: @lookahead_external_uses(
232 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
233 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
234 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
235 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
236 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
237 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
238 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
239 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
240 ; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
241 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
242 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
243 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
244 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
245 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
246 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
247 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
248 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
249 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1
250 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
251 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1
252 ; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
253 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
254 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
255 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
256 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
257 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
258 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
259 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
260 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
261 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
262 ; CHECK-NEXT: ret void
265 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
266 %IdxB0 = getelementptr inbounds double, double* %B, i64 0
267 %IdxC0 = getelementptr inbounds double, double* %C, i64 0
268 %IdxD0 = getelementptr inbounds double, double* %D, i64 0
270 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
271 %IdxB2 = getelementptr inbounds double, double* %B, i64 2
272 %IdxA2 = getelementptr inbounds double, double* %A, i64 2
273 %IdxB1 = getelementptr inbounds double, double* %B, i64 1
275 %A0 = load double, double *%IdxA0, align 8
276 %B0 = load double, double *%IdxB0, align 8
277 %C0 = load double, double *%IdxC0, align 8
278 %D0 = load double, double *%IdxD0, align 8
280 %A1 = load double, double *%IdxA1, align 8
281 %B2 = load double, double *%IdxB2, align 8
282 %A2 = load double, double *%IdxA2, align 8
283 %B1 = load double, double *%IdxB1, align 8
285 %subA0B0 = fsub fast double %A0, %B0
286 %subC0D0 = fsub fast double %C0, %D0
288 %subA1B2 = fsub fast double %A1, %B2
289 %subA2B1 = fsub fast double %A2, %B1
291 %add0 = fadd fast double %subA0B0, %subC0D0
292 %add1 = fadd fast double %subA1B2, %subA2B1
294 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
295 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
297 store double %add0, double *%IdxS0, align 8
298 store double %add1, double *%IdxS1, align 8
301 store double %A1, double *%Ext1, align 8
305 ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
306 ; \ / \ / / \ / \ / \
307 ; - - U1,U2,U3 - - U4,U5
314 ; If we limit the users budget for the look-ahead heuristic to 2, then the
315 ; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
316 ; over A[1] (with 3 external users).
317 ; The result is that the operands are of the Add not reordered and the loads
318 ; from A get vectorized instead of the loads from B.
320 define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3, double *%Ext4, double *%Ext5) {
321 ; CHECK-LABEL: @lookahead_limit_users_budget(
323 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
324 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
325 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
326 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
327 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
328 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
329 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
330 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
331 ; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8
332 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
333 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
334 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
335 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
336 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
337 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
338 ; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8
339 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
340 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1
341 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
342 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
343 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1
344 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
345 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
346 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
347 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
348 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
349 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
350 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
351 ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
352 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
353 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8
354 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8
355 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8
356 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8
357 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8
358 ; CHECK-NEXT: ret void
361 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
362 %IdxB0 = getelementptr inbounds double, double* %B, i64 0
363 %IdxC0 = getelementptr inbounds double, double* %C, i64 0
364 %IdxD0 = getelementptr inbounds double, double* %D, i64 0
366 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
367 %IdxB2 = getelementptr inbounds double, double* %B, i64 2
368 %IdxA2 = getelementptr inbounds double, double* %A, i64 2
369 %IdxB1 = getelementptr inbounds double, double* %B, i64 1
371 %A0 = load double, double *%IdxA0, align 8
372 %B0 = load double, double *%IdxB0, align 8
373 %C0 = load double, double *%IdxC0, align 8
374 %D0 = load double, double *%IdxD0, align 8
376 %A1 = load double, double *%IdxA1, align 8
377 %B2 = load double, double *%IdxB2, align 8
378 %A2 = load double, double *%IdxA2, align 8
379 %B1 = load double, double *%IdxB1, align 8
381 %subA0B0 = fsub fast double %A0, %B0
382 %subC0D0 = fsub fast double %C0, %D0
384 %subA1B2 = fsub fast double %A1, %B2
385 %subA2B1 = fsub fast double %A2, %B1
387 %add0 = fadd fast double %subA0B0, %subC0D0
388 %add1 = fadd fast double %subA1B2, %subA2B1
390 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
391 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
393 store double %add0, double *%IdxS0, align 8
394 store double %add1, double *%IdxS1, align 8
396 ; External uses of A1
397 store double %A1, double *%Ext1, align 8
398 store double %A1, double *%Ext2, align 8
399 store double %A1, double *%Ext3, align 8
401 ; External uses of B1
402 store double %B1, double *%Ext4, align 8
403 store double %B1, double *%Ext5, align 8
408 ; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
411 declare double @_ZN1i2ayEv(%Class*)
412 declare double @_ZN1i2axEv()
414 define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
415 ; CHECK-LABEL: @lookahead_crash(
416 ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
417 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
418 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
419 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
420 ; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]])
421 ; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv()
422 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
423 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1
424 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
425 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
426 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
427 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
428 ; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
429 ; CHECK-NEXT: ret void
431 %IdxA0 = getelementptr inbounds double, double* %A, i64 0
432 %IdxA1 = getelementptr inbounds double, double* %A, i64 1
434 %A0 = load double, double *%IdxA0, align 8
435 %A1 = load double, double *%IdxA1, align 8
437 %C0 = call double @_ZN1i2ayEv(%Class *%Arg0)
438 %C1 = call double @_ZN1i2axEv()
440 %add0 = fadd fast double %A0, %C0
441 %add1 = fadd fast double %A1, %C1
443 %IdxS0 = getelementptr inbounds double, double* %S, i64 0
444 %IdxS1 = getelementptr inbounds double, double* %S, i64 1
445 store double %add0, double *%IdxS0, align 8
446 store double %add1, double *%IdxS1, align 8
450 ; This checks that we choose to group consecutive extracts from the same vectors.
451 define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2) {
452 ; CHECK-LABEL: @ChecksExtractScores(
453 ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
454 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
455 ; CHECK-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4
456 ; CHECK-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4
457 ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
458 ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
459 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
460 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1
461 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
462 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
463 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1
464 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
465 ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
466 ; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
467 ; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
468 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
469 ; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
470 ; CHECK-NEXT: ret void
472 %idx0 = getelementptr inbounds double, double* %array, i64 0
473 %idx1 = getelementptr inbounds double, double* %array, i64 1
474 %loadA0 = load double, double* %idx0, align 4
475 %loadA1 = load double, double* %idx1, align 4
477 %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4
478 %extrA0 = extractelement <2 x double> %loadVec, i32 0
479 %extrA1 = extractelement <2 x double> %loadVec, i32 1
480 %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4
481 %extrB0 = extractelement <2 x double> %loadVec2, i32 0
482 %extrB1 = extractelement <2 x double> %loadVec2, i32 1
484 %mul0 = fmul double %extrA0, %loadA0
485 %mul1 = fmul double %extrA1, %loadA0
486 %mul3 = fmul double %extrB0, %loadA1
487 %mul4 = fmul double %extrB1, %loadA1
488 %add0 = fadd double %mul0, %mul3
489 %add1 = fadd double %mul1, %mul4
491 %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0
492 %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1
493 store double %add0, double *%sidx0, align 8
494 store double %add1, double *%sidx1, align 8
499 define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
500 ; CHECK-LABEL: @ExtractIdxNotConstantInt1(
501 ; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
502 ; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
503 ; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
504 ; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
505 ; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
506 ; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
507 ; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
508 ; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
509 ; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
510 ; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
511 ; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
512 ; CHECK-NEXT: ret i1 [[CMP_I185]]
514 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
515 %sub14.i167 = fsub float undef, %vecext.i291.i166
516 %fm = fmul float %a, %sub14.i167
517 %sub25.i168 = fsub float %fm, %b
518 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
519 %add36.i173 = fadd float %sub25.i168, 10.0
520 %mul72.i179 = fmul float %c, %vecext.i276.i169
521 %add78.i180 = fsub float %mul72.i179, 30.0
522 %add79.i181 = fadd float 2.0, %add78.i180
523 %mul123.i184 = fmul float %add36.i173, %add79.i181
524 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
529 define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
530 ; CHECK-LABEL: @ExtractIdxNotConstantInt2(
531 ; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
532 ; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
533 ; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
534 ; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
535 ; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
536 ; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
537 ; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
538 ; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
539 ; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
540 ; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
541 ; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
542 ; CHECK-NEXT: ret i1 [[CMP_I185]]
544 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
545 %sub14.i167 = fsub float undef, %vecext.i291.i166
546 %fm = fmul float %a, %sub14.i167
547 %sub25.i168 = fsub float %fm, %b
548 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
549 %add36.i173 = fadd float %sub25.i168, 10.0
550 %mul72.i179 = fmul float %c, %vecext.i276.i169
551 %add78.i180 = fsub float %mul72.i179, 30.0
552 %add79.i181 = fadd float 2.0, %add78.i180
553 %mul123.i184 = fmul float %add36.i173, %add79.i181
554 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
559 define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
561 ; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
562 ; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
563 ; CHECK-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
564 ; CHECK-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
565 ; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
566 ; CHECK-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
567 ; CHECK-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
568 ; CHECK-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
569 ; CHECK-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
570 ; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
571 ; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
572 ; CHECK-NEXT: ret i1 [[CMP_I185]]
574 %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
575 %sub14.i167 = fsub float undef, %vecext.i291.i166
576 %fm = fmul float %a, %sub14.i167
577 %sub25.i168 = fsub float %fm, %b
578 %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1
579 %add36.i173 = fadd float %sub25.i168, 10.0
580 %mul72.i179 = fmul float %c, %vecext.i276.i169
581 %add78.i180 = fsub float %mul72.i179, 30.0
582 %add79.i181 = fadd float 2.0, %add78.i180
583 %mul123.i184 = fmul float %add36.i173, %add79.i181
584 %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
588 ; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
589 define void @ChecksExtractScores_different_vectors(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2, <2 x double>* %vecPtr3, <2 x double>* %vecPtr4) {
590 ; CHECK-LABEL: @ChecksExtractScores_different_vectors(
591 ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
592 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
593 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
594 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
595 ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
596 ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
597 ; CHECK-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
598 ; CHECK-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1
599 ; CHECK-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4
600 ; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
601 ; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
602 ; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
603 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0
604 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRA1]], i32 1
605 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
606 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
607 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
608 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1
609 ; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP4]], [[TMP8]]
610 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0
611 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[EXTRB1]], i32 1
612 ; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP2]]
613 ; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], [[TMP9]]
614 ; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
615 ; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
616 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
617 ; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
618 ; CHECK-NEXT: ret void
620 %idx0 = getelementptr inbounds double, double* %array, i64 0
621 %idx1 = getelementptr inbounds double, double* %array, i64 1
622 %loadA0 = load double, double* %idx0, align 4
623 %loadA1 = load double, double* %idx1, align 4
625 %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4
626 %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4
627 %extrA0 = extractelement <2 x double> %loadVec, i32 0
628 %extrA1 = extractelement <2 x double> %loadVec2, i32 1
629 %loadVec3= load <2 x double>, <2 x double>* %vecPtr3, align 4
630 %loadVec4 = load <2 x double>, <2 x double>* %vecPtr4, align 4
631 %extrB0 = extractelement <2 x double> %loadVec3, i32 0
632 %extrB1 = extractelement <2 x double> %loadVec4, i32 1
634 %mul0 = fmul double %extrA0, %loadA0
635 %mul1 = fmul double %extrA1, %loadA0
636 %mul3 = fmul double %extrB0, %loadA1
637 %mul4 = fmul double %extrB1, %loadA1
638 %add0 = fadd double %mul0, %mul3
639 %add1 = fadd double %mul1, %mul4
641 %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0
642 %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1
643 store double %add0, double *%sidx0, align 8
644 store double %add1, double *%sidx1, align 8