1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=slp-vectorizer,verify -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s -check-prefix=ENABLED
4 ; Without supernode operand reordering, this does not get fully vectorized.
5 ; S[0] = (A[0] + B[0]) + C[0]
6 ; S[1] = (B[1] + C[1]) + A[1]
7 define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
8 ; ENABLED-LABEL: @test_supernode_add(
10 ; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
11 ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
12 ; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
13 ; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
14 ; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
15 ; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
16 ; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
17 ; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
18 ; ENABLED-NEXT: ret void
21 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
22 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
23 %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1
24 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
26 %A0 = load double, ptr %Aarray, align 8
27 %A1 = load double, ptr %idxA1, align 8
29 %B0 = load double, ptr %Barray, align 8
30 %B1 = load double, ptr %idxB1, align 8
32 %C0 = load double, ptr %Carray, align 8
33 %C1 = load double, ptr %idxC1, align 8
35 %addA0B0 = fadd fast double %A0, %B0
36 %addB1C1 = fadd fast double %B1, %C1
37 %add0 = fadd fast double %addA0B0, %C0
38 %add1 = fadd fast double %addB1C1, %A1
39 store double %add0, ptr %Sarray, align 8
40 store double %add1, ptr %idxS1, align 8
45 ; Without supernode operand reordering, this does not get fully vectorized.
46 ; S[0] = (A[0] - B[0]) + C[0]
47 ; S[1] = (C[1] - B[1]) + A[1]
48 define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
49 ; ENABLED-LABEL: @test_supernode_addsub(
50 ; ENABLED-NEXT: entry:
51 ; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
52 ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
53 ; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
54 ; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
55 ; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
56 ; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
57 ; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
58 ; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
59 ; ENABLED-NEXT: ret void
62 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
63 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
64 %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1
65 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
67 %A0 = load double, ptr %Aarray, align 8
68 %A1 = load double, ptr %idxA1, align 8
70 %B0 = load double, ptr %Barray, align 8
71 %B1 = load double, ptr %idxB1, align 8
73 %C0 = load double, ptr %Carray, align 8
74 %C1 = load double, ptr %idxC1, align 8
76 %subA0B0 = fsub fast double %A0, %B0
77 %subC1B1 = fsub fast double %C1, %B1
78 %add0 = fadd fast double %subA0B0, %C0
79 %add1 = fadd fast double %subC1B1, %A1
80 store double %add0, ptr %Sarray, align 8
81 store double %add1, ptr %idxS1, align 8
85 ; Without supernode operand reordering, this does not get fully vectorized.
86 ; This checks that the super-node works with alternate sequences.
88 ; S[0] = (A[0] - B[0]) - C[0]
89 ; S[1] = (B[1] + C[1]) + A[1]
90 define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
91 ; ENABLED-LABEL: @test_supernode_addsub_alt(
92 ; ENABLED-NEXT: entry:
93 ; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
94 ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
95 ; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
96 ; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
97 ; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
98 ; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]]
99 ; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
100 ; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
101 ; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]]
102 ; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]]
103 ; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
104 ; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
105 ; ENABLED-NEXT: ret void
108 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
109 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
110 %idxC1 = getelementptr inbounds double, ptr %Carray, i64 1
111 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
113 %A0 = load double, ptr %Aarray, align 8
114 %A1 = load double, ptr %idxA1, align 8
116 %B0 = load double, ptr %Barray, align 8
117 %B1 = load double, ptr %idxB1, align 8
119 %C0 = load double, ptr %Carray, align 8
120 %C1 = load double, ptr %idxC1, align 8
122 %subA0B0 = fsub fast double %A0, %B0
123 %addB1C1 = fadd fast double %B1, %C1
124 %sub0 = fsub fast double %subA0B0, %C0
125 %add1 = fadd fast double %addB1C1, %A1
126 store double %sub0, ptr %Sarray, align 8
127 store double %add1, ptr %idxS1, align 8
131 ; This checks that vectorizeTree() works correctly with the supernode
132 ; and does not generate uses before defs.
133 ; If all of the operands of the supernode are vectorizable, then the scheduler
134 ; will fix their position in the program. If not, then the scheduler may not
135 ; touch them, leading to uses before defs.
149 ; A0 C A1 B1 A0 C A1 D A0:1 C,D
150 ; \ / \ / Reorder \ / \ / Bundles \ /
151 ; t1 + B0 t3 + D -------> t1 + B0 t3 + B1 ------> t1:3 + B0:1
153 ; t2 + t4 + t2 + t4 + t2:4 +
155 ; After reordering, 'D' conceptually becomes an operand of t3:
157 ; But D is defined *after* its use.
159 define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) {
160 ; ENABLED-LABEL: @supernode_scheduling(
161 ; ENABLED-NEXT: entry:
162 ; ENABLED-NEXT: [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8
163 ; ENABLED-NEXT: [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8
164 ; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
165 ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
166 ; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C]], i32 0
167 ; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]]
168 ; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1
169 ; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP4]]
170 ; ENABLED-NEXT: store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8
171 ; ENABLED-NEXT: ret void
174 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
175 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
176 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
179 %A0 = load double, ptr %Aarray, align 8
180 %C = load double, ptr %Carray, align 8
181 %t1 = fadd fast double %A0, %C
182 %B0 = load double, ptr %Barray, align 8
183 %t2 = fadd fast double %t1, %B0
184 %A1 = load double, ptr %idxA1, align 8
185 %B1 = load double, ptr %idxB1, align 8
186 %t3 = fadd fast double %A1, %B1
187 %D = load double, ptr %Darray, align 8
188 %t4 = fadd fast double %t3, %D
190 store double %t2, ptr %Sarray, align 8
191 store double %t4, ptr %idxS1, align 8
196 ; The SLP scheduler has trouble moving instructions across blocks.
197 ; Even though we can build a SuperNode for this example, we should not because the scheduler
198 ; cannot handle the cross-block instruction motion that is required once the operands of the
199 ; SuperNode are reordered.
212 define void @supernode_scheduling_cross_block(ptr %Aarray, ptr %Barray, ptr %Sarray) {
213 ; ENABLED-LABEL: @supernode_scheduling_cross_block(
214 ; ENABLED-NEXT: entry:
215 ; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
216 ; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
217 ; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
218 ; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
219 ; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
220 ; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1
221 ; ENABLED-NEXT: [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], splat (double 2.000000e+00)
222 ; ENABLED-NEXT: br label [[BB:%.*]]
224 ; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
225 ; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
226 ; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
227 ; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1
228 ; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
229 ; ENABLED-NEXT: store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8
230 ; ENABLED-NEXT: ret void
233 %idxA1 = getelementptr inbounds double, ptr %Aarray, i64 1
234 %idxB1 = getelementptr inbounds double, ptr %Barray, i64 1
235 %idxS1 = getelementptr inbounds double, ptr %Sarray, i64 1
237 %A0 = load double, ptr %Aarray, align 8
238 %B1 = load double, ptr %idxB1, align 8
239 %Tmp0 = fadd fast double %A0, 2.0
240 %Tmp1 = fadd fast double %B1, 2.0
244 %A1 = load double, ptr %idxA1, align 8
245 %B0 = load double, ptr %Barray, align 8
247 %Sum0 = fadd fast double %Tmp0, %B0
248 %Sum1 = fadd fast double %Tmp1, %A1
250 store double %Sum0, ptr %Sarray, align 8
251 store double %Sum1, ptr %idxS1, align 8