1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s
4 %struct.S = type { [8 x i32], [8 x i32], [16 x i32] }
6 define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
7 ; CHECK-LABEL: @_Z4testP1S(
9 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
10 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 15
11 ; CHECK-NEXT: [[I1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
12 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 6
13 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX13]], align 4
14 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
15 ; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
16 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12
17 ; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4
18 ; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13
19 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4
20 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5
21 ; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
22 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
23 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0
24 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
25 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
26 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3
27 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4
28 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
29 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 poison>
30 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7
31 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]]
32 ; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[P]], align 4
33 ; CHECK-NEXT: ret void
36 %arrayidx = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 0
37 %i = load i32, ptr %arrayidx, align 4
38 %arrayidx1 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 15
39 %i1 = load i32, ptr %arrayidx1, align 4
40 %add = add nsw i32 %i1, %i
41 store i32 %add, ptr %p, align 4
42 %arrayidx4 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 1
43 %i2 = load i32, ptr %arrayidx4, align 4
44 %arrayidx6 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 7
45 %i3 = load i32, ptr %arrayidx6, align 4
46 %add7 = add nsw i32 %i3, %i2
47 %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1
48 store i32 %add7, ptr %arrayidx9, align 4
49 %arrayidx11 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 2
50 %i4 = load i32, ptr %arrayidx11, align 4
51 %arrayidx13 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 6
52 %i5 = load i32, ptr %arrayidx13, align 4
53 %add14 = add nsw i32 %i5, %i4
54 %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2
55 store i32 %add14, ptr %arrayidx16, align 4
56 %arrayidx18 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 3
57 %i6 = load i32, ptr %arrayidx18, align 4
58 %arrayidx20 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 4
59 %i7 = load i32, ptr %arrayidx20, align 4
60 %add21 = add nsw i32 %i7, %i6
61 %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3
62 store i32 %add21, ptr %arrayidx23, align 4
63 %arrayidx25 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 4
64 %i8 = load i32, ptr %arrayidx25, align 4
65 %arrayidx27 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 12
66 %i9 = load i32, ptr %arrayidx27, align 4
67 %add28 = add nsw i32 %i9, %i8
68 %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4
69 store i32 %add28, ptr %arrayidx30, align 4
70 %arrayidx32 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 5
71 %i10 = load i32, ptr %arrayidx32, align 4
72 %arrayidx34 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 13
73 %i11 = load i32, ptr %arrayidx34, align 4
74 %add35 = add nsw i32 %i11, %i10
75 %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5
76 store i32 %add35, ptr %arrayidx37, align 4
77 %arrayidx39 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 6
78 %i12 = load i32, ptr %arrayidx39, align 4
79 %arrayidx41 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 14
80 %i13 = load i32, ptr %arrayidx41, align 4
81 %add42 = add nsw i32 %i13, %i12
82 %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6
83 store i32 %add42, ptr %arrayidx44, align 4
84 %arrayidx46 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 1, i64 7
85 %i14 = load i32, ptr %arrayidx46, align 4
86 %arrayidx48 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 2, i64 5
87 %i15 = load i32, ptr %arrayidx48, align 4
88 %add49 = add nsw i32 %i15, %i14
89 %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7
90 store i32 %add49, ptr %arrayidx51, align 4
94 ; Test for 2 load groups 4 elements each against different base pointers.
95 ; Both loaded groups are not ordered thus here are few specific points:
96 ; (1) these groups are detected, (2) reordereing shuffles generated and
97 ; (3) these loads vectorized as a part of tree that is seeded by stores
100 define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_addr {
101 ; CHECK-LABEL: @test_unordered_splits(
103 ; CHECK-NEXT: [[P1:%.*]] = alloca [16 x i32], align 16
104 ; CHECK-NEXT: [[P2:%.*]] = alloca [16 x i32], align 16
105 ; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4
106 ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12
107 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4
108 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4
109 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
110 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
111 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
112 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
113 ; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4
114 ; CHECK-NEXT: ret void
117 %p1 = alloca [16 x i32], align 16
118 %p2 = alloca [16 x i32], align 16
119 %g10 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 4
120 %g11 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 5
121 %g12 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 6
122 %g13 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 7
123 %g20 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 12
124 %g21 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 13
125 %g22 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 14
126 %g23 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 15
127 %i1 = load i32, ptr %g11, align 4
128 store i32 %i1, ptr %p, align 4
129 %i3 = load i32, ptr %g10, align 4
130 %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1
131 store i32 %i3, ptr %arrayidx9, align 4
132 %i5 = load i32, ptr %g12, align 4
133 %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2
134 store i32 %i5, ptr %arrayidx16, align 4
135 %i7 = load i32, ptr %g13, align 4
136 %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3
137 store i32 %i7, ptr %arrayidx23, align 4
138 %i9 = load i32, ptr %g23, align 4
139 %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4
140 store i32 %i9, ptr %arrayidx30, align 4
141 %i11 = load i32, ptr %g21, align 4
142 %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5
143 store i32 %i11, ptr %arrayidx37, align 4
144 %i13 = load i32, ptr %g22, align 4
145 %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6
146 store i32 %i13, ptr %arrayidx44, align 4
147 %i15 = load i32, ptr %g20, align 4
148 %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7
149 store i32 %i15, ptr %arrayidx51, align 4
153 define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr {
154 ; CHECK-LABEL: @test_cost_splits(
156 ; CHECK-NEXT: [[P1:%.*]] = alloca [16 x i32], align 16
157 ; CHECK-NEXT: [[P2:%.*]] = alloca [16 x i32], align 16
158 ; CHECK-NEXT: [[P3:%.*]] = alloca [16 x i32], align 16
159 ; CHECK-NEXT: [[P4:%.*]] = alloca [16 x i32], align 16
160 ; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds [16 x i32], ptr [[P1]], i32 0, i64 4
161 ; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 6
162 ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P3]], i32 0, i64 12
163 ; CHECK-NEXT: [[G22:%.*]] = getelementptr inbounds [16 x i32], ptr [[P4]], i32 0, i64 14
164 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[G10]], align 4
165 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4
166 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4
167 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4
168 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
169 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
170 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
171 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
172 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
173 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
174 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
175 ; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4
176 ; CHECK-NEXT: ret void
179 %p1 = alloca [16 x i32], align 16
180 %p2 = alloca [16 x i32], align 16
181 %p3 = alloca [16 x i32], align 16
182 %p4 = alloca [16 x i32], align 16
183 %g10 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 4
184 %g11 = getelementptr inbounds [16 x i32], ptr %p1, i32 0, i64 5
185 %g12 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 6
186 %g13 = getelementptr inbounds [16 x i32], ptr %p2, i32 0, i64 7
187 %g20 = getelementptr inbounds [16 x i32], ptr %p3, i32 0, i64 12
188 %g21 = getelementptr inbounds [16 x i32], ptr %p3, i32 0, i64 13
189 %g22 = getelementptr inbounds [16 x i32], ptr %p4, i32 0, i64 14
190 %g23 = getelementptr inbounds [16 x i32], ptr %p4, i32 0, i64 15
191 %i1 = load i32, ptr %g10, align 4
192 store i32 %i1, ptr %p, align 4
193 %i3 = load i32, ptr %g11, align 4
194 %arrayidx9 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 1
195 store i32 %i3, ptr %arrayidx9, align 4
196 %i5 = load i32, ptr %g12, align 4
197 %arrayidx16 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 2
198 store i32 %i5, ptr %arrayidx16, align 4
199 %i7 = load i32, ptr %g13, align 4
200 %arrayidx23 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 3
201 store i32 %i7, ptr %arrayidx23, align 4
202 %i9 = load i32, ptr %g20, align 4
203 %arrayidx30 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 4
204 store i32 %i9, ptr %arrayidx30, align 4
205 %i11 = load i32, ptr %g21, align 4
206 %arrayidx37 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 5
207 store i32 %i11, ptr %arrayidx37, align 4
208 %i13 = load i32, ptr %g22, align 4
209 %arrayidx44 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 6
210 store i32 %i13, ptr %arrayidx44, align 4
211 %i15 = load i32, ptr %g23, align 4
212 %arrayidx51 = getelementptr inbounds %struct.S, ptr %p, i64 0, i32 0, i64 7
213 store i32 %i15, ptr %arrayidx51, align 4