1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
4 define void @julia_2xdouble(ptr sret([2 x double]), ptr, ptr, ptr) {
5 ; CHECK-LABEL: @julia_2xdouble(
7 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[TMP2:%.*]], align 4
8 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP3:%.*]], align 4
9 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
10 ; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, ptr [[TMP1:%.*]], align 4
11 ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
12 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
13 ; CHECK-NEXT: [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
14 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
15 ; CHECK-NEXT: [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
16 ; CHECK-NEXT: store [2 x double] [[I1]], ptr [[TMP0:%.*]], align 4
17 ; CHECK-NEXT: ret void
20 %x0 = load double, ptr %2, align 4
21 %y0 = load double, ptr %3, align 4
22 %m0 = fmul double %x0, %y0
23 %px1 = getelementptr inbounds [2 x double], ptr %2, i64 0, i64 1
24 %x1 = load double, ptr %px1, align 4
25 %py1 = getelementptr inbounds [2 x double], ptr %3, i64 0, i64 1
26 %y1 = load double, ptr %py1, align 4
27 %m1 = fmul double %x1, %y1
28 %z0 = load double, ptr %1, align 4
29 %a0 = fadd double %m0, %z0
30 %i0 = insertvalue [2 x double] undef, double %a0, 0
31 %pz1 = getelementptr inbounds [2 x double], ptr %1, i64 0, i64 1
32 %z1 = load double, ptr %pz1, align 4
33 %a1 = fadd double %m1, %z1
34 %i1 = insertvalue [2 x double] %i0, double %a1, 1
35 store [2 x double] %i1, ptr %0, align 4
39 define void @julia_4xfloat(ptr sret([4 x float]), ptr, ptr, ptr) {
40 ; CHECK-LABEL: @julia_4xfloat(
42 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[TMP2:%.*]], align 4
43 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP3:%.*]], align 4
44 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
45 ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x float>, ptr [[TMP1:%.*]], align 4
46 ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
47 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
48 ; CHECK-NEXT: [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
49 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
50 ; CHECK-NEXT: [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
51 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
52 ; CHECK-NEXT: [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
53 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
54 ; CHECK-NEXT: [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
55 ; CHECK-NEXT: store [4 x float] [[I3]], ptr [[TMP0:%.*]], align 4
56 ; CHECK-NEXT: ret void
59 %x0 = load float, ptr %2, align 4
60 %y0 = load float, ptr %3, align 4
61 %m0 = fmul float %x0, %y0
62 %px1 = getelementptr inbounds [4 x float], ptr %2, i64 0, i64 1
63 %x1 = load float, ptr %px1, align 4
64 %py1 = getelementptr inbounds [4 x float], ptr %3, i64 0, i64 1
65 %y1 = load float, ptr %py1, align 4
66 %m1 = fmul float %x1, %y1
67 %px2 = getelementptr inbounds [4 x float], ptr %2, i64 0, i64 2
68 %x2 = load float, ptr %px2, align 4
69 %py2 = getelementptr inbounds [4 x float], ptr %3, i64 0, i64 2
70 %y2 = load float, ptr %py2, align 4
71 %m2 = fmul float %x2, %y2
72 %px3 = getelementptr inbounds [4 x float], ptr %2, i64 0, i64 3
73 %x3 = load float, ptr %px3, align 4
74 %py3 = getelementptr inbounds [4 x float], ptr %3, i64 0, i64 3
75 %y3 = load float, ptr %py3, align 4
76 %m3 = fmul float %x3, %y3
77 %z0 = load float, ptr %1, align 4
78 %a0 = fadd float %m0, %z0
79 %i0 = insertvalue [4 x float] undef, float %a0, 0
80 %pz1 = getelementptr inbounds [4 x float], ptr %1, i64 0, i64 1
81 %z1 = load float, ptr %pz1, align 4
82 %a1 = fadd float %m1, %z1
83 %i1 = insertvalue [4 x float] %i0, float %a1, 1
84 %pz2 = getelementptr inbounds [4 x float], ptr %1, i64 0, i64 2
85 %z2 = load float, ptr %pz2, align 4
86 %a2 = fadd float %m2, %z2
87 %i2 = insertvalue [4 x float] %i1, float %a2, 2
88 %pz3 = getelementptr inbounds [4 x float], ptr %1, i64 0, i64 3
89 %z3 = load float, ptr %pz3, align 4
90 %a3 = fadd float %m3, %z3
91 %i3 = insertvalue [4 x float] %i2, float %a3, 3
92 store [4 x float] %i3, ptr %0, align 4
96 define void @julia_load_array_of_float(ptr %a, ptr %b, ptr %c) {
97 ; CHECK-LABEL: @julia_load_array_of_float(
99 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
100 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
101 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
102 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
103 ; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
104 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
105 ; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
106 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
107 ; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
108 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
109 ; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
110 ; CHECK-NEXT: store [4 x float] [[C_ARR3]], ptr [[C:%.*]], align 4
111 ; CHECK-NEXT: ret void
114 %a_arr = load [4 x float], ptr %a, align 4
115 %a0 = extractvalue [4 x float] %a_arr, 0
116 %a2 = extractvalue [4 x float] %a_arr, 2
117 %a1 = extractvalue [4 x float] %a_arr, 1
118 %b_arr = load [4 x float], ptr %b, align 4
119 %b0 = extractvalue [4 x float] %b_arr, 0
120 %b2 = extractvalue [4 x float] %b_arr, 2
121 %b1 = extractvalue [4 x float] %b_arr, 1
122 %a3 = extractvalue [4 x float] %a_arr, 3
123 %c1 = fsub float %a1, %b1
124 %b3 = extractvalue [4 x float] %b_arr, 3
125 %c0 = fsub float %a0, %b0
126 %c2 = fsub float %a2, %b2
127 %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
128 %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
129 %c3 = fsub float %a3, %b3
130 %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
131 %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
132 store [4 x float] %c_arr3, ptr %c, align 4
136 define void @julia_load_array_of_i32(ptr %a, ptr %b, ptr %c) {
137 ; CHECK-LABEL: @julia_load_array_of_i32(
139 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
140 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
141 ; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
142 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
143 ; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
144 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
145 ; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
146 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
147 ; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
148 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
149 ; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
150 ; CHECK-NEXT: store [4 x i32] [[C_ARR3]], ptr [[C:%.*]], align 4
151 ; CHECK-NEXT: ret void
154 %a_arr = load [4 x i32], ptr %a, align 4
155 %a0 = extractvalue [4 x i32] %a_arr, 0
156 %a2 = extractvalue [4 x i32] %a_arr, 2
157 %a1 = extractvalue [4 x i32] %a_arr, 1
158 %b_arr = load [4 x i32], ptr %b, align 4
159 %b0 = extractvalue [4 x i32] %b_arr, 0
160 %b2 = extractvalue [4 x i32] %b_arr, 2
161 %b1 = extractvalue [4 x i32] %b_arr, 1
162 %a3 = extractvalue [4 x i32] %a_arr, 3
163 %c1 = sub i32 %a1, %b1
164 %b3 = extractvalue [4 x i32] %b_arr, 3
165 %c0 = sub i32 %a0, %b0
166 %c2 = sub i32 %a2, %b2
167 %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
168 %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
169 %c3 = sub i32 %a3, %b3
170 %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
171 %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
172 store [4 x i32] %c_arr3, ptr %c, align 4
176 ; Almost identical to previous test, but for type that should NOT be vectorized.
178 define void @julia_load_array_of_i16(ptr %a, ptr %b, ptr %c) {
179 ; CHECK-LABEL: @julia_load_array_of_i16(
181 ; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i16], ptr [[A:%.*]], align 4
182 ; CHECK-NEXT: [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
183 ; CHECK-NEXT: [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
184 ; CHECK-NEXT: [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
185 ; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i16], ptr [[B:%.*]], align 4
186 ; CHECK-NEXT: [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
187 ; CHECK-NEXT: [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
188 ; CHECK-NEXT: [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
189 ; CHECK-NEXT: [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
190 ; CHECK-NEXT: [[C1:%.*]] = sub i16 [[A1]], [[B1]]
191 ; CHECK-NEXT: [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
192 ; CHECK-NEXT: [[C0:%.*]] = sub i16 [[A0]], [[B0]]
193 ; CHECK-NEXT: [[C2:%.*]] = sub i16 [[A2]], [[B2]]
194 ; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
195 ; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
196 ; CHECK-NEXT: [[C3:%.*]] = sub i16 [[A3]], [[B3]]
197 ; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
198 ; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
199 ; CHECK-NEXT: store [4 x i16] [[C_ARR3]], ptr [[C:%.*]], align 4
200 ; CHECK-NEXT: ret void
203 %a_arr = load [4 x i16], ptr %a, align 4
204 %a0 = extractvalue [4 x i16] %a_arr, 0
205 %a2 = extractvalue [4 x i16] %a_arr, 2
206 %a1 = extractvalue [4 x i16] %a_arr, 1
207 %b_arr = load [4 x i16], ptr %b, align 4
208 %b0 = extractvalue [4 x i16] %b_arr, 0
209 %b2 = extractvalue [4 x i16] %b_arr, 2
210 %b1 = extractvalue [4 x i16] %b_arr, 1
211 %a3 = extractvalue [4 x i16] %a_arr, 3
212 %c1 = sub i16 %a1, %b1
213 %b3 = extractvalue [4 x i16] %b_arr, 3
214 %c0 = sub i16 %a0, %b0
215 %c2 = sub i16 %a2, %b2
216 %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
217 %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
218 %c3 = sub i16 %a3, %b3
219 %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
220 %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
221 store [4 x i16] %c_arr3, ptr %c, align 4
225 %pseudovec = type { float, float, float, float }
227 define void @julia_load_struct_of_float(ptr %a, ptr %b, ptr %c) {
228 ; CHECK-LABEL: @julia_load_struct_of_float(
230 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
231 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
232 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
233 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
234 ; CHECK-NEXT: [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC:%.*]] undef, float [[TMP5]], 0
235 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
236 ; CHECK-NEXT: [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT0]], float [[TMP6]], 1
237 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
238 ; CHECK-NEXT: [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT1]], float [[TMP7]], 2
239 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
240 ; CHECK-NEXT: [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT2]], float [[TMP8]], 3
241 ; CHECK-NEXT: store [[PSEUDOVEC]] [[C_STRUCT3]], ptr [[C:%.*]], align 4
242 ; CHECK-NEXT: ret void
245 %a_struct = load %pseudovec, ptr %a, align 4
246 %a0 = extractvalue %pseudovec %a_struct, 0
247 %a1 = extractvalue %pseudovec %a_struct, 1
248 %b_struct = load %pseudovec, ptr %b, align 4
249 %a2 = extractvalue %pseudovec %a_struct, 2
250 %b0 = extractvalue %pseudovec %b_struct, 0
251 %a3 = extractvalue %pseudovec %a_struct, 3
252 %c0 = fsub float %a0, %b0
253 %b1 = extractvalue %pseudovec %b_struct, 1
254 %b2 = extractvalue %pseudovec %b_struct, 2
255 %c1 = fsub float %a1, %b1
256 %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
257 %b3 = extractvalue %pseudovec %b_struct, 3
258 %c3 = fsub float %a3, %b3
259 %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
260 %c2 = fsub float %a2, %b2
261 %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
262 %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
263 store %pseudovec %c_struct3, ptr %c, align 4