1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basic-aa -slp-vectorizer -S | FileCheck %s
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s
8 ; dot4(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1])+(x[2]*y[2])+(x[3]*y[3]))
11 define double @dot4f64(double* dereferenceable(32) %ptrx, double* dereferenceable(32) %ptry) {
12 ; CHECK-LABEL: @dot4f64(
13 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, double* [[PTRX:%.*]], i64 1
14 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, double* [[PTRY:%.*]], i64 1
15 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 2
16 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
17 ; CHECK-NEXT: [[PTRX3:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 3
18 ; CHECK-NEXT: [[PTRY3:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 3
19 ; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PTRX]], align 4
20 ; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PTRY]], align 4
21 ; CHECK-NEXT: [[X1:%.*]] = load double, double* [[PTRX1]], align 4
22 ; CHECK-NEXT: [[Y1:%.*]] = load double, double* [[PTRY1]], align 4
23 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX2]] to <2 x double>*
24 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
25 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY2]] to <2 x double>*
26 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
27 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
28 ; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
29 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
30 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[MUL1]]
31 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
32 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
33 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
34 ; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
35 ; CHECK-NEXT: ret double [[DOT0123]]
37 %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
38 %ptry1 = getelementptr inbounds double, double* %ptry, i64 1
39 %ptrx2 = getelementptr inbounds double, double* %ptrx, i64 2
40 %ptry2 = getelementptr inbounds double, double* %ptry, i64 2
41 %ptrx3 = getelementptr inbounds double, double* %ptrx, i64 3
42 %ptry3 = getelementptr inbounds double, double* %ptry, i64 3
43 %x0 = load double, double* %ptrx, align 4
44 %y0 = load double, double* %ptry, align 4
45 %x1 = load double, double* %ptrx1, align 4
46 %y1 = load double, double* %ptry1, align 4
47 %x2 = load double, double* %ptrx2, align 4
48 %y2 = load double, double* %ptry2, align 4
49 %x3 = load double, double* %ptrx3, align 4
50 %y3 = load double, double* %ptry3, align 4
51 %mul0 = fmul double %x0, %y0
52 %mul1 = fmul double %x1, %y1
53 %mul2 = fmul double %x2, %y2
54 %mul3 = fmul double %x3, %y3
55 %dot01 = fadd double %mul0, %mul1
56 %dot012 = fadd double %dot01, %mul2
57 %dot0123 = fadd double %dot012, %mul3
61 define float @dot4f32(float* dereferenceable(16) %ptrx, float* dereferenceable(16) %ptry) {
62 ; CHECK-LABEL: @dot4f32(
63 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, float* [[PTRX:%.*]], i64 1
64 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, float* [[PTRY:%.*]], i64 1
65 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 2
66 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
67 ; CHECK-NEXT: [[PTRX3:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 3
68 ; CHECK-NEXT: [[PTRY3:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 3
69 ; CHECK-NEXT: [[X0:%.*]] = load float, float* [[PTRX]], align 4
70 ; CHECK-NEXT: [[Y0:%.*]] = load float, float* [[PTRY]], align 4
71 ; CHECK-NEXT: [[X1:%.*]] = load float, float* [[PTRX1]], align 4
72 ; CHECK-NEXT: [[Y1:%.*]] = load float, float* [[PTRY1]], align 4
73 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX2]] to <2 x float>*
74 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
75 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY2]] to <2 x float>*
76 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
77 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
78 ; CHECK-NEXT: [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
79 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
80 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[MUL1]]
81 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
82 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP6]]
83 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
84 ; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP7]]
85 ; CHECK-NEXT: ret float [[DOT0123]]
87 %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1
88 %ptry1 = getelementptr inbounds float, float* %ptry, i64 1
89 %ptrx2 = getelementptr inbounds float, float* %ptrx, i64 2
90 %ptry2 = getelementptr inbounds float, float* %ptry, i64 2
91 %ptrx3 = getelementptr inbounds float, float* %ptrx, i64 3
92 %ptry3 = getelementptr inbounds float, float* %ptry, i64 3
93 %x0 = load float, float* %ptrx, align 4
94 %y0 = load float, float* %ptry, align 4
95 %x1 = load float, float* %ptrx1, align 4
96 %y1 = load float, float* %ptry1, align 4
97 %x2 = load float, float* %ptrx2, align 4
98 %y2 = load float, float* %ptry2, align 4
99 %x3 = load float, float* %ptrx3, align 4
100 %y3 = load float, float* %ptry3, align 4
101 %mul0 = fmul float %x0, %y0
102 %mul1 = fmul float %x1, %y1
103 %mul2 = fmul float %x2, %y2
104 %mul3 = fmul float %x3, %y3
105 %dot01 = fadd float %mul0, %mul1
106 %dot012 = fadd float %dot01, %mul2
107 %dot0123 = fadd float %dot012, %mul3
111 define double @dot4f64_fast(double* dereferenceable(32) %ptrx, double* dereferenceable(32) %ptry) {
112 ; CHECK-LABEL: @dot4f64_fast(
113 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, double* [[PTRX:%.*]], i64 1
114 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, double* [[PTRY:%.*]], i64 1
115 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 2
116 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
117 ; CHECK-NEXT: [[PTRX3:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 3
118 ; CHECK-NEXT: [[PTRY3:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 3
119 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX]] to <4 x double>*
120 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
121 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY]] to <4 x double>*
122 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 4
123 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> [[TMP2]], [[TMP4]]
124 ; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP5]])
125 ; CHECK-NEXT: ret double [[TMP6]]
127 %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
128 %ptry1 = getelementptr inbounds double, double* %ptry, i64 1
129 %ptrx2 = getelementptr inbounds double, double* %ptrx, i64 2
130 %ptry2 = getelementptr inbounds double, double* %ptry, i64 2
131 %ptrx3 = getelementptr inbounds double, double* %ptrx, i64 3
132 %ptry3 = getelementptr inbounds double, double* %ptry, i64 3
133 %x0 = load double, double* %ptrx, align 4
134 %y0 = load double, double* %ptry, align 4
135 %x1 = load double, double* %ptrx1, align 4
136 %y1 = load double, double* %ptry1, align 4
137 %x2 = load double, double* %ptrx2, align 4
138 %y2 = load double, double* %ptry2, align 4
139 %x3 = load double, double* %ptrx3, align 4
140 %y3 = load double, double* %ptry3, align 4
141 %mul0 = fmul double %x0, %y0
142 %mul1 = fmul double %x1, %y1
143 %mul2 = fmul double %x2, %y2
144 %mul3 = fmul double %x3, %y3
145 %dot01 = fadd fast double %mul0, %mul1
146 %dot012 = fadd fast double %dot01, %mul2
147 %dot0123 = fadd fast double %dot012, %mul3
151 define float @dot4f32_fast(float* dereferenceable(16) %ptrx, float* dereferenceable(16) %ptry) {
152 ; CHECK-LABEL: @dot4f32_fast(
153 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, float* [[PTRX:%.*]], i64 1
154 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, float* [[PTRY:%.*]], i64 1
155 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 2
156 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
157 ; CHECK-NEXT: [[PTRX3:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 3
158 ; CHECK-NEXT: [[PTRY3:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 3
159 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX]] to <4 x float>*
160 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
161 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY]] to <4 x float>*
162 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
163 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP2]], [[TMP4]]
164 ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
165 ; CHECK-NEXT: ret float [[TMP6]]
167 %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1
168 %ptry1 = getelementptr inbounds float, float* %ptry, i64 1
169 %ptrx2 = getelementptr inbounds float, float* %ptrx, i64 2
170 %ptry2 = getelementptr inbounds float, float* %ptry, i64 2
171 %ptrx3 = getelementptr inbounds float, float* %ptrx, i64 3
172 %ptry3 = getelementptr inbounds float, float* %ptry, i64 3
173 %x0 = load float, float* %ptrx, align 4
174 %y0 = load float, float* %ptry, align 4
175 %x1 = load float, float* %ptrx1, align 4
176 %y1 = load float, float* %ptry1, align 4
177 %x2 = load float, float* %ptrx2, align 4
178 %y2 = load float, float* %ptry2, align 4
179 %x3 = load float, float* %ptrx3, align 4
180 %y3 = load float, float* %ptry3, align 4
181 %mul0 = fmul float %x0, %y0
182 %mul1 = fmul float %x1, %y1
183 %mul2 = fmul float %x2, %y2
184 %mul3 = fmul float %x3, %y3
185 %dot01 = fadd fast float %mul0, %mul1
186 %dot012 = fadd fast float %dot01, %mul2
187 %dot0123 = fadd fast float %dot012, %mul3
192 ; dot3(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1])+(x[2]*y[2]))
195 define double @dot3f64(double* dereferenceable(32) %ptrx, double* dereferenceable(32) %ptry) {
196 ; CHECK-LABEL: @dot3f64(
197 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, double* [[PTRX:%.*]], i64 1
198 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, double* [[PTRY:%.*]], i64 1
199 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 2
200 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
201 ; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PTRX]], align 4
202 ; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PTRY]], align 4
203 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>*
204 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
205 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>*
206 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
207 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
208 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
209 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
210 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]]
211 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
212 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP7]]
213 ; CHECK-NEXT: ret double [[DOT012]]
215 %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
216 %ptry1 = getelementptr inbounds double, double* %ptry, i64 1
217 %ptrx2 = getelementptr inbounds double, double* %ptrx, i64 2
218 %ptry2 = getelementptr inbounds double, double* %ptry, i64 2
219 %x0 = load double, double* %ptrx, align 4
220 %y0 = load double, double* %ptry, align 4
221 %x1 = load double, double* %ptrx1, align 4
222 %y1 = load double, double* %ptry1, align 4
223 %x2 = load double, double* %ptrx2, align 4
224 %y2 = load double, double* %ptry2, align 4
225 %mul0 = fmul double %x0, %y0
226 %mul1 = fmul double %x1, %y1
227 %mul2 = fmul double %x2, %y2
228 %dot01 = fadd double %mul0, %mul1
229 %dot012 = fadd double %dot01, %mul2
233 define float @dot3f32(float* dereferenceable(16) %ptrx, float* dereferenceable(16) %ptry) {
234 ; CHECK-LABEL: @dot3f32(
235 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, float* [[PTRX:%.*]], i64 1
236 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, float* [[PTRY:%.*]], i64 1
237 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 2
238 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
239 ; CHECK-NEXT: [[X0:%.*]] = load float, float* [[PTRX]], align 4
240 ; CHECK-NEXT: [[Y0:%.*]] = load float, float* [[PTRY]], align 4
241 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>*
242 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
243 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>*
244 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
245 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
246 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
247 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
248 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]]
249 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
250 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP7]]
251 ; CHECK-NEXT: ret float [[DOT012]]
253 %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1
254 %ptry1 = getelementptr inbounds float, float* %ptry, i64 1
255 %ptrx2 = getelementptr inbounds float, float* %ptrx, i64 2
256 %ptry2 = getelementptr inbounds float, float* %ptry, i64 2
257 %x0 = load float, float* %ptrx, align 4
258 %y0 = load float, float* %ptry, align 4
259 %x1 = load float, float* %ptrx1, align 4
260 %y1 = load float, float* %ptry1, align 4
261 %x2 = load float, float* %ptrx2, align 4
262 %y2 = load float, float* %ptry2, align 4
263 %mul0 = fmul float %x0, %y0
264 %mul1 = fmul float %x1, %y1
265 %mul2 = fmul float %x2, %y2
266 %dot01 = fadd float %mul0, %mul1
267 %dot012 = fadd float %dot01, %mul2
271 define double @dot3f64_fast(double* dereferenceable(32) %ptrx, double* dereferenceable(32) %ptry) {
272 ; CHECK-LABEL: @dot3f64_fast(
273 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, double* [[PTRX:%.*]], i64 1
274 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, double* [[PTRY:%.*]], i64 1
275 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, double* [[PTRX]], i64 2
276 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
277 ; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PTRX]], align 4
278 ; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PTRY]], align 4
279 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>*
280 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
281 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>*
282 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
283 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
284 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
285 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
286 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]]
287 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
288 ; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP7]]
289 ; CHECK-NEXT: ret double [[DOT012]]
291 %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
292 %ptry1 = getelementptr inbounds double, double* %ptry, i64 1
293 %ptrx2 = getelementptr inbounds double, double* %ptrx, i64 2
294 %ptry2 = getelementptr inbounds double, double* %ptry, i64 2
295 %x0 = load double, double* %ptrx, align 4
296 %y0 = load double, double* %ptry, align 4
297 %x1 = load double, double* %ptrx1, align 4
298 %y1 = load double, double* %ptry1, align 4
299 %x2 = load double, double* %ptrx2, align 4
300 %y2 = load double, double* %ptry2, align 4
301 %mul0 = fmul double %x0, %y0
302 %mul1 = fmul double %x1, %y1
303 %mul2 = fmul double %x2, %y2
304 %dot01 = fadd fast double %mul0, %mul1
305 %dot012 = fadd fast double %dot01, %mul2
309 define float @dot3f32_fast(float* dereferenceable(16) %ptrx, float* dereferenceable(16) %ptry) {
310 ; CHECK-LABEL: @dot3f32_fast(
311 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, float* [[PTRX:%.*]], i64 1
312 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, float* [[PTRY:%.*]], i64 1
313 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, float* [[PTRX]], i64 2
314 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
315 ; CHECK-NEXT: [[X0:%.*]] = load float, float* [[PTRX]], align 4
316 ; CHECK-NEXT: [[Y0:%.*]] = load float, float* [[PTRY]], align 4
317 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>*
318 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
319 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>*
320 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
321 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
322 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
323 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
324 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]]
325 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
326 ; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP7]]
327 ; CHECK-NEXT: ret float [[DOT012]]
329 %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1
330 %ptry1 = getelementptr inbounds float, float* %ptry, i64 1
331 %ptrx2 = getelementptr inbounds float, float* %ptrx, i64 2
332 %ptry2 = getelementptr inbounds float, float* %ptry, i64 2
333 %x0 = load float, float* %ptrx, align 4
334 %y0 = load float, float* %ptry, align 4
335 %x1 = load float, float* %ptrx1, align 4
336 %y1 = load float, float* %ptry1, align 4
337 %x2 = load float, float* %ptrx2, align 4
338 %y2 = load float, float* %ptry2, align 4
339 %mul0 = fmul float %x0, %y0
340 %mul1 = fmul float %x1, %y1
341 %mul2 = fmul float %x2, %y2
342 %dot01 = fadd fast float %mul0, %mul1
343 %dot012 = fadd fast float %dot01, %mul2
348 ; dot2(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1]))
351 define double @dot2f64(double* dereferenceable(16) %ptrx, double* dereferenceable(16) %ptry) {
352 ; CHECK-LABEL: @dot2f64(
353 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, double* [[PTRX:%.*]], i64 1
354 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, double* [[PTRY:%.*]], i64 1
355 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX]] to <2 x double>*
356 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
357 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY]] to <2 x double>*
358 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
359 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
360 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
361 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
362 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP6]], [[TMP7]]
363 ; CHECK-NEXT: ret double [[DOT01]]
365 %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
366 %ptry1 = getelementptr inbounds double, double* %ptry, i64 1
367 %x0 = load double, double* %ptrx, align 4
368 %y0 = load double, double* %ptry, align 4
369 %x1 = load double, double* %ptrx1, align 4
370 %y1 = load double, double* %ptry1, align 4
371 %mul0 = fmul double %x0, %y0
372 %mul1 = fmul double %x1, %y1
373 %dot01 = fadd double %mul0, %mul1
377 define float @dot2f32(float* dereferenceable(16) %ptrx, float* dereferenceable(16) %ptry) {
378 ; CHECK-LABEL: @dot2f32(
379 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, float* [[PTRX:%.*]], i64 1
380 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, float* [[PTRY:%.*]], i64 1
381 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX]] to <2 x float>*
382 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
383 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY]] to <2 x float>*
384 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
385 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
386 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
387 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
388 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP6]], [[TMP7]]
389 ; CHECK-NEXT: ret float [[DOT01]]
391 %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1
392 %ptry1 = getelementptr inbounds float, float* %ptry, i64 1
393 %x0 = load float, float* %ptrx, align 4
394 %y0 = load float, float* %ptry, align 4
395 %x1 = load float, float* %ptrx1, align 4
396 %y1 = load float, float* %ptry1, align 4
397 %mul0 = fmul float %x0, %y0
398 %mul1 = fmul float %x1, %y1
399 %dot01 = fadd float %mul0, %mul1
403 define double @dot2f64_fast(double* dereferenceable(16) %ptrx, double* dereferenceable(16) %ptry) {
404 ; CHECK-LABEL: @dot2f64_fast(
405 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, double* [[PTRX:%.*]], i64 1
406 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, double* [[PTRY:%.*]], i64 1
407 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX]] to <2 x double>*
408 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
409 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY]] to <2 x double>*
410 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
411 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
412 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
413 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
414 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
415 ; CHECK-NEXT: ret double [[DOT01]]
417 %ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
418 %ptry1 = getelementptr inbounds double, double* %ptry, i64 1
419 %x0 = load double, double* %ptrx, align 4
420 %y0 = load double, double* %ptry, align 4
421 %x1 = load double, double* %ptrx1, align 4
422 %y1 = load double, double* %ptry1, align 4
423 %mul0 = fmul double %x0, %y0
424 %mul1 = fmul double %x1, %y1
425 %dot01 = fadd fast double %mul0, %mul1
429 define float @dot2f32_fast(float* dereferenceable(16) %ptrx, float* dereferenceable(16) %ptry) {
430 ; CHECK-LABEL: @dot2f32_fast(
431 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, float* [[PTRX:%.*]], i64 1
432 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, float* [[PTRY:%.*]], i64 1
433 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX]] to <2 x float>*
434 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
435 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY]] to <2 x float>*
436 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
437 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
438 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
439 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
440 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
441 ; CHECK-NEXT: ret float [[DOT01]]
443 %ptrx1 = getelementptr inbounds float, float* %ptrx, i64 1
444 %ptry1 = getelementptr inbounds float, float* %ptry, i64 1
445 %x0 = load float, float* %ptrx, align 4
446 %y0 = load float, float* %ptry, align 4
447 %x1 = load float, float* %ptrx1, align 4
448 %y1 = load float, float* %ptry1, align 4
449 %mul0 = fmul float %x0, %y0
450 %mul1 = fmul float %x1, %y1
451 %dot01 = fadd fast float %mul0, %mul1