1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s
4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
8 ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
11 define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
12 ; CHECK-LABEL: @dot4f64(
13 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
14 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
15 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
16 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
17 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
18 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
19 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
20 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
21 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
22 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
23 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
24 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
25 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
26 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
27 ; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
28 ; CHECK-NEXT: ret double [[DOT0123]]
30 %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
31 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
32 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
33 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
34 %ptrx3 = getelementptr inbounds double, ptr %ptrx, i64 3
35 %ptry3 = getelementptr inbounds double, ptr %ptry, i64 3
36 %x0 = load double, ptr %ptrx, align 4
37 %y0 = load double, ptr %ptry, align 4
38 %x1 = load double, ptr %ptrx1, align 4
39 %y1 = load double, ptr %ptry1, align 4
40 %x2 = load double, ptr %ptrx2, align 4
41 %y2 = load double, ptr %ptry2, align 4
42 %x3 = load double, ptr %ptrx3, align 4
43 %y3 = load double, ptr %ptry3, align 4
44 %mul0 = fmul double %x0, %y0
45 %mul1 = fmul double %x1, %y1
46 %mul2 = fmul double %x2, %y2
47 %mul3 = fmul double %x3, %y3
48 %dot01 = fadd double %mul0, %mul1
49 %dot012 = fadd double %dot01, %mul2
50 %dot0123 = fadd double %dot012, %mul3
54 define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
55 ; CHECK-LABEL: @dot4f32(
56 ; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2
57 ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2
58 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
59 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
60 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
61 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
62 ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
63 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]]
64 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
65 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
66 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]]
67 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
68 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]]
69 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
70 ; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]]
71 ; CHECK-NEXT: ret float [[DOT0123]]
73 %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
74 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
75 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
76 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
77 %ptrx3 = getelementptr inbounds float, ptr %ptrx, i64 3
78 %ptry3 = getelementptr inbounds float, ptr %ptry, i64 3
79 %x0 = load float, ptr %ptrx, align 4
80 %y0 = load float, ptr %ptry, align 4
81 %x1 = load float, ptr %ptrx1, align 4
82 %y1 = load float, ptr %ptry1, align 4
83 %x2 = load float, ptr %ptrx2, align 4
84 %y2 = load float, ptr %ptry2, align 4
85 %x3 = load float, ptr %ptrx3, align 4
86 %y3 = load float, ptr %ptry3, align 4
87 %mul0 = fmul float %x0, %y0
88 %mul1 = fmul float %x1, %y1
89 %mul2 = fmul float %x2, %y2
90 %mul3 = fmul float %x3, %y3
91 %dot01 = fadd float %mul0, %mul1
92 %dot012 = fadd float %dot01, %mul2
93 %dot0123 = fadd float %dot012, %mul3
97 define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
98 ; CHECK-LABEL: @dot4f64_fast(
99 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
100 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
101 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
102 ; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
103 ; CHECK-NEXT: ret double [[TMP4]]
105 %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
106 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
107 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
108 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
109 %ptrx3 = getelementptr inbounds double, ptr %ptrx, i64 3
110 %ptry3 = getelementptr inbounds double, ptr %ptry, i64 3
111 %x0 = load double, ptr %ptrx, align 4
112 %y0 = load double, ptr %ptry, align 4
113 %x1 = load double, ptr %ptrx1, align 4
114 %y1 = load double, ptr %ptry1, align 4
115 %x2 = load double, ptr %ptrx2, align 4
116 %y2 = load double, ptr %ptry2, align 4
117 %x3 = load double, ptr %ptrx3, align 4
118 %y3 = load double, ptr %ptry3, align 4
119 %mul0 = fmul double %x0, %y0
120 %mul1 = fmul double %x1, %y1
121 %mul2 = fmul double %x2, %y2
122 %mul3 = fmul double %x3, %y3
123 %dot01 = fadd fast double %mul0, %mul1
124 %dot012 = fadd fast double %dot01, %mul2
125 %dot0123 = fadd fast double %dot012, %mul3
129 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
130 ; CHECK-LABEL: @dot4f32_fast(
131 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
132 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
133 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
134 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
135 ; CHECK-NEXT: ret float [[TMP4]]
137 %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
138 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
139 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
140 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
141 %ptrx3 = getelementptr inbounds float, ptr %ptrx, i64 3
142 %ptry3 = getelementptr inbounds float, ptr %ptry, i64 3
143 %x0 = load float, ptr %ptrx, align 4
144 %y0 = load float, ptr %ptry, align 4
145 %x1 = load float, ptr %ptrx1, align 4
146 %y1 = load float, ptr %ptry1, align 4
147 %x2 = load float, ptr %ptrx2, align 4
148 %y2 = load float, ptr %ptry2, align 4
149 %x3 = load float, ptr %ptrx3, align 4
150 %y3 = load float, ptr %ptry3, align 4
151 %mul0 = fmul float %x0, %y0
152 %mul1 = fmul float %x1, %y1
153 %mul2 = fmul float %x2, %y2
154 %mul3 = fmul float %x3, %y3
155 %dot01 = fadd fast float %mul0, %mul1
156 %dot012 = fadd fast float %dot01, %mul2
157 %dot0123 = fadd fast float %dot012, %mul3
162 ; dot3(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2]))
165 define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
166 ; CHECK-LABEL: @dot3f64(
167 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
168 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
169 ; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4
170 ; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
171 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
172 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
173 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
174 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
175 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
176 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]]
177 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
178 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]]
179 ; CHECK-NEXT: ret double [[DOT012]]
181 %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
182 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
183 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
184 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
185 %x0 = load double, ptr %ptrx, align 4
186 %y0 = load double, ptr %ptry, align 4
187 %x1 = load double, ptr %ptrx1, align 4
188 %y1 = load double, ptr %ptry1, align 4
189 %x2 = load double, ptr %ptrx2, align 4
190 %y2 = load double, ptr %ptry2, align 4
191 %mul0 = fmul double %x0, %y0
192 %mul1 = fmul double %x1, %y1
193 %mul2 = fmul double %x2, %y2
194 %dot01 = fadd double %mul0, %mul1
195 %dot012 = fadd double %dot01, %mul2
199 define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
200 ; CHECK-LABEL: @dot3f32(
201 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
202 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
203 ; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4
204 ; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
205 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
206 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
207 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
208 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
209 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
210 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP4]]
211 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
212 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP5]]
213 ; CHECK-NEXT: ret float [[DOT012]]
215 %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
216 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
217 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
218 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
219 %x0 = load float, ptr %ptrx, align 4
220 %y0 = load float, ptr %ptry, align 4
221 %x1 = load float, ptr %ptrx1, align 4
222 %y1 = load float, ptr %ptry1, align 4
223 %x2 = load float, ptr %ptrx2, align 4
224 %y2 = load float, ptr %ptry2, align 4
225 %mul0 = fmul float %x0, %y0
226 %mul1 = fmul float %x1, %y1
227 %mul2 = fmul float %x2, %y2
228 %dot01 = fadd float %mul0, %mul1
229 %dot012 = fadd float %dot01, %mul2
233 define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
234 ; CHECK-LABEL: @dot3f64_fast(
235 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
236 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
237 ; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4
238 ; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
239 ; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
240 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
241 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
242 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
243 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
244 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
245 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
246 ; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
247 ; CHECK-NEXT: ret double [[DOT012]]
249 %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
250 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
251 %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
252 %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
253 %x0 = load double, ptr %ptrx, align 4
254 %y0 = load double, ptr %ptry, align 4
255 %x1 = load double, ptr %ptrx1, align 4
256 %y1 = load double, ptr %ptry1, align 4
257 %x2 = load double, ptr %ptrx2, align 4
258 %y2 = load double, ptr %ptry2, align 4
259 %mul0 = fmul double %x0, %y0
260 %mul1 = fmul double %x1, %y1
261 %mul2 = fmul double %x2, %y2
262 %dot01 = fadd fast double %mul0, %mul1
263 %dot012 = fadd fast double %dot01, %mul2
267 define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
268 ; CHECK-LABEL: @dot3f32_fast(
269 ; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
270 ; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
271 ; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4
272 ; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
273 ; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
274 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
275 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
276 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
277 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
278 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
279 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
280 ; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
281 ; CHECK-NEXT: ret float [[DOT012]]
283 %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
284 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
285 %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
286 %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
287 %x0 = load float, ptr %ptrx, align 4
288 %y0 = load float, ptr %ptry, align 4
289 %x1 = load float, ptr %ptrx1, align 4
290 %y1 = load float, ptr %ptry1, align 4
291 %x2 = load float, ptr %ptrx2, align 4
292 %y2 = load float, ptr %ptry2, align 4
293 %mul0 = fmul float %x0, %y0
294 %mul1 = fmul float %x1, %y1
295 %mul2 = fmul float %x2, %y2
296 %dot01 = fadd fast float %mul0, %mul1
297 %dot012 = fadd fast float %dot01, %mul2
302 ; dot2(ptr x, ptr y) - ((xptr y[0])+(xptr y[1]))
305 define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
306 ; CHECK-LABEL: @dot2f64(
307 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
308 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
309 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
310 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
311 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
312 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
313 ; CHECK-NEXT: ret double [[DOT01]]
315 %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
316 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
317 %x0 = load double, ptr %ptrx, align 4
318 %y0 = load double, ptr %ptry, align 4
319 %x1 = load double, ptr %ptrx1, align 4
320 %y1 = load double, ptr %ptry1, align 4
321 %mul0 = fmul double %x0, %y0
322 %mul1 = fmul double %x1, %y1
323 %dot01 = fadd double %mul0, %mul1
327 define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
328 ; CHECK-LABEL: @dot2f32(
329 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
330 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
331 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
332 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
333 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
334 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP4]], [[TMP5]]
335 ; CHECK-NEXT: ret float [[DOT01]]
337 %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
338 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
339 %x0 = load float, ptr %ptrx, align 4
340 %y0 = load float, ptr %ptry, align 4
341 %x1 = load float, ptr %ptrx1, align 4
342 %y1 = load float, ptr %ptry1, align 4
343 %mul0 = fmul float %x0, %y0
344 %mul1 = fmul float %x1, %y1
345 %dot01 = fadd float %mul0, %mul1
349 define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
350 ; CHECK-LABEL: @dot2f64_fast(
351 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
352 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
353 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
354 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
355 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
356 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
357 ; CHECK-NEXT: ret double [[DOT01]]
359 %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
360 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
361 %x0 = load double, ptr %ptrx, align 4
362 %y0 = load double, ptr %ptry, align 4
363 %x1 = load double, ptr %ptrx1, align 4
364 %y1 = load double, ptr %ptry1, align 4
365 %mul0 = fmul double %x0, %y0
366 %mul1 = fmul double %x1, %y1
367 %dot01 = fadd fast double %mul0, %mul1
371 define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
372 ; CHECK-LABEL: @dot2f32_fast(
373 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
374 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
375 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
376 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
377 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
378 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
379 ; CHECK-NEXT: ret float [[DOT01]]
381 %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
382 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
383 %x0 = load float, ptr %ptrx, align 4
384 %y0 = load float, ptr %ptry, align 4
385 %x1 = load float, ptr %ptrx1, align 4
386 %y1 = load float, ptr %ptry1, align 4
387 %mul0 = fmul float %x0, %y0
388 %mul1 = fmul float %x1, %y1
389 %dot01 = fadd fast float %mul0, %mul1