1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
8 ; Partial laod dot product patterns based off PR51075
11 ; dot3(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2]))
14 define float @dot3_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
15 ; SSE2-LABEL: dot3_float4:
17 ; SSE2-NEXT: movups (%rdi), %xmm0
18 ; SSE2-NEXT: movups (%rsi), %xmm1
19 ; SSE2-NEXT: mulps %xmm0, %xmm1
20 ; SSE2-NEXT: movaps %xmm1, %xmm0
21 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
22 ; SSE2-NEXT: addss %xmm1, %xmm0
23 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
24 ; SSE2-NEXT: addss %xmm1, %xmm0
27 ; SSSE3-LABEL: dot3_float4:
29 ; SSSE3-NEXT: movups (%rdi), %xmm0
30 ; SSSE3-NEXT: movups (%rsi), %xmm1
31 ; SSSE3-NEXT: mulps %xmm0, %xmm1
32 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
33 ; SSSE3-NEXT: addss %xmm1, %xmm0
34 ; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
35 ; SSSE3-NEXT: addss %xmm1, %xmm0
38 ; SSE41-LABEL: dot3_float4:
40 ; SSE41-NEXT: movups (%rdi), %xmm0
41 ; SSE41-NEXT: movups (%rsi), %xmm1
42 ; SSE41-NEXT: mulps %xmm0, %xmm1
43 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
44 ; SSE41-NEXT: addss %xmm1, %xmm0
45 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
46 ; SSE41-NEXT: addss %xmm1, %xmm0
49 ; AVX-LABEL: dot3_float4:
51 ; AVX-NEXT: vmovups (%rdi), %xmm0
52 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0
53 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
54 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
55 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
56 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
58 %x0123 = load <4 x float>, ptr %a0, align 4
59 %y0123 = load <4 x float>, ptr %a1, align 4
60 %mul0123 = fmul <4 x float> %x0123, %y0123
61 %mul0 = extractelement <4 x float> %mul0123, i32 0
62 %mul1 = extractelement <4 x float> %mul0123, i32 1
63 %mul2 = extractelement <4 x float> %mul0123, i32 2
64 %dot01 = fadd float %mul0, %mul1
65 %dot012 = fadd float %dot01, %mul2
69 define float @dot3_float4_as_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
70 ; SSE2-LABEL: dot3_float4_as_float3:
72 ; SSE2-NEXT: movups (%rdi), %xmm0
73 ; SSE2-NEXT: movups (%rsi), %xmm1
74 ; SSE2-NEXT: mulps %xmm0, %xmm1
75 ; SSE2-NEXT: movaps %xmm1, %xmm0
76 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
77 ; SSE2-NEXT: addss %xmm1, %xmm0
78 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
79 ; SSE2-NEXT: addss %xmm1, %xmm0
82 ; SSSE3-LABEL: dot3_float4_as_float3:
84 ; SSSE3-NEXT: movups (%rdi), %xmm0
85 ; SSSE3-NEXT: movups (%rsi), %xmm1
86 ; SSSE3-NEXT: mulps %xmm0, %xmm1
87 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
88 ; SSSE3-NEXT: addss %xmm1, %xmm0
89 ; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
90 ; SSSE3-NEXT: addss %xmm1, %xmm0
93 ; SSE41-LABEL: dot3_float4_as_float3:
95 ; SSE41-NEXT: movups (%rdi), %xmm0
96 ; SSE41-NEXT: movups (%rsi), %xmm1
97 ; SSE41-NEXT: mulps %xmm0, %xmm1
98 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
99 ; SSE41-NEXT: addss %xmm1, %xmm0
100 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
101 ; SSE41-NEXT: addss %xmm1, %xmm0
104 ; AVX-LABEL: dot3_float4_as_float3:
106 ; AVX-NEXT: vmovups (%rdi), %xmm0
107 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0
108 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
109 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
110 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
111 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
113 %x0123 = load <4 x float>, ptr %a0, align 4
114 %y0123 = load <4 x float>, ptr %a1, align 4
115 %x012 = shufflevector <4 x float> %x0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
116 %y012 = shufflevector <4 x float> %y0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
117 %mul012 = fmul <3 x float> %x012, %y012
118 %mul0 = extractelement <3 x float> %mul012, i32 0
119 %mul1 = extractelement <3 x float> %mul012, i32 1
120 %mul2 = extractelement <3 x float> %mul012, i32 2
121 %dot01 = fadd float %mul0, %mul1
122 %dot012 = fadd float %dot01, %mul2
126 define float @dot3_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
127 ; SSE2-LABEL: dot3_float3:
129 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
130 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
131 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
132 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
133 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
134 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
135 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
136 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
137 ; SSE2-NEXT: mulps %xmm0, %xmm1
138 ; SSE2-NEXT: movaps %xmm1, %xmm0
139 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
140 ; SSE2-NEXT: addss %xmm1, %xmm0
141 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
142 ; SSE2-NEXT: addss %xmm1, %xmm0
145 ; SSSE3-LABEL: dot3_float3:
147 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
148 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
149 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
150 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
151 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
152 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
153 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
154 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
155 ; SSSE3-NEXT: mulps %xmm0, %xmm1
156 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
157 ; SSSE3-NEXT: addss %xmm1, %xmm0
158 ; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
159 ; SSSE3-NEXT: addss %xmm1, %xmm0
162 ; SSE41-LABEL: dot3_float3:
164 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
165 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
166 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
167 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
168 ; SSE41-NEXT: mulps %xmm0, %xmm1
169 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
170 ; SSE41-NEXT: addss %xmm1, %xmm0
171 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
172 ; SSE41-NEXT: addss %xmm1, %xmm0
175 ; AVX-LABEL: dot3_float3:
177 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
178 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
179 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
180 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
181 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
182 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
183 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
184 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
185 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
187 %x012 = load <3 x float>, ptr %a0, align 4
188 %y012 = load <3 x float>, ptr %a1, align 4
189 %mul012 = fmul <3 x float> %x012, %y012
190 %mul0 = extractelement <3 x float> %mul012, i32 0
191 %mul1 = extractelement <3 x float> %mul012, i32 1
192 %mul2 = extractelement <3 x float> %mul012, i32 2
193 %dot01 = fadd float %mul0, %mul1
194 %dot012 = fadd float %dot01, %mul2
198 define float @dot3_float2_float(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
199 ; SSE2-LABEL: dot3_float2_float:
201 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
202 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
203 ; SSE2-NEXT: mulps %xmm0, %xmm1
204 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
205 ; SSE2-NEXT: mulss 8(%rsi), %xmm2
206 ; SSE2-NEXT: movaps %xmm1, %xmm0
207 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
208 ; SSE2-NEXT: addss %xmm1, %xmm0
209 ; SSE2-NEXT: addss %xmm2, %xmm0
212 ; SSSE3-LABEL: dot3_float2_float:
214 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
215 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
216 ; SSSE3-NEXT: mulps %xmm0, %xmm1
217 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
218 ; SSSE3-NEXT: mulss 8(%rsi), %xmm2
219 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
220 ; SSSE3-NEXT: addss %xmm1, %xmm0
221 ; SSSE3-NEXT: addss %xmm2, %xmm0
224 ; SSE41-LABEL: dot3_float2_float:
226 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
227 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
228 ; SSE41-NEXT: mulps %xmm0, %xmm1
229 ; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
230 ; SSE41-NEXT: mulss 8(%rsi), %xmm2
231 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
232 ; SSE41-NEXT: addss %xmm1, %xmm0
233 ; SSE41-NEXT: addss %xmm2, %xmm0
236 ; AVX-LABEL: dot3_float2_float:
238 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
239 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
240 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
241 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
242 ; AVX-NEXT: vmulss 8(%rsi), %xmm1, %xmm1
243 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
244 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
245 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
247 %x01 = load <2 x float>, ptr %a0, align 4
248 %y01 = load <2 x float>, ptr %a1, align 4
249 %ptrx2 = getelementptr inbounds float, ptr %a0, i64 2
250 %ptry2 = getelementptr inbounds float, ptr %a1, i64 2
251 %x2 = load float, ptr %ptrx2, align 4
252 %y2 = load float, ptr %ptry2, align 4
253 %mul01 = fmul <2 x float> %x01, %y01
254 %mul2 = fmul float %x2, %y2
255 %mul0 = extractelement <2 x float> %mul01, i32 0
256 %mul1 = extractelement <2 x float> %mul01, i32 1
257 %dot01 = fadd float %mul0, %mul1
258 %dot012 = fadd float %dot01, %mul2
262 define float @dot3_float_float2(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
263 ; SSE2-LABEL: dot3_float_float2:
265 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
267 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
268 ; SSE2-NEXT: mulps %xmm2, %xmm0
269 ; SSE2-NEXT: mulss (%rsi), %xmm1
270 ; SSE2-NEXT: addss %xmm0, %xmm1
271 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
272 ; SSE2-NEXT: addss %xmm1, %xmm0
275 ; SSSE3-LABEL: dot3_float_float2:
277 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
278 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
279 ; SSSE3-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
280 ; SSSE3-NEXT: mulps %xmm1, %xmm2
281 ; SSSE3-NEXT: mulss (%rsi), %xmm0
282 ; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
283 ; SSSE3-NEXT: addss %xmm2, %xmm0
284 ; SSSE3-NEXT: addss %xmm1, %xmm0
287 ; SSE41-LABEL: dot3_float_float2:
289 ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
290 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
291 ; SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
292 ; SSE41-NEXT: mulps %xmm1, %xmm2
293 ; SSE41-NEXT: mulss (%rsi), %xmm0
294 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
295 ; SSE41-NEXT: addss %xmm2, %xmm0
296 ; SSE41-NEXT: addss %xmm1, %xmm0
299 ; AVX-LABEL: dot3_float_float2:
301 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
302 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
303 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
304 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
305 ; AVX-NEXT: vmulss (%rsi), %xmm0, %xmm0
306 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
307 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
308 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
310 %x0 = load float, ptr %a0, align 4
311 %y0 = load float, ptr %a1, align 4
312 %ptrx12 = getelementptr inbounds float, ptr %a0, i64 1
313 %ptry12 = getelementptr inbounds float, ptr %a1, i64 1
314 %x12 = load <2 x float>, ptr %ptrx12, align 4
315 %y12 = load <2 x float>, ptr %ptry12, align 4
316 %mul0 = fmul float %x0, %y0
317 %mul12 = fmul <2 x float> %x12, %y12
318 %mul1 = extractelement <2 x float> %mul12, i32 0
319 %mul2 = extractelement <2 x float> %mul12, i32 1
320 %dot01 = fadd float %mul0, %mul1
321 %dot012 = fadd float %dot01, %mul2
326 ; dot2(ptr x, ptr y) - ((xptr y[0])+(xptr y[1]))
329 define float @dot2_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
330 ; SSE2-LABEL: dot2_float4:
332 ; SSE2-NEXT: movups (%rdi), %xmm0
333 ; SSE2-NEXT: movups (%rsi), %xmm1
334 ; SSE2-NEXT: mulps %xmm0, %xmm1
335 ; SSE2-NEXT: movaps %xmm1, %xmm0
336 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
337 ; SSE2-NEXT: addss %xmm1, %xmm0
340 ; SSSE3-LABEL: dot2_float4:
342 ; SSSE3-NEXT: movups (%rdi), %xmm0
343 ; SSSE3-NEXT: movups (%rsi), %xmm1
344 ; SSSE3-NEXT: mulps %xmm0, %xmm1
345 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
346 ; SSSE3-NEXT: addss %xmm1, %xmm0
349 ; SSE41-LABEL: dot2_float4:
351 ; SSE41-NEXT: movups (%rdi), %xmm0
352 ; SSE41-NEXT: movups (%rsi), %xmm1
353 ; SSE41-NEXT: mulps %xmm0, %xmm1
354 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
355 ; SSE41-NEXT: addss %xmm1, %xmm0
358 ; AVX-LABEL: dot2_float4:
360 ; AVX-NEXT: vmovups (%rdi), %xmm0
361 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0
362 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
363 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
365 %x0123 = load <4 x float>, ptr %a0, align 4
366 %y0123 = load <4 x float>, ptr %a1, align 4
367 %mul0123 = fmul <4 x float> %x0123, %y0123
368 %mul0 = extractelement <4 x float> %mul0123, i32 0
369 %mul1 = extractelement <4 x float> %mul0123, i32 1
370 %dot01 = fadd float %mul0, %mul1
374 define float @dot2_float2(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
375 ; SSE2-LABEL: dot2_float2:
377 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
378 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
379 ; SSE2-NEXT: mulps %xmm0, %xmm1
380 ; SSE2-NEXT: movaps %xmm1, %xmm0
381 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
382 ; SSE2-NEXT: addss %xmm1, %xmm0
385 ; SSSE3-LABEL: dot2_float2:
387 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
388 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
389 ; SSSE3-NEXT: mulps %xmm0, %xmm1
390 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
391 ; SSSE3-NEXT: addss %xmm1, %xmm0
394 ; SSE41-LABEL: dot2_float2:
396 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
397 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
398 ; SSE41-NEXT: mulps %xmm0, %xmm1
399 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
400 ; SSE41-NEXT: addss %xmm1, %xmm0
403 ; AVX-LABEL: dot2_float2:
405 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
406 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
407 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
408 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
409 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
411 %x01 = load <2 x float>, ptr %a0, align 4
412 %y01 = load <2 x float>, ptr %a1, align 4
413 %mul01 = fmul <2 x float> %x01, %y01
414 %mul0 = extractelement <2 x float> %mul01, i32 0
415 %mul1 = extractelement <2 x float> %mul01, i32 1
416 %dot01 = fadd float %mul0, %mul1