1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
8 ; Partial laod dot product patterns based off PR51075
11 ; dot3(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1])+(x[2]*y[2]))
14 define float @dot3_float4(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
15 ; SSE2-LABEL: dot3_float4:
17 ; SSE2-NEXT: movups (%rdi), %xmm0
18 ; SSE2-NEXT: movups (%rsi), %xmm1
19 ; SSE2-NEXT: mulps %xmm0, %xmm1
20 ; SSE2-NEXT: movaps %xmm1, %xmm0
21 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
22 ; SSE2-NEXT: addss %xmm1, %xmm0
23 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
24 ; SSE2-NEXT: addss %xmm1, %xmm0
27 ; SSSE3-LABEL: dot3_float4:
29 ; SSSE3-NEXT: movups (%rdi), %xmm0
30 ; SSSE3-NEXT: movups (%rsi), %xmm1
31 ; SSSE3-NEXT: mulps %xmm0, %xmm1
32 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
33 ; SSSE3-NEXT: addss %xmm1, %xmm0
34 ; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
35 ; SSSE3-NEXT: addss %xmm1, %xmm0
38 ; SSE41-LABEL: dot3_float4:
40 ; SSE41-NEXT: movups (%rdi), %xmm0
41 ; SSE41-NEXT: movups (%rsi), %xmm1
42 ; SSE41-NEXT: mulps %xmm0, %xmm1
43 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
44 ; SSE41-NEXT: addss %xmm1, %xmm0
45 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
46 ; SSE41-NEXT: addss %xmm1, %xmm0
49 ; AVX-LABEL: dot3_float4:
51 ; AVX-NEXT: vmovups (%rdi), %xmm0
52 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0
53 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
54 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
55 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
56 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
58 %bcx0123 = bitcast float* %a0 to <4 x float>*
59 %bcy0123 = bitcast float* %a1 to <4 x float>*
60 %x0123 = load <4 x float>, <4 x float>* %bcx0123, align 4
61 %y0123 = load <4 x float>, <4 x float>* %bcy0123, align 4
62 %mul0123 = fmul <4 x float> %x0123, %y0123
63 %mul0 = extractelement <4 x float> %mul0123, i32 0
64 %mul1 = extractelement <4 x float> %mul0123, i32 1
65 %mul2 = extractelement <4 x float> %mul0123, i32 2
66 %dot01 = fadd float %mul0, %mul1
67 %dot012 = fadd float %dot01, %mul2
71 define float @dot3_float4_as_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
72 ; SSE2-LABEL: dot3_float4_as_float3:
74 ; SSE2-NEXT: movups (%rdi), %xmm0
75 ; SSE2-NEXT: movups (%rsi), %xmm1
76 ; SSE2-NEXT: mulps %xmm0, %xmm1
77 ; SSE2-NEXT: movaps %xmm1, %xmm0
78 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
79 ; SSE2-NEXT: addss %xmm1, %xmm0
80 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
81 ; SSE2-NEXT: addss %xmm1, %xmm0
84 ; SSSE3-LABEL: dot3_float4_as_float3:
86 ; SSSE3-NEXT: movups (%rdi), %xmm0
87 ; SSSE3-NEXT: movups (%rsi), %xmm1
88 ; SSSE3-NEXT: mulps %xmm0, %xmm1
89 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
90 ; SSSE3-NEXT: addss %xmm1, %xmm0
91 ; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
92 ; SSSE3-NEXT: addss %xmm1, %xmm0
95 ; SSE41-LABEL: dot3_float4_as_float3:
97 ; SSE41-NEXT: movups (%rdi), %xmm0
98 ; SSE41-NEXT: movups (%rsi), %xmm1
99 ; SSE41-NEXT: mulps %xmm0, %xmm1
100 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
101 ; SSE41-NEXT: addss %xmm1, %xmm0
102 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
103 ; SSE41-NEXT: addss %xmm1, %xmm0
106 ; AVX-LABEL: dot3_float4_as_float3:
108 ; AVX-NEXT: vmovups (%rdi), %xmm0
109 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0
110 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
111 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
112 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
113 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
115 %bcx0123 = bitcast float* %a0 to <4 x float>*
116 %bcy0123 = bitcast float* %a1 to <4 x float>*
117 %x0123 = load <4 x float>, <4 x float>* %bcx0123, align 4
118 %y0123 = load <4 x float>, <4 x float>* %bcy0123, align 4
119 %x012 = shufflevector <4 x float> %x0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
120 %y012 = shufflevector <4 x float> %y0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
121 %mul012 = fmul <3 x float> %x012, %y012
122 %mul0 = extractelement <3 x float> %mul012, i32 0
123 %mul1 = extractelement <3 x float> %mul012, i32 1
124 %mul2 = extractelement <3 x float> %mul012, i32 2
125 %dot01 = fadd float %mul0, %mul1
126 %dot012 = fadd float %dot01, %mul2
130 define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
131 ; SSE2-LABEL: dot3_float3:
133 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
134 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
135 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
136 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
137 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
138 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
139 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
140 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
141 ; SSE2-NEXT: mulps %xmm0, %xmm1
142 ; SSE2-NEXT: movaps %xmm1, %xmm0
143 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
144 ; SSE2-NEXT: addss %xmm1, %xmm0
145 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
146 ; SSE2-NEXT: addss %xmm1, %xmm0
149 ; SSSE3-LABEL: dot3_float3:
151 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
152 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
153 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
154 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
155 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
156 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
157 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
158 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
159 ; SSSE3-NEXT: mulps %xmm0, %xmm1
160 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
161 ; SSSE3-NEXT: addss %xmm1, %xmm0
162 ; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
163 ; SSSE3-NEXT: addss %xmm1, %xmm0
166 ; SSE41-LABEL: dot3_float3:
168 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
169 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
170 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
171 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
172 ; SSE41-NEXT: mulps %xmm0, %xmm1
173 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
174 ; SSE41-NEXT: addss %xmm1, %xmm0
175 ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
176 ; SSE41-NEXT: addss %xmm1, %xmm0
179 ; AVX-LABEL: dot3_float3:
181 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
182 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
183 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
184 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
185 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
186 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
187 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
188 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
189 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
191 %bcx012 = bitcast float* %a0 to <3 x float>*
192 %bcy012 = bitcast float* %a1 to <3 x float>*
193 %x012 = load <3 x float>, <3 x float>* %bcx012, align 4
194 %y012 = load <3 x float>, <3 x float>* %bcy012, align 4
195 %mul012 = fmul <3 x float> %x012, %y012
196 %mul0 = extractelement <3 x float> %mul012, i32 0
197 %mul1 = extractelement <3 x float> %mul012, i32 1
198 %mul2 = extractelement <3 x float> %mul012, i32 2
199 %dot01 = fadd float %mul0, %mul1
200 %dot012 = fadd float %dot01, %mul2
204 define float @dot3_float2_float(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
205 ; SSE2-LABEL: dot3_float2_float:
207 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
208 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
209 ; SSE2-NEXT: mulps %xmm0, %xmm1
210 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
211 ; SSE2-NEXT: mulss 8(%rsi), %xmm2
212 ; SSE2-NEXT: movaps %xmm1, %xmm0
213 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
214 ; SSE2-NEXT: addss %xmm1, %xmm0
215 ; SSE2-NEXT: addss %xmm2, %xmm0
218 ; SSSE3-LABEL: dot3_float2_float:
220 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
221 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
222 ; SSSE3-NEXT: mulps %xmm0, %xmm1
223 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
224 ; SSSE3-NEXT: mulss 8(%rsi), %xmm2
225 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
226 ; SSSE3-NEXT: addss %xmm1, %xmm0
227 ; SSSE3-NEXT: addss %xmm2, %xmm0
230 ; SSE41-LABEL: dot3_float2_float:
232 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
233 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
234 ; SSE41-NEXT: mulps %xmm0, %xmm1
235 ; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
236 ; SSE41-NEXT: mulss 8(%rsi), %xmm2
237 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
238 ; SSE41-NEXT: addss %xmm1, %xmm0
239 ; SSE41-NEXT: addss %xmm2, %xmm0
242 ; AVX-LABEL: dot3_float2_float:
244 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
245 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
246 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
247 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
248 ; AVX-NEXT: vmulss 8(%rsi), %xmm1, %xmm1
249 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
250 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
251 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
253 %bcx01 = bitcast float* %a0 to <2 x float>*
254 %bcy01 = bitcast float* %a1 to <2 x float>*
255 %x01 = load <2 x float>, <2 x float>* %bcx01, align 4
256 %y01 = load <2 x float>, <2 x float>* %bcy01, align 4
257 %ptrx2 = getelementptr inbounds float, float* %a0, i64 2
258 %ptry2 = getelementptr inbounds float, float* %a1, i64 2
259 %x2 = load float, float* %ptrx2, align 4
260 %y2 = load float, float* %ptry2, align 4
261 %mul01 = fmul <2 x float> %x01, %y01
262 %mul2 = fmul float %x2, %y2
263 %mul0 = extractelement <2 x float> %mul01, i32 0
264 %mul1 = extractelement <2 x float> %mul01, i32 1
265 %dot01 = fadd float %mul0, %mul1
266 %dot012 = fadd float %dot01, %mul2
270 define float @dot3_float_float2(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
271 ; SSE2-LABEL: dot3_float_float2:
273 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
274 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
275 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
276 ; SSE2-NEXT: mulps %xmm2, %xmm0
277 ; SSE2-NEXT: mulss (%rsi), %xmm1
278 ; SSE2-NEXT: addss %xmm0, %xmm1
279 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
280 ; SSE2-NEXT: addss %xmm1, %xmm0
283 ; SSSE3-LABEL: dot3_float_float2:
285 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
286 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
287 ; SSSE3-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
288 ; SSSE3-NEXT: mulps %xmm1, %xmm2
289 ; SSSE3-NEXT: mulss (%rsi), %xmm0
290 ; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
291 ; SSSE3-NEXT: addss %xmm2, %xmm0
292 ; SSSE3-NEXT: addss %xmm1, %xmm0
295 ; SSE41-LABEL: dot3_float_float2:
297 ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
298 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
299 ; SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
300 ; SSE41-NEXT: mulps %xmm1, %xmm2
301 ; SSE41-NEXT: mulss (%rsi), %xmm0
302 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
303 ; SSE41-NEXT: addss %xmm2, %xmm0
304 ; SSE41-NEXT: addss %xmm1, %xmm0
307 ; AVX-LABEL: dot3_float_float2:
309 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
310 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
311 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
312 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
313 ; AVX-NEXT: vmulss (%rsi), %xmm0, %xmm0
314 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
315 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
316 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
318 %x0 = load float, float* %a0, align 4
319 %y0 = load float, float* %a1, align 4
320 %ptrx12 = getelementptr inbounds float, float* %a0, i64 1
321 %ptry12 = getelementptr inbounds float, float* %a1, i64 1
322 %bcx12 = bitcast float* %ptrx12 to <2 x float>*
323 %bcy12 = bitcast float* %ptry12 to <2 x float>*
324 %x12 = load <2 x float>, <2 x float>* %bcx12, align 4
325 %y12 = load <2 x float>, <2 x float>* %bcy12, align 4
326 %mul0 = fmul float %x0, %y0
327 %mul12 = fmul <2 x float> %x12, %y12
328 %mul1 = extractelement <2 x float> %mul12, i32 0
329 %mul2 = extractelement <2 x float> %mul12, i32 1
330 %dot01 = fadd float %mul0, %mul1
331 %dot012 = fadd float %dot01, %mul2
336 ; dot2(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1]))
339 define float @dot2_float4(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
340 ; SSE2-LABEL: dot2_float4:
342 ; SSE2-NEXT: movups (%rdi), %xmm0
343 ; SSE2-NEXT: movups (%rsi), %xmm1
344 ; SSE2-NEXT: mulps %xmm0, %xmm1
345 ; SSE2-NEXT: movaps %xmm1, %xmm0
346 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
347 ; SSE2-NEXT: addss %xmm1, %xmm0
350 ; SSSE3-LABEL: dot2_float4:
352 ; SSSE3-NEXT: movups (%rdi), %xmm0
353 ; SSSE3-NEXT: movups (%rsi), %xmm1
354 ; SSSE3-NEXT: mulps %xmm0, %xmm1
355 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
356 ; SSSE3-NEXT: addss %xmm1, %xmm0
359 ; SSE41-LABEL: dot2_float4:
361 ; SSE41-NEXT: movups (%rdi), %xmm0
362 ; SSE41-NEXT: movups (%rsi), %xmm1
363 ; SSE41-NEXT: mulps %xmm0, %xmm1
364 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
365 ; SSE41-NEXT: addss %xmm1, %xmm0
368 ; AVX-LABEL: dot2_float4:
370 ; AVX-NEXT: vmovups (%rdi), %xmm0
371 ; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0
372 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
373 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
375 %bcx0123 = bitcast float* %a0 to <4 x float>*
376 %bcy0123 = bitcast float* %a1 to <4 x float>*
377 %x0123 = load <4 x float>, <4 x float>* %bcx0123, align 4
378 %y0123 = load <4 x float>, <4 x float>* %bcy0123, align 4
379 %mul0123 = fmul <4 x float> %x0123, %y0123
380 %mul0 = extractelement <4 x float> %mul0123, i32 0
381 %mul1 = extractelement <4 x float> %mul0123, i32 1
382 %dot01 = fadd float %mul0, %mul1
386 define float @dot2_float2(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
387 ; SSE2-LABEL: dot2_float2:
389 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
390 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
391 ; SSE2-NEXT: mulps %xmm0, %xmm1
392 ; SSE2-NEXT: movaps %xmm1, %xmm0
393 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
394 ; SSE2-NEXT: addss %xmm1, %xmm0
397 ; SSSE3-LABEL: dot2_float2:
399 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
400 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
401 ; SSSE3-NEXT: mulps %xmm0, %xmm1
402 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
403 ; SSSE3-NEXT: addss %xmm1, %xmm0
406 ; SSE41-LABEL: dot2_float2:
408 ; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
409 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
410 ; SSE41-NEXT: mulps %xmm0, %xmm1
411 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
412 ; SSE41-NEXT: addss %xmm1, %xmm0
415 ; AVX-LABEL: dot2_float2:
417 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
418 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
419 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
420 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
421 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
423 %bcx01 = bitcast float* %a0 to <2 x float>*
424 %bcy01 = bitcast float* %a1 to <2 x float>*
425 %x01 = load <2 x float>, <2 x float>* %bcx01, align 4
426 %y01 = load <2 x float>, <2 x float>* %bcy01, align 4
427 %mul01 = fmul <2 x float> %x01, %y01
428 %mul0 = extractelement <2 x float> %mul01, i32 0
429 %mul1 = extractelement <2 x float> %mul01, i32 1
430 %dot01 = fadd float %mul0, %mul1