llvm/test/CodeGen/X86/load-partial-dot-product.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
   4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
   5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
   6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
   7
   8 ; Partial laod dot product patterns based off PR51075
   9
  10 ;
  11 ; dot3(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1])+(x[2]*y[2]))
  12 ;
  13
  14 define float @dot3_float4(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
  15 ; SSE2-LABEL: dot3_float4:
  16 ; SSE2:       # %bb.0:
  17 ; SSE2-NEXT:    movups (%rdi), %xmm0
  18 ; SSE2-NEXT:    movups (%rsi), %xmm1
  19 ; SSE2-NEXT:    mulps %xmm0, %xmm1
  20 ; SSE2-NEXT:    movaps %xmm1, %xmm0
  21 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
  22 ; SSE2-NEXT:    addss %xmm1, %xmm0
  23 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
  24 ; SSE2-NEXT:    addss %xmm1, %xmm0
  25 ; SSE2-NEXT:    retq
  26 ;
  27 ; SSSE3-LABEL: dot3_float4:
  28 ; SSSE3:       # %bb.0:
  29 ; SSSE3-NEXT:    movups (%rdi), %xmm0
  30 ; SSSE3-NEXT:    movups (%rsi), %xmm1
  31 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
  32 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
  33 ; SSSE3-NEXT:    addss %xmm1, %xmm0
  34 ; SSSE3-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
  35 ; SSSE3-NEXT:    addss %xmm1, %xmm0
  36 ; SSSE3-NEXT:    retq
  37 ;
  38 ; SSE41-LABEL: dot3_float4:
  39 ; SSE41:       # %bb.0:
  40 ; SSE41-NEXT:    movups (%rdi), %xmm0
  41 ; SSE41-NEXT:    movups (%rsi), %xmm1
  42 ; SSE41-NEXT:    mulps %xmm0, %xmm1
  43 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
  44 ; SSE41-NEXT:    addss %xmm1, %xmm0
  45 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
  46 ; SSE41-NEXT:    addss %xmm1, %xmm0
  47 ; SSE41-NEXT:    retq
  48 ;
  49 ; AVX-LABEL: dot3_float4:
  50 ; AVX:       # %bb.0:
  51 ; AVX-NEXT:    vmovups (%rdi), %xmm0
  52 ; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
  53 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
  54 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
  55 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
  56 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
  57 ; AVX-NEXT:    retq
  58   %bcx0123 = bitcast float* %a0 to <4 x float>*
  59   %bcy0123 = bitcast float* %a1 to <4 x float>*
  60   %x0123 = load <4 x float>, <4 x float>* %bcx0123, align 4
  61   %y0123 = load <4 x float>, <4 x float>* %bcy0123, align 4
  62   %mul0123 = fmul <4 x float> %x0123, %y0123
  63   %mul0 = extractelement <4 x float> %mul0123, i32 0
  64   %mul1 = extractelement <4 x float> %mul0123, i32 1
  65   %mul2 = extractelement <4 x float> %mul0123, i32 2
  66   %dot01 = fadd float %mul0, %mul1
  67   %dot012 = fadd float %dot01, %mul2
  68   ret float %dot012
  69 }
  70
  71 define float @dot3_float4_as_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
  72 ; SSE2-LABEL: dot3_float4_as_float3:
  73 ; SSE2:       # %bb.0:
  74 ; SSE2-NEXT:    movups (%rdi), %xmm0
  75 ; SSE2-NEXT:    movups (%rsi), %xmm1
  76 ; SSE2-NEXT:    mulps %xmm0, %xmm1
  77 ; SSE2-NEXT:    movaps %xmm1, %xmm0
  78 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
  79 ; SSE2-NEXT:    addss %xmm1, %xmm0
  80 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
  81 ; SSE2-NEXT:    addss %xmm1, %xmm0
  82 ; SSE2-NEXT:    retq
  83 ;
  84 ; SSSE3-LABEL: dot3_float4_as_float3:
  85 ; SSSE3:       # %bb.0:
  86 ; SSSE3-NEXT:    movups (%rdi), %xmm0
  87 ; SSSE3-NEXT:    movups (%rsi), %xmm1
  88 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
  89 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
  90 ; SSSE3-NEXT:    addss %xmm1, %xmm0
  91 ; SSSE3-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
  92 ; SSSE3-NEXT:    addss %xmm1, %xmm0
  93 ; SSSE3-NEXT:    retq
  94 ;
  95 ; SSE41-LABEL: dot3_float4_as_float3:
  96 ; SSE41:       # %bb.0:
  97 ; SSE41-NEXT:    movups (%rdi), %xmm0
  98 ; SSE41-NEXT:    movups (%rsi), %xmm1
  99 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 100 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 101 ; SSE41-NEXT:    addss %xmm1, %xmm0
 102 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 103 ; SSE41-NEXT:    addss %xmm1, %xmm0
 104 ; SSE41-NEXT:    retq
 105 ;
 106 ; AVX-LABEL: dot3_float4_as_float3:
 107 ; AVX:       # %bb.0:
 108 ; AVX-NEXT:    vmovups (%rdi), %xmm0
 109 ; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
 110 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 111 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 112 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 113 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 114 ; AVX-NEXT:    retq
 115   %bcx0123 = bitcast float* %a0 to <4 x float>*
 116   %bcy0123 = bitcast float* %a1 to <4 x float>*
 117   %x0123 = load <4 x float>, <4 x float>* %bcx0123, align 4
 118   %y0123 = load <4 x float>, <4 x float>* %bcy0123, align 4
 119   %x012 = shufflevector <4 x float> %x0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
 120   %y012 = shufflevector <4 x float> %y0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
 121   %mul012 = fmul <3 x float> %x012, %y012
 122   %mul0 = extractelement <3 x float> %mul012, i32 0
 123   %mul1 = extractelement <3 x float> %mul012, i32 1
 124   %mul2 = extractelement <3 x float> %mul012, i32 2
 125   %dot01 = fadd float %mul0, %mul1
 126   %dot012 = fadd float %dot01, %mul2
 127   ret float %dot012
 128 }
 129
 130 define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 131 ; SSE2-LABEL: dot3_float3:
 132 ; SSE2:       # %bb.0:
 133 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 134 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 135 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
 136 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
 137 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 138 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 139 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
 140 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
 141 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 142 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 143 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
 144 ; SSE2-NEXT:    addss %xmm1, %xmm0
 145 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 146 ; SSE2-NEXT:    addss %xmm1, %xmm0
 147 ; SSE2-NEXT:    retq
 148 ;
 149 ; SSSE3-LABEL: dot3_float3:
 150 ; SSSE3:       # %bb.0:
 151 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 152 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 153 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
 154 ; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
 155 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 156 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 157 ; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
 158 ; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
 159 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 160 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 161 ; SSSE3-NEXT:    addss %xmm1, %xmm0
 162 ; SSSE3-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 163 ; SSSE3-NEXT:    addss %xmm1, %xmm0
 164 ; SSSE3-NEXT:    retq
 165 ;
 166 ; SSE41-LABEL: dot3_float3:
 167 ; SSE41:       # %bb.0:
 168 ; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 169 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 170 ; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 171 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 172 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 173 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 174 ; SSE41-NEXT:    addss %xmm1, %xmm0
 175 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 176 ; SSE41-NEXT:    addss %xmm1, %xmm0
 177 ; SSE41-NEXT:    retq
 178 ;
 179 ; AVX-LABEL: dot3_float3:
 180 ; AVX:       # %bb.0:
 181 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 182 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 183 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 184 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 185 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 186 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 187 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 188 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 189 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 190 ; AVX-NEXT:    retq
 191   %bcx012 = bitcast float* %a0 to <3 x float>*
 192   %bcy012 = bitcast float* %a1 to <3 x float>*
 193   %x012 = load <3 x float>, <3 x float>* %bcx012, align 4
 194   %y012 = load <3 x float>, <3 x float>* %bcy012, align 4
 195   %mul012 = fmul <3 x float> %x012, %y012
 196   %mul0 = extractelement <3 x float> %mul012, i32 0
 197   %mul1 = extractelement <3 x float> %mul012, i32 1
 198   %mul2 = extractelement <3 x float> %mul012, i32 2
 199   %dot01 = fadd float %mul0, %mul1
 200   %dot012 = fadd float %dot01, %mul2
 201   ret float %dot012
 202 }
 203
 204 define float @dot3_float2_float(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 205 ; SSE2-LABEL: dot3_float2_float:
 206 ; SSE2:       # %bb.0:
 207 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 208 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 209 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 210 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 211 ; SSE2-NEXT:    mulss 8(%rsi), %xmm2
 212 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 213 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
 214 ; SSE2-NEXT:    addss %xmm1, %xmm0
 215 ; SSE2-NEXT:    addss %xmm2, %xmm0
 216 ; SSE2-NEXT:    retq
 217 ;
 218 ; SSSE3-LABEL: dot3_float2_float:
 219 ; SSSE3:       # %bb.0:
 220 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 221 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 222 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 223 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 224 ; SSSE3-NEXT:    mulss 8(%rsi), %xmm2
 225 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 226 ; SSSE3-NEXT:    addss %xmm1, %xmm0
 227 ; SSSE3-NEXT:    addss %xmm2, %xmm0
 228 ; SSSE3-NEXT:    retq
 229 ;
 230 ; SSE41-LABEL: dot3_float2_float:
 231 ; SSE41:       # %bb.0:
 232 ; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 233 ; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 234 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 235 ; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 236 ; SSE41-NEXT:    mulss 8(%rsi), %xmm2
 237 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 238 ; SSE41-NEXT:    addss %xmm1, %xmm0
 239 ; SSE41-NEXT:    addss %xmm2, %xmm0
 240 ; SSE41-NEXT:    retq
 241 ;
 242 ; AVX-LABEL: dot3_float2_float:
 243 ; AVX:       # %bb.0:
 244 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 245 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 246 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 247 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 248 ; AVX-NEXT:    vmulss 8(%rsi), %xmm1, %xmm1
 249 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 250 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 251 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 252 ; AVX-NEXT:    retq
 253   %bcx01 = bitcast float* %a0 to <2 x float>*
 254   %bcy01 = bitcast float* %a1 to <2 x float>*
 255   %x01 = load <2 x float>, <2 x float>* %bcx01, align 4
 256   %y01 = load <2 x float>, <2 x float>* %bcy01, align 4
 257   %ptrx2 = getelementptr inbounds float, float* %a0, i64 2
 258   %ptry2 = getelementptr inbounds float, float* %a1, i64 2
 259   %x2 = load float, float* %ptrx2, align 4
 260   %y2 = load float, float* %ptry2, align 4
 261   %mul01 = fmul <2 x float> %x01, %y01
 262   %mul2 = fmul float %x2, %y2
 263   %mul0 = extractelement <2 x float> %mul01, i32 0
 264   %mul1 = extractelement <2 x float> %mul01, i32 1
 265   %dot01 = fadd float %mul0, %mul1
 266   %dot012 = fadd float %dot01, %mul2
 267   ret float %dot012
 268 }
 269
 270 define float @dot3_float_float2(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 271 ; SSE2-LABEL: dot3_float_float2:
 272 ; SSE2:       # %bb.0:
 273 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 274 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
 275 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 276 ; SSE2-NEXT:    mulps %xmm2, %xmm0
 277 ; SSE2-NEXT:    mulss (%rsi), %xmm1
 278 ; SSE2-NEXT:    addss %xmm0, %xmm1
 279 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 280 ; SSE2-NEXT:    addss %xmm1, %xmm0
 281 ; SSE2-NEXT:    retq
 282 ;
 283 ; SSSE3-LABEL: dot3_float_float2:
 284 ; SSSE3:       # %bb.0:
 285 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 286 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 287 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
 288 ; SSSE3-NEXT:    mulps %xmm1, %xmm2
 289 ; SSSE3-NEXT:    mulss (%rsi), %xmm0
 290 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 291 ; SSSE3-NEXT:    addss %xmm2, %xmm0
 292 ; SSSE3-NEXT:    addss %xmm1, %xmm0
 293 ; SSSE3-NEXT:    retq
 294 ;
 295 ; SSE41-LABEL: dot3_float_float2:
 296 ; SSE41:       # %bb.0:
 297 ; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 298 ; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 299 ; SSE41-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
 300 ; SSE41-NEXT:    mulps %xmm1, %xmm2
 301 ; SSE41-NEXT:    mulss (%rsi), %xmm0
 302 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 303 ; SSE41-NEXT:    addss %xmm2, %xmm0
 304 ; SSE41-NEXT:    addss %xmm1, %xmm0
 305 ; SSE41-NEXT:    retq
 306 ;
 307 ; AVX-LABEL: dot3_float_float2:
 308 ; AVX:       # %bb.0:
 309 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 310 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 311 ; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 312 ; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
 313 ; AVX-NEXT:    vmulss (%rsi), %xmm0, %xmm0
 314 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
 315 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 316 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 317 ; AVX-NEXT:    retq
 318   %x0 = load float, float* %a0, align 4
 319   %y0 = load float, float* %a1, align 4
 320   %ptrx12 = getelementptr inbounds float, float* %a0, i64 1
 321   %ptry12 = getelementptr inbounds float, float* %a1, i64 1
 322   %bcx12 = bitcast float* %ptrx12 to <2 x float>*
 323   %bcy12 = bitcast float* %ptry12 to <2 x float>*
 324   %x12 = load <2 x float>, <2 x float>* %bcx12, align 4
 325   %y12 = load <2 x float>, <2 x float>* %bcy12, align 4
 326   %mul0 = fmul float %x0, %y0
 327   %mul12 = fmul <2 x float> %x12, %y12
 328   %mul1 = extractelement <2 x float> %mul12, i32 0
 329   %mul2 = extractelement <2 x float> %mul12, i32 1
 330   %dot01 = fadd float %mul0, %mul1
 331   %dot012 = fadd float %dot01, %mul2
 332   ret float %dot012
 333 }
 334
 335 ;
 336 ; dot2(float *x, float *y) - ((x[0]*y[0])+(x[1]*y[1]))
 337 ;
 338
 339 define float @dot2_float4(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 340 ; SSE2-LABEL: dot2_float4:
 341 ; SSE2:       # %bb.0:
 342 ; SSE2-NEXT:    movups (%rdi), %xmm0
 343 ; SSE2-NEXT:    movups (%rsi), %xmm1
 344 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 345 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 346 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
 347 ; SSE2-NEXT:    addss %xmm1, %xmm0
 348 ; SSE2-NEXT:    retq
 349 ;
 350 ; SSSE3-LABEL: dot2_float4:
 351 ; SSSE3:       # %bb.0:
 352 ; SSSE3-NEXT:    movups (%rdi), %xmm0
 353 ; SSSE3-NEXT:    movups (%rsi), %xmm1
 354 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 355 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 356 ; SSSE3-NEXT:    addss %xmm1, %xmm0
 357 ; SSSE3-NEXT:    retq
 358 ;
 359 ; SSE41-LABEL: dot2_float4:
 360 ; SSE41:       # %bb.0:
 361 ; SSE41-NEXT:    movups (%rdi), %xmm0
 362 ; SSE41-NEXT:    movups (%rsi), %xmm1
 363 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 364 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 365 ; SSE41-NEXT:    addss %xmm1, %xmm0
 366 ; SSE41-NEXT:    retq
 367 ;
 368 ; AVX-LABEL: dot2_float4:
 369 ; AVX:       # %bb.0:
 370 ; AVX-NEXT:    vmovups (%rdi), %xmm0
 371 ; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
 372 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 373 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 374 ; AVX-NEXT:    retq
 375   %bcx0123 = bitcast float* %a0 to <4 x float>*
 376   %bcy0123 = bitcast float* %a1 to <4 x float>*
 377   %x0123 = load <4 x float>, <4 x float>* %bcx0123, align 4
 378   %y0123 = load <4 x float>, <4 x float>* %bcy0123, align 4
 379   %mul0123 = fmul <4 x float> %x0123, %y0123
 380   %mul0 = extractelement <4 x float> %mul0123, i32 0
 381   %mul1 = extractelement <4 x float> %mul0123, i32 1
 382   %dot01 = fadd float %mul0, %mul1
 383   ret float %dot01
 384 }
 385
 386 define float @dot2_float2(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) {
 387 ; SSE2-LABEL: dot2_float2:
 388 ; SSE2:       # %bb.0:
 389 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 390 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 391 ; SSE2-NEXT:    mulps %xmm0, %xmm1
 392 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 393 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
 394 ; SSE2-NEXT:    addss %xmm1, %xmm0
 395 ; SSE2-NEXT:    retq
 396 ;
 397 ; SSSE3-LABEL: dot2_float2:
 398 ; SSSE3:       # %bb.0:
 399 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 400 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 401 ; SSSE3-NEXT:    mulps %xmm0, %xmm1
 402 ; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 403 ; SSSE3-NEXT:    addss %xmm1, %xmm0
 404 ; SSSE3-NEXT:    retq
 405 ;
 406 ; SSE41-LABEL: dot2_float2:
 407 ; SSE41:       # %bb.0:
 408 ; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 409 ; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 410 ; SSE41-NEXT:    mulps %xmm0, %xmm1
 411 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
 412 ; SSE41-NEXT:    addss %xmm1, %xmm0
 413 ; SSE41-NEXT:    retq
 414 ;
 415 ; AVX-LABEL: dot2_float2:
 416 ; AVX:       # %bb.0:
 417 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 418 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 419 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 420 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 421 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 422 ; AVX-NEXT:    retq
 423   %bcx01 = bitcast float* %a0 to <2 x float>*
 424   %bcy01 = bitcast float* %a1 to <2 x float>*
 425   %x01 = load <2 x float>, <2 x float>* %bcx01, align 4
 426   %y01 = load <2 x float>, <2 x float>* %bcy01, align 4
 427   %mul01 = fmul <2 x float> %x01, %y01
 428   %mul0 = extractelement <2 x float> %mul01, i32 0
 429   %mul1 = extractelement <2 x float> %mul01, i32 1
 430   %dot01 = fadd float %mul0, %mul1
 431   ret float %dot01
 432 }