llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
   3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s
   4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
   5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
   6
   7 ;
   8 ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
   9 ;
  10
  11 define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
  12 ; CHECK-LABEL: @dot4f64(
  13 ; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
  14 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
  15 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
  16 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
  17 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
  18 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
  19 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
  20 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
  21 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
  22 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
  23 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
  24 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
  25 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
  26 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
  27 ; CHECK-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
  28 ; CHECK-NEXT:    ret double [[DOT0123]]
  29 ;
  30   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
  31   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
  32   %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
  33   %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
  34   %ptrx3 = getelementptr inbounds double, ptr %ptrx, i64 3
  35   %ptry3 = getelementptr inbounds double, ptr %ptry, i64 3
  36   %x0 = load double, ptr %ptrx, align 4
  37   %y0 = load double, ptr %ptry, align 4
  38   %x1 = load double, ptr %ptrx1, align 4
  39   %y1 = load double, ptr %ptry1, align 4
  40   %x2 = load double, ptr %ptrx2, align 4
  41   %y2 = load double, ptr %ptry2, align 4
  42   %x3 = load double, ptr %ptrx3, align 4
  43   %y3 = load double, ptr %ptry3, align 4
  44   %mul0 = fmul double %x0, %y0
  45   %mul1 = fmul double %x1, %y1
  46   %mul2 = fmul double %x2, %y2
  47   %mul3 = fmul double %x3, %y3
  48   %dot01 = fadd double %mul0, %mul1
  49   %dot012 = fadd double %dot01, %mul2
  50   %dot0123 = fadd double %dot012, %mul3
  51   ret double %dot0123
  52 }
  53
  54 define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
  55 ; CHECK-LABEL: @dot4f32(
  56 ; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2
  57 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2
  58 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
  59 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
  60 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
  61 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
  62 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
  63 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]]
  64 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
  65 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
  66 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]]
  67 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
  68 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]]
  69 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
  70 ; CHECK-NEXT:    [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]]
  71 ; CHECK-NEXT:    ret float [[DOT0123]]
  72 ;
  73   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
  74   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
  75   %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
  76   %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
  77   %ptrx3 = getelementptr inbounds float, ptr %ptrx, i64 3
  78   %ptry3 = getelementptr inbounds float, ptr %ptry, i64 3
  79   %x0 = load float, ptr %ptrx, align 4
  80   %y0 = load float, ptr %ptry, align 4
  81   %x1 = load float, ptr %ptrx1, align 4
  82   %y1 = load float, ptr %ptry1, align 4
  83   %x2 = load float, ptr %ptrx2, align 4
  84   %y2 = load float, ptr %ptry2, align 4
  85   %x3 = load float, ptr %ptrx3, align 4
  86   %y3 = load float, ptr %ptry3, align 4
  87   %mul0 = fmul float %x0, %y0
  88   %mul1 = fmul float %x1, %y1
  89   %mul2 = fmul float %x2, %y2
  90   %mul3 = fmul float %x3, %y3
  91   %dot01 = fadd float %mul0, %mul1
  92   %dot012 = fadd float %dot01, %mul2
  93   %dot0123 = fadd float %dot012, %mul3
  94   ret float %dot0123
  95 }
  96
  97 define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
  98 ; CHECK-LABEL: @dot4f64_fast(
  99 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
 100 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
 101 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
 102 ; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
 103 ; CHECK-NEXT:    ret double [[TMP4]]
 104 ;
 105   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
 106   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
 107   %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
 108   %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
 109   %ptrx3 = getelementptr inbounds double, ptr %ptrx, i64 3
 110   %ptry3 = getelementptr inbounds double, ptr %ptry, i64 3
 111   %x0 = load double, ptr %ptrx, align 4
 112   %y0 = load double, ptr %ptry, align 4
 113   %x1 = load double, ptr %ptrx1, align 4
 114   %y1 = load double, ptr %ptry1, align 4
 115   %x2 = load double, ptr %ptrx2, align 4
 116   %y2 = load double, ptr %ptry2, align 4
 117   %x3 = load double, ptr %ptrx3, align 4
 118   %y3 = load double, ptr %ptry3, align 4
 119   %mul0 = fmul double %x0, %y0
 120   %mul1 = fmul double %x1, %y1
 121   %mul2 = fmul double %x2, %y2
 122   %mul3 = fmul double %x3, %y3
 123   %dot01 = fadd fast double %mul0, %mul1
 124   %dot012 = fadd fast double %dot01, %mul2
 125   %dot0123 = fadd fast double %dot012, %mul3
 126   ret double %dot0123
 127 }
 128
 129 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 130 ; CHECK-LABEL: @dot4f32_fast(
 131 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
 132 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
 133 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
 134 ; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 135 ; CHECK-NEXT:    ret float [[TMP4]]
 136 ;
 137   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
 138   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
 139   %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
 140   %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
 141   %ptrx3 = getelementptr inbounds float, ptr %ptrx, i64 3
 142   %ptry3 = getelementptr inbounds float, ptr %ptry, i64 3
 143   %x0 = load float, ptr %ptrx, align 4
 144   %y0 = load float, ptr %ptry, align 4
 145   %x1 = load float, ptr %ptrx1, align 4
 146   %y1 = load float, ptr %ptry1, align 4
 147   %x2 = load float, ptr %ptrx2, align 4
 148   %y2 = load float, ptr %ptry2, align 4
 149   %x3 = load float, ptr %ptrx3, align 4
 150   %y3 = load float, ptr %ptry3, align 4
 151   %mul0 = fmul float %x0, %y0
 152   %mul1 = fmul float %x1, %y1
 153   %mul2 = fmul float %x2, %y2
 154   %mul3 = fmul float %x3, %y3
 155   %dot01 = fadd fast float %mul0, %mul1
 156   %dot012 = fadd fast float %dot01, %mul2
 157   %dot0123 = fadd fast float %dot012, %mul3
 158   ret float %dot0123
 159 }
 160
 161 ;
 162 ; dot3(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2]))
 163 ;
 164
 165 define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
 166 ; CHECK-LABEL: @dot3f64(
 167 ; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
 168 ; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
 169 ; CHECK-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
 170 ; CHECK-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
 171 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 172 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
 173 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
 174 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
 175 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
 176 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]]
 177 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
 178 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]]
 179 ; CHECK-NEXT:    ret double [[DOT012]]
 180 ;
 181   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
 182   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
 183   %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
 184   %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
 185   %x0 = load double, ptr %ptrx, align 4
 186   %y0 = load double, ptr %ptry, align 4
 187   %x1 = load double, ptr %ptrx1, align 4
 188   %y1 = load double, ptr %ptry1, align 4
 189   %x2 = load double, ptr %ptrx2, align 4
 190   %y2 = load double, ptr %ptry2, align 4
 191   %mul0 = fmul double %x0, %y0
 192   %mul1 = fmul double %x1, %y1
 193   %mul2 = fmul double %x2, %y2
 194   %dot01 = fadd double %mul0, %mul1
 195   %dot012 = fadd double %dot01, %mul2
 196   ret double %dot012
 197 }
 198
 199 define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 200 ; CHECK-LABEL: @dot3f32(
 201 ; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
 202 ; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
 203 ; CHECK-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
 204 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
 205 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 206 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
 207 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
 208 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
 209 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
 210 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP4]]
 211 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 212 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP5]]
 213 ; CHECK-NEXT:    ret float [[DOT012]]
 214 ;
 215   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
 216   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
 217   %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
 218   %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
 219   %x0 = load float, ptr %ptrx, align 4
 220   %y0 = load float, ptr %ptry, align 4
 221   %x1 = load float, ptr %ptrx1, align 4
 222   %y1 = load float, ptr %ptry1, align 4
 223   %x2 = load float, ptr %ptrx2, align 4
 224   %y2 = load float, ptr %ptry2, align 4
 225   %mul0 = fmul float %x0, %y0
 226   %mul1 = fmul float %x1, %y1
 227   %mul2 = fmul float %x2, %y2
 228   %dot01 = fadd float %mul0, %mul1
 229   %dot012 = fadd float %dot01, %mul2
 230   ret float %dot012
 231 }
 232
 233 define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
 234 ; CHECK-LABEL: @dot3f64_fast(
 235 ; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
 236 ; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
 237 ; CHECK-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
 238 ; CHECK-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
 239 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 240 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
 241 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
 242 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
 243 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
 244 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
 245 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
 246 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
 247 ; CHECK-NEXT:    ret double [[DOT012]]
 248 ;
 249   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
 250   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
 251   %ptrx2 = getelementptr inbounds double, ptr %ptrx, i64 2
 252   %ptry2 = getelementptr inbounds double, ptr %ptry, i64 2
 253   %x0 = load double, ptr %ptrx, align 4
 254   %y0 = load double, ptr %ptry, align 4
 255   %x1 = load double, ptr %ptrx1, align 4
 256   %y1 = load double, ptr %ptry1, align 4
 257   %x2 = load double, ptr %ptrx2, align 4
 258   %y2 = load double, ptr %ptry2, align 4
 259   %mul0 = fmul double %x0, %y0
 260   %mul1 = fmul double %x1, %y1
 261   %mul2 = fmul double %x2, %y2
 262   %dot01 = fadd fast double %mul0, %mul1
 263   %dot012 = fadd fast double %dot01, %mul2
 264   ret double %dot012
 265 }
 266
 267 define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 268 ; CHECK-LABEL: @dot3f32_fast(
 269 ; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
 270 ; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
 271 ; CHECK-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
 272 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
 273 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 274 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
 275 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
 276 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
 277 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
 278 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
 279 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 280 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
 281 ; CHECK-NEXT:    ret float [[DOT012]]
 282 ;
 283   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
 284   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
 285   %ptrx2 = getelementptr inbounds float, ptr %ptrx, i64 2
 286   %ptry2 = getelementptr inbounds float, ptr %ptry, i64 2
 287   %x0 = load float, ptr %ptrx, align 4
 288   %y0 = load float, ptr %ptry, align 4
 289   %x1 = load float, ptr %ptrx1, align 4
 290   %y1 = load float, ptr %ptry1, align 4
 291   %x2 = load float, ptr %ptrx2, align 4
 292   %y2 = load float, ptr %ptry2, align 4
 293   %mul0 = fmul float %x0, %y0
 294   %mul1 = fmul float %x1, %y1
 295   %mul2 = fmul float %x2, %y2
 296   %dot01 = fadd fast float %mul0, %mul1
 297   %dot012 = fadd fast float %dot01, %mul2
 298   ret float %dot012
 299 }
 300
 301 ;
 302 ; dot2(ptr x, ptr y) - ((xptr y[0])+(xptr y[1]))
 303 ;
 304
 305 define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 306 ; CHECK-LABEL: @dot2f64(
 307 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
 308 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
 309 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
 310 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
 311 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
 312 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
 313 ; CHECK-NEXT:    ret double [[DOT01]]
 314 ;
 315   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
 316   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
 317   %x0 = load double, ptr %ptrx, align 4
 318   %y0 = load double, ptr %ptry, align 4
 319   %x1 = load double, ptr %ptrx1, align 4
 320   %y1 = load double, ptr %ptry1, align 4
 321   %mul0 = fmul double %x0, %y0
 322   %mul1 = fmul double %x1, %y1
 323   %dot01 = fadd double %mul0, %mul1
 324   ret double %dot01
 325 }
 326
 327 define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 328 ; CHECK-LABEL: @dot2f32(
 329 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
 330 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
 331 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
 332 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
 333 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 334 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP4]], [[TMP5]]
 335 ; CHECK-NEXT:    ret float [[DOT01]]
 336 ;
 337   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
 338   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
 339   %x0 = load float, ptr %ptrx, align 4
 340   %y0 = load float, ptr %ptry, align 4
 341   %x1 = load float, ptr %ptrx1, align 4
 342   %y1 = load float, ptr %ptry1, align 4
 343   %mul0 = fmul float %x0, %y0
 344   %mul1 = fmul float %x1, %y1
 345   %dot01 = fadd float %mul0, %mul1
 346   ret float %dot01
 347 }
 348
 349 define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 350 ; CHECK-LABEL: @dot2f64_fast(
 351 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
 352 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
 353 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
 354 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
 355 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
 356 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
 357 ; CHECK-NEXT:    ret double [[DOT01]]
 358 ;
 359   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
 360   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
 361   %x0 = load double, ptr %ptrx, align 4
 362   %y0 = load double, ptr %ptry, align 4
 363   %x1 = load double, ptr %ptrx1, align 4
 364   %y1 = load double, ptr %ptry1, align 4
 365   %mul0 = fmul double %x0, %y0
 366   %mul1 = fmul double %x1, %y1
 367   %dot01 = fadd fast double %mul0, %mul1
 368   ret double %dot01
 369 }
 370
 371 define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 372 ; CHECK-LABEL: @dot2f32_fast(
 373 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
 374 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
 375 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
 376 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
 377 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 378 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 379 ; CHECK-NEXT:    ret float [[DOT01]]
 380 ;
 381   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
 382   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
 383   %x0 = load float, ptr %ptrx, align 4
 384   %y0 = load float, ptr %ptry, align 4
 385   %x1 = load float, ptr %ptrx1, align 4
 386   %y1 = load float, ptr %ptry1, align 4
 387   %mul0 = fmul float %x0, %y0
 388   %mul1 = fmul float %x1, %y1
 389   %dot01 = fadd fast float %mul0, %mul1
 390   ret float %dot01
 391 }