llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
   3 ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,AVX
   4 ;
   5 ; This file tests the look-ahead operand reordering heuristic.
   6 ;
   7 ;
   8 ; This checks that operand reordering will reorder the operands of the adds
   9 ; by taking into consideration the instructions beyond the immediate
  10 ; predecessors.
  11 ;
  12 ; A[0] B[0] C[0] D[0]  C[1] D[1] A[1] B[1]
  13 ;     \  /   \  /          \  /   \  /
  14 ;       -     -              -     -
  15 ;        \   /                \   /
  16 ;          +                    +
  17 ;          |                    |
  18 ;         S[0]                 S[1]
  19 ;
  20 define void @lookahead_basic(ptr %array) {
  21 ; CHECK-LABEL: @lookahead_basic(
  22 ; CHECK-NEXT:  entry:
  23 ; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
  24 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
  25 ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
  26 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
  27 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
  28 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
  29 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
  30 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
  31 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
  32 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
  33 ; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8
  34 ; CHECK-NEXT:    ret void
  35 ;
  36 entry:
  37   %idx1 = getelementptr inbounds double, ptr %array, i64 1
  38   %idx2 = getelementptr inbounds double, ptr %array, i64 2
  39   %idx3 = getelementptr inbounds double, ptr %array, i64 3
  40   %idx4 = getelementptr inbounds double, ptr %array, i64 4
  41   %idx5 = getelementptr inbounds double, ptr %array, i64 5
  42   %idx6 = getelementptr inbounds double, ptr %array, i64 6
  43   %idx7 = getelementptr inbounds double, ptr %array, i64 7
  44
  45   %A_0 = load double, ptr %array, align 8
  46   %A_1 = load double, ptr %idx1, align 8
  47   %B_0 = load double, ptr %idx2, align 8
  48   %B_1 = load double, ptr %idx3, align 8
  49   %C_0 = load double, ptr %idx4, align 8
  50   %C_1 = load double, ptr %idx5, align 8
  51   %D_0 = load double, ptr %idx6, align 8
  52   %D_1 = load double, ptr %idx7, align 8
  53
  54   %subAB_0 = fsub fast double %A_0, %B_0
  55   %subCD_0 = fsub fast double %C_0, %D_0
  56
  57   %subAB_1 = fsub fast double %A_1, %B_1
  58   %subCD_1 = fsub fast double %C_1, %D_1
  59
  60   %addABCD_0 = fadd fast double %subAB_0, %subCD_0
  61   %addCDAB_1 = fadd fast double %subCD_1, %subAB_1
  62
  63   store double %addABCD_0, ptr %array, align 8
  64   store double %addCDAB_1, ptr %idx1, align 8
  65   ret void
  66 }
  67
  68
  69 ; Check whether the look-ahead operand reordering heuristic will avoid
  70 ; bundling the alt opcodes. The vectorized code should have no shuffles.
  71 ;
  72 ; A[0] B[0] A[0] B[0]  A[1] A[1] A[1] B[1]
  73 ;     \  /   \  /          \  /   \  /
  74 ;       +     -              -     +
  75 ;        \   /                \   /
  76 ;          +                    +
  77 ;          |                    |
  78 ;         S[0]                 S[1]
  79 ;
  80 define void @lookahead_alt1(ptr %array) {
  81 ; CHECK-LABEL: @lookahead_alt1(
  82 ; CHECK-NEXT:  entry:
  83 ; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
  84 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
  85 ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5
  86 ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
  87 ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7
  88 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
  89 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
  90 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
  91 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
  92 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]]
  93 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8
  94 ; CHECK-NEXT:    ret void
  95 ;
  96 entry:
  97   %idx1 = getelementptr inbounds double, ptr %array, i64 1
  98   %idx2 = getelementptr inbounds double, ptr %array, i64 2
  99   %idx3 = getelementptr inbounds double, ptr %array, i64 3
 100   %idx4 = getelementptr inbounds double, ptr %array, i64 4
 101   %idx5 = getelementptr inbounds double, ptr %array, i64 5
 102   %idx6 = getelementptr inbounds double, ptr %array, i64 6
 103   %idx7 = getelementptr inbounds double, ptr %array, i64 7
 104
 105   %A_0 = load double, ptr %array, align 8
 106   %A_1 = load double, ptr %idx1, align 8
 107   %B_0 = load double, ptr %idx2, align 8
 108   %B_1 = load double, ptr %idx3, align 8
 109
 110   %addAB_0_L = fadd fast double %A_0, %B_0
 111   %subAB_0_R = fsub fast double %A_0, %B_0
 112
 113   %subAB_1_L = fsub fast double %A_1, %B_1
 114   %addAB_1_R = fadd fast double %A_1, %B_1
 115
 116   %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R
 117   %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R
 118
 119   store double %addABCD_0, ptr %array, align 8
 120   store double %addCDAB_1, ptr %idx1, align 8
 121   ret void
 122 }
 123
 124
 125 ; This code should get vectorized all the way to the loads with shuffles for
 126 ; the alt opcodes.
 127 ;
 128 ; A[0] B[0] C[0] D[0]  C[1] D[1] A[1] B[1]
 129 ;     \  /   \  /          \  /   \  /
 130 ;       +     -              +     -
 131 ;        \   /                \   /
 132 ;          +                    +
 133 ;          |                    |
 134 ;         S[0]                 S[1]
 135 ;
 136 define void @lookahead_alt2(ptr %array) {
 137 ; CHECK-LABEL: @lookahead_alt2(
 138 ; CHECK-NEXT:  entry:
 139 ; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
 140 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
 141 ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
 142 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
 143 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
 144 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
 145 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
 146 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
 147 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]]
 148 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
 149 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
 150 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
 151 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
 152 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
 153 ; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8
 154 ; CHECK-NEXT:    ret void
 155 ;
 156 entry:
 157   %idx1 = getelementptr inbounds double, ptr %array, i64 1
 158   %idx2 = getelementptr inbounds double, ptr %array, i64 2
 159   %idx3 = getelementptr inbounds double, ptr %array, i64 3
 160   %idx4 = getelementptr inbounds double, ptr %array, i64 4
 161   %idx5 = getelementptr inbounds double, ptr %array, i64 5
 162   %idx6 = getelementptr inbounds double, ptr %array, i64 6
 163   %idx7 = getelementptr inbounds double, ptr %array, i64 7
 164
 165   %A_0 = load double, ptr %array, align 8
 166   %A_1 = load double, ptr %idx1, align 8
 167   %B_0 = load double, ptr %idx2, align 8
 168   %B_1 = load double, ptr %idx3, align 8
 169   %C_0 = load double, ptr %idx4, align 8
 170   %C_1 = load double, ptr %idx5, align 8
 171   %D_0 = load double, ptr %idx6, align 8
 172   %D_1 = load double, ptr %idx7, align 8
 173
 174   %addAB_0 = fadd fast double %A_0, %B_0
 175   %subCD_0 = fsub fast double %C_0, %D_0
 176
 177   %addCD_1 = fadd fast double %C_1, %D_1
 178   %subAB_1 = fsub fast double %A_1, %B_1
 179
 180   %addABCD_0 = fadd fast double %addAB_0, %subCD_0
 181   %addCDAB_1 = fadd fast double %addCD_1, %subAB_1
 182
 183   store double %addABCD_0, ptr %array, align 8
 184   store double %addCDAB_1, ptr %idx1, align 8
 185   ret void
 186 }
 187
 188
 189 ;
 190 ; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
 191 ;     \  /   \  /       /  \  /   \  /
 192 ;       -     -        U     -     -
 193 ;        \   /                \   /
 194 ;          +                    +
 195 ;          |                    |
 196 ;         S[0]                 S[1]
 197 ;
 198 ; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses.
 199 ; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use.
 200
 201 define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) {
 202 ; CHECK-LABEL: @lookahead_external_uses(
 203 ; CHECK-NEXT:  entry:
 204 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
 205 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
 206 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
 207 ; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 208 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
 209 ; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
 210 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 211 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
 212 ; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 213 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
 214 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
 215 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
 216 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
 217 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 218 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
 219 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
 220 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
 221 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
 222 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
 223 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
 224 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
 225 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT1:%.*]], align 8
 226 ; CHECK-NEXT:    ret void
 227 ;
 228 entry:
 229
 230   %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
 231   %IdxB2 = getelementptr inbounds double, ptr %B, i64 2
 232   %IdxA2 = getelementptr inbounds double, ptr %A, i64 2
 233   %IdxB1 = getelementptr inbounds double, ptr %B, i64 1
 234
 235   %A0 = load double, ptr %A, align 8
 236   %B0 = load double, ptr %B, align 8
 237   %C0 = load double, ptr %C, align 8
 238   %D0 = load double, ptr %D, align 8
 239
 240   %A1 = load double, ptr %IdxA1, align 8
 241   %B2 = load double, ptr %IdxB2, align 8
 242   %A2 = load double, ptr %IdxA2, align 8
 243   %B1 = load double, ptr %IdxB1, align 8
 244
 245   %subA0B0 = fsub fast double %A0, %B0
 246   %subC0D0 = fsub fast double %C0, %D0
 247
 248   %subA1B2 = fsub fast double %A1, %B2
 249   %subA2B1 = fsub fast double %A2, %B1
 250
 251   %add0 = fadd fast double %subA0B0, %subC0D0
 252   %add1 = fadd fast double %subA1B2, %subA2B1
 253
 254   %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
 255
 256   store double %add0, ptr %S, align 8
 257   store double %add1, ptr %IdxS1, align 8
 258
 259   ; External use
 260   store double %A1, ptr %Ext1, align 8
 261   ret void
 262 }
 263
 264 ; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
 265 ;     \  /   \  /       /  \  /   \  / \
 266 ;       -     -    U1,U2,U3  -     -  U4,U5
 267 ;        \   /                \   /
 268 ;          +                    +
 269 ;          |                    |
 270 ;         S[0]                 S[1]
 271 ;
 272 ;
 273 ; If we limit the users budget for the look-ahead heuristic to 2, then the
 274 ; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
 275 ; over A[1] (with 3 external users).
 276 ; The result is that the operands are of the Add not reordered and the loads
 277 ; from A get vectorized instead of the loads from B.
 278 ;
 279 define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) {
 280 ; CHECK-LABEL: @lookahead_limit_users_budget(
 281 ; CHECK-NEXT:  entry:
 282 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
 283 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
 284 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
 285 ; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 286 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
 287 ; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
 288 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 289 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
 290 ; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 291 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
 292 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
 293 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
 294 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
 295 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 296 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
 297 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
 298 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
 299 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
 300 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
 301 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
 302 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
 303 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT1:%.*]], align 8
 304 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT2:%.*]], align 8
 305 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT3:%.*]], align 8
 306 ; CHECK-NEXT:    store double [[B1]], ptr [[EXT4:%.*]], align 8
 307 ; CHECK-NEXT:    store double [[B1]], ptr [[EXT5:%.*]], align 8
 308 ; CHECK-NEXT:    ret void
 309 ;
 310 entry:
 311
 312   %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
 313   %IdxB2 = getelementptr inbounds double, ptr %B, i64 2
 314   %IdxA2 = getelementptr inbounds double, ptr %A, i64 2
 315   %IdxB1 = getelementptr inbounds double, ptr %B, i64 1
 316
 317   %A0 = load double, ptr %A, align 8
 318   %B0 = load double, ptr %B, align 8
 319   %C0 = load double, ptr %C, align 8
 320   %D0 = load double, ptr %D, align 8
 321
 322   %A1 = load double, ptr %IdxA1, align 8
 323   %B2 = load double, ptr %IdxB2, align 8
 324   %A2 = load double, ptr %IdxA2, align 8
 325   %B1 = load double, ptr %IdxB1, align 8
 326
 327   %subA0B0 = fsub fast double %A0, %B0
 328   %subC0D0 = fsub fast double %C0, %D0
 329
 330   %subA1B2 = fsub fast double %A1, %B2
 331   %subA2B1 = fsub fast double %A2, %B1
 332
 333   %add0 = fadd fast double %subA0B0, %subC0D0
 334   %add1 = fadd fast double %subA1B2, %subA2B1
 335
 336   %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
 337
 338   store double %add0, ptr %S, align 8
 339   store double %add1, ptr %IdxS1, align 8
 340
 341   ; External uses of A1
 342   store double %A1, ptr %Ext1, align 8
 343   store double %A1, ptr %Ext2, align 8
 344   store double %A1, ptr %Ext3, align 8
 345
 346   ; External uses of B1
 347   store double %B1, ptr %Ext4, align 8
 348   store double %B1, ptr %Ext5, align 8
 349
 350   ret void
 351 }
 352
 353 ; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
 354
 355 %Class = type { i8 }
 356 declare double @_ZN1i2ayEv(ptr)
 357 declare double @_ZN1i2axEv()
 358
 359 define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) {
 360 ; CHECK-LABEL: @lookahead_crash(
 361 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
 362 ; CHECK-NEXT:    [[C0:%.*]] = call double @_ZN1i2ayEv(ptr [[ARG0:%.*]])
 363 ; CHECK-NEXT:    [[C1:%.*]] = call double @_ZN1i2axEv()
 364 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 365 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
 366 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
 367 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[S:%.*]], align 8
 368 ; CHECK-NEXT:    ret void
 369 ;
 370   %IdxA1 = getelementptr inbounds double, ptr %A, i64 1
 371
 372   %A0 = load double, ptr %A, align 8
 373   %A1 = load double, ptr %IdxA1, align 8
 374
 375   %C0 = call double @_ZN1i2ayEv(ptr %Arg0)
 376   %C1 = call double @_ZN1i2axEv()
 377
 378   %add0 = fadd fast double %A0, %C0
 379   %add1 = fadd fast double %A1, %C1
 380
 381   %IdxS1 = getelementptr inbounds double, ptr %S, i64 1
 382   store double %add0, ptr %S, align 8
 383   store double %add1, ptr %IdxS1, align 8
 384   ret void
 385 }
 386
 387 ; This checks that we choose to group consecutive extracts from the same vectors.
 388 define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) {
 389 ; CHECK-LABEL: @ChecksExtractScores(
 390 ; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
 391 ; CHECK-NEXT:    [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
 392 ; CHECK-NEXT:    [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
 393 ; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
 394 ; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
 395 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
 396 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
 397 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
 398 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
 399 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
 400 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
 401 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
 402 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
 403 ; CHECK-NEXT:    ret void
 404 ;
 405   %idx1 = getelementptr inbounds double, ptr %array, i64 1
 406   %loadA0 = load double, ptr %array, align 4
 407   %loadA1 = load double, ptr %idx1, align 4
 408
 409   %loadVec = load <2 x double>, ptr %vecPtr1, align 4
 410   %extrA0 = extractelement <2 x double> %loadVec, i32 0
 411   %extrA1 = extractelement <2 x double> %loadVec, i32 1
 412   %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
 413   %extrB0 = extractelement <2 x double> %loadVec2, i32 0
 414   %extrB1 = extractelement <2 x double> %loadVec2, i32 1
 415
 416   %mul0 = fmul double %extrA0, %loadA0
 417   %mul1 = fmul double %extrA1, %loadA0
 418   %mul3 = fmul double %extrB0, %loadA1
 419   %mul4 = fmul double %extrB1, %loadA1
 420   %add0 = fadd double %mul0, %mul3
 421   %add1 = fadd double %mul1, %mul4
 422
 423   %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
 424   store double %add0, ptr %storeArray, align 8
 425   store double %add1, ptr %sidx1, align 8
 426   ret void
 427 }
 428
 429
 430 define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
 431 ; SSE-LABEL: @ExtractIdxNotConstantInt1(
 432 ; SSE-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
 433 ; SSE-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
 434 ; SSE-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
 435 ; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
 436 ; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
 437 ; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
 438 ; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
 439 ; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 440 ; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
 441 ; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
 442 ; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
 443 ; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
 444 ; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
 445 ; SSE-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
 446 ; SSE-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
 447 ; SSE-NEXT:    ret i1 [[CMP_I185]]
 448 ;
 449 ; AVX-LABEL: @ExtractIdxNotConstantInt1(
 450 ; AVX-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
 451 ; AVX-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
 452 ; AVX-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
 453 ; AVX-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
 454 ; AVX-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
 455 ; AVX-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
 456 ; AVX-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
 457 ; AVX-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
 458 ; AVX-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
 459 ; AVX-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
 460 ; AVX-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
 461 ; AVX-NEXT:    ret i1 [[CMP_I185]]
 462 ;
 463   %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
 464   %sub14.i167 = fsub float undef, %vecext.i291.i166
 465   %fm = fmul float %a, %sub14.i167
 466   %sub25.i168 = fsub float %fm, %b
 467   %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
 468   %add36.i173 = fadd float %sub25.i168, 10.0
 469   %mul72.i179 = fmul float %c, %vecext.i276.i169
 470   %add78.i180 = fsub float %mul72.i179, 30.0
 471   %add79.i181 = fadd float 2.0, %add78.i180
 472   %mul123.i184 = fmul float %add36.i173, %add79.i181
 473   %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
 474   ret i1 %cmp.i185
 475 }
 476
 477
 478 define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
 479 ; SSE-LABEL: @ExtractIdxNotConstantInt2(
 480 ; SSE-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
 481 ; SSE-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
 482 ; SSE-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
 483 ; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
 484 ; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
 485 ; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
 486 ; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
 487 ; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 488 ; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
 489 ; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
 490 ; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
 491 ; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
 492 ; SSE-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
 493 ; SSE-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
 494 ; SSE-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
 495 ; SSE-NEXT:    ret i1 [[CMP_I185]]
 496 ;
 497 ; AVX-LABEL: @ExtractIdxNotConstantInt2(
 498 ; AVX-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
 499 ; AVX-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
 500 ; AVX-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
 501 ; AVX-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
 502 ; AVX-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
 503 ; AVX-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
 504 ; AVX-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
 505 ; AVX-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
 506 ; AVX-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
 507 ; AVX-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
 508 ; AVX-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
 509 ; AVX-NEXT:    ret i1 [[CMP_I185]]
 510 ;
 511   %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
 512   %sub14.i167 = fsub float undef, %vecext.i291.i166
 513   %fm = fmul float %a, %sub14.i167
 514   %sub25.i168 = fsub float %fm, %b
 515   %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
 516   %add36.i173 = fadd float %sub25.i168, 10.0
 517   %mul72.i179 = fmul float %c, %vecext.i276.i169
 518   %add78.i180 = fsub float %mul72.i179, 30.0
 519   %add79.i181 = fadd float 2.0, %add78.i180
 520   %mul123.i184 = fmul float %add36.i173, %add79.i181
 521   %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
 522   ret i1 %cmp.i185
 523 }
 524
 525
 526 define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
 527 ; CHECK-LABEL: @foo(
 528 ; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
 529 ; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
 530 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
 531 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
 532 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1>
 533 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
 534 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 535 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
 536 ; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
 537 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
 538 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
 539 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
 540 ; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
 541 ; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
 542 ; CHECK-NEXT:    ret i1 [[CMP_I185]]
 543 ;
 544   %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
 545   %sub14.i167 = fsub float undef, %vecext.i291.i166
 546   %fm = fmul float %a, %sub14.i167
 547   %sub25.i168 = fsub float %fm, %b
 548   %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1
 549   %add36.i173 = fadd float %sub25.i168, 10.0
 550   %mul72.i179 = fmul float %c, %vecext.i276.i169
 551   %add78.i180 = fsub float %mul72.i179, 30.0
 552   %add79.i181 = fadd float 2.0, %add78.i180
 553   %mul123.i184 = fmul float %add36.i173, %add79.i181
 554   %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
 555   ret i1 %cmp.i185
 556 }
 557
 558 ; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
 559 define void @ChecksExtractScores_different_vectors(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2, ptr %vecPtr3, ptr %vecPtr4) {
 560 ;
 561 ; SSE-LABEL: @ChecksExtractScores_different_vectors(
 562 ; SSE-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
 563 ; SSE-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
 564 ; SSE-NEXT:    [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
 565 ; SSE-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
 566 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4
 567 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> <i32 1, i32 2>
 568 ; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
 569 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 570 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
 571 ; SSE-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]]
 572 ; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
 573 ; SSE-NEXT:    store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
 574 ; SSE-NEXT:    ret void
 575 ;
 576 ; AVX-LABEL: @ChecksExtractScores_different_vectors(
 577 ; AVX-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
 578 ; AVX-NEXT:    [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
 579 ; AVX-NEXT:    [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
 580 ; AVX-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
 581 ; AVX-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
 582 ; AVX-NEXT:    [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
 583 ; AVX-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
 584 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> <i32 0, i32 3>
 585 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
 586 ; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
 587 ; AVX-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
 588 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
 589 ; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
 590 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
 591 ; AVX-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
 592 ; AVX-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
 593 ; AVX-NEXT:    store <2 x double> [[TMP9]], ptr [[STOREARRAY:%.*]], align 8
 594 ; AVX-NEXT:    ret void
 595 ;
 596   %idx1 = getelementptr inbounds double, ptr %array, i64 1
 597   %loadA0 = load double, ptr %array, align 4
 598   %loadA1 = load double, ptr %idx1, align 4
 599
 600   %loadVec = load <2 x double>, ptr %vecPtr1, align 4
 601   %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
 602   %extrA0 = extractelement <2 x double> %loadVec, i32 0
 603   %extrA1 = extractelement <2 x double> %loadVec2, i32 1
 604   %loadVec3= load <2 x double>, ptr %vecPtr3, align 4
 605   %loadVec4 = load <2 x double>, ptr %vecPtr4, align 4
 606   %extrB0 = extractelement <2 x double> %loadVec3, i32 0
 607   %extrB1 = extractelement <2 x double> %loadVec4, i32 1
 608
 609   %mul0 = fmul double %extrA0, %loadA0
 610   %mul1 = fmul double %extrA1, %loadA0
 611   %mul3 = fmul double %extrB0, %loadA1
 612   %mul4 = fmul double %extrB1, %loadA1
 613   %add0 = fadd double %mul0, %mul3
 614   %add1 = fadd double %mul1, %mul4
 615
 616   %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
 617   store double %add0, ptr %storeArray, align 8
 618   store double %add1, ptr %sidx1, align 8
 619   ret void
 620 }
 621
 622 ; This checks that we we prefer splats rather than reverse load vectors + shuffles.
 623 ; 2-wide splat loads in x86 use a single instruction so they are quite cheap.
 624 define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 625 ; SSE-LABEL: @splat_loads(
 626 ; SSE-NEXT:  entry:
 627 ; SSE-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
 628 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
 629 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 630 ; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
 631 ; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
 632 ; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
 633 ; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 634 ; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 635 ; SSE-NEXT:    [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]]
 636 ; SSE-NEXT:    ret double [[ADD3]]
 637 ;
 638 ; AVX-LABEL: @splat_loads(
 639 ; AVX-NEXT:  entry:
 640 ; AVX-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
 641 ; AVX-NEXT:    [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
 642 ; AVX-NEXT:    [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
 643 ; AVX-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
 644 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
 645 ; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
 646 ; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
 647 ; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
 648 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
 649 ; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
 650 ; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
 651 ; AVX-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
 652 ; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
 653 ; AVX-NEXT:    [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]]
 654 ; AVX-NEXT:    ret double [[ADD3]]
 655 ;
 656 entry:
 657   %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
 658   %ld_1_0 = load double, ptr %array1, align 8
 659   %ld_1_1 = load double, ptr %gep_1_1, align 8
 660
 661   %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
 662   %ld_2_0 = load double, ptr %array2, align 8
 663   %ld_2_1 = load double, ptr %gep_2_1, align 8
 664
 665   %mul1 = fmul double %ld_1_0, %ld_2_0
 666   %mul2 = fmul double %ld_1_1, %ld_2_0
 667
 668   %mul3 = fmul double %ld_1_0, %ld_2_1
 669   %mul4 = fmul double %ld_1_1, %ld_2_1
 670
 671   %add1 = fadd double %mul1, %mul3
 672   %add2 = fadd double %mul2, %mul4
 673
 674   %add3 = fadd double %add1, %add2
 675   ret double %add3
 676 }
 677
 678
 679 ; Same as splat_loads() but the splat load has internal uses in the slp graph.
 680 define double @splat_loads_with_internal_uses(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 681 ; SSE-LABEL: @splat_loads_with_internal_uses(
 682 ; SSE-NEXT:  entry:
 683 ; SSE-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
 684 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
 685 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 686 ; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
 687 ; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
 688 ; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
 689 ; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
 690 ; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
 691 ; SSE-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
 692 ; SSE-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
 693 ; SSE-NEXT:    [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]]
 694 ; SSE-NEXT:    ret double [[RES]]
 695 ;
 696 ; AVX-LABEL: @splat_loads_with_internal_uses(
 697 ; AVX-NEXT:  entry:
 698 ; AVX-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
 699 ; AVX-NEXT:    [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
 700 ; AVX-NEXT:    [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
 701 ; AVX-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
 702 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
 703 ; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
 704 ; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
 705 ; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
 706 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
 707 ; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
 708 ; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
 709 ; AVX-NEXT:    [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
 710 ; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
 711 ; AVX-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
 712 ; AVX-NEXT:    [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]]
 713 ; AVX-NEXT:    ret double [[RES]]
 714 ;
 715 entry:
 716   %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
 717   %ld_1_0 = load double, ptr %array1, align 8
 718   %ld_1_1 = load double, ptr %gep_1_1, align 8
 719
 720   %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
 721   %ld_2_0 = load double, ptr %array2, align 8
 722   %ld_2_1 = load double, ptr %gep_2_1, align 8
 723
 724   %mul1 = fmul double %ld_1_0, %ld_2_0
 725   %mul2 = fmul double %ld_1_1, %ld_2_0
 726
 727   %mul3 = fmul double %ld_1_0, %ld_2_1
 728   %mul4 = fmul double %ld_1_1, %ld_2_1
 729
 730   %add1 = fadd double %mul1, %mul3
 731   %add2 = fadd double %mul2, %mul4
 732
 733   ; One more user for the broadcast of %ld_2_0
 734   %sub1 = fsub double %add1, %ld_2_0
 735   %sub2 = fsub double %add2, %ld_2_0
 736
 737   %res = fadd double %sub1, %sub2
 738
 739   ret double %res
 740 }