llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
   3 ; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
   4
   5 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
   6
   7 ; Make sure we order the operands of commutative operations so that we get
   8 ; bigger vectorizable trees.
   9
  10 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
  11 ; CHECK-LABEL: @shuffle_operands1(
  12 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
  13 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
  14 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
  15 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
  16 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
  17 ; CHECK-NEXT:    ret void
  18 ;
  19 ; SSE2-LABEL: @shuffle_operands1(
  20 ; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
  21 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
  22 ; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
  23 ; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
  24 ; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
  25 ; SSE2-NEXT:    ret void
  26 ;
  27   %from_1 = getelementptr double, ptr %from, i64 1
  28   %v0_1 = load double , ptr %from
  29   %v0_2 = load double , ptr %from_1
  30   %v1_1 = fadd double %v0_1, %v1
  31   %v1_2 = fadd double %v2, %v0_2
  32   %to_2 = getelementptr double, ptr %to, i64 1
  33   store double %v1_1, ptr %to
  34   store double %v1_2, ptr %to_2
  35   ret void
  36 }
  37
  38 define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
  39 ; CHECK-LABEL: @vecload_vs_broadcast(
  40 ; CHECK-NEXT:  entry:
  41 ; CHECK-NEXT:    br label [[LP:%.*]]
  42 ; CHECK:       lp:
  43 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
  44 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
  45 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
  46 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
  47 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
  48 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
  49 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
  50 ; CHECK:       ext:
  51 ; CHECK-NEXT:    ret void
  52 ;
  53 ; SSE2-LABEL: @vecload_vs_broadcast(
  54 ; SSE2-NEXT:  entry:
  55 ; SSE2-NEXT:    br label [[LP:%.*]]
  56 ; SSE2:       lp:
  57 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
  58 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
  59 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
  60 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
  61 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
  62 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
  63 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
  64 ; SSE2:       ext:
  65 ; SSE2-NEXT:    ret void
  66 ;
  67 entry:
  68 br label %lp
  69
  70 lp:
  71   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
  72   %from_1 = getelementptr double, ptr %from, i64 1
  73   %v0_1 = load double , ptr %from
  74   %v0_2 = load double , ptr %from_1
  75   %v1_1 = fadd double %v0_1, %p
  76   %v1_2 = fadd double %v0_1, %v0_2
  77   %to_2 = getelementptr double, ptr %to, i64 1
  78   store double %v1_1, ptr %to
  79   store double %v1_2, ptr %to_2
  80   br i1 %c, label %lp, label %ext
  81
  82 ext:
  83   ret void
  84 }
  85
  86 define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
  87 ; CHECK-LABEL: @vecload_vs_broadcast2(
  88 ; CHECK-NEXT:  entry:
  89 ; CHECK-NEXT:    br label [[LP:%.*]]
  90 ; CHECK:       lp:
  91 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
  92 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
  93 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
  94 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
  95 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
  96 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
  97 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
  98 ; CHECK:       ext:
  99 ; CHECK-NEXT:    ret void
 100 ;
 101 ; SSE2-LABEL: @vecload_vs_broadcast2(
 102 ; SSE2-NEXT:  entry:
 103 ; SSE2-NEXT:    br label [[LP:%.*]]
 104 ; SSE2:       lp:
 105 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 106 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 107 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
 108 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
 109 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 110 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 111 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 112 ; SSE2:       ext:
 113 ; SSE2-NEXT:    ret void
 114 ;
 115 entry:
 116 br label %lp
 117
 118 lp:
 119   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
 120   %from_1 = getelementptr double, ptr %from, i64 1
 121   %v0_1 = load double , ptr %from
 122   %v0_2 = load double , ptr %from_1
 123   %v1_1 = fadd double %p, %v0_1
 124   %v1_2 = fadd double %v0_2, %v0_1
 125   %to_2 = getelementptr double, ptr %to, i64 1
 126   store double %v1_1, ptr %to
 127   store double %v1_2, ptr %to_2
 128   br i1 %c, label %lp, label %ext
 129
 130 ext:
 131   ret void
 132 }
 133
 134 define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
 135 ; CHECK-LABEL: @vecload_vs_broadcast3(
 136 ; CHECK-NEXT:  entry:
 137 ; CHECK-NEXT:    br label [[LP:%.*]]
 138 ; CHECK:       lp:
 139 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 140 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 141 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
 142 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
 143 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 144 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 145 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 146 ; CHECK:       ext:
 147 ; CHECK-NEXT:    ret void
 148 ;
 149 ; SSE2-LABEL: @vecload_vs_broadcast3(
 150 ; SSE2-NEXT:  entry:
 151 ; SSE2-NEXT:    br label [[LP:%.*]]
 152 ; SSE2:       lp:
 153 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 154 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 155 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
 156 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
 157 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 158 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 159 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 160 ; SSE2:       ext:
 161 ; SSE2-NEXT:    ret void
 162 ;
 163 entry:
 164 br label %lp
 165
 166 lp:
 167   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
 168   %from_1 = getelementptr double, ptr %from, i64 1
 169   %v0_1 = load double , ptr %from
 170   %v0_2 = load double , ptr %from_1
 171   %v1_1 = fadd double %p, %v0_1
 172   %v1_2 = fadd double %v0_1, %v0_2
 173   %to_2 = getelementptr double, ptr %to, i64 1
 174   store double %v1_1, ptr %to
 175   store double %v1_2, ptr %to_2
 176   br i1 %c, label %lp, label %ext
 177
 178 ext:
 179   ret void
 180 }
 181
 182 define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
 183 ; CHECK-LABEL: @shuffle_nodes_match1(
 184 ; CHECK-NEXT:  entry:
 185 ; CHECK-NEXT:    br label [[LP:%.*]]
 186 ; CHECK:       lp:
 187 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 188 ; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
 189 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 190 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
 191 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
 192 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
 193 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
 194 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
 195 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 196 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 197 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 198 ; CHECK:       ext:
 199 ; CHECK-NEXT:    ret void
 200 ;
 201 ; SSE2-LABEL: @shuffle_nodes_match1(
 202 ; SSE2-NEXT:  entry:
 203 ; SSE2-NEXT:    br label [[LP:%.*]]
 204 ; SSE2:       lp:
 205 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 206 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 207 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 208 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
 209 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 210 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 211 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 212 ; SSE2:       ext:
 213 ; SSE2-NEXT:    ret void
 214 ;
 215 entry:
 216 br label %lp
 217
 218 lp:
 219   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
 220   %from_1 = getelementptr double, ptr %from, i64 1
 221   %v0_1 = load double , ptr %from
 222   %v0_2 = load double , ptr %from_1
 223   %v1_1 = fadd double %v0_2, %v0_1
 224   %v1_2 = fadd double %p, %v0_1
 225   %to_2 = getelementptr double, ptr %to, i64 1
 226   store double %v1_1, ptr %to
 227   store double %v1_2, ptr %to_2
 228   br i1 %c, label %lp, label %ext
 229
 230 ext:
 231   ret void
 232 }
 233
 234 define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
 235 ; CHECK-LABEL: @vecload_vs_broadcast4(
 236 ; CHECK-NEXT:  entry:
 237 ; CHECK-NEXT:    br label [[LP:%.*]]
 238 ; CHECK:       lp:
 239 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 240 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 241 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 242 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
 243 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 244 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 245 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 246 ; CHECK:       ext:
 247 ; CHECK-NEXT:    ret void
 248 ;
 249 ; SSE2-LABEL: @vecload_vs_broadcast4(
 250 ; SSE2-NEXT:  entry:
 251 ; SSE2-NEXT:    br label [[LP:%.*]]
 252 ; SSE2:       lp:
 253 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 254 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 255 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 256 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
 257 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 258 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 259 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 260 ; SSE2:       ext:
 261 ; SSE2-NEXT:    ret void
 262 ;
 263 entry:
 264 br label %lp
 265
 266 lp:
 267   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
 268   %from_1 = getelementptr double, ptr %from, i64 1
 269   %v0_1 = load double , ptr %from
 270   %v0_2 = load double , ptr %from_1
 271   %v1_1 = fadd double %v0_1, %v0_2
 272   %v1_2 = fadd double %p, %v0_1
 273   %to_2 = getelementptr double, ptr %to, i64 1
 274   store double %v1_1, ptr %to
 275   store double %v1_2, ptr %to_2
 276   br i1 %c, label %lp, label %ext
 277
 278 ext:
 279   ret void
 280 }
 281
 282
 283 define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
 284 ; CHECK-LABEL: @shuffle_nodes_match2(
 285 ; CHECK-NEXT:  entry:
 286 ; CHECK-NEXT:    br label [[LP:%.*]]
 287 ; CHECK:       lp:
 288 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 289 ; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
 290 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 291 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
 292 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
 293 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
 294 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
 295 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1
 296 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 297 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 298 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 299 ; CHECK:       ext:
 300 ; CHECK-NEXT:    ret void
 301 ;
 302 ; SSE2-LABEL: @shuffle_nodes_match2(
 303 ; SSE2-NEXT:  entry:
 304 ; SSE2-NEXT:    br label [[LP:%.*]]
 305 ; SSE2:       lp:
 306 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 307 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 308 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 309 ; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
 310 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 311 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 312 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 313 ; SSE2:       ext:
 314 ; SSE2-NEXT:    ret void
 315 ;
 316 entry:
 317 br label %lp
 318
 319 lp:
 320   %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
 321   %from_1 = getelementptr double, ptr %from, i64 1
 322   %v0_1 = load double , ptr %from
 323   %v0_2 = load double , ptr %from_1
 324   %v1_1 = fadd double %v0_1, %v0_2
 325   %v1_2 = fadd double %v0_1, %p
 326   %to_2 = getelementptr double, ptr %to, i64 1
 327   store double %v1_1, ptr %to
 328   store double %v1_2, ptr %to_2
 329   br i1 %c, label %lp, label %ext
 330
 331 ext:
 332   ret void
 333 }
 334
 335 ; Make sure we don't scramble operands when we reorder them and destroy
 336 ; 'good' source order.
 337
 338 @a = common global [32000 x float] zeroinitializer, align 16
 339
 340 define void @good_load_order() {
 341 ; CHECK-LABEL: @good_load_order(
 342 ; CHECK-NEXT:  entry:
 343 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 344 ; CHECK:       for.cond1.preheader:
 345 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr @a, align 16
 346 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 347 ; CHECK:       for.body3:
 348 ; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
 349 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 350 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 351 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
 352 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
 353 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 354 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
 355 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 356 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
 357 ; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
 358 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 359 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
 360 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
 361 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
 362 ; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
 363 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
 364 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 365 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
 366 ; CHECK-NEXT:    [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
 367 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
 368 ; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
 369 ; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
 370 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 371 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
 372 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
 373 ; CHECK:       for.end:
 374 ; CHECK-NEXT:    ret void
 375 ;
 376 ; SSE2-LABEL: @good_load_order(
 377 ; SSE2-NEXT:  entry:
 378 ; SSE2-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 379 ; SSE2:       for.cond1.preheader:
 380 ; SSE2-NEXT:    [[TMP0:%.*]] = load float, ptr @a, align 16
 381 ; SSE2-NEXT:    br label [[FOR_BODY3:%.*]]
 382 ; SSE2:       for.body3:
 383 ; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
 384 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 385 ; SSE2-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 386 ; SSE2-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
 387 ; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
 388 ; SSE2-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 389 ; SSE2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
 390 ; SSE2-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 391 ; SSE2-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
 392 ; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
 393 ; SSE2-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 394 ; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
 395 ; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
 396 ; SSE2-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
 397 ; SSE2-NEXT:    store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
 398 ; SSE2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
 399 ; SSE2-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 400 ; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
 401 ; SSE2-NEXT:    [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
 402 ; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
 403 ; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
 404 ; SSE2-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
 405 ; SSE2-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 406 ; SSE2-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
 407 ; SSE2-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
 408 ; SSE2:       for.end:
 409 ; SSE2-NEXT:    ret void
 410 ;
 411 entry:
 412   br label %for.cond1.preheader
 413
 414 for.cond1.preheader:
 415   %0 = load float, ptr @a, align 16
 416   br label %for.body3
 417
 418 for.body3:
 419   %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
 420   %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
 421   %2 = add nsw i64 %indvars.iv, 1
 422   %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
 423   %3 = load float, ptr %arrayidx, align 4
 424   %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
 425   %mul6 = fmul float %3, %1
 426   store float %mul6, ptr %arrayidx5, align 4
 427   %4 = add nsw i64 %indvars.iv, 2
 428   %arrayidx11 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %4
 429   %5 = load float, ptr %arrayidx11, align 4
 430   %mul15 = fmul float %5, %3
 431   store float %mul15, ptr %arrayidx, align 4
 432   %6 = add nsw i64 %indvars.iv, 3
 433   %arrayidx21 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %6
 434   %7 = load float, ptr %arrayidx21, align 4
 435   %mul25 = fmul float %7, %5
 436   store float %mul25, ptr %arrayidx11, align 4
 437   %8 = add nsw i64 %indvars.iv, 4
 438   %arrayidx31 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %8
 439   %9 = load float, ptr %arrayidx31, align 4
 440   %mul35 = fmul float %9, %7
 441   store float %mul35, ptr %arrayidx21, align 4
 442   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
 443   %arrayidx41 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv.next
 444   %10 = load float, ptr %arrayidx41, align 4
 445   %mul45 = fmul float %10, %9
 446   store float %mul45, ptr %arrayidx31, align 4
 447   %11 = trunc i64 %indvars.iv.next to i32
 448   %cmp2 = icmp slt i32 %11, 31995
 449   br i1 %cmp2, label %for.body3, label %for.end
 450
 451 for.end:
 452   ret void
 453 }
 454
 455 ; Check vectorization of following code for double data type-
 456 ;  c[0] = a[0]+b[0];
 457 ;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
 458
 459 define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
 460 ; CHECK-LABEL: @load_reorder_double(
 461 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
 462 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
 463 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 464 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
 465 ; CHECK-NEXT:    ret void
 466 ;
 467 ; SSE2-LABEL: @load_reorder_double(
 468 ; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
 469 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
 470 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 471 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
 472 ; SSE2-NEXT:    ret void
 473 ;
 474   %1 = load double, ptr %a
 475   %2 = load double, ptr %b
 476   %3 = fadd double %1, %2
 477   store double %3, ptr %c
 478   %4 = getelementptr inbounds double, ptr %b, i64 1
 479   %5 = load double, ptr %4
 480   %6 = getelementptr inbounds double, ptr %a, i64 1
 481   %7 = load double, ptr %6
 482   %8 = fadd double %5, %7
 483   %9 = getelementptr inbounds double, ptr %c, i64 1
 484   store double %8, ptr %9
 485   ret void
 486 }
 487
 488 ; Check vectorization of following code for float data type-
 489 ;  c[0] = a[0]+b[0];
 490 ;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
 491 ;  c[2] = a[2]+b[2];
 492 ;  c[3] = a[3]+b[3];
 493
 494 define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
 495 ; CHECK-LABEL: @load_reorder_float(
 496 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
 497 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
 498 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 499 ; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
 500 ; CHECK-NEXT:    ret void
 501 ;
 502 ; SSE2-LABEL: @load_reorder_float(
 503 ; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
 504 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
 505 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 506 ; SSE2-NEXT:    store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
 507 ; SSE2-NEXT:    ret void
 508 ;
 509   %1 = load float, ptr %a
 510   %2 = load float, ptr %b
 511   %3 = fadd float %1, %2
 512   store float %3, ptr %c
 513   %4 = getelementptr inbounds float, ptr %b, i64 1
 514   %5 = load float, ptr %4
 515   %6 = getelementptr inbounds float, ptr %a, i64 1
 516   %7 = load float, ptr %6
 517   %8 = fadd float %5, %7
 518   %9 = getelementptr inbounds float, ptr %c, i64 1
 519   store float %8, ptr %9
 520   %10 = getelementptr inbounds float, ptr %a, i64 2
 521   %11 = load float, ptr %10
 522   %12 = getelementptr inbounds float, ptr %b, i64 2
 523   %13 = load float, ptr %12
 524   %14 = fadd float %11, %13
 525   %15 = getelementptr inbounds float, ptr %c, i64 2
 526   store float %14, ptr %15
 527   %16 = getelementptr inbounds float, ptr %a, i64 3
 528   %17 = load float, ptr %16
 529   %18 = getelementptr inbounds float, ptr %b, i64 3
 530   %19 = load float, ptr %18
 531   %20 = fadd float %17, %19
 532   %21 = getelementptr inbounds float, ptr %c, i64 3
 533   store float %20, ptr %21
 534   ret void
 535 }
 536
 537 ; Check we properly reorder the below code so that it gets vectorized optimally-
 538 ; a[0] = (b[0]+c[0])+d[0];
 539 ; a[1] = d[1]+(b[1]+c[1]);
 540 ; a[2] = (b[2]+c[2])+d[2];
 541 ; a[3] = (b[3]+c[3])+d[3];
 542
 543 define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) {
 544 ; CHECK-LABEL: @opcode_reorder(
 545 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
 546 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
 547 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 548 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
 549 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
 550 ; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
 551 ; CHECK-NEXT:    ret void
 552 ;
 553 ; SSE2-LABEL: @opcode_reorder(
 554 ; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
 555 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
 556 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 557 ; SSE2-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
 558 ; SSE2-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
 559 ; SSE2-NEXT:    store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
 560 ; SSE2-NEXT:    ret void
 561 ;
 562   %1 = load float, ptr %b
 563   %2 = load float, ptr %c
 564   %3 = fadd float %1, %2
 565   %4 = load float, ptr %d
 566   %5 = fadd float %3, %4
 567   store float %5, ptr %a
 568   %6 = getelementptr inbounds float, ptr %d, i64 1
 569   %7 = load float, ptr %6
 570   %8 = getelementptr inbounds float, ptr %b, i64 1
 571   %9 = load float, ptr %8
 572   %10 = getelementptr inbounds float, ptr %c, i64 1
 573   %11 = load float, ptr %10
 574   %12 = fadd float %9, %11
 575   %13 = fadd float %7, %12
 576   %14 = getelementptr inbounds float, ptr %a, i64 1
 577   store float %13, ptr %14
 578   %15 = getelementptr inbounds float, ptr %b, i64 2
 579   %16 = load float, ptr %15
 580   %17 = getelementptr inbounds float, ptr %c, i64 2
 581   %18 = load float, ptr %17
 582   %19 = fadd float %16, %18
 583   %20 = getelementptr inbounds float, ptr %d, i64 2
 584   %21 = load float, ptr %20
 585   %22 = fadd float %19, %21
 586   %23 = getelementptr inbounds float, ptr %a, i64 2
 587   store float %22, ptr %23
 588   %24 = getelementptr inbounds float, ptr %b, i64 3
 589   %25 = load float, ptr %24
 590   %26 = getelementptr inbounds float, ptr %c, i64 3
 591   %27 = load float, ptr %26
 592   %28 = fadd float %25, %27
 593   %29 = getelementptr inbounds float, ptr %d, i64 3
 594   %30 = load float, ptr %29
 595   %31 = fadd float %28, %30
 596   %32 = getelementptr inbounds float, ptr %a, i64 3
 597   store float %31, ptr %32
 598   ret void
 599 }