llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -O2                   -S -mattr=avx < %s | FileCheck %s
   3 ; RUN: opt -passes='default<O2>' -S -mattr=avx < %s | FileCheck %s
   4
   5 target triple = "x86_64--"
   6 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   7
   8 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
   9 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
  10 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
  11 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[Z]])
  12 ; CHECK-NEXT:    ret i32 [[TMP1]]
  13 ;
  14   %z = and <4 x i32> %x, %y
  15   %z0 = extractelement <4 x i32> %z, i32 0
  16   %z1 = extractelement <4 x i32> %z, i32 1
  17   %z01 = or i32 %z0, %z1
  18   %z2 = extractelement <4 x i32> %z, i32 2
  19   %z012 = or i32 %z01, %z2
  20   %z3 = extractelement <4 x i32> %z, i32 3
  21   %z0123 = or i32 %z3, %z012
  22   ret i32 %z0123
  23 }
  24
  25 define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
  26 ; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
  27 ; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  28 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]]
  29 ; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
  30 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SHIFT1]]
  31 ; CHECK-NEXT:    [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
  32 ; CHECK-NEXT:    ret i32 [[X210]]
  33 ;
  34   %x0 = extractelement <4 x i32> %x, i32 0
  35   %x1 = extractelement <4 x i32> %x, i32 1
  36   %x10 = add i32 %x1, %x0
  37   %x2 = extractelement <4 x i32> %x, i32 2
  38   %x210 = add i32 %x2, %x10
  39   ret i32 %x210
  40 }
  41
  42 define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
  43 ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
  44 ; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
  45 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[Y:%.*]]
  46 ; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  47 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SHIFT1]]
  48 ; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
  49 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[SHIFT2]]
  50 ; CHECK-NEXT:    [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
  51 ; CHECK-NEXT:    ret i32 [[X2Y210]]
  52 ;
  53   %y0 = extractelement <4 x i32> %y, i32 0
  54   %y1 = extractelement <4 x i32> %y, i32 1
  55   %y10 = add i32 %y1, %y0
  56   %y2 = extractelement <4 x i32> %y, i32 2
  57   %y210 = add i32 %y2, %y10
  58   %x2 = extractelement <4 x i32> %x, i32 2
  59   %x2y210 = add i32 %x2, %y210
  60   ret i32 %x2y210
  61 }
  62
  63 ; PR43953 - https://bugs.llvm.org/show_bug.cgi?id=43953
  64 ; We want to end up with a single reduction on the next 4 tests.
  65
  66 define i32 @TestVectorsEqual(i32* noalias %Vec0, i32* noalias %Vec1, i32 %Tolerance) {
  67 ; CHECK-LABEL: @TestVectorsEqual(
  68 ; CHECK-NEXT:  entry:
  69 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[VEC0:%.*]] to <4 x i32>*
  70 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
  71 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>*
  72 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
  73 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
  74 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 true)
  75 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
  76 ; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP6]], [[TOLERANCE:%.*]]
  77 ; CHECK-NEXT:    [[COND6:%.*]] = zext i1 [[CMP5_NOT]] to i32
  78 ; CHECK-NEXT:    ret i32 [[COND6]]
  79 ;
  80 entry:
  81   br label %for.cond
  82
  83 for.cond:
  84   %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
  85   %Component.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
  86   %cmp = icmp slt i32 %Component.0, 4
  87   br i1 %cmp, label %for.body, label %for.cond.cleanup
  88
  89 for.cond.cleanup:
  90   br label %for.end
  91
  92 for.body:
  93   %idxprom = sext i32 %Component.0 to i64
  94   %arrayidx = getelementptr inbounds i32, i32* %Vec0, i64 %idxprom
  95   %0 = load i32, i32* %arrayidx, align 4
  96   %idxprom1 = sext i32 %Component.0 to i64
  97   %arrayidx2 = getelementptr inbounds i32, i32* %Vec1, i64 %idxprom1
  98   %1 = load i32, i32* %arrayidx2, align 4
  99   %sub = sub nsw i32 %0, %1
 100   %cmp3 = icmp sge i32 %sub, 0
 101   br i1 %cmp3, label %cond.true, label %cond.false
 102
 103 cond.true:
 104   br label %cond.end
 105
 106 cond.false:
 107   %sub4 = sub nsw i32 0, %sub
 108   br label %cond.end
 109
 110 cond.end:
 111   %cond = phi i32 [ %sub, %cond.true ], [ %sub4, %cond.false ]
 112   %add = add nsw i32 %sum.0, %cond
 113   br label %for.inc
 114
 115 for.inc:
 116   %inc = add nsw i32 %Component.0, 1
 117   br label %for.cond
 118
 119 for.end:
 120   %cmp5 = icmp sle i32 %sum.0, %Tolerance
 121   %2 = zext i1 %cmp5 to i64
 122   %cond6 = select i1 %cmp5, i32 1, i32 0
 123   ret i32 %cond6
 124 }
 125
 126 define i32 @TestVectorsEqual_alt(i32* noalias %Vec0, i32* noalias %Vec1, i32 %Tolerance) {
 127 ; CHECK-LABEL: @TestVectorsEqual_alt(
 128 ; CHECK-NEXT:  entry:
 129 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[VEC0:%.*]] to <4 x i32>*
 130 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 131 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>*
 132 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 133 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
 134 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 135 ; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]]
 136 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32
 137 ; CHECK-NEXT:    ret i32 [[COND]]
 138 ;
 139 entry:
 140   br label %for.cond
 141
 142 for.cond:
 143   %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
 144   %Component.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
 145   %cmp = icmp slt i32 %Component.0, 4
 146   br i1 %cmp, label %for.body, label %for.cond.cleanup
 147
 148 for.cond.cleanup:
 149   br label %for.end
 150
 151 for.body:
 152   %idxprom = sext i32 %Component.0 to i64
 153   %arrayidx = getelementptr inbounds i32, i32* %Vec0, i64 %idxprom
 154   %0 = load i32, i32* %arrayidx, align 4
 155   %idxprom1 = sext i32 %Component.0 to i64
 156   %arrayidx2 = getelementptr inbounds i32, i32* %Vec1, i64 %idxprom1
 157   %1 = load i32, i32* %arrayidx2, align 4
 158   %sub = sub i32 %0, %1
 159   %add = add i32 %sum.0, %sub
 160   br label %for.inc
 161
 162 for.inc:
 163   %inc = add nsw i32 %Component.0, 1
 164   br label %for.cond
 165
 166 for.end:
 167   %cmp3 = icmp ule i32 %sum.0, %Tolerance
 168   %2 = zext i1 %cmp3 to i64
 169   %cond = select i1 %cmp3, i32 1, i32 0
 170   ret i32 %cond
 171 }
 172
 173 define i32 @TestVectorsEqualFP(float* noalias %Vec0, float* noalias %Vec1, float %Tolerance) {
 174 ; CHECK-LABEL: @TestVectorsEqualFP(
 175 ; CHECK-NEXT:  entry:
 176 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[VEC0:%.*]] to <4 x float>*
 177 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 178 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>*
 179 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 180 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
 181 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
 182 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
 183 ; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]]
 184 ; CHECK-NEXT:    [[COND5:%.*]] = zext i1 [[CMP4]] to i32
 185 ; CHECK-NEXT:    ret i32 [[COND5]]
 186 ;
 187 entry:
 188   br label %for.cond
 189
 190 for.cond:
 191   %sum.0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.inc ]
 192   %Component.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
 193   %cmp = icmp slt i32 %Component.0, 4
 194   br i1 %cmp, label %for.body, label %for.cond.cleanup
 195
 196 for.cond.cleanup:
 197   br label %for.end
 198
 199 for.body:
 200   %idxprom = sext i32 %Component.0 to i64
 201   %arrayidx = getelementptr inbounds float, float* %Vec0, i64 %idxprom
 202   %0 = load float, float* %arrayidx, align 4
 203   %idxprom1 = sext i32 %Component.0 to i64
 204   %arrayidx2 = getelementptr inbounds float, float* %Vec1, i64 %idxprom1
 205   %1 = load float, float* %arrayidx2, align 4
 206   %sub = fsub fast float %0, %1
 207   %cmp3 = fcmp fast oge float %sub, 0.000000e+00
 208   br i1 %cmp3, label %cond.true, label %cond.false
 209
 210 cond.true:
 211   br label %cond.end
 212
 213 cond.false:
 214   %fneg = fneg fast float %sub
 215   br label %cond.end
 216
 217 cond.end:
 218   %cond = phi fast float [ %sub, %cond.true ], [ %fneg, %cond.false ]
 219   %add = fadd fast float %sum.0, %cond
 220   br label %for.inc
 221
 222 for.inc:
 223   %inc = add nsw i32 %Component.0, 1
 224   br label %for.cond
 225
 226 for.end:
 227   %cmp4 = fcmp fast ole float %sum.0, %Tolerance
 228   %2 = zext i1 %cmp4 to i64
 229   %cond5 = select i1 %cmp4, i32 1, i32 0
 230   ret i32 %cond5
 231 }
 232
 233 define i32 @TestVectorsEqualFP_alt(float* noalias %Vec0, float* noalias %Vec1, float %Tolerance) {
 234 ; CHECK-LABEL: @TestVectorsEqualFP_alt(
 235 ; CHECK-NEXT:  entry:
 236 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[VEC0:%.*]] to <4 x float>*
 237 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 238 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>*
 239 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 240 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
 241 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
 242 ; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]]
 243 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 244 ; CHECK-NEXT:    ret i32 [[COND]]
 245 ;
 246 entry:
 247   br label %for.cond
 248
 249 for.cond:
 250   %sum.0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.inc ]
 251   %Component.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
 252   %cmp = icmp slt i32 %Component.0, 4
 253   br i1 %cmp, label %for.body, label %for.cond.cleanup
 254
 255 for.cond.cleanup:
 256   br label %for.end
 257
 258 for.body:
 259   %idxprom = sext i32 %Component.0 to i64
 260   %arrayidx = getelementptr inbounds float, float* %Vec0, i64 %idxprom
 261   %0 = load float, float* %arrayidx, align 4
 262   %idxprom1 = sext i32 %Component.0 to i64
 263   %arrayidx2 = getelementptr inbounds float, float* %Vec1, i64 %idxprom1
 264   %1 = load float, float* %arrayidx2, align 4
 265   %sub = fsub fast float %0, %1
 266   %add = fadd fast float %sum.0, %sub
 267   br label %for.inc
 268
 269 for.inc:
 270   %inc = add nsw i32 %Component.0, 1
 271   br label %for.cond
 272
 273 for.end:
 274   %cmp3 = fcmp fast ole float %sum.0, %Tolerance
 275   %2 = zext i1 %cmp3 to i64
 276   %cond = select i1 %cmp3, i32 1, i32 0
 277   ret i32 %cond
 278 }
 279
 280 ; PR43745 - https://bugs.llvm.org/show_bug.cgi?id=43745
 281
 282 ; FIXME: this should be vectorized
 283 define i1 @cmp_lt_gt(double %a, double %b, double %c) {
 284 ; CHECK-LABEL: @cmp_lt_gt(
 285 ; CHECK-NEXT:  entry:
 286 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 287 ; CHECK-NEXT:    [[ADD:%.*]] = fsub double [[C:%.*]], [[B]]
 288 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
 289 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[ADD]], [[MUL]]
 290 ; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[FNEG]], [[C]]
 291 ; CHECK-NEXT:    [[DIV3:%.*]] = fdiv double [[SUB]], [[MUL]]
 292 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[DIV]], 0x3EB0C6F7A0B5ED8D
 293 ; CHECK-NEXT:    [[CMP4:%.*]] = fcmp olt double [[DIV3]], 0x3EB0C6F7A0B5ED8D
 294 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP4]], i1 false
 295 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
 296 ; CHECK:       lor.lhs.false:
 297 ; CHECK-NEXT:    [[CMP5:%.*]] = fcmp ule double [[DIV]], 1.000000e+00
 298 ; CHECK-NEXT:    [[CMP7:%.*]] = fcmp ule double [[DIV3]], 1.000000e+00
 299 ; CHECK-NEXT:    [[OR_COND1:%.*]] = select i1 [[CMP5]], i1 true, i1 [[CMP7]]
 300 ; CHECK-NEXT:    br label [[CLEANUP]]
 301 ; CHECK:       cleanup:
 302 ; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[OR_COND1]], [[LOR_LHS_FALSE]] ]
 303 ; CHECK-NEXT:    ret i1 [[RETVAL_0]]
 304 ;
 305 entry:
 306   %fneg = fneg double %b
 307   %add = fadd double %fneg, %c
 308   %mul = fmul double 2.0, %a
 309   %div = fdiv double %add, %mul
 310   %fneg1 = fneg double %b
 311   %sub = fsub double %fneg1, %c
 312   %mul2 = fmul double 2.0, %a
 313   %div3 = fdiv double %sub, %mul2
 314   %cmp = fcmp olt double %div, 0x3EB0C6F7A0B5ED8D
 315   br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
 316
 317 land.lhs.true:
 318   %cmp4 = fcmp olt double %div3, 0x3EB0C6F7A0B5ED8D
 319   br i1 %cmp4, label %if.then, label %lor.lhs.false
 320
 321 lor.lhs.false:
 322   %cmp5 = fcmp ogt double %div, 1.0
 323   br i1 %cmp5, label %land.lhs.true6, label %if.end
 324
 325 land.lhs.true6:
 326   %cmp7 = fcmp ogt double %div3, 1.0
 327   br i1 %cmp7, label %if.then, label %if.end
 328
 329 if.then:
 330   br label %cleanup
 331
 332 if.end:
 333   br label %cleanup
 334
 335 cleanup:
 336   %retval.0 = phi i1 [ false, %if.then ], [ true, %if.end ]
 337   ret i1 %retval.0
 338 }