llvm/test/Transforms/LoopVectorize/trunc-reductions.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -loop-vectorize -dce -instcombine -force-vector-interleave=1 -force-vector-width=8 -S < %s | FileCheck %s
   3
   4 define i8 @reduction_and_trunc(i8* noalias nocapture %ptr) {
   5 ; CHECK-LABEL: @reduction_and_trunc(
   6 ; CHECK-NEXT:  entry:
   7 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
   8 ; CHECK:       vector.ph:
   9 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  10 ; CHECK:       vector.body:
  11 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  12 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i8> [ <i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
  13 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
  14 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]]
  15 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i8>*
  16 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
  17 ; CHECK-NEXT:    [[TMP3]] = and <8 x i8> [[VEC_PHI]], [[WIDE_LOAD]]
  18 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
  19 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
  20 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
  21 ; CHECK:       middle.block:
  22 ; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP3]])
  23 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  24 ; CHECK:       scalar.ph:
  25 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  26 ; CHECK:       for.body:
  27 ; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
  28 ; CHECK:       for.end:
  29 ; CHECK-NEXT:    [[AND_LCSSA_OFF0:%.*]] = phi i8 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
  30 ; CHECK-NEXT:    ret i8 [[AND_LCSSA_OFF0]]
  31 ;
  32 entry:
  33   br label %for.body
  34
  35 for.body:
  36   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
  37   %sum.02p = phi i32 [ %and, %for.body ], [ 0, %entry ]
  38   %sum.02 = and i32 %sum.02p, 255
  39   %gep = getelementptr inbounds i8, i8* %ptr, i32 %iv
  40   %load = load i8, i8* %gep
  41   %ext = zext i8 %load to i32
  42   %and = and i32 %sum.02, %ext
  43   %iv.next = add i32 %iv, 1
  44   %exitcond = icmp eq i32 %iv.next, 256
  45   br i1 %exitcond, label %for.end, label %for.body
  46
  47 for.end:
  48   %ret = trunc i32 %and to i8
  49   ret i8 %ret
  50 }
  51
  52 define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
  53 ; CHECK-LABEL: @reduction_or_trunc(
  54 ; CHECK-NEXT:  entry:
  55 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  56 ; CHECK:       vector.ph:
  57 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  58 ; CHECK:       vector.body:
  59 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
  60 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
  61 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
  62 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
  63 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <8 x i16>*
  64 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP2]], align 2
  65 ; CHECK-NEXT:    [[TMP3]] = or <8 x i16> [[VEC_PHI]], [[WIDE_LOAD]]
  66 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
  67 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
  68 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
  69 ; CHECK:       middle.block:
  70 ; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP3]])
  71 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
  72 ; CHECK:       scalar.ph:
  73 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  74 ; CHECK:       for.body:
  75 ; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
  76 ; CHECK:       for.end:
  77 ; CHECK-NEXT:    [[XOR_LCSSA_OFF0:%.*]] = phi i16 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
  78 ; CHECK-NEXT:    ret i16 [[XOR_LCSSA_OFF0]]
  79 ;
  80 entry:
  81   br label %for.body
  82
  83 for.body:
  84   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
  85   %sum.02p = phi i32 [ %xor, %for.body ], [ 0, %entry ]
  86   %sum.02 = and i32 %sum.02p, 65535
  87   %gep = getelementptr inbounds i16, i16* %ptr, i32 %iv
  88   %load = load i16, i16* %gep
  89   %ext = zext i16 %load to i32
  90   %xor = or i32 %sum.02, %ext
  91   %iv.next = add i32 %iv, 1
  92   %exitcond = icmp eq i32 %iv.next, 256
  93   br i1 %exitcond, label %for.end, label %for.body
  94
  95 for.end:
  96   %ret = trunc i32 %xor to i16
  97   ret i16 %ret
  98 }
  99
 100 define i16 @reduction_xor_trunc(i16* noalias nocapture %ptr) {
 101 ; CHECK-LABEL: @reduction_xor_trunc(
 102 ; CHECK-NEXT:  entry:
 103 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 104 ; CHECK:       vector.ph:
 105 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 106 ; CHECK:       vector.body:
 107 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 108 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 109 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 110 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
 111 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <8 x i16>*
 112 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP2]], align 2
 113 ; CHECK-NEXT:    [[TMP3]] = xor <8 x i16> [[VEC_PHI]], [[WIDE_LOAD]]
 114 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 115 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 116 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 117 ; CHECK:       middle.block:
 118 ; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> [[TMP3]])
 119 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 120 ; CHECK:       scalar.ph:
 121 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 122 ; CHECK:       for.body:
 123 ; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 124 ; CHECK:       for.end:
 125 ; CHECK-NEXT:    [[XOR_LCSSA_OFF0:%.*]] = phi i16 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 126 ; CHECK-NEXT:    ret i16 [[XOR_LCSSA_OFF0]]
 127 ;
 128 entry:
 129   br label %for.body
 130
 131 for.body:
 132   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
 133   %sum.02p = phi i32 [ %xor, %for.body ], [ 0, %entry ]
 134   %sum.02 = and i32 %sum.02p, 65535
 135   %gep = getelementptr inbounds i16, i16* %ptr, i32 %iv
 136   %load = load i16, i16* %gep
 137   %ext = zext i16 %load to i32
 138   %xor = xor i32 %sum.02, %ext
 139   %iv.next = add i32 %iv, 1
 140   %exitcond = icmp eq i32 %iv.next, 256
 141   br i1 %exitcond, label %for.end, label %for.body
 142
 143 for.end:
 144   %ret = trunc i32 %xor to i16
 145   ret i16 %ret
 146 }
 147
 148 define i8 @reduction_smin_trunc(i8* noalias nocapture %ptr) {
 149 ; CHECK-LABEL: @reduction_smin_trunc(
 150 ; CHECK-NOT: vector.body
 151 ; CHECK-NOT: <8 x
 152 ; CHECK: ret
 153 entry:
 154   br label %for.body
 155
 156 for.body:
 157   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
 158   %sum.02p = phi i32 [ %min, %for.body ], [ 256, %entry ]
 159   %sum.02 = and i32 %sum.02p, 255
 160   %gep = getelementptr inbounds i8, i8* %ptr, i32 %iv
 161   %load = load i8, i8* %gep
 162   %ext = sext i8 %load to i32
 163   %icmp = icmp slt i32 %sum.02, %ext
 164   %min = select i1 %icmp, i32 %sum.02, i32 %ext
 165   %iv.next = add i32 %iv, 1
 166   %exitcond = icmp eq i32 %iv.next, 256
 167   br i1 %exitcond, label %for.end, label %for.body
 168
 169 for.end:
 170   %ret = trunc i32 %min to i8
 171   ret i8 %ret
 172 }
 173
 174 define i8 @reduction_umin_trunc(i8* noalias nocapture %ptr) {
 175 ; CHECK-LABEL: @reduction_umin_trunc(
 176 ; CHECK-NOT: vector.body
 177 ; CHECK-NOT: <8 x
 178 ; CHECK: ret
 179 entry:
 180   br label %for.body
 181
 182 for.body:
 183   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
 184   %sum.02p = phi i32 [ %min, %for.body ], [ 0, %entry ]
 185   %sum.02 = and i32 %sum.02p, 255
 186   %gep = getelementptr inbounds i8, i8* %ptr, i32 %iv
 187   %load = load i8, i8* %gep
 188   %ext = zext i8 %load to i32
 189   %icmp = icmp ult i32 %sum.02, %ext
 190   %min = select i1 %icmp, i32 %sum.02, i32 %ext
 191   %iv.next = add i32 %iv, 1
 192   %exitcond = icmp eq i32 %iv.next, 256
 193   br i1 %exitcond, label %for.end, label %for.body
 194
 195 for.end:
 196   %ret = trunc i32 %min to i8
 197   ret i8 %ret
 198 }
 199
 200 define i16 @reduction_smax_trunc(i16* noalias nocapture %ptr) {
 201 ; CHECK-LABEL: @reduction_smax_trunc(
 202 ; CHECK-NOT: vector.body
 203 ; CHECK-NOT: <8 x
 204 ; CHECK: ret
 205 entry:
 206   br label %for.body
 207
 208 for.body:
 209   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
 210   %sum.02p = phi i32 [ %min, %for.body ], [ 0, %entry ]
 211   %sum.02 = and i32 %sum.02p, 65535
 212   %gep = getelementptr inbounds i16, i16* %ptr, i32 %iv
 213   %load = load i16, i16* %gep
 214   %ext = sext i16 %load to i32
 215   %icmp = icmp sgt i32 %sum.02, %ext
 216   %min = select i1 %icmp, i32 %sum.02, i32 %ext
 217   %iv.next = add i32 %iv, 1
 218   %exitcond = icmp eq i32 %iv.next, 256
 219   br i1 %exitcond, label %for.end, label %for.body
 220
 221 for.end:
 222   %ret = trunc i32 %min to i16
 223   ret i16 %ret
 224 }
 225
 226 define i16 @reduction_umax_trunc(i16* noalias nocapture %ptr) {
 227 ; CHECK-LABEL: @reduction_umax_trunc(
 228 ; CHECK-NOT: vector.body
 229 ; CHECK-NOT: <8 x
 230 ; CHECK: ret
 231 entry:
 232   br label %for.body
 233
 234 for.body:
 235   %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
 236   %sum.02p = phi i32 [ %min, %for.body ], [ 0, %entry ]
 237   %sum.02 = and i32 %sum.02p, 65535
 238   %gep = getelementptr inbounds i16, i16* %ptr, i32 %iv
 239   %load = load i16, i16* %gep
 240   %ext = zext i16 %load to i32
 241   %icmp = icmp ugt i32 %sum.02, %ext
 242   %min = select i1 %icmp, i32 %sum.02, i32 %ext
 243   %iv.next = add i32 %iv, 1
 244   %exitcond = icmp eq i32 %iv.next, 256
 245   br i1 %exitcond, label %for.end, label %for.body
 246
 247 for.end:
 248   %ret = trunc i32 %min to i16
 249   ret i16 %ret
 250 }