llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX
   3 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE
   4 ; REQUIRES: asserts
   5
   6 ; int test_add(unsigned int *p) {
   7 ;   int result = 0;
   8 ;   for (int i = 0; i < 8; i++)
   9 ;     result += p[i];
  10 ;   return result;
  11 ; }
  12
  13 ; Vector cost is 5, Scalar cost is 7
  14 ; AVX: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
  15 ; Vector cost is 6, Scalar cost is 7
  16 ; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
  17 define i32 @test_add(i32* nocapture readonly %p) {
  18 ; CHECK-LABEL: @test_add(
  19 ; CHECK-NEXT:  entry:
  20 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
  21 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
  22 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
  23 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
  24 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
  25 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
  26 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
  27 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
  28 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
  29 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
  30 ; CHECK-NEXT:    ret i32 [[TMP2]]
  31 ;
  32 entry:
  33   %0 = load i32, i32* %p, align 4
  34   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
  35   %1 = load i32, i32* %arrayidx.1, align 4
  36   %mul.18 = add i32 %1, %0
  37   %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
  38   %2 = load i32, i32* %arrayidx.2, align 4
  39   %mul.29 = add i32 %2, %mul.18
  40   %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
  41   %3 = load i32, i32* %arrayidx.3, align 4
  42   %mul.310 = add i32 %3, %mul.29
  43   %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
  44   %4 = load i32, i32* %arrayidx.4, align 4
  45   %mul.411 = add i32 %4, %mul.310
  46   %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
  47   %5 = load i32, i32* %arrayidx.5, align 4
  48   %mul.512 = add i32 %5, %mul.411
  49   %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
  50   %6 = load i32, i32* %arrayidx.6, align 4
  51   %mul.613 = add i32 %6, %mul.512
  52   %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
  53   %7 = load i32, i32* %arrayidx.7, align 4
  54   %mul.714 = add i32 %7, %mul.613
  55   ret i32 %mul.714
  56 }
  57
  58 ; int test_mul(unsigned int *p) {
  59 ;   int result = 0;
  60 ;   for (int i = 0; i < 8; i++)
  61 ;     result *= p[i];
  62 ;   return result;
  63 ; }
  64
  65 define i32 @test_mul(i32* nocapture readonly %p) {
  66 ; AVX-LABEL: @test_mul(
  67 ; AVX-NEXT:  entry:
  68 ; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
  69 ; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
  70 ; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
  71 ; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
  72 ; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
  73 ; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
  74 ; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
  75 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
  76 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
  77 ; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
  78 ; AVX-NEXT:    ret i32 [[TMP2]]
  79 ;
  80 ; SSE-LABEL: @test_mul(
  81 ; SSE-NEXT:  entry:
  82 ; SSE-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
  83 ; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
  84 ; SSE-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
  85 ; SSE-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
  86 ; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
  87 ; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
  88 ; SSE-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
  89 ; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
  90 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
  91 ; SSE-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
  92 ; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
  93 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
  94 ; SSE-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
  95 ; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
  96 ; SSE-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
  97 ; SSE-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
  98 ; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
  99 ; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
 100 ; SSE-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
 101 ; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 102 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
 103 ; SSE-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
 104 ; SSE-NEXT:    ret i32 [[MUL_714]]
 105 ;
 106 entry:
 107   %0 = load i32, i32* %p, align 4
 108   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
 109   %1 = load i32, i32* %arrayidx.1, align 4
 110   %mul.18 = mul i32 %1, %0
 111   %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
 112   %2 = load i32, i32* %arrayidx.2, align 4
 113   %mul.29 = mul i32 %2, %mul.18
 114   %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
 115   %3 = load i32, i32* %arrayidx.3, align 4
 116   %mul.310 = mul i32 %3, %mul.29
 117   %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
 118   %4 = load i32, i32* %arrayidx.4, align 4
 119   %mul.411 = mul i32 %4, %mul.310
 120   %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
 121   %5 = load i32, i32* %arrayidx.5, align 4
 122   %mul.512 = mul i32 %5, %mul.411
 123   %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
 124   %6 = load i32, i32* %arrayidx.6, align 4
 125   %mul.613 = mul i32 %6, %mul.512
 126   %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
 127   %7 = load i32, i32* %arrayidx.7, align 4
 128   %mul.714 = mul i32 %7, %mul.613
 129   ret i32 %mul.714
 130 }
 131
 132 ; int test_and(unsigned int *p) {
 133 ;   int result = 0;
 134 ;   for (int i = 0; i < 8; i++)
 135 ;     result &= p[i];
 136 ;   return result;
 137 ; }
 138
 139 define i32 @test_and(i32* nocapture readonly %p) {
 140 ; CHECK-LABEL: @test_and(
 141 ; CHECK-NEXT:  entry:
 142 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 143 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
 144 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
 145 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
 146 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
 147 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
 148 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 149 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 150 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 151 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
 152 ; CHECK-NEXT:    ret i32 [[TMP2]]
 153 ;
 154 entry:
 155   %0 = load i32, i32* %p, align 4
 156   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
 157   %1 = load i32, i32* %arrayidx.1, align 4
 158   %mul.18 = and i32 %1, %0
 159   %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
 160   %2 = load i32, i32* %arrayidx.2, align 4
 161   %mul.29 = and i32 %2, %mul.18
 162   %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
 163   %3 = load i32, i32* %arrayidx.3, align 4
 164   %mul.310 = and i32 %3, %mul.29
 165   %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
 166   %4 = load i32, i32* %arrayidx.4, align 4
 167   %mul.411 = and i32 %4, %mul.310
 168   %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
 169   %5 = load i32, i32* %arrayidx.5, align 4
 170   %mul.512 = and i32 %5, %mul.411
 171   %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
 172   %6 = load i32, i32* %arrayidx.6, align 4
 173   %mul.613 = and i32 %6, %mul.512
 174   %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
 175   %7 = load i32, i32* %arrayidx.7, align 4
 176   %mul.714 = and i32 %7, %mul.613
 177   ret i32 %mul.714
 178 }
 179
 180 ; int test_or(unsigned int *p) {
 181 ;   int result = 0;
 182 ;   for (int i = 0; i < 8; i++)
 183 ;     result |= p[i];
 184 ;   return result;
 185 ; }
 186
 187 define i32 @test_or(i32* nocapture readonly %p) {
 188 ; CHECK-LABEL: @test_or(
 189 ; CHECK-NEXT:  entry:
 190 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 191 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
 192 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
 193 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
 194 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
 195 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
 196 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 197 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 198 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 199 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
 200 ; CHECK-NEXT:    ret i32 [[TMP2]]
 201 ;
 202 entry:
 203   %0 = load i32, i32* %p, align 4
 204   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
 205   %1 = load i32, i32* %arrayidx.1, align 4
 206   %mul.18 = or i32 %1, %0
 207   %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
 208   %2 = load i32, i32* %arrayidx.2, align 4
 209   %mul.29 = or i32 %2, %mul.18
 210   %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
 211   %3 = load i32, i32* %arrayidx.3, align 4
 212   %mul.310 = or i32 %3, %mul.29
 213   %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
 214   %4 = load i32, i32* %arrayidx.4, align 4
 215   %mul.411 = or i32 %4, %mul.310
 216   %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
 217   %5 = load i32, i32* %arrayidx.5, align 4
 218   %mul.512 = or i32 %5, %mul.411
 219   %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
 220   %6 = load i32, i32* %arrayidx.6, align 4
 221   %mul.613 = or i32 %6, %mul.512
 222   %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
 223   %7 = load i32, i32* %arrayidx.7, align 4
 224   %mul.714 = or i32 %7, %mul.613
 225   ret i32 %mul.714
 226 }
 227
 228 ; int test_xor(unsigned int *p) {
 229 ;   int result = 0;
 230 ;   for (int i = 0; i < 8; i++)
 231 ;     result ^= p[i];
 232 ;   return result;
 233 ; }
 234
 235 define i32 @test_xor(i32* nocapture readonly %p) {
 236 ; CHECK-LABEL: @test_xor(
 237 ; CHECK-NEXT:  entry:
 238 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 239 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
 240 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
 241 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
 242 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
 243 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
 244 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 245 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 246 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 247 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]])
 248 ; CHECK-NEXT:    ret i32 [[TMP2]]
 249 ;
 250 entry:
 251   %0 = load i32, i32* %p, align 4
 252   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
 253   %1 = load i32, i32* %arrayidx.1, align 4
 254   %mul.18 = xor i32 %1, %0
 255   %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
 256   %2 = load i32, i32* %arrayidx.2, align 4
 257   %mul.29 = xor i32 %2, %mul.18
 258   %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
 259   %3 = load i32, i32* %arrayidx.3, align 4
 260   %mul.310 = xor i32 %3, %mul.29
 261   %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
 262   %4 = load i32, i32* %arrayidx.4, align 4
 263   %mul.411 = xor i32 %4, %mul.310
 264   %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
 265   %5 = load i32, i32* %arrayidx.5, align 4
 266   %mul.512 = xor i32 %5, %mul.411
 267   %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
 268   %6 = load i32, i32* %arrayidx.6, align 4
 269   %mul.613 = xor i32 %6, %mul.512
 270   %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
 271   %7 = load i32, i32* %arrayidx.7, align 4
 272   %mul.714 = xor i32 %7, %mul.613
 273   ret i32 %mul.714
 274 }
 275
 276 define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 {
 277 ; CHECK-LABEL: @PR37731(
 278 ; CHECK-NEXT:  entry:
 279 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
 280 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
 281 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
 282 ; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
 283 ; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
 284 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
 285 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
 286 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
 287 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
 288 ; CHECK-NEXT:    ret i32 [[TMP7]]
 289 ;
 290 entry:
 291   %0 = load <4 x i32>, <4 x i32>* %self, align 16
 292   %1 = shl <4 x i32> %0, <i32 6, i32 2, i32 13, i32 3>
 293   %2 = xor <4 x i32> %1, %0
 294   %3 = lshr <4 x i32> %2, <i32 13, i32 27, i32 21, i32 12>
 295   %4 = and <4 x i32> %0, <i32 -2, i32 -8, i32 -16, i32 -128>
 296   %5 = shl <4 x i32> %4, <i32 18, i32 2, i32 7, i32 13>
 297   %6 = xor <4 x i32> %3, %5
 298   store <4 x i32> %6, <4 x i32>* %self, align 16
 299   %7 = extractelement <4 x i32> %6, i32 0
 300   %8 = extractelement <4 x i32> %6, i32 1
 301   %9 = xor i32 %7, %8
 302   %10 = extractelement <4 x i32> %6, i32 2
 303   %11 = xor i32 %9, %10
 304   %12 = extractelement <4 x i32> %6, i32 3
 305   %13 = xor i32 %11, %12
 306   ret i32 %13
 307 }