llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse4.2 | FileCheck %s
   3
   4 ; PR28474
   5
   6 ;void foo();
   7 ;
   8 ;int test1(unsigned int *p) {
   9 ;  int sum = 0;
  10 ;  #pragma nounroll
  11 ;  for (int y = 0; y < 2; y++) {
  12 ;    // Inner loop gets unrolled
  13 ;    for (int x = 0; x < 8; x++) {
  14 ;      sum += p[x] * 42;
  15 ;    }
  16 ;    // Dummy call to keep outer loop alive
  17 ;    foo();
  18 ;  }
  19 ;  return sum;
  20 ;}
  21
  22 define i32 @test(i32* nocapture readonly %p) {
  23 ; CHECK-LABEL: @test(
  24 ; CHECK-NEXT:  entry:
  25 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
  26 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
  27 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
  28 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
  29 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
  30 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
  31 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
  32 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  33 ; CHECK:       for.body:
  34 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
  35 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
  36 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
  37 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
  38 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
  39 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
  40 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
  41 ; CHECK:       for.end:
  42 ; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
  43 ;
  44 entry:
  45   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
  46   %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
  47   %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
  48   %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
  49   %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
  50   %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
  51   %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
  52   br label %for.body
  53
  54 for.body:
  55   %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
  56   %tmp = load i32, i32* %p, align 4
  57   %mul = mul i32 %tmp, 42
  58   %add = add i32 %mul, %sum
  59   %tmp5 = load i32, i32* %arrayidx.1, align 4
  60   %mul.1 = mul i32 %tmp5, 42
  61   %add.1 = add i32 %mul.1, %add
  62   %tmp6 = load i32, i32* %arrayidx.2, align 4
  63   %mul.2 = mul i32 %tmp6, 42
  64   %add.2 = add i32 %mul.2, %add.1
  65   %tmp7 = load i32, i32* %arrayidx.3, align 4
  66   %mul.3 = mul i32 %tmp7, 42
  67   %add.3 = add i32 %mul.3, %add.2
  68   %tmp8 = load i32, i32* %arrayidx.4, align 4
  69   %mul.4 = mul i32 %tmp8, 42
  70   %add.4 = add i32 %mul.4, %add.3
  71   %tmp9 = load i32, i32* %arrayidx.5, align 4
  72   %mul.5 = mul i32 %tmp9, 42
  73   %add.5 = add i32 %mul.5, %add.4
  74   %tmp10 = load i32, i32* %arrayidx.6, align 4
  75   %mul.6 = mul i32 %tmp10, 42
  76   %add.6 = add i32 %mul.6, %add.5
  77   %tmp11 = load i32, i32* %arrayidx.7, align 4
  78   %mul.7 = mul i32 %tmp11, 42
  79   %add.7 = add i32 %mul.7, %add.6
  80   br i1 true, label %for.end, label %for.body
  81
  82 for.end:
  83   ret i32 %add.7
  84 }
  85
  86 ;void foo();
  87 ;
  88 ;int test2(unsigned int *p, unsigned int *q) {
  89 ;  int sum = 0;
  90 ;  #pragma nounroll
  91 ;  for (int y = 0; y < 2; y++) {
  92 ;    // Inner loop gets unrolled
  93 ;    for (int x = 0; x < 8; x++) {
  94 ;      sum += p[x] * q[x];
  95 ;    }
  96 ;    // Dummy call to keep outer loop alive
  97 ;    foo();
  98 ;  }
  99 ;  return sum;
 100 ;}
 101
 102 define i32 @test2(i32* nocapture readonly %p, i32* nocapture readonly %q) {
 103 ; CHECK-LABEL: @test2(
 104 ; CHECK-NEXT:  entry:
 105 ; CHECK-NEXT:    [[ARRAYIDX_P_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 106 ; CHECK-NEXT:    [[ARRAYIDX_P_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
 107 ; CHECK-NEXT:    [[ARRAYIDX_P_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
 108 ; CHECK-NEXT:    [[ARRAYIDX_P_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
 109 ; CHECK-NEXT:    [[ARRAYIDX_P_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
 110 ; CHECK-NEXT:    [[ARRAYIDX_P_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
 111 ; CHECK-NEXT:    [[ARRAYIDX_P_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 112 ; CHECK-NEXT:    [[ARRAYIDX_Q_1:%.*]] = getelementptr inbounds i32, i32* [[Q:%.*]], i64 1
 113 ; CHECK-NEXT:    [[ARRAYIDX_Q_2:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 2
 114 ; CHECK-NEXT:    [[ARRAYIDX_Q_3:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 3
 115 ; CHECK-NEXT:    [[ARRAYIDX_Q_4:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 4
 116 ; CHECK-NEXT:    [[ARRAYIDX_Q_5:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 5
 117 ; CHECK-NEXT:    [[ARRAYIDX_Q_6:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 6
 118 ; CHECK-NEXT:    [[ARRAYIDX_Q_7:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 7
 119 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 120 ; CHECK:       for.body:
 121 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
 122 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 123 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 124 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
 125 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 126 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
 127 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 128 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
 129 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 130 ; CHECK:       for.end:
 131 ; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
 132 ;
 133 entry:
 134   %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
 135   %arrayidx.p.2 = getelementptr inbounds i32, i32* %p, i64 2
 136   %arrayidx.p.3 = getelementptr inbounds i32, i32* %p, i64 3
 137   %arrayidx.p.4 = getelementptr inbounds i32, i32* %p, i64 4
 138   %arrayidx.p.5 = getelementptr inbounds i32, i32* %p, i64 5
 139   %arrayidx.p.6 = getelementptr inbounds i32, i32* %p, i64 6
 140   %arrayidx.p.7 = getelementptr inbounds i32, i32* %p, i64 7
 141
 142   %arrayidx.q.1 = getelementptr inbounds i32, i32* %q, i64 1
 143   %arrayidx.q.2 = getelementptr inbounds i32, i32* %q, i64 2
 144   %arrayidx.q.3 = getelementptr inbounds i32, i32* %q, i64 3
 145   %arrayidx.q.4 = getelementptr inbounds i32, i32* %q, i64 4
 146   %arrayidx.q.5 = getelementptr inbounds i32, i32* %q, i64 5
 147   %arrayidx.q.6 = getelementptr inbounds i32, i32* %q, i64 6
 148   %arrayidx.q.7 = getelementptr inbounds i32, i32* %q, i64 7
 149   br label %for.body
 150
 151 for.body:
 152   %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
 153   %tmpp = load i32, i32* %p, align 4
 154   %tmpq = load i32, i32* %q, align 4
 155   %mul = mul i32 %tmpp, %tmpq
 156   %add = add i32 %mul, %sum
 157   %tmp5p = load i32, i32* %arrayidx.p.1, align 4
 158   %tmp5q = load i32, i32* %arrayidx.q.1, align 4
 159   %mul.1 = mul i32 %tmp5p, %tmp5q
 160   %add.1 = add i32 %mul.1, %add
 161   %tmp6p = load i32, i32* %arrayidx.p.2, align 4
 162   %tmp6q = load i32, i32* %arrayidx.q.2, align 4
 163   %mul.2 = mul i32 %tmp6p, %tmp6q
 164   %add.2 = add i32 %mul.2, %add.1
 165   %tmp7p = load i32, i32* %arrayidx.p.3, align 4
 166   %tmp7q = load i32, i32* %arrayidx.q.3, align 4
 167   %mul.3 = mul i32 %tmp7p, %tmp7q
 168   %add.3 = add i32 %mul.3, %add.2
 169   %tmp8p = load i32, i32* %arrayidx.p.4, align 4
 170   %tmp8q = load i32, i32* %arrayidx.q.4, align 4
 171   %mul.4 = mul i32 %tmp8p, %tmp8q
 172   %add.4 = add i32 %mul.4, %add.3
 173   %tmp9p = load i32, i32* %arrayidx.p.5, align 4
 174   %tmp9q = load i32, i32* %arrayidx.q.5, align 4
 175   %mul.5 = mul i32 %tmp9p, %tmp9q
 176   %add.5 = add i32 %mul.5, %add.4
 177   %tmp10p = load i32, i32* %arrayidx.p.6, align 4
 178   %tmp10q = load i32, i32* %arrayidx.q.6, align 4
 179   %mul.6 = mul i32 %tmp10p, %tmp10q
 180   %add.6 = add i32 %mul.6, %add.5
 181   %tmp11p = load i32, i32* %arrayidx.p.7, align 4
 182   %tmp11q = load i32, i32* %arrayidx.q.7, align 4
 183   %mul.7 = mul i32 %tmp11p, %tmp11q
 184   %add.7 = add i32 %mul.7, %add.6
 185   br i1 true, label %for.end, label %for.body
 186
 187 for.end:
 188   ret i32 %add.7
 189 }
 190
 191 ;void foo();
 192 ;
 193 ;int test3(unsigned int *p, unsigned int *q) {
 194 ;  int sum = 0;
 195 ;  #pragma nounroll
 196 ;  for (int y = 0; y < 2; y++) {
 197 ;    // Inner loop gets unrolled
 198 ;    for (int x = 0; x < 8; x++) {
 199 ;      sum += p[x] * q[7-x];
 200 ;    }
 201 ;    // Dummy call to keep outer loop alive
 202 ;    foo();
 203 ;  }
 204 ;  return sum;
 205 ;}
 206
 207 define i32 @test3(i32* nocapture readonly %p, i32* nocapture readonly %q) {
 208 ; CHECK-LABEL: @test3(
 209 ; CHECK-NEXT:  entry:
 210 ; CHECK-NEXT:    [[ARRAYIDX_P_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 211 ; CHECK-NEXT:    [[ARRAYIDX_P_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
 212 ; CHECK-NEXT:    [[ARRAYIDX_P_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
 213 ; CHECK-NEXT:    [[ARRAYIDX_P_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
 214 ; CHECK-NEXT:    [[ARRAYIDX_P_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
 215 ; CHECK-NEXT:    [[ARRAYIDX_P_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
 216 ; CHECK-NEXT:    [[ARRAYIDX_P_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 217 ; CHECK-NEXT:    [[ARRAYIDX_Q_1:%.*]] = getelementptr inbounds i32, i32* [[Q:%.*]], i64 1
 218 ; CHECK-NEXT:    [[ARRAYIDX_Q_2:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 2
 219 ; CHECK-NEXT:    [[ARRAYIDX_Q_3:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 3
 220 ; CHECK-NEXT:    [[ARRAYIDX_Q_4:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 4
 221 ; CHECK-NEXT:    [[ARRAYIDX_Q_5:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 5
 222 ; CHECK-NEXT:    [[ARRAYIDX_Q_6:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 6
 223 ; CHECK-NEXT:    [[ARRAYIDX_Q_7:%.*]] = getelementptr inbounds i32, i32* [[Q]], i64 7
 224 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 225 ; CHECK:       for.body:
 226 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
 227 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 228 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 229 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 230 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>*
 231 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
 232 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[SHUFFLE]], [[TMP3]]
 233 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
 234 ; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]]
 235 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 236 ; CHECK:       for.end:
 237 ; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
 238 ;
 239 entry:
 240   %arrayidx.p.1 = getelementptr inbounds i32, i32* %p, i64 1
 241   %arrayidx.p.2 = getelementptr inbounds i32, i32* %p, i64 2
 242   %arrayidx.p.3 = getelementptr inbounds i32, i32* %p, i64 3
 243   %arrayidx.p.4 = getelementptr inbounds i32, i32* %p, i64 4
 244   %arrayidx.p.5 = getelementptr inbounds i32, i32* %p, i64 5
 245   %arrayidx.p.6 = getelementptr inbounds i32, i32* %p, i64 6
 246   %arrayidx.p.7 = getelementptr inbounds i32, i32* %p, i64 7
 247
 248   %arrayidx.q.1 = getelementptr inbounds i32, i32* %q, i64 1
 249   %arrayidx.q.2 = getelementptr inbounds i32, i32* %q, i64 2
 250   %arrayidx.q.3 = getelementptr inbounds i32, i32* %q, i64 3
 251   %arrayidx.q.4 = getelementptr inbounds i32, i32* %q, i64 4
 252   %arrayidx.q.5 = getelementptr inbounds i32, i32* %q, i64 5
 253   %arrayidx.q.6 = getelementptr inbounds i32, i32* %q, i64 6
 254   %arrayidx.q.7 = getelementptr inbounds i32, i32* %q, i64 7
 255   br label %for.body
 256
 257 for.body:
 258   %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
 259   %tmpp = load i32, i32* %p, align 4
 260   %tmpq = load i32, i32* %arrayidx.q.7, align 4
 261   %mul = mul i32 %tmpp, %tmpq
 262   %add = add i32 %mul, %sum
 263   %tmp5p = load i32, i32* %arrayidx.p.1, align 4
 264   %tmp5q = load i32, i32* %arrayidx.q.6, align 4
 265   %mul.1 = mul i32 %tmp5p, %tmp5q
 266   %add.1 = add i32 %mul.1, %add
 267   %tmp6p = load i32, i32* %arrayidx.p.2, align 4
 268   %tmp6q = load i32, i32* %arrayidx.q.5, align 4
 269   %mul.2 = mul i32 %tmp6p, %tmp6q
 270   %add.2 = add i32 %mul.2, %add.1
 271   %tmp7p = load i32, i32* %arrayidx.p.3, align 4
 272   %tmp7q = load i32, i32* %arrayidx.q.4, align 4
 273   %mul.3 = mul i32 %tmp7p, %tmp7q
 274   %add.3 = add i32 %mul.3, %add.2
 275   %tmp8p = load i32, i32* %arrayidx.p.4, align 4
 276   %tmp8q = load i32, i32* %arrayidx.q.3, align 4
 277   %mul.4 = mul i32 %tmp8p, %tmp8q
 278   %add.4 = add i32 %mul.4, %add.3
 279   %tmp9p = load i32, i32* %arrayidx.p.5, align 4
 280   %tmp9q = load i32, i32* %arrayidx.q.2, align 4
 281   %mul.5 = mul i32 %tmp9p, %tmp9q
 282   %add.5 = add i32 %mul.5, %add.4
 283   %tmp10p = load i32, i32* %arrayidx.p.6, align 4
 284   %tmp10q = load i32, i32* %arrayidx.q.1, align 4
 285   %mul.6 = mul i32 %tmp10p, %tmp10q
 286   %add.6 = add i32 %mul.6, %add.5
 287   %tmp11p = load i32, i32* %arrayidx.p.7, align 4
 288   %tmp11q = load i32, i32* %q, align 4
 289   %mul.7 = mul i32 %tmp11p, %tmp11q
 290   %add.7 = add i32 %mul.7, %add.6
 291   br i1 true, label %for.end, label %for.body
 292
 293 for.end:
 294   ret i32 %add.7
 295 }