llvm/test/CodeGen/ARM/loop-indexing.ll

   1 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | \
   2 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
   3
   4 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 -lsr-preferred-addressing-mode=none %s -o - | \
   5 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
   6
   7 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | \
   8 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
   9
  10 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=postindexed %s -o - | \
  11 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
  12
  13 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=preindexed %s -o - | \
  14 ; RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-T2
  15
  16 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
  17 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
  18
  19 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | \
  20 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
  21
  22 ; Tests to check that post increment addressing modes are used instead of
  23 ; updating base pointers with add instructions.
  24
  25 ; TODO: I think we should be able to use post inc addressing with VLDM
  26 ; instructions.
  27 ; CHECK-LABEL: test_fma
  28 ; CHECK: @ %loop
  29
  30 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
  31 ; CHECK-DEFAULT: vldr s{{.*}}, #8]
  32 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
  33 ; CHECK-DEFAULT: vldr s{{.*}}, #12]
  34
  35 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
  36 ; CHECK-COMPLEX: vldr s{{.*}}, #8]
  37 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
  38 ; CHECK-COMPLEX: vldr s{{.*}}, #12]
  39
  40 define float @test_fma(float* %a, float* %b, i32 %N) {
  41 entry:
  42   br label %loop
  43
  44 loop:
  45   %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
  46   %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
  47   %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
  48   %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
  49   %a.1 = load float, float* %gep.a.1
  50   %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
  51   %b.1 = load float, float* %gep.b.1
  52   %fmul.1 = fmul float %a.1, %b.1
  53   %fma.1 = fadd float %fmul.1, %res
  54   %idx.2 = or i32 %idx.1, 1
  55   %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
  56   %a.2 = load float, float* %gep.a.2
  57   %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
  58   %b.2 = load float, float* %gep.b.2
  59   %fmul.2 = fmul float %a.2, %b.2
  60   %fma.2 = fadd float %fmul.2, %fma.1
  61   %i.next = add nsw nuw i32 %i, -2
  62   %idx.next = add nsw nuw i32 %idx.1, 2
  63   %cmp = icmp ult i32 %i.next, %N
  64   br i1 %cmp, label %loop, label %exit
  65
  66 exit:
  67   ret float %fma.2
  68 }
  69
  70 ; CHECK-LABEL: convolve_16bit
  71 ; TODO: Both arrays should use indexing
  72 ; CHECK-DEFAULT: ldr{{.*}}, #8]!
  73 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
  74
  75 ; CHECK-COMPLEX: ldr{{.*}}, #8]!
  76 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
  77
  78 ; DISABLED-NOT: ldr{{.*}}]!
  79 ; DISABLED-NOT: str{{.*}}]!
  80
  81 define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
  82                             i32 %filter_dim, i32 %out_width, i32 %out_height,
  83                             i32** nocapture readonly %convolved) {
  84 entry:
  85   %cmp92 = icmp eq i32 %out_height, 0
  86   br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
  87
  88 for.cond1.preheader.lr.ph:                        ; preds = %entry
  89   %xtraiter = and i32 %filter_dim, 3
  90   %unroll_iter = sub i32 %filter_dim, %xtraiter
  91   br label %for.cond1.preheader
  92
  93 for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
  94   %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
  95   %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
  96   %tmp3 = load i32*, i32** %arrayidx22, align 4
  97   br label %for.cond9.preheader.us.us.preheader
  98
  99 for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
 100   %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
 101   br label %for.cond9.preheader.us.us
 102
 103 for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
 104   %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
 105   %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
 106   %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
 107   %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
 108   %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
 109   %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
 110   %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
 111   br label %for.body12.us.us
 112
 113 for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
 114   %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
 115   %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
 116   %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
 117   %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
 118   %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
 119   %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
 120   %conv.us.us = sext i16 %tmp9 to i32
 121   %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
 122   %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
 123   %conv17.us.us = sext i16 %tmp10 to i32
 124   %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
 125   %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
 126   %inc.us.us = or i32 %filter_x.053.us.us, 1
 127   %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
 128   %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
 129   %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
 130   %conv.us.us.1 = sext i16 %tmp11 to i32
 131   %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
 132   %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
 133   %conv17.us.us.1 = sext i16 %tmp12 to i32
 134   %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
 135   %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
 136   %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
 137   %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
 138   %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
 139   %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
 140   %conv.us.us.2 = sext i16 %tmp13 to i32
 141   %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
 142   %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
 143   %conv17.us.us.2 = sext i16 %tmp14 to i32
 144   %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
 145   %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
 146   %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
 147   %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
 148   %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
 149   %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
 150   %conv.us.us.3 = sext i16 %tmp15 to i32
 151   %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
 152   %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
 153   %conv17.us.us.3 = sext i16 %tmp16 to i32
 154   %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
 155   %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
 156   %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
 157   %niter.nsub.3 = add i32 %niter, -4
 158   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 159   br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
 160
 161 for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
 162   %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
 163   %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
 164   br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
 165
 166 for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
 167   %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
 168   store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
 169   %add25.us = add nuw i32 %res_x.060.us, 1
 170   %exitcond99 = icmp eq i32 %add25.us, %out_width
 171   br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
 172
 173 for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
 174   %add28 = add nuw i32 %res_y.093, 1
 175   %exitcond100 = icmp eq i32 %add28, %out_height
 176   br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
 177
 178 for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
 179   ret void
 180 }
 181
 182 ; CHECK-LABEL: mul_8x8
 183 ; CHECK: @ %for.body
 184
 185 ; CHECK-DEFAULT: str{{.*}}, #16]!
 186 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 187 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 188
 189 ; CHECK-COMPLEX: str{{.*}}, #16]!
 190 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
 191 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
 192
 193 ; DISABLED-NOT: ldr{{.*}}]!
 194 ; DISABLED-NOT: str{{.*}}]!
 195
 196 ; CHECK-T2: @ %for.body.epil
 197 ; CHECK-T2: ldrb{{.*}}, #1]!
 198 ; CHECK-T2: ldrb{{.*}}, #1]!
 199 ; CHECK-T2: str{{.*}}, #4]!
 200
 201 define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
 202 entry:
 203   %cmp9 = icmp eq i32 %N, 0
 204   br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
 205
 206 for.body.preheader:                               ; preds = %entry
 207   %tmp = add i32 %N, -1
 208   %xtraiter = and i32 %N, 3
 209   %tmp1 = icmp ult i32 %tmp, 3
 210   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 211
 212 for.body.preheader.new:                           ; preds = %for.body.preheader
 213   %unroll_iter = sub i32 %N, %xtraiter
 214   br label %for.body
 215
 216 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 217   %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 218   %lcmp.mod = icmp eq i32 %xtraiter, 0
 219   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 220
 221 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
 222   %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 223   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 224   %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil
 225   %tmp2 = load i8, i8* %arrayidx.epil, align 1
 226   %conv.epil = zext i8 %tmp2 to i32
 227   %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
 228   %tmp3 = load i8, i8* %arrayidx1.epil, align 1
 229   %conv2.epil = zext i8 %tmp3 to i32
 230   %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
 231   %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
 232   store i32 %mul.epil, i32* %arrayidx3.epil, align 4
 233   %inc.epil = add nuw i32 %i.010.epil, 1
 234   %epil.iter.sub = add i32 %epil.iter, -1
 235   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 236   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 237
 238 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
 239   ret void
 240
 241 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 242   %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 243   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 244   %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010
 245   %tmp4 = load i8, i8* %arrayidx, align 1
 246   %conv = zext i8 %tmp4 to i32
 247   %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
 248   %tmp5 = load i8, i8* %arrayidx1, align 1
 249   %conv2 = zext i8 %tmp5 to i32
 250   %mul = mul nuw nsw i32 %conv2, %conv
 251   %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
 252   store i32 %mul, i32* %arrayidx3, align 4
 253   %inc = or i32 %i.010, 1
 254   %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
 255   %tmp6 = load i8, i8* %arrayidx.1, align 1
 256   %conv.1 = zext i8 %tmp6 to i32
 257   %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
 258   %tmp7 = load i8, i8* %arrayidx1.1, align 1
 259   %conv2.1 = zext i8 %tmp7 to i32
 260   %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
 261   %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
 262   store i32 %mul.1, i32* %arrayidx3.1, align 4
 263   %inc.1 = or i32 %i.010, 2
 264   %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
 265   %tmp8 = load i8, i8* %arrayidx.2, align 1
 266   %conv.2 = zext i8 %tmp8 to i32
 267   %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
 268   %tmp9 = load i8, i8* %arrayidx1.2, align 1
 269   %conv2.2 = zext i8 %tmp9 to i32
 270   %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
 271   %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
 272   store i32 %mul.2, i32* %arrayidx3.2, align 4
 273   %inc.2 = or i32 %i.010, 3
 274   %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
 275   %tmp10 = load i8, i8* %arrayidx.3, align 1
 276   %conv.3 = zext i8 %tmp10 to i32
 277   %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
 278   %tmp11 = load i8, i8* %arrayidx1.3, align 1
 279   %conv2.3 = zext i8 %tmp11 to i32
 280   %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
 281   %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
 282   store i32 %mul.3, i32* %arrayidx3.3, align 4
 283   %inc.3 = add i32 %i.010, 4
 284   %niter.nsub.3 = add i32 %niter, -4
 285   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 286   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 287 }
 288
 289 ; CHECK-LABEL: mul_16x8
 290 ; CHECK: @ %for.body
 291
 292 ; CHECK-DEFAULT: str{{.*}}, #16]!
 293 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 294
 295 ; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
 296 ; CHECK-COMPLEX: str{{.*}}, #16]!
 297 ; CHECK-COMPLEX: ldrb{{.*}}, #4]!
 298
 299 ; DISABLED-NOT: ldr{{.*}}]!
 300 ; DISABLED-NOT: str{{.*}}]!
 301
 302 ; CHECK-T2: @ %for.body.epil
 303 ; CHECK-T2: ldrsh{{.*}}, #2]!
 304 ; CHECK-T2: ldrb{{.*}}, #1]!
 305 ; CHECK-T2: str{{.*}}, #4]!
 306
 307 define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
 308 entry:
 309   %cmp9 = icmp eq i32 %N, 0
 310   br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
 311
 312 for.body.preheader:                               ; preds = %entry
 313   %tmp = add i32 %N, -1
 314   %xtraiter = and i32 %N, 3
 315   %tmp1 = icmp ult i32 %tmp, 3
 316   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 317
 318 for.body.preheader.new:                           ; preds = %for.body.preheader
 319   %unroll_iter = sub i32 %N, %xtraiter
 320   br label %for.body
 321
 322 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 323   %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 324   %lcmp.mod = icmp eq i32 %xtraiter, 0
 325   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 326
 327 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
 328   %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 329   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 330   %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
 331   %tmp2 = load i16, i16* %arrayidx.epil, align 2
 332   %conv.epil = sext i16 %tmp2 to i32
 333   %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
 334   %tmp3 = load i8, i8* %arrayidx1.epil, align 1
 335   %conv2.epil = zext i8 %tmp3 to i32
 336   %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
 337   %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
 338   store i32 %mul.epil, i32* %arrayidx3.epil, align 4
 339   %inc.epil = add nuw i32 %i.010.epil, 1
 340   %epil.iter.sub = add i32 %epil.iter, -1
 341   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 342   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 343
 344 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
 345   ret void
 346
 347 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 348   %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 349   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 350   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
 351   %tmp4 = load i16, i16* %arrayidx, align 2
 352   %conv = sext i16 %tmp4 to i32
 353   %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
 354   %tmp5 = load i8, i8* %arrayidx1, align 1
 355   %conv2 = zext i8 %tmp5 to i32
 356   %mul = mul nsw i32 %conv2, %conv
 357   %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
 358   store i32 %mul, i32* %arrayidx3, align 4
 359   %inc = or i32 %i.010, 1
 360   %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
 361   %tmp6 = load i16, i16* %arrayidx.1, align 2
 362   %conv.1 = sext i16 %tmp6 to i32
 363   %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
 364   %tmp7 = load i8, i8* %arrayidx1.1, align 1
 365   %conv2.1 = zext i8 %tmp7 to i32
 366   %mul.1 = mul nsw i32 %conv2.1, %conv.1
 367   %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
 368   store i32 %mul.1, i32* %arrayidx3.1, align 4
 369   %inc.1 = or i32 %i.010, 2
 370   %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
 371   %tmp8 = load i16, i16* %arrayidx.2, align 2
 372   %conv.2 = sext i16 %tmp8 to i32
 373   %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
 374   %tmp9 = load i8, i8* %arrayidx1.2, align 1
 375   %conv2.2 = zext i8 %tmp9 to i32
 376   %mul.2 = mul nsw i32 %conv2.2, %conv.2
 377   %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
 378   store i32 %mul.2, i32* %arrayidx3.2, align 4
 379   %inc.2 = or i32 %i.010, 3
 380   %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
 381   %tmp10 = load i16, i16* %arrayidx.3, align 2
 382   %conv.3 = sext i16 %tmp10 to i32
 383   %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
 384   %tmp11 = load i8, i8* %arrayidx1.3, align 1
 385   %conv2.3 = zext i8 %tmp11 to i32
 386   %mul.3 = mul nsw i32 %conv2.3, %conv.3
 387   %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
 388   store i32 %mul.3, i32* %arrayidx3.3, align 4
 389   %inc.3 = add i32 %i.010, 4
 390   %niter.nsub.3 = add i32 %niter, -4
 391   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 392   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 393 }
 394
 395 ; CHECK-LABEL: mul_16x16
 396 ; CHECK: @ %for.body
 397
 398 ; TODO: pre-indexed loads
 399 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
 400 ; CHECK-DEFAULT: str{{.*}}, #16]!
 401 ; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
 402
 403 ; CHECK-COMPLEX: ldrsh{{.*}}]!
 404 ; CHECK-COMPLEX: ldrsh{{.*}}]!
 405 ; CHECK-COMPLEX: str{{.*}}]!
 406
 407 ; DISABLED-NOT: ldr{{.*}}]!
 408 ; DISABLED-NOT: str{{.*}}]!
 409
 410 ; CHECK-T2: @ %for.body.epil
 411 ; CHECK-T2: ldrsh{{.*}}, #2]!
 412 ; CHECK-T2: ldrsh{{.*}}, #2]!
 413 ; CHECK-T2: str{{.*}}, #4]!
 414
 415 define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) {
 416 entry:
 417   %cmp9 = icmp eq i32 %N, 0
 418   br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
 419
 420 for.body.preheader:                               ; preds = %entry
 421   %tmp = add i32 %N, -1
 422   %xtraiter = and i32 %N, 3
 423   %tmp1 = icmp ult i32 %tmp, 3
 424   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
 425
 426 for.body.preheader.new:                           ; preds = %for.body.preheader
 427   %unroll_iter = sub i32 %N, %xtraiter
 428   br label %for.body
 429
 430 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
 431   %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
 432   %lcmp.mod = icmp eq i32 %xtraiter, 0
 433   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
 434
 435 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
 436   %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
 437   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
 438   %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
 439   %tmp2 = load i16, i16* %arrayidx.epil, align 2
 440   %conv.epil = sext i16 %tmp2 to i32
 441   %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil
 442   %tmp3 = load i16, i16* %arrayidx1.epil, align 2
 443   %conv2.epil = sext i16 %tmp3 to i32
 444   %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
 445   %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
 446   store i32 %mul.epil, i32* %arrayidx3.epil, align 4
 447   %inc.epil = add nuw i32 %i.010.epil, 1
 448   %epil.iter.sub = add i32 %epil.iter, -1
 449   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 450   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
 451
 452 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
 453   ret void
 454
 455 for.body:                                         ; preds = %for.body, %for.body.preheader.new
 456   %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
 457   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
 458   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
 459   %tmp4 = load i16, i16* %arrayidx, align 2
 460   %conv = sext i16 %tmp4 to i32
 461   %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
 462   %tmp5 = load i16, i16* %arrayidx1, align 2
 463   %conv2 = sext i16 %tmp5 to i32
 464   %mul = mul nsw i32 %conv2, %conv
 465   %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
 466   store i32 %mul, i32* %arrayidx3, align 4
 467   %inc = or i32 %i.010, 1
 468   %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
 469   %tmp6 = load i16, i16* %arrayidx.1, align 2
 470   %conv.1 = sext i16 %tmp6 to i32
 471   %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
 472   %tmp7 = load i16, i16* %arrayidx1.1, align 2
 473   %conv2.1 = sext i16 %tmp7 to i32
 474   %mul.1 = mul nsw i32 %conv2.1, %conv.1
 475   %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
 476   store i32 %mul.1, i32* %arrayidx3.1, align 4
 477   %inc.1 = or i32 %i.010, 2
 478   %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
 479   %tmp8 = load i16, i16* %arrayidx.2, align 2
 480   %conv.2 = sext i16 %tmp8 to i32
 481   %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
 482   %tmp9 = load i16, i16* %arrayidx1.2, align 2
 483   %conv2.2 = sext i16 %tmp9 to i32
 484   %mul.2 = mul nsw i32 %conv2.2, %conv.2
 485   %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
 486   store i32 %mul.2, i32* %arrayidx3.2, align 4
 487   %inc.2 = or i32 %i.010, 3
 488   %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
 489   %tmp10 = load i16, i16* %arrayidx.3, align 2
 490   %conv.3 = sext i16 %tmp10 to i32
 491   %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
 492   %tmp11 = load i16, i16* %arrayidx1.3, align 2
 493   %conv2.3 = sext i16 %tmp11 to i32
 494   %mul.3 = mul nsw i32 %conv2.3, %conv.3
 495   %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
 496   store i32 %mul.3, i32* %arrayidx3.3, align 4
 497   %inc.3 = add i32 %i.010, 4
 498   %niter.nsub.3 = add i32 %niter, -4
 499   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 500   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 501 }
 502
 503 ; CHECK-LABEL: mul_8x8_2d
 504 ; CHECK: @ %for.body4.us
 505
 506 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
 507 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 508
 509 ; DISABLED-NOT: ldr{{.*}}]!
 510 ; DISABLED-NOT: str{{.*}}]!
 511
 512 ; CHECK-T2: @ %for.body4.us.epil
 513 ; CHECK-T2: ldrb{{.*}}, #1]!
 514 ; CHECK-T2: ldr{{.*}}, #4]!
 515
 516 define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
 517 entry:
 518   %cmp24 = icmp eq i32 %N, 0
 519   %cmp222 = icmp eq i32 %M, 0
 520   %or.cond = or i1 %cmp24, %cmp222
 521   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 522
 523 for.cond1.preheader.us.preheader:                 ; preds = %entry
 524   %tmp = add i32 %M, -1
 525   %xtraiter = and i32 %M, 3
 526   %tmp1 = icmp ult i32 %tmp, 3
 527   %unroll_iter = sub i32 %M, %xtraiter
 528   %lcmp.mod = icmp eq i32 %xtraiter, 0
 529   br label %for.cond1.preheader.us
 530
 531 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 532   %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 533   %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us
 534   %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us
 535   %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
 536   %.pre = load i8*, i8** %arrayidx5.us, align 4
 537   %.pre30 = load i32*, i32** %arrayidx8.us, align 4
 538   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 539
 540 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 541   %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 542   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 543   %tmp2 = load i8, i8* %arrayidx.us, align 1
 544   %conv.us = zext i8 %tmp2 to i32
 545   %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us
 546   %tmp3 = load i8, i8* %arrayidx6.us, align 1
 547   %conv7.us = zext i8 %tmp3 to i32
 548   %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
 549   %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us
 550   %tmp4 = load i32, i32* %arrayidx9.us, align 4
 551   %add.us = add nsw i32 %tmp4, %mul.us
 552   store i32 %add.us, i32* %arrayidx9.us, align 4
 553   %inc.us = or i32 %j.023.us, 1
 554   %tmp5 = load i8, i8* %arrayidx.us, align 1
 555   %conv.us.1 = zext i8 %tmp5 to i32
 556   %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
 557   %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
 558   %conv7.us.1 = zext i8 %tmp6 to i32
 559   %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
 560   %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us
 561   %tmp7 = load i32, i32* %arrayidx9.us.1, align 4
 562   %add.us.1 = add nsw i32 %tmp7, %mul.us.1
 563   store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
 564   %inc.us.1 = or i32 %j.023.us, 2
 565   %tmp8 = load i8, i8* %arrayidx.us, align 1
 566   %conv.us.2 = zext i8 %tmp8 to i32
 567   %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
 568   %tmp9 = load i8, i8* %arrayidx6.us.2, align 1
 569   %conv7.us.2 = zext i8 %tmp9 to i32
 570   %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
 571   %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1
 572   %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
 573   %add.us.2 = add nsw i32 %tmp10, %mul.us.2
 574   store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
 575   %inc.us.2 = or i32 %j.023.us, 3
 576   %tmp11 = load i8, i8* %arrayidx.us, align 1
 577   %conv.us.3 = zext i8 %tmp11 to i32
 578   %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
 579   %tmp12 = load i8, i8* %arrayidx6.us.3, align 1
 580   %conv7.us.3 = zext i8 %tmp12 to i32
 581   %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
 582   %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2
 583   %tmp13 = load i32, i32* %arrayidx9.us.3, align 4
 584   %add.us.3 = add nsw i32 %tmp13, %mul.us.3
 585   store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
 586   %inc.us.3 = add i32 %j.023.us, 4
 587   %niter.nsub.3 = add i32 %niter, -4
 588   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 589   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 590
 591 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 592   %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 593   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 594
 595 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 596   %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 597   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 598   %tmp14 = load i8, i8* %arrayidx.us, align 1
 599   %conv.us.epil = zext i8 %tmp14 to i32
 600   %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil
 601   %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1
 602   %conv7.us.epil = zext i8 %tmp15 to i32
 603   %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
 604   %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil
 605   %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4
 606   %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
 607   store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
 608   %inc.us.epil = add nuw i32 %j.023.us.epil, 1
 609   %epil.iter.sub = add i32 %epil.iter, -1
 610   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 611   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 612
 613 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 614   %inc11.us = add nuw i32 %i.025.us, 1
 615   %exitcond28 = icmp eq i32 %inc11.us, %N
 616   br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
 617
 618 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 619   ret void
 620 }
 621
 622 ; CHECK-LABEL: mul_16x16_2d
 623 ; CHECK: @ %for.body4.us
 624
 625 ; CHECK-DEFAULT: ldr{{.*}}, #16]!
 626 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 627
 628 ; DISABLED-NOT: ldr{{.*}}]!
 629 ; DISABLED-NOT: str{{.*}}]!
 630
 631 ; CHECK-T2: @ %for.body4.us.epil
 632 ; CHECK-T2: ldrsh{{.*}}, #2]!
 633 ; CHECK-T2: ldr{{.*}}, #4]!
 634
 635 define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
 636 entry:
 637   %cmp24 = icmp eq i32 %N, 0
 638   %cmp222 = icmp eq i32 %M, 0
 639   %or.cond = or i1 %cmp24, %cmp222
 640   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 641
 642 for.cond1.preheader.us.preheader:                 ; preds = %entry
 643   %tmp = add i32 %M, -1
 644   %xtraiter = and i32 %M, 3
 645   %tmp1 = icmp ult i32 %tmp, 3
 646   %unroll_iter = sub i32 %M, %xtraiter
 647   %lcmp.mod = icmp eq i32 %xtraiter, 0
 648   br label %for.cond1.preheader.us
 649
 650 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 651   %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 652   %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us
 653   %tmp2 = load i16, i16* %arrayidx.us, align 2
 654   %conv.us = sext i16 %tmp2 to i32
 655   %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us
 656   %tmp3 = load i16*, i16** %arrayidx5.us, align 4
 657   %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
 658   %tmp4 = load i32*, i32** %arrayidx8.us, align 4
 659   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 660
 661 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 662   %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 663   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 664   %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us
 665   %tmp5 = load i16, i16* %arrayidx6.us, align 2
 666   %conv7.us = sext i16 %tmp5 to i32
 667   %mul.us = mul nsw i32 %conv7.us, %conv.us
 668   %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us
 669   %tmp6 = load i32, i32* %arrayidx9.us, align 4
 670   %add.us = add nsw i32 %tmp6, %mul.us
 671   store i32 %add.us, i32* %arrayidx9.us, align 4
 672   %inc.us = or i32 %j.023.us, 1
 673   %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
 674   %tmp7 = load i16, i16* %arrayidx6.us.1, align 2
 675   %conv7.us.1 = sext i16 %tmp7 to i32
 676   %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
 677   %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us
 678   %tmp8 = load i32, i32* %arrayidx9.us.1, align 4
 679   %add.us.1 = add nsw i32 %tmp8, %mul.us.1
 680   store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
 681   %inc.us.1 = or i32 %j.023.us, 2
 682   %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
 683   %tmp9 = load i16, i16* %arrayidx6.us.2, align 2
 684   %conv7.us.2 = sext i16 %tmp9 to i32
 685   %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
 686   %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1
 687   %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
 688   %add.us.2 = add nsw i32 %tmp10, %mul.us.2
 689   store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
 690   %inc.us.2 = or i32 %j.023.us, 3
 691   %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
 692   %tmp11 = load i16, i16* %arrayidx6.us.3, align 2
 693   %conv7.us.3 = sext i16 %tmp11 to i32
 694   %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
 695   %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2
 696   %tmp12 = load i32, i32* %arrayidx9.us.3, align 4
 697   %add.us.3 = add nsw i32 %tmp12, %mul.us.3
 698   store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
 699   %inc.us.3 = add i32 %j.023.us, 4
 700   %niter.nsub.3 = add i32 %niter, -4
 701   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 702   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 703
 704 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 705   %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 706   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 707
 708 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 709   %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 710   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 711   %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil
 712   %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2
 713   %conv7.us.epil = sext i16 %tmp13 to i32
 714   %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
 715   %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil
 716   %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4
 717   %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
 718   store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
 719   %inc.us.epil = add nuw i32 %j.023.us.epil, 1
 720   %epil.iter.sub = add i32 %epil.iter, -1
 721   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 722   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 723
 724 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 725   %inc11.us = add nuw i32 %i.025.us, 1
 726   %exitcond28 = icmp eq i32 %inc11.us, %N
 727   br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
 728
 729 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 730   ret void
 731 }
 732
 733 ; CHECK-LABEL: mac_8x8_2d
 734 ; CHECK: @ %for.body4.us
 735
 736 ; TODO: Both input arrays could use pre-indexed loads.
 737 ; TODO: pre-indexed stores.
 738 ; CHECK-DEFAULT: ldrb{{.*}}, #4]!
 739 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 740 ; CHECK-DEFAULT-NOT: str{{.*}}]!
 741
 742 ; TODO: Increased complexity shouldn't prevent indexed accesses.
 743 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
 744 ; CHECK-COMPLEX-NOT: str{{.*}}]!
 745
 746 ; DISABLED-NOT: ldr{{.*}}]!
 747 ; DISABLED-NOT: str{{.*}}]!
 748
 749 ; CHECK-T2: @ %for.body4.us.epil
 750 ; CHECK-T2: ldrb{{.*}}, #1]!
 751
 752 define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
 753 entry:
 754   %cmp22 = icmp eq i32 %N, 0
 755   %cmp220 = icmp eq i32 %M, 0
 756   %or.cond = or i1 %cmp22, %cmp220
 757   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 758
 759 for.cond1.preheader.us.preheader:                 ; preds = %entry
 760   %tmp = add i32 %M, -1
 761   %xtraiter = and i32 %M, 3
 762   %tmp1 = icmp ult i32 %tmp, 3
 763   %unroll_iter = sub i32 %M, %xtraiter
 764   %lcmp.mod = icmp eq i32 %xtraiter, 0
 765   br label %for.cond1.preheader.us
 766
 767 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 768   %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 769   %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us
 770   %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us
 771   %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us
 772   %.pre = load i8*, i8** %arrayidx5.us, align 4
 773   %.pre28 = load i32, i32* %arrayidx8.us, align 4
 774   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 775
 776 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 777   %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
 778   %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 779   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 780   %tmp3 = load i8, i8* %arrayidx.us, align 1
 781   %conv.us = zext i8 %tmp3 to i32
 782   %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us
 783   %tmp4 = load i8, i8* %arrayidx6.us, align 1
 784   %conv7.us = zext i8 %tmp4 to i32
 785   %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
 786   %add.us = add nsw i32 %mul.us, %tmp2
 787   store i32 %add.us, i32* %arrayidx8.us, align 4
 788   %inc.us = or i32 %j.021.us, 1
 789   %tmp5 = load i8, i8* %arrayidx.us, align 1
 790   %conv.us.1 = zext i8 %tmp5 to i32
 791   %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
 792   %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
 793   %conv7.us.1 = zext i8 %tmp6 to i32
 794   %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
 795   %add.us.1 = add nsw i32 %mul.us.1, %add.us
 796   store i32 %add.us.1, i32* %arrayidx8.us, align 4
 797   %inc.us.1 = or i32 %j.021.us, 2
 798   %tmp7 = load i8, i8* %arrayidx.us, align 1
 799   %conv.us.2 = zext i8 %tmp7 to i32
 800   %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
 801   %tmp8 = load i8, i8* %arrayidx6.us.2, align 1
 802   %conv7.us.2 = zext i8 %tmp8 to i32
 803   %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
 804   %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
 805   store i32 %add.us.2, i32* %arrayidx8.us, align 4
 806   %inc.us.2 = or i32 %j.021.us, 3
 807   %tmp9 = load i8, i8* %arrayidx.us, align 1
 808   %conv.us.3 = zext i8 %tmp9 to i32
 809   %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
 810   %tmp10 = load i8, i8* %arrayidx6.us.3, align 1
 811   %conv7.us.3 = zext i8 %tmp10 to i32
 812   %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
 813   %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
 814   store i32 %add.us.3, i32* %arrayidx8.us, align 4
 815   %inc.us.3 = add i32 %j.021.us, 4
 816   %niter.nsub.3 = add i32 %niter, -4
 817   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 818   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 819
 820 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 821   %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
 822   %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 823   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 824
 825 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 826   %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 827   %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 828   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 829   %tmp12 = load i8, i8* %arrayidx.us, align 1
 830   %conv.us.epil = zext i8 %tmp12 to i32
 831   %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil
 832   %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1
 833   %conv7.us.epil = zext i8 %tmp13 to i32
 834   %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
 835   %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
 836   store i32 %add.us.epil, i32* %arrayidx8.us, align 4
 837   %inc.us.epil = add nuw i32 %j.021.us.epil, 1
 838   %epil.iter.sub = add i32 %epil.iter, -1
 839   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 840   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 841
 842 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 843   %inc10.us = add nuw i32 %i.023.us, 1
 844   %exitcond26 = icmp eq i32 %inc10.us, %N
 845   br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
 846
 847 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 848   ret void
 849 }
 850
 851 ; CHECK-LABEL: mac_16x16_2d
 852 ; CHECK: @ %for.body4.us
 853
 854 ; TODO: pre-indexed loads for both input arrays.
 855 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 856 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 857
 858 ; TODO: increased complexity should lead to better codegen.
 859 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
 860
 861 ; DISABLED-NOT: ldr{{.*}}]!
 862
 863 ; CHECK-T2: @ %for.body4.us.epil
 864 ; CHECK-T2: ldrsh{{.*}}, #2]!
 865
 866 define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
 867 entry:
 868   %cmp23 = icmp eq i32 %N, 0
 869   %cmp220 = icmp eq i32 %M, 0
 870   %or.cond = or i1 %cmp23, %cmp220
 871   br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
 872
 873 for.cond1.preheader.us.preheader:                 ; preds = %entry
 874   %tmp = add i32 %M, -1
 875   %xtraiter = and i32 %M, 3
 876   %tmp1 = icmp ult i32 %tmp, 3
 877   %unroll_iter = sub i32 %M, %xtraiter
 878   %lcmp.mod = icmp eq i32 %xtraiter, 0
 879   br label %for.cond1.preheader.us
 880
 881 for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
 882   %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
 883   %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us
 884   %tmp2 = load i16, i16* %arrayidx.us, align 2
 885   %conv.us = sext i16 %tmp2 to i32
 886   %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us
 887   %tmp3 = load i16*, i16** %arrayidx5.us, align 4
 888   %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
 889   %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
 890   br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 891
 892 for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
 893   %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
 894   %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
 895   %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
 896   %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us
 897   %tmp4 = load i16, i16* %arrayidx6.us, align 2
 898   %conv7.us = sext i16 %tmp4 to i32
 899   %mul.us = mul nsw i32 %conv7.us, %conv.us
 900   %add.us = add nsw i32 %mul.us, %add22.us
 901   %inc.us = or i32 %j.021.us, 1
 902   %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
 903   %tmp5 = load i16, i16* %arrayidx6.us.1, align 2
 904   %conv7.us.1 = sext i16 %tmp5 to i32
 905   %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
 906   %add.us.1 = add nsw i32 %mul.us.1, %add.us
 907   %inc.us.1 = or i32 %j.021.us, 2
 908   %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
 909   %tmp6 = load i16, i16* %arrayidx6.us.2, align 2
 910   %conv7.us.2 = sext i16 %tmp6 to i32
 911   %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
 912   %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
 913   %inc.us.2 = or i32 %j.021.us, 3
 914   %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
 915   %tmp7 = load i16, i16* %arrayidx6.us.3, align 2
 916   %conv7.us.3 = sext i16 %tmp7 to i32
 917   %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
 918   %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
 919   %inc.us.3 = add i32 %j.021.us, 4
 920   %niter.nsub.3 = add i32 %niter, -4
 921   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
 922   br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
 923
 924 for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
 925   %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
 926   %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
 927   %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
 928   br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 929
 930 for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 931   %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 932   %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 933   %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
 934   %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil
 935   %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2
 936   %conv7.us.epil = sext i16 %tmp8 to i32
 937   %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
 938   %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
 939   %inc.us.epil = add nuw i32 %j.021.us.epil, 1
 940   %epil.iter.sub = add i32 %epil.iter, -1
 941   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
 942   br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
 943
 944 for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
 945   %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
 946   store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4
 947   %inc10.us = add nuw i32 %i.024.us, 1
 948   %exitcond27 = icmp eq i32 %inc10.us, %N
 949   br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
 950
 951 for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
 952   ret void
 953 }
 954
 955 ; CHECK-LABEL: mul32x32_backwards
 956 ; CHECK: @ %for.body
 957
 958 ; TODO: post increments for decreasing addresses
 959 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 960 ; CHECK-DEFAULT-NOT: str{{.*}}]!
 961
 962 ; CHECK-COMPLEX-NOT: ldr{{.*}}]!
 963 ; CHECK-COMPLEX-NOT: str{{.*}}]!
 964
 965 define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 966 entry:
 967   %i.08 = add i32 %N, -1
 968   %cmp9 = icmp sgt i32 %i.08, -1
 969   br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
 970
 971 for.body.preheader:                               ; preds = %entry
 972   %xtraiter = and i32 %N, 3
 973   %lcmp.mod = icmp eq i32 %xtraiter, 0
 974   br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
 975
 976 for.body.prol:                                    ; preds = %for.body.prol, %for.body.preheader
 977   %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
 978   %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
 979   %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol
 980   %tmp = load i32, i32* %arrayidx.prol, align 4
 981   %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol
 982   %tmp1 = load i32, i32* %arrayidx1.prol, align 4
 983   %mul.prol = mul nsw i32 %tmp1, %tmp
 984   %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol
 985   store i32 %mul.prol, i32* %arrayidx2.prol, align 4
 986   %i.0.prol = add i32 %i.010.prol, -1
 987   %prol.iter.sub = add i32 %prol.iter, -1
 988   %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
 989   br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
 990
 991 for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader
 992   %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
 993   %tmp2 = icmp ult i32 %i.08, 3
 994   br i1 %tmp2, label %for.cond.cleanup, label %for.body
 995
 996 for.cond.cleanup:                                 ; preds = %for.body, %for.body.prol.loopexit, %entry
 997   ret void
 998
 999 for.body:                                         ; preds = %for.body, %for.body.prol.loopexit
1000   %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
1001   %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010
1002   %tmp3 = load i32, i32* %arrayidx, align 4
1003   %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010
1004   %tmp4 = load i32, i32* %arrayidx1, align 4
1005   %mul = mul nsw i32 %tmp4, %tmp3
1006   %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010
1007   store i32 %mul, i32* %arrayidx2, align 4
1008   %i.0 = add i32 %i.010, -1
1009   %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0
1010   %tmp5 = load i32, i32* %arrayidx.1, align 4
1011   %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0
1012   %tmp6 = load i32, i32* %arrayidx1.1, align 4
1013   %mul.1 = mul nsw i32 %tmp6, %tmp5
1014   %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0
1015   store i32 %mul.1, i32* %arrayidx2.1, align 4
1016   %i.0.1 = add i32 %i.010, -2
1017   %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1
1018   %tmp7 = load i32, i32* %arrayidx.2, align 4
1019   %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1
1020   %tmp8 = load i32, i32* %arrayidx1.2, align 4
1021   %mul.2 = mul nsw i32 %tmp8, %tmp7
1022   %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1
1023   store i32 %mul.2, i32* %arrayidx2.2, align 4
1024   %i.0.2 = add i32 %i.010, -3
1025   %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2
1026   %tmp9 = load i32, i32* %arrayidx.3, align 4
1027   %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2
1028   %tmp10 = load i32, i32* %arrayidx1.3, align 4
1029   %mul.3 = mul nsw i32 %tmp10, %tmp9
1030   %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2
1031   store i32 %mul.3, i32* %arrayidx2.3, align 4
1032   %i.0.3 = add i32 %i.010, -4
1033   %cmp.3 = icmp sgt i32 %i.0.3, -1
1034   br i1 %cmp.3, label %for.body, label %for.cond.cleanup
1035 }
1036
1037 ; CHECK-LABEL: mul32x32_forwards
1038 ; CHECK: @ %for.body
1039
1040 ; TODO: Would be good for the complexity limit didn't have to be increased to
1041 ; enable the pre-indexed accesses.
1042
1043 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
1044 ; CHECK-DEFAULT-NOT: str{{.*}}]!
1045
1046 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1047 ; CHECK-COMPLEX: ldr{{.*}}, #16]!
1048 ; CHECK-COMPLEX: str{{.*}}, #16]!
1049
1050 ; CHECK-T2: @ %for.body.epil
1051 ; CHECK-T2: ldr{{.*}}, #4]!
1052 ; CHECK-T2: ldr{{.*}}, #4]!
1053 ; CHECK-T2: str{{.*}}, #4]!
1054
1055 define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
1056 entry:
1057   %cmp8 = icmp eq i32 %N, 0
1058   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1059
1060 for.body.preheader:                               ; preds = %entry
1061   %tmp = add i32 %N, -1
1062   %xtraiter = and i32 %N, 3
1063   %tmp1 = icmp ult i32 %tmp, 3
1064   br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1065
1066 for.body.preheader.new:                           ; preds = %for.body.preheader
1067   %unroll_iter = sub i32 %N, %xtraiter
1068   br label %for.body
1069
1070 for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1071   %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1072   %lcmp.mod = icmp eq i32 %xtraiter, 0
1073   br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1074
1075 for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
1076   %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1077   %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1078   %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1079   %tmp2 = load i32, i32* %arrayidx.epil, align 4
1080   %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil
1081   %tmp3 = load i32, i32* %arrayidx1.epil, align 4
1082   %mul.epil = mul nsw i32 %tmp3, %tmp2
1083   %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1084   store i32 %mul.epil, i32* %arrayidx2.epil, align 4
1085   %inc.epil = add nuw nsw i32 %i.09.epil, 1
1086   %epil.iter.sub = add i32 %epil.iter, -1
1087   %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1088   br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1089
1090 for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1091   ret void
1092
1093 for.body:                                         ; preds = %for.body, %for.body.preheader.new
1094   %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1095   %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1096   %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09
1097   %tmp4 = load i32, i32* %arrayidx, align 4
1098   %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09
1099   %tmp5 = load i32, i32* %arrayidx1, align 4
1100   %mul = mul nsw i32 %tmp5, %tmp4
1101   %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09
1102   store i32 %mul, i32* %arrayidx2, align 4
1103   %inc = or i32 %i.09, 1
1104   %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1105   %tmp6 = load i32, i32* %arrayidx.1, align 4
1106   %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc
1107   %tmp7 = load i32, i32* %arrayidx1.1, align 4
1108   %mul.1 = mul nsw i32 %tmp7, %tmp6
1109   %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1110   store i32 %mul.1, i32* %arrayidx2.1, align 4
1111   %inc.1 = or i32 %i.09, 2
1112   %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1113   %tmp8 = load i32, i32* %arrayidx.2, align 4
1114   %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1
1115   %tmp9 = load i32, i32* %arrayidx1.2, align 4
1116   %mul.2 = mul nsw i32 %tmp9, %tmp8
1117   %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1118   store i32 %mul.2, i32* %arrayidx2.2, align 4
1119   %inc.2 = or i32 %i.09, 3
1120   %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1121   %tmp10 = load i32, i32* %arrayidx.3, align 4
1122   %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2
1123   %tmp11 = load i32, i32* %arrayidx1.3, align 4
1124   %mul.3 = mul nsw i32 %tmp11, %tmp10
1125   %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1126   store i32 %mul.3, i32* %arrayidx2.3, align 4
1127   %inc.3 = add nuw nsw i32 %i.09, 4
1128   %niter.nsub.3 = add i32 %niter, -4
1129   %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1130   br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1131 }