test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll

   1 ; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
   2 ; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
   3 ; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
   4 ; REQUIRES: asserts
   5
   6 ; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
   7 ; regarding IEEE 754 standard.
   8 ; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
   9 ; because NEON is not IEEE compliant.
  10 ; Darwin, on the other hand, doesn't support subnormals, and all optimizations
  11 ; are allowed, even without -ffast-math.
  12
  13 ; Integer loops are always vectorizeable
  14 ; CHECK: Checking a loop in "sumi"
  15 ; CHECK: We can vectorize this loop!
  16 define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
  17 entry:
  18   %cmp5 = icmp eq i32 %N, 0
  19   br i1 %cmp5, label %for.end, label %for.body.preheader
  20
  21 for.body.preheader:                               ; preds = %entry
  22   br label %for.body
  23
  24 for.body:                                         ; preds = %for.body.preheader, %for.body
  25   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  26   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
  27   %0 = load i32, i32* %arrayidx, align 4
  28   %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
  29   %1 = load i32, i32* %arrayidx1, align 4
  30   %mul = mul nsw i32 %1, %0
  31   %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
  32   store i32 %mul, i32* %arrayidx2, align 4
  33   %inc = add nuw nsw i32 %i.06, 1
  34   %exitcond = icmp eq i32 %inc, %N
  35   br i1 %exitcond, label %for.end.loopexit, label %for.body
  36
  37 for.end.loopexit:                                 ; preds = %for.body
  38   br label %for.end
  39
  40 for.end:                                          ; preds = %for.end.loopexit, %entry
  41   ret void
  42 }
  43
  44 ; Floating-point loops need fast-math to be vectorizeable
  45 ; LINUX: Checking a loop in "sumf"
  46 ; LINUX: Potentially unsafe FP op prevents vectorization
  47 ; DARWIN: Checking a loop in "sumf"
  48 ; DARWIN: We can vectorize this loop!
  49 define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
  50 entry:
  51   %cmp5 = icmp eq i32 %N, 0
  52   br i1 %cmp5, label %for.end, label %for.body.preheader
  53
  54 for.body.preheader:                               ; preds = %entry
  55   br label %for.body
  56
  57 for.body:                                         ; preds = %for.body.preheader, %for.body
  58   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  59   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
  60   %0 = load float, float* %arrayidx, align 4
  61   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
  62   %1 = load float, float* %arrayidx1, align 4
  63   %mul = fmul float %0, %1
  64   %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
  65   store float %mul, float* %arrayidx2, align 4
  66   %inc = add nuw nsw i32 %i.06, 1
  67   %exitcond = icmp eq i32 %inc, %N
  68   br i1 %exitcond, label %for.end.loopexit, label %for.body
  69
  70 for.end.loopexit:                                 ; preds = %for.body
  71   br label %for.end
  72
  73 for.end:                                          ; preds = %for.end.loopexit, %entry
  74   ret void
  75 }
  76
  77 ; Integer loops are always vectorizeable
  78 ; CHECK: Checking a loop in "redi"
  79 ; CHECK: We can vectorize this loop!
  80 define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
  81 entry:
  82   %cmp5 = icmp eq i32 %N, 0
  83   br i1 %cmp5, label %for.end, label %for.body.preheader
  84
  85 for.body.preheader:                               ; preds = %entry
  86   br label %for.body
  87
  88 for.body:                                         ; preds = %for.body.preheader, %for.body
  89   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  90   %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
  91   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
  92   %0 = load i32, i32* %arrayidx, align 4
  93   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
  94   %1 = load i32, i32* %arrayidx1, align 4
  95   %mul = mul nsw i32 %1, %0
  96   %add = add nsw i32 %mul, %Red.06
  97   %inc = add nuw nsw i32 %i.07, 1
  98   %exitcond = icmp eq i32 %inc, %N
  99   br i1 %exitcond, label %for.end.loopexit, label %for.body
 100
 101 for.end.loopexit:                                 ; preds = %for.body
 102   %add.lcssa = phi i32 [ %add, %for.body ]
 103   br label %for.end
 104
 105 for.end:                                          ; preds = %for.end.loopexit, %entry
 106   %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 107   ret i32 %Red.0.lcssa
 108 }
 109
 110 ; Floating-point loops need fast-math to be vectorizeable
 111 ; LINUX: Checking a loop in "redf"
 112 ; LINUX: Potentially unsafe FP op prevents vectorization
 113 ; DARWIN: Checking a loop in "redf"
 114 ; DARWIN: We can vectorize this loop!
 115 define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
 116 entry:
 117   %cmp5 = icmp eq i32 %N, 0
 118   br i1 %cmp5, label %for.end, label %for.body.preheader
 119
 120 for.body.preheader:                               ; preds = %entry
 121   br label %for.body
 122
 123 for.body:                                         ; preds = %for.body.preheader, %for.body
 124   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 125   %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
 126   %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
 127   %0 = load float, float* %arrayidx, align 4
 128   %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
 129   %1 = load float, float* %arrayidx1, align 4
 130   %mul = fmul float %0, %1
 131   %add = fadd float %Red.06, %mul
 132   %inc = add nuw nsw i32 %i.07, 1
 133   %exitcond = icmp eq i32 %inc, %N
 134   br i1 %exitcond, label %for.end.loopexit, label %for.body
 135
 136 for.end.loopexit:                                 ; preds = %for.body
 137   %add.lcssa = phi float [ %add, %for.body ]
 138   br label %for.end
 139
 140 for.end:                                          ; preds = %for.end.loopexit, %entry
 141   %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 142   ret float %Red.0.lcssa
 143 }
 144
 145 ; Make sure calls that turn into builtins are also covered
 146 ; LINUX: Checking a loop in "fabs"
 147 ; LINUX: Potentially unsafe FP op prevents vectorization
 148 ; DARWIN: Checking a loop in "fabs"
 149 ; DARWIN: We can vectorize this loop!
 150 define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 151 entry:
 152   %cmp10 = icmp eq i32 %N, 0
 153   br i1 %cmp10, label %for.end, label %for.body
 154
 155 for.body:                                         ; preds = %entry, %for.body
 156   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 157   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
 158   %0 = load float, float* %arrayidx, align 4
 159   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
 160   %1 = load float, float* %arrayidx1, align 4
 161   %fabsf = tail call float @fabsf(float %1) #1
 162   %conv3 = fmul float %0, %fabsf
 163   %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
 164   store float %conv3, float* %arrayidx4, align 4
 165   %inc = add nuw nsw i32 %i.011, 1
 166   %exitcond = icmp eq i32 %inc, %N
 167   br i1 %exitcond, label %for.end, label %for.body
 168
 169 for.end:                                          ; preds = %for.body, %entry
 170   ret void
 171 }
 172
 173 ; Integer loops are always vectorizeable
 174 ; CHECK: Checking a loop in "sumi_fast"
 175 ; CHECK: We can vectorize this loop!
 176 define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
 177 entry:
 178   %cmp5 = icmp eq i32 %N, 0
 179   br i1 %cmp5, label %for.end, label %for.body.preheader
 180
 181 for.body.preheader:                               ; preds = %entry
 182   br label %for.body
 183
 184 for.body:                                         ; preds = %for.body.preheader, %for.body
 185   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 186   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
 187   %0 = load i32, i32* %arrayidx, align 4
 188   %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
 189   %1 = load i32, i32* %arrayidx1, align 4
 190   %mul = mul nsw i32 %1, %0
 191   %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
 192   store i32 %mul, i32* %arrayidx2, align 4
 193   %inc = add nuw nsw i32 %i.06, 1
 194   %exitcond = icmp eq i32 %inc, %N
 195   br i1 %exitcond, label %for.end.loopexit, label %for.body
 196
 197 for.end.loopexit:                                 ; preds = %for.body
 198   br label %for.end
 199
 200 for.end:                                          ; preds = %for.end.loopexit, %entry
 201   ret void
 202 }
 203
 204 ; Floating-point loops can be vectorizeable with fast-math
 205 ; CHECK: Checking a loop in "sumf_fast"
 206 ; CHECK: We can vectorize this loop!
 207 define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 208 entry:
 209   %cmp5 = icmp eq i32 %N, 0
 210   br i1 %cmp5, label %for.end, label %for.body.preheader
 211
 212 for.body.preheader:                               ; preds = %entry
 213   br label %for.body
 214
 215 for.body:                                         ; preds = %for.body.preheader, %for.body
 216   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 217   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
 218   %0 = load float, float* %arrayidx, align 4
 219   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
 220   %1 = load float, float* %arrayidx1, align 4
 221   %mul = fmul fast float %1, %0
 222   %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
 223   store float %mul, float* %arrayidx2, align 4
 224   %inc = add nuw nsw i32 %i.06, 1
 225   %exitcond = icmp eq i32 %inc, %N
 226   br i1 %exitcond, label %for.end.loopexit, label %for.body
 227
 228 for.end.loopexit:                                 ; preds = %for.body
 229   br label %for.end
 230
 231 for.end:                                          ; preds = %for.end.loopexit, %entry
 232   ret void
 233 }
 234
 235 ; Integer loops are always vectorizeable
 236 ; CHECK: Checking a loop in "redi_fast"
 237 ; CHECK: We can vectorize this loop!
 238 define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
 239 entry:
 240   %cmp5 = icmp eq i32 %N, 0
 241   br i1 %cmp5, label %for.end, label %for.body.preheader
 242
 243 for.body.preheader:                               ; preds = %entry
 244   br label %for.body
 245
 246 for.body:                                         ; preds = %for.body.preheader, %for.body
 247   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 248   %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
 249   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
 250   %0 = load i32, i32* %arrayidx, align 4
 251   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
 252   %1 = load i32, i32* %arrayidx1, align 4
 253   %mul = mul nsw i32 %1, %0
 254   %add = add nsw i32 %mul, %Red.06
 255   %inc = add nuw nsw i32 %i.07, 1
 256   %exitcond = icmp eq i32 %inc, %N
 257   br i1 %exitcond, label %for.end.loopexit, label %for.body
 258
 259 for.end.loopexit:                                 ; preds = %for.body
 260   %add.lcssa = phi i32 [ %add, %for.body ]
 261   br label %for.end
 262
 263 for.end:                                          ; preds = %for.end.loopexit, %entry
 264   %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 265   ret i32 %Red.0.lcssa
 266 }
 267
 268 ; Floating-point loops can be vectorizeable with fast-math
 269 ; CHECK: Checking a loop in "redf_fast"
 270 ; CHECK: We can vectorize this loop!
 271 define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
 272 entry:
 273   %cmp5 = icmp eq i32 %N, 0
 274   br i1 %cmp5, label %for.end, label %for.body.preheader
 275
 276 for.body.preheader:                               ; preds = %entry
 277   br label %for.body
 278
 279 for.body:                                         ; preds = %for.body.preheader, %for.body
 280   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 281   %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
 282   %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
 283   %0 = load float, float* %arrayidx, align 4
 284   %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
 285   %1 = load float, float* %arrayidx1, align 4
 286   %mul = fmul fast float %1, %0
 287   %add = fadd fast float %mul, %Red.06
 288   %inc = add nuw nsw i32 %i.07, 1
 289   %exitcond = icmp eq i32 %inc, %N
 290   br i1 %exitcond, label %for.end.loopexit, label %for.body
 291
 292 for.end.loopexit:                                 ; preds = %for.body
 293   %add.lcssa = phi float [ %add, %for.body ]
 294   br label %for.end
 295
 296 for.end:                                          ; preds = %for.end.loopexit, %entry
 297   %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 298   ret float %Red.0.lcssa
 299 }
 300
 301 ; Make sure calls that turn into builtins are also covered
 302 ; CHECK: Checking a loop in "fabs_fast"
 303 ; CHECK: We can vectorize this loop!
 304 define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 305 entry:
 306   %cmp10 = icmp eq i32 %N, 0
 307   br i1 %cmp10, label %for.end, label %for.body
 308
 309 for.body:                                         ; preds = %entry, %for.body
 310   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 311   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
 312   %0 = load float, float* %arrayidx, align 4
 313   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
 314   %1 = load float, float* %arrayidx1, align 4
 315   %fabsf = tail call fast float @fabsf(float %1) #2
 316   %conv3 = fmul fast float %fabsf, %0
 317   %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
 318   store float %conv3, float* %arrayidx4, align 4
 319   %inc = add nuw nsw i32 %i.011, 1
 320   %exitcond = icmp eq i32 %inc, %N
 321   br i1 %exitcond, label %for.end, label %for.body
 322
 323 for.end:                                          ; preds = %for.body, %entry
 324   ret void
 325 }
 326
 327 declare float @fabsf(float)
 328
 329 attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
 330 attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }