test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll

   1 ; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
   2 ; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
   3 ; RUN: opt -mtriple armv8.1.m-none-eabi -mattr=+mve.fp -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=MVE
   4 ; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
   5 ; REQUIRES: asserts
   6
   7 ; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
   8 ; regarding IEEE 754 standard.
   9 ; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
  10 ; because NEON is not IEEE compliant.
  11 ; Darwin, on the other hand, doesn't support subnormals, and all optimizations
  12 ; are allowed, even without -ffast-math.
  13
  14 ; Integer loops are always vectorizeable
  15 ; CHECK: Checking a loop in "sumi"
  16 ; CHECK: We can vectorize this loop!
  17 define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
  18 entry:
  19   %cmp5 = icmp eq i32 %N, 0
  20   br i1 %cmp5, label %for.end, label %for.body.preheader
  21
  22 for.body.preheader:                               ; preds = %entry
  23   br label %for.body
  24
  25 for.body:                                         ; preds = %for.body.preheader, %for.body
  26   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  27   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
  28   %0 = load i32, i32* %arrayidx, align 4
  29   %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
  30   %1 = load i32, i32* %arrayidx1, align 4
  31   %mul = mul nsw i32 %1, %0
  32   %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
  33   store i32 %mul, i32* %arrayidx2, align 4
  34   %inc = add nuw nsw i32 %i.06, 1
  35   %exitcond = icmp eq i32 %inc, %N
  36   br i1 %exitcond, label %for.end.loopexit, label %for.body
  37
  38 for.end.loopexit:                                 ; preds = %for.body
  39   br label %for.end
  40
  41 for.end:                                          ; preds = %for.end.loopexit, %entry
  42   ret void
  43 }
  44
  45 ; Floating-point loops need fast-math to be vectorizeable
  46 ; LINUX: Checking a loop in "sumf"
  47 ; LINUX: Potentially unsafe FP op prevents vectorization
  48 ; MVE: Checking a loop in "sumf"
  49 ; MVE: We can vectorize this loop!
  50 ; DARWIN: Checking a loop in "sumf"
  51 ; DARWIN: We can vectorize this loop!
  52 define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
  53 entry:
  54   %cmp5 = icmp eq i32 %N, 0
  55   br i1 %cmp5, label %for.end, label %for.body.preheader
  56
  57 for.body.preheader:                               ; preds = %entry
  58   br label %for.body
  59
  60 for.body:                                         ; preds = %for.body.preheader, %for.body
  61   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  62   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
  63   %0 = load float, float* %arrayidx, align 4
  64   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
  65   %1 = load float, float* %arrayidx1, align 4
  66   %mul = fmul float %0, %1
  67   %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
  68   store float %mul, float* %arrayidx2, align 4
  69   %inc = add nuw nsw i32 %i.06, 1
  70   %exitcond = icmp eq i32 %inc, %N
  71   br i1 %exitcond, label %for.end.loopexit, label %for.body
  72
  73 for.end.loopexit:                                 ; preds = %for.body
  74   br label %for.end
  75
  76 for.end:                                          ; preds = %for.end.loopexit, %entry
  77   ret void
  78 }
  79
  80 ; Integer loops are always vectorizeable
  81 ; CHECK: Checking a loop in "redi"
  82 ; CHECK: We can vectorize this loop!
  83 define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
  84 entry:
  85   %cmp5 = icmp eq i32 %N, 0
  86   br i1 %cmp5, label %for.end, label %for.body.preheader
  87
  88 for.body.preheader:                               ; preds = %entry
  89   br label %for.body
  90
  91 for.body:                                         ; preds = %for.body.preheader, %for.body
  92   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  93   %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
  94   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
  95   %0 = load i32, i32* %arrayidx, align 4
  96   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
  97   %1 = load i32, i32* %arrayidx1, align 4
  98   %mul = mul nsw i32 %1, %0
  99   %add = add nsw i32 %mul, %Red.06
 100   %inc = add nuw nsw i32 %i.07, 1
 101   %exitcond = icmp eq i32 %inc, %N
 102   br i1 %exitcond, label %for.end.loopexit, label %for.body
 103
 104 for.end.loopexit:                                 ; preds = %for.body
 105   %add.lcssa = phi i32 [ %add, %for.body ]
 106   br label %for.end
 107
 108 for.end:                                          ; preds = %for.end.loopexit, %entry
 109   %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 110   ret i32 %Red.0.lcssa
 111 }
 112
 113 ; Floating-point loops need fast-math to be vectorizeable
 114 ; LINUX: Checking a loop in "redf"
 115 ; LINUX: Potentially unsafe FP op prevents vectorization
 116 ; MVE: Checking a loop in "redf"
 117 ; MVE: We can vectorize this loop!
 118 ; DARWIN: Checking a loop in "redf"
 119 ; DARWIN: We can vectorize this loop!
 120 define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
 121 entry:
 122   %cmp5 = icmp eq i32 %N, 0
 123   br i1 %cmp5, label %for.end, label %for.body.preheader
 124
 125 for.body.preheader:                               ; preds = %entry
 126   br label %for.body
 127
 128 for.body:                                         ; preds = %for.body.preheader, %for.body
 129   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 130   %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
 131   %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
 132   %0 = load float, float* %arrayidx, align 4
 133   %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
 134   %1 = load float, float* %arrayidx1, align 4
 135   %mul = fmul float %0, %1
 136   %add = fadd float %Red.06, %mul
 137   %inc = add nuw nsw i32 %i.07, 1
 138   %exitcond = icmp eq i32 %inc, %N
 139   br i1 %exitcond, label %for.end.loopexit, label %for.body
 140
 141 for.end.loopexit:                                 ; preds = %for.body
 142   %add.lcssa = phi float [ %add, %for.body ]
 143   br label %for.end
 144
 145 for.end:                                          ; preds = %for.end.loopexit, %entry
 146   %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 147   ret float %Red.0.lcssa
 148 }
 149
 150 ; Make sure calls that turn into builtins are also covered
 151 ; LINUX: Checking a loop in "fabs"
 152 ; LINUX: Potentially unsafe FP op prevents vectorization
 153 ; DARWIN: Checking a loop in "fabs"
 154 ; DARWIN: We can vectorize this loop!
 155 define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 156 entry:
 157   %cmp10 = icmp eq i32 %N, 0
 158   br i1 %cmp10, label %for.end, label %for.body
 159
 160 for.body:                                         ; preds = %entry, %for.body
 161   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 162   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
 163   %0 = load float, float* %arrayidx, align 4
 164   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
 165   %1 = load float, float* %arrayidx1, align 4
 166   %fabsf = tail call float @fabsf(float %1) #1
 167   %conv3 = fmul float %0, %fabsf
 168   %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
 169   store float %conv3, float* %arrayidx4, align 4
 170   %inc = add nuw nsw i32 %i.011, 1
 171   %exitcond = icmp eq i32 %inc, %N
 172   br i1 %exitcond, label %for.end, label %for.body
 173
 174 for.end:                                          ; preds = %for.body, %entry
 175   ret void
 176 }
 177
 178 ; Integer loops are always vectorizeable
 179 ; CHECK: Checking a loop in "sumi_fast"
 180 ; CHECK: We can vectorize this loop!
 181 define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
 182 entry:
 183   %cmp5 = icmp eq i32 %N, 0
 184   br i1 %cmp5, label %for.end, label %for.body.preheader
 185
 186 for.body.preheader:                               ; preds = %entry
 187   br label %for.body
 188
 189 for.body:                                         ; preds = %for.body.preheader, %for.body
 190   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 191   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
 192   %0 = load i32, i32* %arrayidx, align 4
 193   %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
 194   %1 = load i32, i32* %arrayidx1, align 4
 195   %mul = mul nsw i32 %1, %0
 196   %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
 197   store i32 %mul, i32* %arrayidx2, align 4
 198   %inc = add nuw nsw i32 %i.06, 1
 199   %exitcond = icmp eq i32 %inc, %N
 200   br i1 %exitcond, label %for.end.loopexit, label %for.body
 201
 202 for.end.loopexit:                                 ; preds = %for.body
 203   br label %for.end
 204
 205 for.end:                                          ; preds = %for.end.loopexit, %entry
 206   ret void
 207 }
 208
 209 ; Floating-point loops can be vectorizeable with fast-math
 210 ; CHECK: Checking a loop in "sumf_fast"
 211 ; CHECK: We can vectorize this loop!
 212 define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 213 entry:
 214   %cmp5 = icmp eq i32 %N, 0
 215   br i1 %cmp5, label %for.end, label %for.body.preheader
 216
 217 for.body.preheader:                               ; preds = %entry
 218   br label %for.body
 219
 220 for.body:                                         ; preds = %for.body.preheader, %for.body
 221   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 222   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
 223   %0 = load float, float* %arrayidx, align 4
 224   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
 225   %1 = load float, float* %arrayidx1, align 4
 226   %mul = fmul fast float %1, %0
 227   %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
 228   store float %mul, float* %arrayidx2, align 4
 229   %inc = add nuw nsw i32 %i.06, 1
 230   %exitcond = icmp eq i32 %inc, %N
 231   br i1 %exitcond, label %for.end.loopexit, label %for.body
 232
 233 for.end.loopexit:                                 ; preds = %for.body
 234   br label %for.end
 235
 236 for.end:                                          ; preds = %for.end.loopexit, %entry
 237   ret void
 238 }
 239
 240 ; Integer loops are always vectorizeable
 241 ; CHECK: Checking a loop in "redi_fast"
 242 ; CHECK: We can vectorize this loop!
 243 define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
 244 entry:
 245   %cmp5 = icmp eq i32 %N, 0
 246   br i1 %cmp5, label %for.end, label %for.body.preheader
 247
 248 for.body.preheader:                               ; preds = %entry
 249   br label %for.body
 250
 251 for.body:                                         ; preds = %for.body.preheader, %for.body
 252   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 253   %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
 254   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
 255   %0 = load i32, i32* %arrayidx, align 4
 256   %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
 257   %1 = load i32, i32* %arrayidx1, align 4
 258   %mul = mul nsw i32 %1, %0
 259   %add = add nsw i32 %mul, %Red.06
 260   %inc = add nuw nsw i32 %i.07, 1
 261   %exitcond = icmp eq i32 %inc, %N
 262   br i1 %exitcond, label %for.end.loopexit, label %for.body
 263
 264 for.end.loopexit:                                 ; preds = %for.body
 265   %add.lcssa = phi i32 [ %add, %for.body ]
 266   br label %for.end
 267
 268 for.end:                                          ; preds = %for.end.loopexit, %entry
 269   %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 270   ret i32 %Red.0.lcssa
 271 }
 272
 273 ; Floating-point loops can be vectorizeable with fast-math
 274 ; CHECK: Checking a loop in "redf_fast"
 275 ; CHECK: We can vectorize this loop!
 276 define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
 277 entry:
 278   %cmp5 = icmp eq i32 %N, 0
 279   br i1 %cmp5, label %for.end, label %for.body.preheader
 280
 281 for.body.preheader:                               ; preds = %entry
 282   br label %for.body
 283
 284 for.body:                                         ; preds = %for.body.preheader, %for.body
 285   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
 286   %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
 287   %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
 288   %0 = load float, float* %arrayidx, align 4
 289   %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
 290   %1 = load float, float* %arrayidx1, align 4
 291   %mul = fmul fast float %1, %0
 292   %add = fadd fast float %mul, %Red.06
 293   %inc = add nuw nsw i32 %i.07, 1
 294   %exitcond = icmp eq i32 %inc, %N
 295   br i1 %exitcond, label %for.end.loopexit, label %for.body
 296
 297 for.end.loopexit:                                 ; preds = %for.body
 298   %add.lcssa = phi float [ %add, %for.body ]
 299   br label %for.end
 300
 301 for.end:                                          ; preds = %for.end.loopexit, %entry
 302   %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
 303   ret float %Red.0.lcssa
 304 }
 305
 306 ; Make sure calls that turn into builtins are also covered
 307 ; CHECK: Checking a loop in "fabs_fast"
 308 ; CHECK: We can vectorize this loop!
 309 define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
 310 entry:
 311   %cmp10 = icmp eq i32 %N, 0
 312   br i1 %cmp10, label %for.end, label %for.body
 313
 314 for.body:                                         ; preds = %entry, %for.body
 315   %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 316   %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
 317   %0 = load float, float* %arrayidx, align 4
 318   %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
 319   %1 = load float, float* %arrayidx1, align 4
 320   %fabsf = tail call fast float @fabsf(float %1) #2
 321   %conv3 = fmul fast float %fabsf, %0
 322   %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
 323   store float %conv3, float* %arrayidx4, align 4
 324   %inc = add nuw nsw i32 %i.011, 1
 325   %exitcond = icmp eq i32 %inc, %N
 326   br i1 %exitcond, label %for.end, label %for.body
 327
 328 for.end:                                          ; preds = %for.body, %entry
 329   ret void
 330 }
 331
 332 declare float @fabsf(float)
 333
 334 attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
 335 attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }