test/Transforms/LoopVectorize/AArch64/predication_costs.ll

   1 ; REQUIRES: asserts
   2 ; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
   3
   4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   5 target triple = "aarch64--linux-gnu"
   6
   7 ; Check predication-related cost calculations, including scalarization overhead
   8 ; and block probability scaling. Note that the functionality being tested is
   9 ; not specific to AArch64. We specify a target to get actual values for the
  10 ; instruction costs.
  11
  12 ; CHECK-LABEL: predicated_udiv
  13 ;
  14 ; This test checks that we correctly compute the cost of the predicated udiv
  15 ; instruction. If we assume the block probability is 50%, we compute the cost
  16 ; as:
  17 ;
  18 ; Cost of udiv:
  19 ;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
  20 ;
  21 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
  22 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
  23 ;
  24 define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
  25 entry:
  26   br label %for.body
  27
  28 for.body:
  29   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
  30   %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
  31   %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
  32   %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
  33   %tmp2 = load i32, i32* %tmp0, align 4
  34   %tmp3 = load i32, i32* %tmp1, align 4
  35   br i1 %c, label %if.then, label %for.inc
  36
  37 if.then:
  38   %tmp4 = udiv i32 %tmp2, %tmp3
  39   br label %for.inc
  40
  41 for.inc:
  42   %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
  43   %tmp6 = add i32 %r, %tmp5
  44   %i.next = add nuw nsw i64 %i, 1
  45   %cond = icmp slt i64 %i.next, %n
  46   br i1 %cond, label %for.body, label %for.end
  47
  48 for.end:
  49   %tmp7 = phi i32 [ %tmp6, %for.inc ]
  50   ret i32 %tmp7
  51 }
  52
  53 ; CHECK-LABEL: predicated_store
  54 ;
  55 ; This test checks that we correctly compute the cost of the predicated store
  56 ; instruction. If we assume the block probability is 50%, we compute the cost
  57 ; as:
  58 ;
  59 ; Cost of store:
  60 ;   (store(4) + extractelement(3)) / 2 = 3
  61 ;
  62 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
  63 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
  64 ;
  65 define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
  66 entry:
  67   br label %for.body
  68
  69 for.body:
  70   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
  71   %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
  72   %tmp1 = load i32, i32* %tmp0, align 4
  73   %tmp2 = add nsw i32 %tmp1, %x
  74   br i1 %c, label %if.then, label %for.inc
  75
  76 if.then:
  77   store i32 %tmp2, i32* %tmp0, align 4
  78   br label %for.inc
  79
  80 for.inc:
  81   %i.next = add nuw nsw i64 %i, 1
  82   %cond = icmp slt i64 %i.next, %n
  83   br i1 %cond, label %for.body, label %for.end
  84
  85 for.end:
  86   ret void
  87 }
  88
  89 ; CHECK-LABEL: predicated_udiv_scalarized_operand
  90 ;
  91 ; This test checks that we correctly compute the cost of the predicated udiv
  92 ; instruction and the add instruction it uses. The add is scalarized and sunk
  93 ; inside the predicated block.  If we assume the block probability is 50%, we
  94 ; compute the cost as:
  95 ;
  96 ; Cost of add:
  97 ;   (add(2) + extractelement(3)) / 2 = 2
  98 ; Cost of udiv:
  99 ;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
 100 ;
 101 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 102 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 103 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
 104 ; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 105 ;
 106 define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 107 entry:
 108   br label %for.body
 109
 110 for.body:
 111   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 112   %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
 113   %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
 114   %tmp2 = load i32, i32* %tmp0, align 4
 115   br i1 %c, label %if.then, label %for.inc
 116
 117 if.then:
 118   %tmp3 = add nsw i32 %tmp2, %x
 119   %tmp4 = udiv i32 %tmp2, %tmp3
 120   br label %for.inc
 121
 122 for.inc:
 123   %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
 124   %tmp6 = add i32 %r, %tmp5
 125   %i.next = add nuw nsw i64 %i, 1
 126   %cond = icmp slt i64 %i.next, %n
 127   br i1 %cond, label %for.body, label %for.end
 128
 129 for.end:
 130   %tmp7 = phi i32 [ %tmp6, %for.inc ]
 131   ret i32 %tmp7
 132 }
 133
 134 ; CHECK-LABEL: predicated_store_scalarized_operand
 135 ;
 136 ; This test checks that we correctly compute the cost of the predicated store
 137 ; instruction and the add instruction it uses. The add is scalarized and sunk
 138 ; inside the predicated block.  If we assume the block probability is 50%, we
 139 ; compute the cost as:
 140 ;
 141 ; Cost of add:
 142 ;   (add(2) + extractelement(3)) / 2 = 2
 143 ; Cost of store:
 144 ;   store(4) / 2 = 2
 145 ;
 146 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 147 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
 148 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
 149 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
 150 ;
 151 define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 152 entry:
 153   br label %for.body
 154
 155 for.body:
 156   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 157   %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
 158   %tmp1 = load i32, i32* %tmp0, align 4
 159   br i1 %c, label %if.then, label %for.inc
 160
 161 if.then:
 162   %tmp2 = add nsw i32 %tmp1, %x
 163   store i32 %tmp2, i32* %tmp0, align 4
 164   br label %for.inc
 165
 166 for.inc:
 167   %i.next = add nuw nsw i64 %i, 1
 168   %cond = icmp slt i64 %i.next, %n
 169   br i1 %cond, label %for.body, label %for.end
 170
 171 for.end:
 172   ret void
 173 }
 174
 175 ; CHECK-LABEL: predication_multi_context
 176 ;
 177 ; This test checks that we correctly compute the cost of multiple predicated
 178 ; instructions in the same block. The sdiv, udiv, and store must be scalarized
 179 ; and predicated. The sub feeding the store is scalarized and sunk inside the
 180 ; store's predicated block. However, the add feeding the sdiv and udiv cannot
 181 ; be sunk and is not scalarized. If we assume the block probability is 50%, we
 182 ; compute the cost as:
 183 ;
 184 ; Cost of add:
 185 ;   add(1) = 1
 186 ; Cost of sdiv:
 187 ;   (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
 188 ; Cost of udiv:
 189 ;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
 190 ; Cost of sub:
 191 ;   (sub(2) + extractelement(3)) / 2 = 2
 192 ; Cost of store:
 193 ;   store(4) / 2 = 2
 194 ;
 195 ; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
 196 ; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
 197 ; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
 198 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 199 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
 200 ; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
 201 ; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
 202 ; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
 203 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
 204 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
 205 ;
 206 define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
 207 entry:
 208   br label %for.body
 209
 210 for.body:
 211   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 212   %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
 213   %tmp1 = load i32, i32* %tmp0, align 4
 214   br i1 %c, label %if.then, label %for.inc
 215
 216 if.then:
 217   %tmp2 = add i32 %tmp1, %x
 218   %tmp3 = sdiv i32 %tmp1, %tmp2
 219   %tmp4 = udiv i32 %tmp3, %tmp2
 220   %tmp5 = sub i32 %tmp4, %x
 221   store i32 %tmp5, i32* %tmp0, align 4
 222   br label %for.inc
 223
 224 for.inc:
 225   %i.next = add nuw nsw i64 %i, 1
 226   %cond = icmp slt i64 %i.next, %n
 227   br i1 %cond, label %for.body, label %for.end
 228
 229 for.end:
 230   ret void
 231 }