llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll

   1 ; REQUIRES: asserts
   2 ; RUN: opt < %s -force-vector-width=2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
   3
   4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   5 target triple = "aarch64--linux-gnu"
   6
   7 ; Check predication-related cost calculations, including scalarization overhead
   8 ; and block probability scaling. Note that the functionality being tested is
   9 ; not specific to AArch64. We specify a target to get actual values for the
  10 ; instruction costs.
  11
  12 ; CHECK-LABEL: predicated_udiv
  13 ;
  14 ; This test checks that we correctly compute the cost of the predicated udiv
  15 ; instruction. If we assume the block probability is 50%, we compute the cost
  16 ; as:
  17 ;
  18 ; Cost of udiv:
  19 ;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
  20 ;
  21 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
  22 ; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
  23 ;
  24 define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
  25 entry:
  26   br label %for.body
  27
  28 for.body:
  29   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
  30   %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
  31   %tmp0 = getelementptr inbounds i32, ptr %a, i64 %i
  32   %tmp1 = getelementptr inbounds i32, ptr %b, i64 %i
  33   %tmp2 = load i32, ptr %tmp0, align 4
  34   %tmp3 = load i32, ptr %tmp1, align 4
  35   br i1 %c, label %if.then, label %for.inc
  36
  37 if.then:
  38   %tmp4 = udiv i32 %tmp2, %tmp3
  39   br label %for.inc
  40
  41 for.inc:
  42   %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
  43   %tmp6 = add i32 %r, %tmp5
  44   %i.next = add nuw nsw i64 %i, 1
  45   %cond = icmp slt i64 %i.next, %n
  46   br i1 %cond, label %for.body, label %for.end
  47
  48 for.end:
  49   %tmp7 = phi i32 [ %tmp6, %for.inc ]
  50   ret i32 %tmp7
  51 }
  52
  53 ; CHECK-LABEL: predicated_store
  54 ;
  55 ; This test checks that we correctly compute the cost of the predicated store
  56 ; instruction. If we assume the block probability is 50%, we compute the cost
  57 ; as:
  58 ;
  59 ; Cost of store:
  60 ;   (store(4) + extractelement(4)) / 2 = 4
  61 ;
  62 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
  63 ; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
  64 ;
  65 define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) {
  66 entry:
  67   br label %for.body
  68
  69 for.body:
  70   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
  71   %tmp0 = getelementptr inbounds i32, ptr %a, i64 %i
  72   %tmp1 = load i32, ptr %tmp0, align 4
  73   %tmp2 = add nsw i32 %tmp1, %x
  74   br i1 %c, label %if.then, label %for.inc
  75
  76 if.then:
  77   store i32 %tmp2, ptr %tmp0, align 4
  78   br label %for.inc
  79
  80 for.inc:
  81   %i.next = add nuw nsw i64 %i, 1
  82   %cond = icmp slt i64 %i.next, %n
  83   br i1 %cond, label %for.body, label %for.end
  84
  85 for.end:
  86   ret void
  87 }
  88
  89 ; CHECK-LABEL: predicated_store_phi
  90 ;
  91 ; Same as predicate_store except we use a pointer PHI to maintain the address
  92 ;
  93 ; CHECK: Found scalar instruction:   %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
  94 ; CHECK: Found scalar instruction:   %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
  95 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4
  96 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
  97 ; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
  98 ;
  99 define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) {
 100 entry:
 101   br label %for.body
 102
 103 for.body:
 104   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 105   %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
 106   %tmp1 = load i32, ptr %addr, align 4
 107   %tmp2 = add nsw i32 %tmp1, %x
 108   br i1 %c, label %if.then, label %for.inc
 109
 110 if.then:
 111   store i32 %tmp2, ptr %addr, align 4
 112   br label %for.inc
 113
 114 for.inc:
 115   %i.next = add nuw nsw i64 %i, 1
 116   %cond = icmp slt i64 %i.next, %n
 117   %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
 118   br i1 %cond, label %for.body, label %for.end
 119
 120 for.end:
 121   ret void
 122 }
 123
 124 ; CHECK-LABEL: predicated_udiv_scalarized_operand
 125 ;
 126 ; This test checks that we correctly compute the cost of the predicated udiv
 127 ; instruction and the add instruction it uses. The add is scalarized and sunk
 128 ; inside the predicated block.  If we assume the block probability is 50%, we
 129 ; compute the cost as:
 130 ;
 131 ; Cost of add:
 132 ;   (add(2) + extractelement(4)) / 2 = 3
 133 ; Cost of udiv:
 134 ;   (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
 135 ;
 136 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 137 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 138 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
 139 ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 140 ;
 141 define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 142 entry:
 143   br label %for.body
 144
 145 for.body:
 146   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 147   %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
 148   %tmp0 = getelementptr inbounds i32, ptr %a, i64 %i
 149   %tmp2 = load i32, ptr %tmp0, align 4
 150   br i1 %c, label %if.then, label %for.inc
 151
 152 if.then:
 153   %tmp3 = add nsw i32 %tmp2, %x
 154   %tmp4 = udiv i32 %tmp2, %tmp3
 155   br label %for.inc
 156
 157 for.inc:
 158   %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
 159   %tmp6 = add i32 %r, %tmp5
 160   %i.next = add nuw nsw i64 %i, 1
 161   %cond = icmp slt i64 %i.next, %n
 162   br i1 %cond, label %for.body, label %for.end
 163
 164 for.end:
 165   %tmp7 = phi i32 [ %tmp6, %for.inc ]
 166   ret i32 %tmp7
 167 }
 168
 169 ; CHECK-LABEL: predicated_store_scalarized_operand
 170 ;
 171 ; This test checks that we correctly compute the cost of the predicated store
 172 ; instruction and the add instruction it uses. The add is scalarized and sunk
 173 ; inside the predicated block.  If we assume the block probability is 50%, we
 174 ; compute the cost as:
 175 ;
 176 ; Cost of add:
 177 ;   (add(2) + extractelement(4)) / 2 = 3
 178 ; Cost of store:
 179 ;   store(4) / 2 = 2
 180 ;
 181 ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 182 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
 183 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
 184 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
 185 ;
 186 define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 187 entry:
 188   br label %for.body
 189
 190 for.body:
 191   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 192   %tmp0 = getelementptr inbounds i32, ptr %a, i64 %i
 193   %tmp1 = load i32, ptr %tmp0, align 4
 194   br i1 %c, label %if.then, label %for.inc
 195
 196 if.then:
 197   %tmp2 = add nsw i32 %tmp1, %x
 198   store i32 %tmp2, ptr %tmp0, align 4
 199   br label %for.inc
 200
 201 for.inc:
 202   %i.next = add nuw nsw i64 %i, 1
 203   %cond = icmp slt i64 %i.next, %n
 204   br i1 %cond, label %for.body, label %for.end
 205
 206 for.end:
 207   ret void
 208 }
 209
 210 ; CHECK-LABEL: predication_multi_context
 211 ;
 212 ; This test checks that we correctly compute the cost of multiple predicated
 213 ; instructions in the same block. The sdiv, udiv, and store must be scalarized
 214 ; and predicated. The sub feeding the store is scalarized and sunk inside the
 215 ; store's predicated block. However, the add feeding the sdiv and udiv cannot
 216 ; be sunk and is not scalarized. If we assume the block probability is 50%, we
 217 ; compute the cost as:
 218 ;
 219 ; Cost of add:
 220 ;   add(1) = 1
 221 ; Cost of sdiv:
 222 ;   (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 223 ; Cost of udiv:
 224 ;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 225 ; Cost of sub:
 226 ;   (sub(2) + extractelement(4)) / 2 = 3
 227 ; Cost of store:
 228 ;   store(4) / 2 = 2
 229 ;
 230 ; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
 231 ; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
 232 ; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
 233 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 234 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
 235 ; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
 236 ; CHECK:     Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
 237 ; CHECK:     Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
 238 ; CHECK:     Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
 239 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4
 240 ;
 241 define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
 242 entry:
 243   br label %for.body
 244
 245 for.body:
 246   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
 247   %tmp0 = getelementptr inbounds i32, ptr %a, i64 %i
 248   %tmp1 = load i32, ptr %tmp0, align 4
 249   br i1 %c, label %if.then, label %for.inc
 250
 251 if.then:
 252   %tmp2 = add i32 %tmp1, %x
 253   %tmp3 = sdiv i32 %tmp1, %tmp2
 254   %tmp4 = udiv i32 %tmp3, %tmp2
 255   %tmp5 = sub i32 %tmp4, %x
 256   store i32 %tmp5, ptr %tmp0, align 4
 257   br label %for.inc
 258
 259 for.inc:
 260   %i.next = add nuw nsw i64 %i, 1
 261   %cond = icmp slt i64 %i.next, %n
 262   br i1 %cond, label %for.body, label %for.end
 263
 264 for.end:
 265   ret void
 266 }