llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
   3 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
   4
   5 declare void @use(<4 x i32>)
   6 declare void @usef(<4 x float>)
   7
   8 ; Eliminating an insert is profitable.
   9
  10 define <16 x i1> @ins0_ins0_i8(i8 %x, i8 %y) {
  11 ; CHECK-LABEL: @ins0_ins0_i8(
  12 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
  13 ; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x i1> undef, i1 [[R_SCALAR]], i64 0
  14 ; CHECK-NEXT:    ret <16 x i1> [[R]]
  15 ;
  16   %i0 = insertelement <16 x i8> undef, i8 %x, i32 0
  17   %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
  18   %r = icmp eq <16 x i8> %i0, %i1
  19   ret <16 x i1> %r
  20 }
  21
  22 ; Eliminating an insert is still profitable. Mismatch types on index is ok.
  23
  24 define <8 x i1> @ins5_ins5_i16(i16 %x, i16 %y) {
  25 ; CHECK-LABEL: @ins5_ins5_i16(
  26 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp sgt i16 [[X:%.*]], [[Y:%.*]]
  27 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i1> undef, i1 [[R_SCALAR]], i64 5
  28 ; CHECK-NEXT:    ret <8 x i1> [[R]]
  29 ;
  30   %i0 = insertelement <8 x i16> undef, i16 %x, i8 5
  31   %i1 = insertelement <8 x i16> undef, i16 %y, i32 5
  32   %r = icmp sgt <8 x i16> %i0, %i1
  33   ret <8 x i1> %r
  34 }
  35
  36 ; The new vector constant is calculated by constant folding.
  37
  38 define <2 x i1> @ins1_ins1_i64(i64 %x, i64 %y) {
  39 ; CHECK-LABEL: @ins1_ins1_i64(
  40 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp sle i64 [[X:%.*]], [[Y:%.*]]
  41 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> <i1 true, i1 false>, i1 [[R_SCALAR]], i64 1
  42 ; CHECK-NEXT:    ret <2 x i1> [[R]]
  43 ;
  44   %i0 = insertelement <2 x i64> zeroinitializer, i64 %x, i64 1
  45   %i1 = insertelement <2 x i64> <i64 1, i64 -1>, i64 %y, i32 1
  46   %r = icmp sle <2 x i64> %i0, %i1
  47   ret <2 x i1> %r
  48 }
  49
  50 ; The inserts are free, but it's still better to scalarize.
  51
  52 define <2 x i1> @ins0_ins0_f64(double %x, double %y) {
  53 ; CHECK-LABEL: @ins0_ins0_f64(
  54 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp nnan ninf uge double [[X:%.*]], [[Y:%.*]]
  55 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
  56 ; CHECK-NEXT:    ret <2 x i1> [[R]]
  57 ;
  58   %i0 = insertelement <2 x double> undef, double %x, i32 0
  59   %i1 = insertelement <2 x double> undef, double %y, i32 0
  60   %r = fcmp nnan ninf uge <2 x double> %i0, %i1
  61   ret <2 x i1> %r
  62 }
  63
  64 ; Negative test - mismatched indexes (but could fold this).
  65
  66 define <16 x i1> @ins1_ins0_i8(i8 %x, i8 %y) {
  67 ; CHECK-LABEL: @ins1_ins0_i8(
  68 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 1
  69 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0
  70 ; CHECK-NEXT:    [[R:%.*]] = icmp sle <16 x i8> [[I0]], [[I1]]
  71 ; CHECK-NEXT:    ret <16 x i1> [[R]]
  72 ;
  73   %i0 = insertelement <16 x i8> undef, i8 %x, i32 1
  74   %i1 = insertelement <16 x i8> undef, i8 %y, i32 0
  75   %r = icmp sle <16 x i8> %i0, %i1
  76   ret <16 x i1> %r
  77 }
  78
  79 ; Base vector does not have to be undef.
  80
  81 define <4 x i1> @ins0_ins0_i32(i32 %x, i32 %y) {
  82 ; CHECK-LABEL: @ins0_ins0_i32(
  83 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
  84 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> undef, i1 [[R_SCALAR]], i64 0
  85 ; CHECK-NEXT:    ret <4 x i1> [[R]]
  86 ;
  87   %i0 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
  88   %i1 = insertelement <4 x i32> undef, i32 %y, i32 0
  89   %r = icmp ne <4 x i32> %i0, %i1
  90   ret <4 x i1> %r
  91 }
  92
  93 ; Extra use is accounted for in cost calculation.
  94
  95 define <4 x i1> @ins0_ins0_i32_use(i32 %x, i32 %y) {
  96 ; CHECK-LABEL: @ins0_ins0_i32_use(
  97 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
  98 ; CHECK-NEXT:    call void @use(<4 x i32> [[I0]])
  99 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ugt i32 [[X]], [[Y:%.*]]
 100 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> undef, i1 [[R_SCALAR]], i64 0
 101 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 102 ;
 103   %i0 = insertelement <4 x i32> undef, i32 %x, i32 0
 104   call void @use(<4 x i32> %i0)
 105   %i1 = insertelement <4 x i32> undef, i32 %y, i32 0
 106   %r = icmp ugt <4 x i32> %i0, %i1
 107   ret <4 x i1> %r
 108 }
 109
 110 ; Extra use is accounted for in cost calculation.
 111
 112 define <4 x i1> @ins1_ins1_f32_use(float %x, float %y) {
 113 ; CHECK-LABEL: @ins1_ins1_f32_use(
 114 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 1
 115 ; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
 116 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp ogt float [[X:%.*]], [[Y]]
 117 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 1
 118 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 119 ;
 120   %i0 = insertelement <4 x float> undef, float %x, i32 1
 121   %i1 = insertelement <4 x float> undef, float %y, i32 1
 122   call void @usef(<4 x float> %i1)
 123   %r = fcmp ogt <4 x float> %i0, %i1
 124   ret <4 x i1> %r
 125 }
 126
 127 ; If the scalar cmp is not cheaper than the vector cmp, extra uses can prevent the transform.
 128
 129 define <4 x i1> @ins2_ins2_f32_uses(float %x, float %y) {
 130 ; CHECK-LABEL: @ins2_ins2_f32_uses(
 131 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
 132 ; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
 133 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2
 134 ; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
 135 ; CHECK-NEXT:    [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]]
 136 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 137 ;
 138   %i0 = insertelement <4 x float> undef, float %x, i32 2
 139   call void @usef(<4 x float> %i0)
 140   %i1 = insertelement <4 x float> undef, float %y, i32 2
 141   call void @usef(<4 x float> %i1)
 142   %r = fcmp oeq <4 x float> %i0, %i1
 143   ret <4 x i1> %r
 144 }
 145
 146 define <2 x i1> @constant_op1_i64(i64 %x) {
 147 ; CHECK-LABEL: @constant_op1_i64(
 148 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ne i64 [[X:%.*]], 42
 149 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> undef, i1 [[R_SCALAR]], i64 0
 150 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 151 ;
 152   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
 153   %r = icmp ne <2 x i64> %ins, <i64 42, i64 undef>
 154   ret <2 x i1> %r
 155 }
 156
 157 define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 158 ; CHECK-LABEL: @constant_op1_i64_not_undef_lane(
 159 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp sge i64 [[X:%.*]], 42
 160 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
 161 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 162 ;
 163   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
 164   %r = icmp sge <2 x i64> %ins, <i64 42, i64 -42>
 165   ret <2 x i1> %r
 166 }
 167
 168 ; negative test - load prevents the transform
 169
 170 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 171 ; CHECK-LABEL: @constant_op1_i64_load(
 172 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 173 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 174 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 175 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 176 ;
 177   %ld = load i64, ptr %p
 178   %ins = insertelement <2 x i64> undef, i64 %ld, i32 0
 179   %r = icmp eq <2 x i64> %ins, <i64 42, i64 -42>
 180   ret <2 x i1> %r
 181 }
 182
 183 define <4 x i1> @constant_op0_i32(i32 %x) {
 184 ; CHECK-LABEL: @constant_op0_i32(
 185 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ult i32 -42, [[X:%.*]]
 186 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 1
 187 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 188 ;
 189   %ins = insertelement <4 x i32> undef, i32 %x, i32 1
 190   %r = icmp ult <4 x i32> <i32 undef, i32 -42, i32 undef, i32 undef>, %ins
 191   ret <4 x i1> %r
 192 }
 193
 194 define <4 x i1> @constant_op0_i32_not_undef_lane(i32 %x) {
 195 ; CHECK-LABEL: @constant_op0_i32_not_undef_lane(
 196 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = icmp ule i32 42, [[X:%.*]]
 197 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 1
 198 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 199 ;
 200   %ins = insertelement <4 x i32> undef, i32 %x, i32 1
 201   %r = icmp ule <4 x i32> <i32 1, i32 42, i32 42, i32 -42>, %ins
 202   ret <4 x i1> %r
 203 }
 204
 205 define <2 x i1> @constant_op0_f64(double %x) {
 206 ; CHECK-LABEL: @constant_op0_f64(
 207 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp fast olt double 4.200000e+01, [[X:%.*]]
 208 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 0
 209 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 210 ;
 211   %ins = insertelement <2 x double> undef, double %x, i32 0
 212   %r = fcmp fast olt <2 x double> <double 42.0, double undef>, %ins
 213   ret <2 x i1> %r
 214 }
 215
 216 define <2 x i1> @constant_op0_f64_not_undef_lane(double %x) {
 217 ; CHECK-LABEL: @constant_op0_f64_not_undef_lane(
 218 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp nnan ueq double -4.200000e+01, [[X:%.*]]
 219 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 1
 220 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 221 ;
 222   %ins = insertelement <2 x double> undef, double %x, i32 1
 223   %r = fcmp nnan ueq <2 x double> <double 42.0, double -42.0>, %ins
 224   ret <2 x i1> %r
 225 }
 226
 227 define <2 x i1> @constant_op1_f64(double %x) {
 228 ; CHECK-LABEL: @constant_op1_f64(
 229 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp one double [[X:%.*]], 4.200000e+01
 230 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 1
 231 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 232 ;
 233   %ins = insertelement <2 x double> undef, double %x, i32 1
 234   %r = fcmp one <2 x double> %ins, <double undef, double 42.0>
 235   ret <2 x i1> %r
 236 }
 237
 238 define <4 x i1> @constant_op1_f32_not_undef_lane(float %x) {
 239 ; CHECK-LABEL: @constant_op1_f32_not_undef_lane(
 240 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp uge float [[X:%.*]], 4.200000e+01
 241 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> splat (i1 true), i1 [[R_SCALAR]], i64 0
 242 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 243 ;
 244   %ins = insertelement <4 x float> undef, float %x, i32 0
 245   %r = fcmp uge <4 x float> %ins, <float 42.0, float -42.0, float 0.0, float 1.0>
 246   ret <4 x i1> %r
 247 }
 248
 249 ; negative test - select prevents the transform
 250
 251 define <4 x float> @vec_select_use1(<4 x float> %x, <4 x float> %y, i32 %a, i32 %b) {
 252 ; CHECK-LABEL: @vec_select_use1(
 253 ; CHECK-NEXT:    [[VECA:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i8 0
 254 ; CHECK-NEXT:    [[VECB:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i8 0
 255 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq <4 x i32> [[VECA]], [[VECB]]
 256 ; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]
 257 ; CHECK-NEXT:    ret <4 x float> [[R]]
 258 ;
 259   %veca = insertelement <4 x i32> undef, i32 %a, i8 0
 260   %vecb = insertelement <4 x i32> undef, i32 %b, i8 0
 261   %cond = icmp eq <4 x i32> %veca, %vecb
 262   %r = select <4 x i1> %cond, <4 x float> %x, <4 x float> %y
 263   ret <4 x float> %r
 264 }
 265
 266 ; negative test - select prevents the transform
 267
 268 define <4 x float> @vec_select_use2(<4 x float> %x, <4 x float> %y, float %a) {
 269 ; CHECK-LABEL: @vec_select_use2(
 270 ; CHECK-NEXT:    [[VECA:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i8 0
 271 ; CHECK-NEXT:    [[COND:%.*]] = fcmp oeq <4 x float> [[VECA]], zeroinitializer
 272 ; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND]], <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]
 273 ; CHECK-NEXT:    ret <4 x float> [[R]]
 274 ;
 275   %veca = insertelement <4 x float> undef, float %a, i8 0
 276   %cond = fcmp oeq <4 x float> %veca, zeroinitializer
 277   %r = select <4 x i1> %cond, <4 x float> %x, <4 x float> %y
 278   ret <4 x float> %r
 279 }
 280
 281 define <4 x i1> @vector_of_pointers(ptr %t1) {
 282 ; CHECK-LABEL: @vector_of_pointers(
 283 ; CHECK-NEXT:    [[T6_SCALAR:%.*]] = icmp ne ptr [[T1:%.*]], null
 284 ; CHECK-NEXT:    [[T6:%.*]] = insertelement <4 x i1> undef, i1 [[T6_SCALAR]], i64 0
 285 ; CHECK-NEXT:    ret <4 x i1> [[T6]]
 286 ;
 287   %t5 = insertelement <4 x ptr> undef, ptr %t1, i32 0
 288   %t6 = icmp ne <4 x ptr> %t5, zeroinitializer
 289   ret <4 x i1> %t6
 290 }