llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s
   3
   4 ; Incremental updates of the instruction depths should be enough for this test
   5 ; case.
   6 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s
   7
   8 ; Verify that the first two adds are independent regardless of how the inputs are
   9 ; commuted. The destination registers are used as source registers for the third add.
  10
  11 define half @reassociate_adds1(half %x0, half %x1, half %x2, half %x3) {
  12 ; CHECK-LABEL: reassociate_adds1:
  13 ; CHECK:       # %bb.0:
  14 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  15 ; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
  16 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  17 ; CHECK-NEXT:    retq
  18   %t0 = fadd reassoc nsz half %x0, %x1
  19   %t1 = fadd reassoc nsz half %t0, %x2
  20   %t2 = fadd reassoc nsz half %t1, %x3
  21   ret half %t2
  22 }
  23
  24 define half @reassociate_adds2(half %x0, half %x1, half %x2, half %x3) {
  25 ; CHECK-LABEL: reassociate_adds2:
  26 ; CHECK:       # %bb.0:
  27 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  28 ; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
  29 ; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
  30 ; CHECK-NEXT:    retq
  31   %t0 = fadd reassoc nsz half %x0, %x1
  32   %t1 = fadd reassoc nsz half %x2, %t0
  33   %t2 = fadd reassoc nsz half %t1, %x3
  34   ret half %t2
  35 }
  36
  37 define half @reassociate_adds3(half %x0, half %x1, half %x2, half %x3) {
  38 ; CHECK-LABEL: reassociate_adds3:
  39 ; CHECK:       # %bb.0:
  40 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  41 ; CHECK-NEXT:    vaddsh %xmm2, %xmm3, %xmm1
  42 ; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
  43 ; CHECK-NEXT:    retq
  44   %t0 = fadd reassoc nsz half %x0, %x1
  45   %t1 = fadd reassoc nsz half %t0, %x2
  46   %t2 = fadd reassoc nsz half %x3, %t1
  47   ret half %t2
  48 }
  49
  50 define half @reassociate_adds4(half %x0, half %x1, half %x2, half %x3) {
  51 ; CHECK-LABEL: reassociate_adds4:
  52 ; CHECK:       # %bb.0:
  53 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  54 ; CHECK-NEXT:    vaddsh %xmm2, %xmm3, %xmm1
  55 ; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
  56 ; CHECK-NEXT:    retq
  57   %t0 = fadd reassoc nsz half %x0, %x1
  58   %t1 = fadd reassoc nsz half %x2, %t0
  59   %t2 = fadd reassoc nsz half %x3, %t1
  60   ret half %t2
  61 }
  62
  63 ; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
  64 ; produced because that would cost more compile time.
  65
  66 define half @reassociate_adds5(half %x0, half %x1, half %x2, half %x3, half %x4, half %x5, half %x6, half %x7) {
  67 ; CHECK-LABEL: reassociate_adds5:
  68 ; CHECK:       # %bb.0:
  69 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  70 ; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
  71 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  72 ; CHECK-NEXT:    vaddsh %xmm5, %xmm4, %xmm1
  73 ; CHECK-NEXT:    vaddsh %xmm6, %xmm1, %xmm1
  74 ; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
  75 ; CHECK-NEXT:    vaddsh %xmm7, %xmm0, %xmm0
  76 ; CHECK-NEXT:    retq
  77   %t0 = fadd reassoc nsz half %x0, %x1
  78   %t1 = fadd reassoc nsz half %t0, %x2
  79   %t2 = fadd reassoc nsz half %t1, %x3
  80   %t3 = fadd reassoc nsz half %t2, %x4
  81   %t4 = fadd reassoc nsz half %t3, %x5
  82   %t5 = fadd reassoc nsz half %t4, %x6
  83   %t6 = fadd reassoc nsz half %t5, %x7
  84   ret half %t6
  85 }
  86
  87 ; Verify that we only need two associative operations to reassociate the operands.
  88 ; Also, we should reassociate such that the result of the high latency division
  89 ; is used by the final 'add' rather than reassociating the %x3 operand with the
  90 ; division. The latter reassociation would not improve anything.
  91
  92 define half @reassociate_adds6(half %x0, half %x1, half %x2, half %x3) {
  93 ; CHECK-LABEL: reassociate_adds6:
  94 ; CHECK:       # %bb.0:
  95 ; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
  96 ; CHECK-NEXT:    vaddsh %xmm2, %xmm3, %xmm1
  97 ; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
  98 ; CHECK-NEXT:    retq
  99   %t0 = fdiv reassoc nsz half %x0, %x1
 100   %t1 = fadd reassoc nsz half %x2, %t0
 101   %t2 = fadd reassoc nsz half %x3, %t1
 102   ret half %t2
 103 }
 104
 105 ; Verify that SSE and AVX scalar single-precision multiplies are reassociated.
 106
 107 define half @reassociate_muls1(half %x0, half %x1, half %x2, half %x3) {
 108 ; CHECK-LABEL: reassociate_muls1:
 109 ; CHECK:       # %bb.0:
 110 ; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
 111 ; CHECK-NEXT:    vmulsh %xmm2, %xmm3, %xmm1
 112 ; CHECK-NEXT:    vmulsh %xmm0, %xmm1, %xmm0
 113 ; CHECK-NEXT:    retq
 114   %t0 = fdiv reassoc nsz half %x0, %x1
 115   %t1 = fmul reassoc nsz half %x2, %t0
 116   %t2 = fmul reassoc nsz half %x3, %t1
 117   ret half %t2
 118 }
 119
 120 ; Verify that SSE and AVX 128-bit vector half-precision adds are reassociated.
 121
 122 define <8 x half> @reassociate_adds_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
 123 ; CHECK-LABEL: reassociate_adds_v8f16:
 124 ; CHECK:       # %bb.0:
 125 ; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm0
 126 ; CHECK-NEXT:    vaddph %xmm2, %xmm3, %xmm1
 127 ; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
 128 ; CHECK-NEXT:    retq
 129   %t0 = fdiv reassoc nsz <8 x half> %x0, %x1
 130   %t1 = fadd reassoc nsz <8 x half> %x2, %t0
 131   %t2 = fadd reassoc nsz <8 x half> %x3, %t1
 132   ret <8 x half> %t2
 133 }
 134
 135 ; Verify that SSE and AVX 128-bit vector half-precision multiplies are reassociated.
 136
 137 define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
 138 ; CHECK-LABEL: reassociate_muls_v8f16:
 139 ; CHECK:       # %bb.0:
 140 ; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
 141 ; CHECK-NEXT:    vmulph %xmm2, %xmm3, %xmm1
 142 ; CHECK-NEXT:    vmulph %xmm0, %xmm1, %xmm0
 143 ; CHECK-NEXT:    retq
 144   %t0 = fadd reassoc nsz <8 x half> %x0, %x1
 145   %t1 = fmul reassoc nsz <8 x half> %x2, %t0
 146   %t2 = fmul reassoc nsz <8 x half> %x3, %t1
 147   ret <8 x half> %t2
 148 }
 149
 150 ; Verify that AVX 256-bit vector half-precision adds are reassociated.
 151
 152 define <16 x half> @reassociate_adds_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
 153 ; CHECK-LABEL: reassociate_adds_v16f16:
 154 ; CHECK:       # %bb.0:
 155 ; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm0
 156 ; CHECK-NEXT:    vaddph %ymm2, %ymm3, %ymm1
 157 ; CHECK-NEXT:    vaddph %ymm0, %ymm1, %ymm0
 158 ; CHECK-NEXT:    retq
 159   %t0 = fdiv reassoc nsz <16 x half> %x0, %x1
 160   %t1 = fadd reassoc nsz <16 x half> %x2, %t0
 161   %t2 = fadd reassoc nsz <16 x half> %x3, %t1
 162   ret <16 x half> %t2
 163 }
 164
 165 ; Verify that AVX 256-bit vector half-precision multiplies are reassociated.
 166
 167 define <16 x half> @reassociate_muls_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
 168 ; CHECK-LABEL: reassociate_muls_v16f16:
 169 ; CHECK:       # %bb.0:
 170 ; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
 171 ; CHECK-NEXT:    vmulph %ymm2, %ymm3, %ymm1
 172 ; CHECK-NEXT:    vmulph %ymm0, %ymm1, %ymm0
 173 ; CHECK-NEXT:    retq
 174   %t0 = fadd reassoc nsz <16 x half> %x0, %x1
 175   %t1 = fmul reassoc nsz <16 x half> %x2, %t0
 176   %t2 = fmul reassoc nsz <16 x half> %x3, %t1
 177   ret <16 x half> %t2
 178 }
 179
 180 ; Verify that AVX512 512-bit vector half-precision adds are reassociated.
 181
 182 define <32 x half> @reassociate_adds_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
 183 ; CHECK-LABEL: reassociate_adds_v32f16:
 184 ; CHECK:       # %bb.0:
 185 ; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
 186 ; CHECK-NEXT:    vaddph %zmm2, %zmm3, %zmm1
 187 ; CHECK-NEXT:    vaddph %zmm0, %zmm1, %zmm0
 188 ; CHECK-NEXT:    retq
 189   %t0 = fdiv reassoc nsz <32 x half> %x0, %x1
 190   %t1 = fadd reassoc nsz <32 x half> %x2, %t0
 191   %t2 = fadd reassoc nsz <32 x half> %x3, %t1
 192   ret <32 x half> %t2
 193 }
 194
 195 ; Verify that AVX512 512-bit vector half-precision multiplies are reassociated.
 196
 197 define <32 x half> @reassociate_muls_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
 198 ; CHECK-LABEL: reassociate_muls_v32f16:
 199 ; CHECK:       # %bb.0:
 200 ; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
 201 ; CHECK-NEXT:    vmulph %zmm2, %zmm3, %zmm1
 202 ; CHECK-NEXT:    vmulph %zmm0, %zmm1, %zmm0
 203 ; CHECK-NEXT:    retq
 204   %t0 = fadd reassoc nsz <32 x half> %x0, %x1
 205   %t1 = fmul reassoc nsz <32 x half> %x2, %t0
 206   %t2 = fmul reassoc nsz <32 x half> %x3, %t1
 207   ret <32 x half> %t2
 208 }
 209
 210 ; Verify that SSE and AVX scalar half-precision minimum ops are reassociated.
 211
 212 define half @reassociate_mins_half(half %x0, half %x1, half %x2, half %x3) {
 213 ; CHECK-LABEL: reassociate_mins_half:
 214 ; CHECK:       # %bb.0:
 215 ; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
 216 ; CHECK-NEXT:    vminsh %xmm2, %xmm3, %xmm1
 217 ; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm0
 218 ; CHECK-NEXT:    retq
 219   %t0 = fdiv half %x0, %x1
 220   %cmp1 = fcmp olt half %x2, %t0
 221   %sel1 = select i1 %cmp1, half %x2, half %t0
 222   %cmp2 = fcmp olt half %x3, %sel1
 223   %sel2 = select i1 %cmp2, half %x3, half %sel1
 224   ret half %sel2
 225 }
 226
 227 ; Verify that SSE and AVX scalar half-precision maximum ops are reassociated.
 228
 229 define half @reassociate_maxs_half(half %x0, half %x1, half %x2, half %x3) {
 230 ; CHECK-LABEL: reassociate_maxs_half:
 231 ; CHECK:       # %bb.0:
 232 ; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
 233 ; CHECK-NEXT:    vmaxsh %xmm2, %xmm3, %xmm1
 234 ; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm0
 235 ; CHECK-NEXT:    retq
 236   %t0 = fdiv half %x0, %x1
 237   %cmp1 = fcmp ogt half %x2, %t0
 238   %sel1 = select i1 %cmp1, half %x2, half %t0
 239   %cmp2 = fcmp ogt half %x3, %sel1
 240   %sel2 = select i1 %cmp2, half %x3, half %sel1
 241   ret half %sel2
 242 }
 243
 244 ; Verify that SSE and AVX 128-bit vector half-precision minimum ops are reassociated.
 245
 246 define <8 x half> @reassociate_mins_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
 247 ; CHECK-LABEL: reassociate_mins_v8f16:
 248 ; CHECK:       # %bb.0:
 249 ; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
 250 ; CHECK-NEXT:    vminph %xmm2, %xmm3, %xmm1
 251 ; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm0
 252 ; CHECK-NEXT:    retq
 253   %t0 = fadd <8 x half> %x0, %x1
 254   %cmp1 = fcmp olt <8 x half> %x2, %t0
 255   %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0
 256   %cmp2 = fcmp olt <8 x half> %x3, %sel1
 257   %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1
 258   ret <8 x half> %sel2
 259 }
 260
 261 ; Verify that SSE and AVX 128-bit vector half-precision maximum ops are reassociated.
 262
 263 define <8 x half> @reassociate_maxs_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
 264 ; CHECK-LABEL: reassociate_maxs_v8f16:
 265 ; CHECK:       # %bb.0:
 266 ; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
 267 ; CHECK-NEXT:    vmaxph %xmm2, %xmm3, %xmm1
 268 ; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm0
 269 ; CHECK-NEXT:    retq
 270   %t0 = fadd <8 x half> %x0, %x1
 271   %cmp1 = fcmp ogt <8 x half> %x2, %t0
 272   %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0
 273   %cmp2 = fcmp ogt <8 x half> %x3, %sel1
 274   %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1
 275   ret <8 x half> %sel2
 276 }
 277
 278 ; Verify that AVX 256-bit vector half-precision minimum ops are reassociated.
 279
 280 define <16 x half> @reassociate_mins_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
 281 ; CHECK-LABEL: reassociate_mins_v16f16:
 282 ; CHECK:       # %bb.0:
 283 ; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
 284 ; CHECK-NEXT:    vminph %ymm2, %ymm3, %ymm1
 285 ; CHECK-NEXT:    vminph %ymm0, %ymm1, %ymm0
 286 ; CHECK-NEXT:    retq
 287   %t0 = fadd <16 x half> %x0, %x1
 288   %cmp1 = fcmp olt <16 x half> %x2, %t0
 289   %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0
 290   %cmp2 = fcmp olt <16 x half> %x3, %sel1
 291   %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1
 292   ret <16 x half> %sel2
 293 }
 294
 295 ; Verify that AVX 256-bit vector half-precision maximum ops are reassociated.
 296
 297 define <16 x half> @reassociate_maxs_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
 298 ; CHECK-LABEL: reassociate_maxs_v16f16:
 299 ; CHECK:       # %bb.0:
 300 ; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
 301 ; CHECK-NEXT:    vmaxph %ymm2, %ymm3, %ymm1
 302 ; CHECK-NEXT:    vmaxph %ymm0, %ymm1, %ymm0
 303 ; CHECK-NEXT:    retq
 304   %t0 = fadd <16 x half> %x0, %x1
 305   %cmp1 = fcmp ogt <16 x half> %x2, %t0
 306   %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0
 307   %cmp2 = fcmp ogt <16 x half> %x3, %sel1
 308   %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1
 309   ret <16 x half> %sel2
 310 }
 311
 312 ; Verify that AVX512 512-bit vector half-precision minimum ops are reassociated.
 313
 314 define <32 x half> @reassociate_mins_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
 315 ; CHECK-LABEL: reassociate_mins_v32f16:
 316 ; CHECK:       # %bb.0:
 317 ; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
 318 ; CHECK-NEXT:    vminph %zmm2, %zmm3, %zmm1
 319 ; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm0
 320 ; CHECK-NEXT:    retq
 321   %t0 = fadd <32 x half> %x0, %x1
 322   %cmp1 = fcmp olt <32 x half> %x2, %t0
 323   %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0
 324   %cmp2 = fcmp olt <32 x half> %x3, %sel1
 325   %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1
 326   ret <32 x half> %sel2
 327 }
 328
 329 ; Verify that AVX512 512-bit vector half-precision maximum ops are reassociated.
 330
 331 define <32 x half> @reassociate_maxs_v16f32(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
 332 ; CHECK-LABEL: reassociate_maxs_v16f32:
 333 ; CHECK:       # %bb.0:
 334 ; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
 335 ; CHECK-NEXT:    vmaxph %zmm2, %zmm3, %zmm1
 336 ; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm0
 337 ; CHECK-NEXT:    retq
 338   %t0 = fadd <32 x half> %x0, %x1
 339   %cmp1 = fcmp ogt <32 x half> %x2, %t0
 340   %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0
 341   %cmp2 = fcmp ogt <32 x half> %x3, %sel1
 342   %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1
 343   ret <32 x half> %sel2
 344 }
 345