llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ
   3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ
   4
   5 ; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
   6 define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
   7 ; NO-SZ-LABEL: test1:
   8 ; NO-SZ:       # %bb.0: # %entry
   9 ; NO-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
  10 ; NO-SZ-NEXT:    retq
  11 ;
  12 ; HAS-SZ-LABEL: test1:
  13 ; HAS-SZ:       # %bb.0: # %entry
  14 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
  15 ; HAS-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm3
  16 ; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
  17 ; HAS-SZ-NEXT:    retq
  18 entry:
  19   %0 = bitcast <32 x half> %a to <16 x float>
  20   %1 = bitcast <32 x half> %b to <16 x float>
  21   %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
  22   %3 = bitcast <16 x float> %2 to <32 x half>
  23   %add.i = fadd <32 x half> %3, %acc
  24   ret <32 x half> %add.i
  25 }
  26
  27 define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
  28 ; NO-SZ-LABEL: test2:
  29 ; NO-SZ:       # %bb.0: # %entry
  30 ; NO-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
  31 ; NO-SZ-NEXT:    retq
  32 ;
  33 ; HAS-SZ-LABEL: test2:
  34 ; HAS-SZ:       # %bb.0: # %entry
  35 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
  36 ; HAS-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm3
  37 ; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
  38 ; HAS-SZ-NEXT:    retq
  39 entry:
  40   %0 = bitcast <32 x half> %a to <16 x float>
  41   %1 = bitcast <32 x half> %b to <16 x float>
  42   %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
  43   %3 = bitcast <16 x float> %2 to <32 x half>
  44   %add.i = fadd <32 x half> %3, %acc
  45   ret <32 x half> %add.i
  46 }
  47
  48 define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
  49 ; NO-SZ-LABEL: test3:
  50 ; NO-SZ:       # %bb.0: # %entry
  51 ; NO-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
  52 ; NO-SZ-NEXT:    retq
  53 ;
  54 ; HAS-SZ-LABEL: test3:
  55 ; HAS-SZ:       # %bb.0: # %entry
  56 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
  57 ; HAS-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm3
  58 ; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
  59 ; HAS-SZ-NEXT:    retq
  60 entry:
  61   %0 = bitcast <16 x half> %a to <8 x float>
  62   %1 = bitcast <16 x half> %b to <8 x float>
  63   %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
  64   %3 = bitcast <8 x float> %2 to <16 x half>
  65   %add.i = fadd <16 x half> %3, %acc
  66   ret <16 x half> %add.i
  67 }
  68
  69 define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
  70 ; NO-SZ-LABEL: test4:
  71 ; NO-SZ:       # %bb.0: # %entry
  72 ; NO-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
  73 ; NO-SZ-NEXT:    retq
  74 ;
  75 ; HAS-SZ-LABEL: test4:
  76 ; HAS-SZ:       # %bb.0: # %entry
  77 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
  78 ; HAS-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm3
  79 ; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
  80 ; HAS-SZ-NEXT:    retq
  81 entry:
  82   %0 = bitcast <16 x half> %a to <8 x float>
  83   %1 = bitcast <16 x half> %b to <8 x float>
  84   %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
  85   %3 = bitcast <8 x float> %2 to <16 x half>
  86   %add.i = fadd <16 x half> %3, %acc
  87   ret <16 x half> %add.i
  88 }
  89
  90 define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
  91 ; NO-SZ-LABEL: test5:
  92 ; NO-SZ:       # %bb.0: # %entry
  93 ; NO-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
  94 ; NO-SZ-NEXT:    retq
  95 ;
  96 ; HAS-SZ-LABEL: test5:
  97 ; HAS-SZ:       # %bb.0: # %entry
  98 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
  99 ; HAS-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm3
 100 ; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
 101 ; HAS-SZ-NEXT:    retq
 102 entry:
 103   %0 = bitcast <8 x half> %a to <4 x float>
 104   %1 = bitcast <8 x half> %b to <4 x float>
 105   %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
 106   %3 = bitcast <4 x float> %2 to <8 x half>
 107   %add.i = fadd <8 x half> %3, %acc
 108   ret <8 x half> %add.i
 109 }
 110
 111 define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 112 ; NO-SZ-LABEL: test6:
 113 ; NO-SZ:       # %bb.0: # %entry
 114 ; NO-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
 115 ; NO-SZ-NEXT:    retq
 116 ;
 117 ; HAS-SZ-LABEL: test6:
 118 ; HAS-SZ:       # %bb.0: # %entry
 119 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 120 ; HAS-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm3
 121 ; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
 122 ; HAS-SZ-NEXT:    retq
 123 entry:
 124   %0 = bitcast <8 x half> %a to <4 x float>
 125   %1 = bitcast <8 x half> %b to <4 x float>
 126   %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
 127   %3 = bitcast <4 x float> %2 to <8 x half>
 128   %add.i = fadd <8 x half> %3, %acc
 129   ret <8 x half> %add.i
 130 }
 131
 132 ; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
 133 define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
 134 ; CHECK-LABEL: test13:
 135 ; CHECK:       # %bb.0: # %entry
 136 ; CHECK-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
 137 ; CHECK-NEXT:    retq
 138 entry:
 139   %0 = bitcast <32 x half> %a to <16 x float>
 140   %1 = bitcast <32 x half> %b to <16 x float>
 141   %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
 142   %3 = bitcast <16 x float> %2 to <32 x half>
 143   %add.i = fadd <32 x half> %3, %acc
 144   ret <32 x half> %add.i
 145 }
 146
 147 define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
 148 ; CHECK-LABEL: test14:
 149 ; CHECK:       # %bb.0: # %entry
 150 ; CHECK-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
 151 ; CHECK-NEXT:    retq
 152 entry:
 153   %0 = bitcast <32 x half> %a to <16 x float>
 154   %1 = bitcast <32 x half> %b to <16 x float>
 155   %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
 156   %3 = bitcast <16 x float> %2 to <32 x half>
 157   %add.i = fadd <32 x half> %3, %acc
 158   ret <32 x half> %add.i
 159 }
 160
 161 define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
 162 ; CHECK-LABEL: test15:
 163 ; CHECK:       # %bb.0: # %entry
 164 ; CHECK-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
 165 ; CHECK-NEXT:    retq
 166 entry:
 167   %0 = bitcast <16 x half> %a to <8 x float>
 168   %1 = bitcast <16 x half> %b to <8 x float>
 169   %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
 170   %3 = bitcast <8 x float> %2 to <16 x half>
 171   %add.i = fadd <16 x half> %3, %acc
 172   ret <16 x half> %add.i
 173 }
 174
 175 define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
 176 ; CHECK-LABEL: test16:
 177 ; CHECK:       # %bb.0: # %entry
 178 ; CHECK-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
 179 ; CHECK-NEXT:    retq
 180 entry:
 181   %0 = bitcast <16 x half> %a to <8 x float>
 182   %1 = bitcast <16 x half> %b to <8 x float>
 183   %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
 184   %3 = bitcast <8 x float> %2 to <16 x half>
 185   %add.i = fadd <16 x half> %3, %acc
 186   ret <16 x half> %add.i
 187 }
 188
 189 define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 190 ; CHECK-LABEL: test17:
 191 ; CHECK:       # %bb.0: # %entry
 192 ; CHECK-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
 193 ; CHECK-NEXT:    retq
 194 entry:
 195   %0 = bitcast <8 x half> %a to <4 x float>
 196   %1 = bitcast <8 x half> %b to <4 x float>
 197   %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
 198   %3 = bitcast <4 x float> %2 to <8 x half>
 199   %add.i = fadd <8 x half> %3, %acc
 200   ret <8 x half> %add.i
 201 }
 202
 203 define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 204 ; CHECK-LABEL: test18:
 205 ; CHECK:       # %bb.0: # %entry
 206 ; CHECK-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
 207 ; CHECK-NEXT:    retq
 208 entry:
 209   %0 = bitcast <8 x half> %a to <4 x float>
 210   %1 = bitcast <8 x half> %b to <4 x float>
 211   %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
 212   %3 = bitcast <4 x float> %2 to <8 x half>
 213   %add.i = fadd <8 x half> %3, %acc
 214   ret <8 x half> %add.i
 215 }
 216
 217 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 218 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 219 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 220 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 221 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 222 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)