llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
   3
   4 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
   5 ; CHECK-LABEL: test1:
   6 ; CHECK:       # %bb.0: # %entry
   7 ; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
   8 ; CHECK-NEXT:    retq
   9 entry:
  10   %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
  11   %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  12   %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
  13   %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
  14   %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %2, <16 x float> zeroinitializer, i16 -1, i32 4) #2
  15   %4 = bitcast <16 x float> %3 to <32 x half>
  16   %add = fadd fast <32 x half> %4, %acc.coerce
  17   ret <32 x half> %add
  18 }
  19
  20 define dso_local <32 x half> @test2(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
  21 ; CHECK-LABEL: test2:
  22 ; CHECK:       # %bb.0: # %entry
  23 ; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
  24 ; CHECK-NEXT:    retq
  25 entry:
  26   %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
  27   %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  28   %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
  29   %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
  30   %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %2, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2
  31   %4 = bitcast <16 x float> %3 to <32 x half>
  32   %add = fadd fast <32 x half> %4, %acc.coerce
  33   ret <32 x half> %add
  34 }
  35
  36 define dso_local <16 x half> @test3(<16 x half> %acc.coerce, <16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 {
  37 ; CHECK-LABEL: test3:
  38 ; CHECK:       # %bb.0: # %entry
  39 ; CHECK-NEXT:    vfcmaddcph %ymm1, %ymm2, %ymm0
  40 ; CHECK-NEXT:    retq
  41 entry:
  42   %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32>
  43   %xor.i.i = xor <8 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  44   %1 = bitcast <8 x i32> %xor.i.i to <8 x float>
  45   %2 = bitcast <16 x half> %rhs.coerce to <8 x float>
  46   %3 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %1, <8 x float> %2, <8 x float> zeroinitializer, i8 -1) #2
  47   %4 = bitcast <8 x float> %3 to <16 x half>
  48   %add = fadd fast <16 x half> %4, %acc.coerce
  49   ret <16 x half> %add
  50 }
  51
  52 define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
  53 ; CHECK-LABEL: test4:
  54 ; CHECK:       # %bb.0: # %entry
  55 ; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
  56 ; CHECK-NEXT:    retq
  57 entry:
  58   %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
  59   %xor.i.i = xor <4 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  60   %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
  61   %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
  62   %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
  63   %4 = bitcast <4 x float> %3 to <8 x half>
  64   %add = fadd fast <8 x half> %4, %acc.coerce
  65   ret <8 x half> %add
  66 }
  67
  68 define dso_local <8 x half> @test5(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
  69 ; CHECK-LABEL: test5:
  70 ; CHECK:       # %bb.0: # %entry
  71 ; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
  72 ; CHECK-NEXT:    retq
  73 entry:
  74   %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
  75   %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
  76   %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
  77   %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
  78   %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
  79   %4 = bitcast <4 x float> %3 to <8 x half>
  80   %add = fadd fast <8 x half> %4, %acc.coerce
  81   ret <8 x half> %add
  82 }
  83
  84 define dso_local <8 x half> @test6(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
  85 ; CHECK-LABEL: test6:
  86 ; CHECK:       # %bb.0: # %entry
  87 ; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
  88 ; CHECK-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
  89 ; CHECK-NEXT:    retq
  90 entry:
  91   %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
  92   %xor.i.i = xor <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %0
  93   %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
  94   %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
  95   %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
  96   %4 = bitcast <4 x float> %3 to <8 x half>
  97   %add = fadd fast <8 x half> %4, %acc.coerce
  98   ret <8 x half> %add
  99 }
 100
 101 define dso_local <8 x half> @test7(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
 102 ; CHECK-LABEL: test7:
 103 ; CHECK:       # %bb.0: # %entry
 104 ; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
 105 ; CHECK-NEXT:    retq
 106 entry:
 107   %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
 108   %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
 109   %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
 110   %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
 111   %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
 112   %4 = bitcast <4 x float> %3 to <8 x half>
 113   %add = fadd fast <8 x half> %acc.coerce, %4
 114   ret <8 x half> %add
 115 }
 116
 117 define dso_local <8 x half> @test8(<8 x half> %acc.coerce, <4 x float> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
 118 ; CHECK-LABEL: test8:
 119 ; CHECK:       # %bb.0: # %entry
 120 ; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
 121 ; CHECK-NEXT:    retq
 122 entry:
 123   %0 = bitcast <4 x float> %lhs.coerce.conj to <4 x i32>
 124   %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
 125   %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
 126   %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
 127   %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
 128   %4 = bitcast <4 x float> %3 to <8 x half>
 129   %add = fadd fast <8 x half> %acc.coerce, %4
 130   ret <8 x half> %add
 131 }
 132
 133 define dso_local <32 x half> @test9(<32 x half> %acc.coerce, <8 x i64> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 134 ; CHECK-LABEL: test9:
 135 ; CHECK:       # %bb.0: # %entry
 136 ; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
 137 ; CHECK-NEXT:    retq
 138 entry:
 139   %xor1.i = xor <8 x i64> %lhs.coerce.conj, <i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160>
 140   %0 = bitcast <8 x i64> %xor1.i to <16 x float>
 141   %1 = bitcast <32 x half> %rhs.coerce to <16 x float>
 142   %2 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2
 143   %3 = bitcast <16 x float> %2 to <32 x half>
 144   %add = fadd fast <32 x half> %3, %acc.coerce
 145   ret <32 x half> %add
 146 }
 147
 148 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 149 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 150 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)