From a5693c1203c3a26443af13182a8082c2e9152f6c Mon Sep 17 00:00:00 2001
From: Mark Wielaard <mark@klomp.org>
Date: Sat, 13 Apr 2024 14:33:19 +0200
Subject: [PATCH] amd64: Implement VFMADD213 for Iop_MAddF32 and Iop_MAddF64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Speed up F32 and F64 FMA on amd64. Add priv/host_amd64_maddf.c
implementing h_amd64_calc_MAddF32_fma4 and h_amd64_calc_MAddF64_fma4
to be used instead of the generic variants h_generic_calc_MAddF32
and h_generic_calc_MAddF64 when host has VEX_HWCAPS_AMD64_FMA4.
Add fma3 and fma4 detection m_machine.c (machine_get_hwcaps).

This patch also fixes the memcheck/tests/vcpu_fnfns and
none/tests/amd64/fma testcases when run on a x86-64-v3 system.

Patch contributed by Grazvydas Ignotas <notasas@gmail.com> and
Bruno Lathuilière <bruno.lathuiliere@edf.fr>

https://bugs.kde.org/show_bug.cgi?id=481127
https://bugs.kde.org/show_bug.cgi?id=463463
https://bugs.kde.org/show_bug.cgi?id=463458
---
 Makefile.vex.am             |   2 +
 NEWS                        |   3 ++
 VEX/priv/host_amd64_defs.c  | 108 ++++++++++++++++++++++++++++++++++++++++++++
 VEX/priv/host_amd64_defs.h  |  18 ++++++++
 VEX/priv/host_amd64_isel.c  |  60 ++++++++++++++++++++----
 VEX/priv/host_amd64_maddf.c |  35 ++++++++++++++
 VEX/priv/host_amd64_maddf.h |  32 +++++++++++++
 VEX/priv/main_main.c        |   2 +
 VEX/pub/libvex.h            |   2 +
 coregrind/m_machine.c       |  20 +++++---
 10 files changed, 266 insertions(+), 16 deletions(-)
 create mode 100644 VEX/priv/host_amd64_maddf.c
 create mode 100644 VEX/priv/host_amd64_maddf.h

diff --git a/Makefile.vex.am b/Makefile.vex.am
index 98d848359..c1244a69d 100644
--- a/Makefile.vex.am
+++ b/Makefile.vex.am
@@ -54,6 +54,7 @@ noinst_HEADERS = \
 	priv/host_generic_simd128.h \
 	priv/host_generic_simd256.h \
 	priv/host_generic_maddf.h \
+	priv/host_amd64_maddf.h \
 	priv/host_x86_defs.h \
 	priv/host_amd64_defs.h \
 	priv/host_ppc_defs.h \
@@ -156,6 +157,7 @@ LIBVEX_SOURCES_COMMON = \
 	priv/host_generic_simd128.c \
 	priv/host_generic_simd256.c \
 	priv/host_generic_maddf.c \
+	priv/host_amd64_maddf.c \
 	priv/host_generic_reg_alloc2.c \
 	priv/host_generic_reg_alloc3.c \
 	priv/host_x86_defs.c \
diff --git a/NEWS b/NEWS
index 1c3f9dce5..ea444f6ff 100644
--- a/NEWS
+++ b/NEWS
@@ -37,6 +37,8 @@ are not entered into bugzilla tend to get forgotten about or ignored.
 437790  valgrind reports "Conditional jump or move depends on uninitialised
         value" in memchr of macOS 10.12-10.15
 460616  disInstr(arm64): unhandled instruction 0x4E819402 (dotprod/ASIMDDP)
+463458  memcheck/tests/vcpu_fnfns fails when glibc is built for x86-64-v3
+463463  none/tests/amd64/fma fails when executed on a x86-64-v3 system
 466762  Add redirs for C23 free_sized() and free_aligned_sized()
 466884  Missing writev uninit padding suppression for _XSend
 471036  disInstr_AMD64: disInstr miscalculated next %rip on RORX imm8, m32/64, r32/6
@@ -70,6 +72,7 @@ are not entered into bugzilla tend to get forgotten about or ignored.
         Assertion '!sr_isError(sr)' failed."
 480488  Add support for FreeBSD 13.3
 480706  Unhandled syscall 325 (mlock2)
+481127  amd64: Implement VFMADD213 for Iop_MAddF32
 481131  [PATCH] x86 regtest: fix clobber lists in generated asm statements
 483786  Incorrect parameter indexing in FreeBSD clock_nanosleep syscall wrapper
 484002  Add suppression for invalid read in glibc's __wcpncpy_avx2() via wcsxfrm()
diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c
index 69afab739..253ed6515 100644
--- a/VEX/priv/host_amd64_defs.c
+++ b/VEX/priv/host_amd64_defs.c
@@ -590,6 +590,7 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) {
       case Asse_PMADDUBSW: return "pmaddubsw";
       case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4).";
       case Asse_F16toF32: return "vcvtph2ps.";
+      case Asse_VFMADD213: return "vfmadd213";
       default: vpanic("showAMD64SseOp");
    }
 }
@@ -1056,6 +1057,28 @@ AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
 //uu    i->Ain.AvxReRg.dst = rg;
 //uu    return i;
 //uu }
+AMD64Instr* AMD64Instr_Avx32FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) {
+   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag               = Ain_Avx32FLo;
+   i->Ain.Avx32FLo.op   = op;
+   i->Ain.Avx32FLo.src1 = src1;
+   i->Ain.Avx32FLo.src2 = src2;
+   i->Ain.Avx32FLo.dst  = dst;
+   vassert(op != Asse_MOV);
+   return i;
+}
+
+AMD64Instr* AMD64Instr_Avx64FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) {
+   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag               = Ain_Avx64FLo;
+   i->Ain.Avx64FLo.op   = op;
+   i->Ain.Avx64FLo.src1 = src1;
+   i->Ain.Avx64FLo.src2 = src2;
+   i->Ain.Avx64FLo.dst  = dst;
+   vassert(op != Asse_MOV);
+   return i;
+}
+
 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
                                  AMD64AMode* amFailAddr ) {
    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
@@ -1434,6 +1457,22 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
       //uu    vex_printf(",");
       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
       //uu    return;
+      case Ain_Avx32FLo:
+         vex_printf("%sss ", showAMD64SseOp(i->Ain.Avx32FLo.op));
+         ppHRegAMD64(i->Ain.Avx32FLo.src2);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Avx32FLo.src1);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Avx32FLo.dst);
+         return;
+      case Ain_Avx64FLo:
+         vex_printf("%ssd ", showAMD64SseOp(i->Ain.Avx64FLo.op));
+         ppHRegAMD64(i->Ain.Avx64FLo.src2);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Avx64FLo.src1);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Avx64FLo.dst);
+         return;
       case Ain_EvCheck:
          vex_printf("(evCheck) decl ");
          ppAMD64AMode(i->Ain.EvCheck.amCounter);
@@ -1790,6 +1829,18 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
       //uu       }
       //uu    }
       //uu    return;
+      case Ain_Avx32FLo:
+         vassert(i->Ain.Avx32FLo.op != Asse_MOV);
+         addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src1);
+         addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src2);
+         addHRegUse(u, HRmModify, i->Ain.Avx32FLo.dst);
+         return;
+      case Ain_Avx64FLo:
+         vassert(i->Ain.Avx64FLo.op != Asse_MOV);
+         addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src1);
+         addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src2);
+         addHRegUse(u, HRmModify, i->Ain.Avx64FLo.dst);
+         return;
       case Ain_EvCheck:
          /* We expect both amodes only to mention %rbp, so this is in
             fact pointless, since %rbp isn't allocatable, but anyway.. */
@@ -1999,6 +2050,16 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
       //uu    mapReg(m, &i->Ain.AvxReRg.src);
       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
       //uu    return;
+      case Ain_Avx32FLo:
+         mapReg(m, &i->Ain.Avx32FLo.src1);
+         mapReg(m, &i->Ain.Avx32FLo.src2);
+         mapReg(m, &i->Ain.Avx32FLo.dst);
+         return;
+      case Ain_Avx64FLo:
+         mapReg(m, &i->Ain.Avx64FLo.src1);
+         mapReg(m, &i->Ain.Avx64FLo.src2);
+         mapReg(m, &i->Ain.Avx64FLo.dst);
+         return;
       case Ain_EvCheck:
          /* We expect both amodes only to mention %rbp, so this is in
             fact pointless, since %rbp isn't allocatable, but anyway.. */
@@ -4061,6 +4122,53 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
    //uu      goto done;
    //uu }
 
+   case Ain_Avx32FLo: {
+      UInt d = vregEnc3210(i->Ain.Avx32FLo.dst);
+      UInt v = vregEnc3210(i->Ain.Avx32FLo.src1);
+      UInt s = vregEnc3210(i->Ain.Avx32FLo.src2);
+      UInt m = 2, pp = 1;
+      UInt opcode;
+      switch (i->Ain.Avx32FLo.op) {
+         case Asse_VFMADD213:
+            // VFMADD213SS %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15)
+            opcode = 0xa9;
+            break;
+         default:
+            goto bad;
+      }
+      // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 0 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte
+      //      :   1 1  d2 d1 d0 s2 s1 s0
+      *p++ = 0xC4; // 3-byte VEX
+      *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m;
+      *p++ = ((~v&0x0f) << 3) | pp;
+      *p++ = opcode;
+      *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
+      goto done;
+   }
+   case Ain_Avx64FLo: {
+      UInt d = vregEnc3210(i->Ain.Avx64FLo.dst);
+      UInt v = vregEnc3210(i->Ain.Avx64FLo.src1);
+      UInt s = vregEnc3210(i->Ain.Avx64FLo.src2);
+      UInt m = 2, pp = 1;
+      UInt opcode;
+      switch (i->Ain.Avx64FLo.op) {
+         case Asse_VFMADD213:
+            // VFMADD213SD %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15)
+            opcode = 0xa9;
+            break;
+         default:
+            goto bad;
+      }
+      // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 1 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte
+      //      :   1 1  d2 d1 d0 s2 s1 s0
+      *p++ = 0xC4; // 3-byte VEX
+      *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m;
+      *p++ = (1<<7)|((~v&0x0f) << 3) | pp;
+      *p++ = opcode;
+      *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
+      goto done;
+   }
+
    case Ain_EvCheck: {
       /* We generate:
             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h
index e2ed2613b..eae878e31 100644
--- a/VEX/priv/host_amd64_defs.h
+++ b/VEX/priv/host_amd64_defs.h
@@ -347,6 +347,8 @@ typedef
       // Only for F16C capable hosts:
       Asse_F32toF16, // F32 to F16 conversion, aka vcvtps2ph
       Asse_F16toF32, // F16 to F32 conversion, aka vcvtph2ps
+      // Only for FMA (FMA3) capable hosts:
+      Asse_VFMADD213, // Fused Multiply-Add, aka vfmadd213ss
    }
    AMD64SseOp;
 
@@ -412,6 +414,8 @@ typedef
       //uu Ain_AvxLdSt,     /* AVX load/store 256 bits,
       //uu                     no alignment constraints */
       //uu Ain_AvxReRg,     /* AVX binary general reg-reg, Re, Rg */
+      Ain_Avx32FLo,    /* AVX binary 3 operand, 32F in lowest lane only */
+      Ain_Avx64FLo,    /* AVX binary 3 operand, 64F in lowest lane only */
       Ain_EvCheck,     /* Event check */
       Ain_ProfInc      /* 64-bit profile counter increment */
    }
@@ -730,6 +734,18 @@ typedef
          //uu    HReg       dst;
          //uu } AvxReRg;
          struct {
+            AMD64SseOp op;
+            HReg       src1;
+            HReg       src2;
+            HReg       dst;
+         } Avx32FLo;
+         struct {
+            AMD64SseOp op;
+            HReg       src1;
+            HReg       src2;
+            HReg       dst;
+         } Avx64FLo;
+         struct {
             AMD64AMode* amCounter;
             AMD64AMode* amFailAddr;
          } EvCheck;
@@ -803,6 +819,8 @@ extern AMD64Instr* AMD64Instr_SseShiftN  ( AMD64SseOp,
 extern AMD64Instr* AMD64Instr_SseMOVQ    ( HReg gpr, HReg xmm, Bool toXMM );
 //uu extern AMD64Instr* AMD64Instr_AvxLdSt    ( Bool isLoad, HReg, AMD64AMode* );
 //uu extern AMD64Instr* AMD64Instr_AvxReRg    ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Avx32FLo   ( AMD64SseOp, HReg, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Avx64FLo   ( AMD64SseOp, HReg, HReg, HReg );
 extern AMD64Instr* AMD64Instr_EvCheck    ( AMD64AMode* amCounter,
                                            AMD64AMode* amFailAddr );
 extern AMD64Instr* AMD64Instr_ProfInc    ( void );
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index e15e1e60f..21d20c77f 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -42,6 +42,7 @@
 #include "host_generic_simd64.h"
 #include "host_generic_simd128.h"
 #include "host_generic_simd256.h"
+#include "host_amd64_maddf.h"
 #include "host_generic_maddf.h"
 #include "host_amd64_defs.h"
 
@@ -2832,6 +2833,13 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
       HReg argX = iselFltExpr(env, qop->arg2);
       HReg argY = iselFltExpr(env, qop->arg3);
       HReg argZ = iselFltExpr(env, qop->arg4);
+      if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
+         vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
+         if (dst.u32 != argX.u32)
+            addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
+         addInstr(env, AMD64Instr_Avx32FLo(Asse_VFMADD213, argY, argZ, dst));
+         return dst;
+      }
       /* XXXROUNDINGFIXME */
       /* set roundingmode here */
       /* subq $16, %rsp         -- make a space*/
@@ -2861,10 +2869,22 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e )
                                        AMD64AMode_IR(0, hregAMD64_RDX())));
       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
                                        AMD64AMode_IR(0, hregAMD64_RCX())));
-      /* call the helper */
-      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
-                                     (ULong)(HWord)h_generic_calc_MAddF32,
-                                     4, mk_RetLoc_simple(RLPri_None) ));
+
+      /* call the helper with priority order : fma4 -> fallback generic
+         remark: the fma3 case is handled before without helper*/
+#if defined(VGA_amd64)
+      if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+                                        (ULong)(HWord)h_amd64_calc_MAddF32_fma4,
+                                        4, mk_RetLoc_simple(RLPri_None) ));
+      }else
+#endif
+      {
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+                                        (ULong)(HWord)h_generic_calc_MAddF32,
+                                        4, mk_RetLoc_simple(RLPri_None) ));
+      }
+
       /* fetch the result from memory, using %r_argp, which the
          register allocator will keep alive across the call. */
       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
@@ -3024,6 +3044,14 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
       HReg argX = iselDblExpr(env, qop->arg2);
       HReg argY = iselDblExpr(env, qop->arg3);
       HReg argZ = iselDblExpr(env, qop->arg4);
+      if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
+         vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
+         if (dst.u32 != argX.u32)
+            addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
+         addInstr(env, AMD64Instr_Avx64FLo(Asse_VFMADD213, argY, argZ, dst));
+         return dst;
+      }
+
       /* XXXROUNDINGFIXME */
       /* set roundingmode here */
       /* subq $32, %rsp         -- make a space*/
@@ -3053,10 +3081,22 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e )
                                        AMD64AMode_IR(0, hregAMD64_RDX())));
       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
                                        AMD64AMode_IR(0, hregAMD64_RCX())));
-      /* call the helper */
-      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
-                                     (ULong)(HWord)h_generic_calc_MAddF64,
-                                     4, mk_RetLoc_simple(RLPri_None) ));
+
+      /* call the helper with priority order : fma4 -> fallback generic
+         remark: the fma3 case is handled before without helper*/
+#if defined(VGA_amd64)
+      if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+                                        (ULong)(HWord)h_amd64_calc_MAddF64_fma4,
+                                        4, mk_RetLoc_simple(RLPri_None) ));
+      }else
+#endif
+      {
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+                                        (ULong)(HWord)h_generic_calc_MAddF64,
+                                        4, mk_RetLoc_simple(RLPri_None) ));
+      }
+
       /* fetch the result from memory, using %r_argp, which the
          register allocator will keep alive across the call. */
       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
@@ -5372,7 +5412,9 @@ HInstrArray* iselSB_AMD64 ( const IRSB* bb,
                      | VEX_HWCAPS_AMD64_AVX2
                      | VEX_HWCAPS_AMD64_F16C
                      | VEX_HWCAPS_AMD64_RDRAND
-                     | VEX_HWCAPS_AMD64_RDSEED)));
+                     | VEX_HWCAPS_AMD64_RDSEED
+                     | VEX_HWCAPS_AMD64_FMA3
+                     | VEX_HWCAPS_AMD64_FMA4)));
 
    /* Check that the host's endianness is as expected. */
    vassert(archinfo_host->endness == VexEndnessLE);
diff --git a/VEX/priv/host_amd64_maddf.c b/VEX/priv/host_amd64_maddf.c
new file mode 100644
index 000000000..579abb438
--- /dev/null
+++ b/VEX/priv/host_amd64_maddf.c
@@ -0,0 +1,35 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                host_amd64_maddf.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   Compute x * y + z as ternary operation with intrinsics.
+*/
+
+
+#include "libvex_basictypes.h"
+#include "host_amd64_maddf.h"
+
+#if defined(VGA_amd64)
+void VEX_REGPARM(3)
+     h_amd64_calc_MAddF32_fma4 ( /*OUT*/Float* res,
+                            Float* argX, Float* argY, Float* argZ )
+{
+   __asm__ ("vfmaddss %3,%2,%1,%0;" :
+            "=x"(*res): "x"(*argX),"x"(*argY), "x"(*argZ));
+   return ;
+}
+
+void VEX_REGPARM(3)
+     h_amd64_calc_MAddF64_fma4 ( /*OUT*/Double* res,
+                            Double* argX, Double* argY, Double* argZ )
+{
+   __asm__ ("vfmaddsd %3,%2,%1,%0;" :
+            "=x"(*res): "x"(*argX),"x"(*argY), "x"(*argZ));
+   return;
+}
+#endif
+/*---------------------------------------------------------------*/
+/*--- end                                   host_amd64_maddf.c --*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_amd64_maddf.h b/VEX/priv/host_amd64_maddf.h
new file mode 100644
index 000000000..b592a44e1
--- /dev/null
+++ b/VEX/priv/host_amd64_maddf.h
@@ -0,0 +1,32 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                host_amd64_maddf.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   Compute x * y + z as ternary operation with intrinsics
+*/
+
+/* Generic helper functions for doing FMA, i.e. compute x * y + z
+   as ternary operation.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#ifndef __VEX_HOST_AMD64_MADDF_H
+#define __VEX_HOST_AMD64_MADDF_H
+
+#include "libvex_basictypes.h"
+
+#if defined(VGA_amd64)
+extern VEX_REGPARM(3)
+       void h_amd64_calc_MAddF32_fma4 ( /*OUT*/Float*, Float*, Float*, Float* );
+
+extern VEX_REGPARM(3)
+       void h_amd64_calc_MAddF64_fma4 ( /*OUT*/Double*, Double*, Double*,
+                                        Double* );
+#endif
+#endif /* ndef __VEX_HOST_AMD64_MADDF_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_amd64_maddf.h --*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
index 482047c7a..eda2fe6ee 100644
--- a/VEX/priv/main_main.c
+++ b/VEX/priv/main_main.c
@@ -1650,6 +1650,8 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps )
       { VEX_HWCAPS_AMD64_F16C,   "f16c"   },
       { VEX_HWCAPS_AMD64_RDRAND, "rdrand" },
       { VEX_HWCAPS_AMD64_RDSEED, "rdseed" },
+      { VEX_HWCAPS_AMD64_FMA3,   "fma"    }, /*fma to keep the same naming as /proc/cpuinfo*/
+      { VEX_HWCAPS_AMD64_FMA4,   "fma4"   },
    };
    /* Allocate a large enough buffer */
    static HChar buf[sizeof prefix + 
diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h
index 42c013c1e..15e2d39de 100644
--- a/VEX/pub/libvex.h
+++ b/VEX/pub/libvex.h
@@ -101,6 +101,8 @@ typedef
 #define VEX_HWCAPS_AMD64_RDRAND (1<<13) /* RDRAND instructions */
 #define VEX_HWCAPS_AMD64_F16C   (1<<14) /* F16C instructions */
 #define VEX_HWCAPS_AMD64_RDSEED (1<<15) /* RDSEED instructions */
+#define VEX_HWCAPS_AMD64_FMA3   (1<<16) /* FMA3 instructions */
+#define VEX_HWCAPS_AMD64_FMA4   (1<<17) /* FMA4 instructions */
 
 /* ppc32: baseline capability is integer only */
 #define VEX_HWCAPS_PPC32_F     (1<<8)  /* basic (non-optional) FP */
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
index 079383651..81fb81064 100644
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -984,6 +984,7 @@ Bool VG_(machine_get_hwcaps)( void )
 #elif defined(VGA_amd64)
    { Bool have_sse3, have_ssse3, have_cx8, have_cx16;
      Bool have_lzcnt, have_avx, have_bmi, have_avx2;
+     Bool have_fma3, have_fma4;
      Bool have_rdtscp, have_rdrand, have_f16c, have_rdseed;
      UInt eax, ebx, ecx, edx, max_basic, max_extended;
      ULong xgetbv_0 = 0;
@@ -992,7 +993,8 @@ Bool VG_(machine_get_hwcaps)( void )
 
      have_sse3 = have_ssse3 = have_cx8 = have_cx16
                = have_lzcnt = have_avx = have_bmi = have_avx2
-               = have_rdtscp = have_rdrand = have_f16c = have_rdseed = False;
+               = have_rdtscp = have_rdrand = have_f16c = have_rdseed
+               = have_fma3 = have_fma4 = False;
 
      eax = ebx = ecx = edx = max_basic = max_extended = 0;
 
@@ -1022,7 +1024,7 @@ Bool VG_(machine_get_hwcaps)( void )
      // we assume that SSE1 and SSE2 are available by default
      have_sse3  = (ecx & (1<<0)) != 0;  /* True => have sse3 insns */
      have_ssse3 = (ecx & (1<<9)) != 0;  /* True => have Sup SSE3 insns */
-     // fma     is ecx:12
+     have_fma3  = (ecx & (1<<12))!= 0;  /* True => have fma3 insns */
      // sse41   is ecx:19
      // sse42   is ecx:20
      // xsave   is ecx:26
@@ -1032,7 +1034,7 @@ Bool VG_(machine_get_hwcaps)( void )
      have_rdrand = (ecx & (1<<30)) != 0; /* True => have RDRAND insns */
 
      have_avx = False;
-     /* have_fma = False; */
+
      if ( (ecx & ((1<<28)|(1<<27)|(1<<26))) == ((1<<28)|(1<<27)|(1<<26)) ) {
         /* Processor supports AVX instructions and XGETBV is enabled
            by OS and AVX instructions are enabled by the OS. */
@@ -1059,9 +1061,6 @@ Bool VG_(machine_get_hwcaps)( void )
            if (ebx2 == 576 && eax2 == 256) {
               have_avx = True;
            }
-           /* have_fma = (ecx & (1<<12)) != 0; */
-           /* have_fma: Probably correct, but gcc complains due to
-              unusedness. */
         }
      }
 
@@ -1089,6 +1088,11 @@ Bool VG_(machine_get_hwcaps)( void )
         have_rdtscp = (edx & (1<<27)) != 0; /* True => have RDTSVCP */
      }
 
+     if (max_extended >= 0x80000001) {
+        VG_(cpuid)(0x80000001, 0, &eax, &ebx, &ecx, &edx);
+        have_fma4= (ecx & (1<<16)) != 0; /* True => have fma4 */
+     }
+
      /* Check for BMI1 and AVX2.  If we have AVX1 (plus OS support). */
      have_bmi  = False;
      have_avx2 = False;
@@ -1120,7 +1124,9 @@ Bool VG_(machine_get_hwcaps)( void )
                  | (have_rdtscp ? VEX_HWCAPS_AMD64_RDTSCP : 0)
                  | (have_f16c   ? VEX_HWCAPS_AMD64_F16C   : 0)
                  | (have_rdrand ? VEX_HWCAPS_AMD64_RDRAND : 0)
-                 | (have_rdseed ? VEX_HWCAPS_AMD64_RDSEED : 0);
+                 | (have_rdseed ? VEX_HWCAPS_AMD64_RDSEED : 0)
+                 | (have_fma3   ? VEX_HWCAPS_AMD64_FMA3   : 0)
+                 | (have_fma4   ? VEX_HWCAPS_AMD64_FMA4   : 0);
 
      VG_(machine_get_cache_info)(&vai);
 
-- 
2.11.4.GIT