From a5693c1203c3a26443af13182a8082c2e9152f6c Mon Sep 17 00:00:00 2001 From: Mark Wielaard Date: Sat, 13 Apr 2024 14:33:19 +0200 Subject: [PATCH] amd64: Implement VFMADD213 for Iop_MAddF32 and Iop_MAddF64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Speed up F32 and F64 FMA on amd64. Add priv/host_amd64_maddf.c implementing h_amd64_calc_MAddF32_fma4 and h_amd64_calc_MAddF64_fma4 to be used instead of the generic variants h_generic_calc_MAddF32 and h_generic_calc_MAddF64 when host has VEX_HWCAPS_AMD64_FMA4. Add fma3 and fma4 detection m_machine.c (machine_get_hwcaps). This patch also fixes the memcheck/tests/vcpu_fnfns and none/tests/amd64/fma testcases when run on a x86-64-v3 system. Patch contributed by Grazvydas Ignotas and Bruno Lathuilière https://bugs.kde.org/show_bug.cgi?id=481127 https://bugs.kde.org/show_bug.cgi?id=463463 https://bugs.kde.org/show_bug.cgi?id=463458 --- Makefile.vex.am | 2 + NEWS | 3 ++ VEX/priv/host_amd64_defs.c | 108 ++++++++++++++++++++++++++++++++++++++++++++ VEX/priv/host_amd64_defs.h | 18 ++++++++ VEX/priv/host_amd64_isel.c | 60 ++++++++++++++++++++---- VEX/priv/host_amd64_maddf.c | 35 ++++++++++++++ VEX/priv/host_amd64_maddf.h | 32 +++++++++++++ VEX/priv/main_main.c | 2 + VEX/pub/libvex.h | 2 + coregrind/m_machine.c | 20 +++++--- 10 files changed, 266 insertions(+), 16 deletions(-) create mode 100644 VEX/priv/host_amd64_maddf.c create mode 100644 VEX/priv/host_amd64_maddf.h diff --git a/Makefile.vex.am b/Makefile.vex.am index 98d848359..c1244a69d 100644 --- a/Makefile.vex.am +++ b/Makefile.vex.am @@ -54,6 +54,7 @@ noinst_HEADERS = \ priv/host_generic_simd128.h \ priv/host_generic_simd256.h \ priv/host_generic_maddf.h \ + priv/host_amd64_maddf.h \ priv/host_x86_defs.h \ priv/host_amd64_defs.h \ priv/host_ppc_defs.h \ @@ -156,6 +157,7 @@ LIBVEX_SOURCES_COMMON = \ priv/host_generic_simd128.c \ priv/host_generic_simd256.c \ priv/host_generic_maddf.c \ + priv/host_amd64_maddf.c \ priv/host_generic_reg_alloc2.c \ priv/host_generic_reg_alloc3.c \ priv/host_x86_defs.c \ diff --git a/NEWS b/NEWS index 1c3f9dce5..ea444f6ff 100644 --- a/NEWS +++ b/NEWS @@ -37,6 +37,8 @@ are not entered into bugzilla tend to get forgotten about or ignored. 437790 valgrind reports "Conditional jump or move depends on uninitialised value" in memchr of macOS 10.12-10.15 460616 disInstr(arm64): unhandled instruction 0x4E819402 (dotprod/ASIMDDP) +463458 memcheck/tests/vcpu_fnfns fails when glibc is built for x86-64-v3 +463463 none/tests/amd64/fma fails when executed on a x86-64-v3 system 466762 Add redirs for C23 free_sized() and free_aligned_sized() 466884 Missing writev uninit padding suppression for _XSend 471036 disInstr_AMD64: disInstr miscalculated next %rip on RORX imm8, m32/64, r32/6 @@ -70,6 +72,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. Assertion '!sr_isError(sr)' failed." 480488 Add support for FreeBSD 13.3 480706 Unhandled syscall 325 (mlock2) +481127 amd64: Implement VFMADD213 for Iop_MAddF32 481131 [PATCH] x86 regtest: fix clobber lists in generated asm statements 483786 Incorrect parameter indexing in FreeBSD clock_nanosleep syscall wrapper 484002 Add suppression for invalid read in glibc's __wcpncpy_avx2() via wcsxfrm() diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 69afab739..253ed6515 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -590,6 +590,7 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_PMADDUBSW: return "pmaddubsw"; case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4)."; case Asse_F16toF32: return "vcvtph2ps."; + case Asse_VFMADD213: return "vfmadd213"; default: vpanic("showAMD64SseOp"); } } @@ -1056,6 +1057,28 @@ AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) { //uu i->Ain.AvxReRg.dst = rg; //uu return i; //uu } +AMD64Instr* AMD64Instr_Avx32FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) { + AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); + i->tag = Ain_Avx32FLo; + i->Ain.Avx32FLo.op = op; + i->Ain.Avx32FLo.src1 = src1; + i->Ain.Avx32FLo.src2 = src2; + i->Ain.Avx32FLo.dst = dst; + vassert(op != Asse_MOV); + return i; +} + +AMD64Instr* AMD64Instr_Avx64FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) { + AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); + i->tag = Ain_Avx64FLo; + i->Ain.Avx64FLo.op = op; + i->Ain.Avx64FLo.src1 = src1; + i->Ain.Avx64FLo.src2 = src2; + i->Ain.Avx64FLo.dst = dst; + vassert(op != Asse_MOV); + return i; +} + AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, AMD64AMode* amFailAddr ) { AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); @@ -1434,6 +1457,22 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 ) //uu vex_printf(","); //uu ppHRegAMD64(i->Ain.AvxReRg.dst); //uu return; + case Ain_Avx32FLo: + vex_printf("%sss ", showAMD64SseOp(i->Ain.Avx32FLo.op)); + ppHRegAMD64(i->Ain.Avx32FLo.src2); + vex_printf(","); + ppHRegAMD64(i->Ain.Avx32FLo.src1); + vex_printf(","); + ppHRegAMD64(i->Ain.Avx32FLo.dst); + return; + case Ain_Avx64FLo: + vex_printf("%ssd ", showAMD64SseOp(i->Ain.Avx64FLo.op)); + ppHRegAMD64(i->Ain.Avx64FLo.src2); + vex_printf(","); + ppHRegAMD64(i->Ain.Avx64FLo.src1); + vex_printf(","); + ppHRegAMD64(i->Ain.Avx64FLo.dst); + return; case Ain_EvCheck: vex_printf("(evCheck) decl "); ppAMD64AMode(i->Ain.EvCheck.amCounter); @@ -1790,6 +1829,18 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) //uu } //uu } //uu return; + case Ain_Avx32FLo: + vassert(i->Ain.Avx32FLo.op != Asse_MOV); + addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src1); + addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src2); + addHRegUse(u, HRmModify, i->Ain.Avx32FLo.dst); + return; + case Ain_Avx64FLo: + vassert(i->Ain.Avx64FLo.op != Asse_MOV); + addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src1); + addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src2); + addHRegUse(u, HRmModify, i->Ain.Avx64FLo.dst); + return; case Ain_EvCheck: /* We expect both amodes only to mention %rbp, so this is in fact pointless, since %rbp isn't allocatable, but anyway.. */ @@ -1999,6 +2050,16 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) //uu mapReg(m, &i->Ain.AvxReRg.src); //uu mapReg(m, &i->Ain.AvxReRg.dst); //uu return; + case Ain_Avx32FLo: + mapReg(m, &i->Ain.Avx32FLo.src1); + mapReg(m, &i->Ain.Avx32FLo.src2); + mapReg(m, &i->Ain.Avx32FLo.dst); + return; + case Ain_Avx64FLo: + mapReg(m, &i->Ain.Avx64FLo.src1); + mapReg(m, &i->Ain.Avx64FLo.src2); + mapReg(m, &i->Ain.Avx64FLo.dst); + return; case Ain_EvCheck: /* We expect both amodes only to mention %rbp, so this is in fact pointless, since %rbp isn't allocatable, but anyway.. */ @@ -4061,6 +4122,53 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, //uu goto done; //uu } + case Ain_Avx32FLo: { + UInt d = vregEnc3210(i->Ain.Avx32FLo.dst); + UInt v = vregEnc3210(i->Ain.Avx32FLo.src1); + UInt s = vregEnc3210(i->Ain.Avx32FLo.src2); + UInt m = 2, pp = 1; + UInt opcode; + switch (i->Ain.Avx32FLo.op) { + case Asse_VFMADD213: + // VFMADD213SS %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15) + opcode = 0xa9; + break; + default: + goto bad; + } + // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 0 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte + // : 1 1 d2 d1 d0 s2 s1 s0 + *p++ = 0xC4; // 3-byte VEX + *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m; + *p++ = ((~v&0x0f) << 3) | pp; + *p++ = opcode; + *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0); + goto done; + } + case Ain_Avx64FLo: { + UInt d = vregEnc3210(i->Ain.Avx64FLo.dst); + UInt v = vregEnc3210(i->Ain.Avx64FLo.src1); + UInt s = vregEnc3210(i->Ain.Avx64FLo.src2); + UInt m = 2, pp = 1; + UInt opcode; + switch (i->Ain.Avx64FLo.op) { + case Asse_VFMADD213: + // VFMADD213SD %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15) + opcode = 0xa9; + break; + default: + goto bad; + } + // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 1 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte + // : 1 1 d2 d1 d0 s2 s1 s0 + *p++ = 0xC4; // 3-byte VEX + *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m; + *p++ = (1<<7)|((~v&0x0f) << 3) | pp; + *p++ = opcode; + *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0); + goto done; + } + case Ain_EvCheck: { /* We generate: (3 bytes) decl 8(%rbp) 8 == offsetof(host_EvC_COUNTER) diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index e2ed2613b..eae878e31 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -347,6 +347,8 @@ typedef // Only for F16C capable hosts: Asse_F32toF16, // F32 to F16 conversion, aka vcvtps2ph Asse_F16toF32, // F16 to F32 conversion, aka vcvtph2ps + // Only for FMA (FMA3) capable hosts: + Asse_VFMADD213, // Fused Multiply-Add, aka vfmadd213ss } AMD64SseOp; @@ -412,6 +414,8 @@ typedef //uu Ain_AvxLdSt, /* AVX load/store 256 bits, //uu no alignment constraints */ //uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */ + Ain_Avx32FLo, /* AVX binary 3 operand, 32F in lowest lane only */ + Ain_Avx64FLo, /* AVX binary 3 operand, 64F in lowest lane only */ Ain_EvCheck, /* Event check */ Ain_ProfInc /* 64-bit profile counter increment */ } @@ -730,6 +734,18 @@ typedef //uu HReg dst; //uu } AvxReRg; struct { + AMD64SseOp op; + HReg src1; + HReg src2; + HReg dst; + } Avx32FLo; + struct { + AMD64SseOp op; + HReg src1; + HReg src2; + HReg dst; + } Avx64FLo; + struct { AMD64AMode* amCounter; AMD64AMode* amFailAddr; } EvCheck; @@ -803,6 +819,8 @@ extern AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp, extern AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ); //uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* ); //uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg ); +extern AMD64Instr* AMD64Instr_Avx32FLo ( AMD64SseOp, HReg, HReg, HReg ); +extern AMD64Instr* AMD64Instr_Avx64FLo ( AMD64SseOp, HReg, HReg, HReg ); extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, AMD64AMode* amFailAddr ); extern AMD64Instr* AMD64Instr_ProfInc ( void ); diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index e15e1e60f..21d20c77f 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -42,6 +42,7 @@ #include "host_generic_simd64.h" #include "host_generic_simd128.h" #include "host_generic_simd256.h" +#include "host_amd64_maddf.h" #include "host_generic_maddf.h" #include "host_amd64_defs.h" @@ -2832,6 +2833,13 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e ) HReg argX = iselFltExpr(env, qop->arg2); HReg argY = iselFltExpr(env, qop->arg3); HReg argZ = iselFltExpr(env, qop->arg4); + if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) { + vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32); + if (dst.u32 != argX.u32) + addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst)); + addInstr(env, AMD64Instr_Avx32FLo(Asse_VFMADD213, argY, argZ, dst)); + return dst; + } /* XXXROUNDINGFIXME */ /* set roundingmode here */ /* subq $16, %rsp -- make a space*/ @@ -2861,10 +2869,22 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e ) AMD64AMode_IR(0, hregAMD64_RDX()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ, AMD64AMode_IR(0, hregAMD64_RCX()))); - /* call the helper */ - addInstr(env, AMD64Instr_Call( Acc_ALWAYS, - (ULong)(HWord)h_generic_calc_MAddF32, - 4, mk_RetLoc_simple(RLPri_None) )); + + /* call the helper with priority order : fma4 -> fallback generic + remark: the fma3 case is handled before without helper*/ +#if defined(VGA_amd64) + if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) { + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, + (ULong)(HWord)h_amd64_calc_MAddF32_fma4, + 4, mk_RetLoc_simple(RLPri_None) )); + }else +#endif + { + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, + (ULong)(HWord)h_generic_calc_MAddF32, + 4, mk_RetLoc_simple(RLPri_None) )); + } + /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst, @@ -3024,6 +3044,14 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e ) HReg argX = iselDblExpr(env, qop->arg2); HReg argY = iselDblExpr(env, qop->arg3); HReg argZ = iselDblExpr(env, qop->arg4); + if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) { + vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32); + if (dst.u32 != argX.u32) + addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst)); + addInstr(env, AMD64Instr_Avx64FLo(Asse_VFMADD213, argY, argZ, dst)); + return dst; + } + /* XXXROUNDINGFIXME */ /* set roundingmode here */ /* subq $32, %rsp -- make a space*/ @@ -3053,10 +3081,22 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e ) AMD64AMode_IR(0, hregAMD64_RDX()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ, AMD64AMode_IR(0, hregAMD64_RCX()))); - /* call the helper */ - addInstr(env, AMD64Instr_Call( Acc_ALWAYS, - (ULong)(HWord)h_generic_calc_MAddF64, - 4, mk_RetLoc_simple(RLPri_None) )); + + /* call the helper with priority order : fma4 -> fallback generic + remark: the fma3 case is handled before without helper*/ +#if defined(VGA_amd64) + if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) { + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, + (ULong)(HWord)h_amd64_calc_MAddF64_fma4, + 4, mk_RetLoc_simple(RLPri_None) )); + }else +#endif + { + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, + (ULong)(HWord)h_generic_calc_MAddF64, + 4, mk_RetLoc_simple(RLPri_None) )); + } + /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst, @@ -5372,7 +5412,9 @@ HInstrArray* iselSB_AMD64 ( const IRSB* bb, | VEX_HWCAPS_AMD64_AVX2 | VEX_HWCAPS_AMD64_F16C | VEX_HWCAPS_AMD64_RDRAND - | VEX_HWCAPS_AMD64_RDSEED))); + | VEX_HWCAPS_AMD64_RDSEED + | VEX_HWCAPS_AMD64_FMA3 + | VEX_HWCAPS_AMD64_FMA4))); /* Check that the host's endianness is as expected. */ vassert(archinfo_host->endness == VexEndnessLE); diff --git a/VEX/priv/host_amd64_maddf.c b/VEX/priv/host_amd64_maddf.c new file mode 100644 index 000000000..579abb438 --- /dev/null +++ b/VEX/priv/host_amd64_maddf.c @@ -0,0 +1,35 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_amd64_maddf.c ---*/ +/*---------------------------------------------------------------*/ + +/* + Compute x * y + z as ternary operation with intrinsics. +*/ + + +#include "libvex_basictypes.h" +#include "host_amd64_maddf.h" + +#if defined(VGA_amd64) +void VEX_REGPARM(3) + h_amd64_calc_MAddF32_fma4 ( /*OUT*/Float* res, + Float* argX, Float* argY, Float* argZ ) +{ + __asm__ ("vfmaddss %3,%2,%1,%0;" : + "=x"(*res): "x"(*argX),"x"(*argY), "x"(*argZ)); + return ; +} + +void VEX_REGPARM(3) + h_amd64_calc_MAddF64_fma4 ( /*OUT*/Double* res, + Double* argX, Double* argY, Double* argZ ) +{ + __asm__ ("vfmaddsd %3,%2,%1,%0;" : + "=x"(*res): "x"(*argX),"x"(*argY), "x"(*argZ)); + return; +} +#endif +/*---------------------------------------------------------------*/ +/*--- end host_amd64_maddf.c --*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_amd64_maddf.h b/VEX/priv/host_amd64_maddf.h new file mode 100644 index 000000000..b592a44e1 --- /dev/null +++ b/VEX/priv/host_amd64_maddf.h @@ -0,0 +1,32 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_amd64_maddf.h ---*/ +/*---------------------------------------------------------------*/ + +/* + Compute x * y + z as ternary operation with intrinsics +*/ + +/* Generic helper functions for doing FMA, i.e. compute x * y + z + as ternary operation. + These are purely back-end entities and cannot be seen/referenced + from IR. */ + +#ifndef __VEX_HOST_AMD64_MADDF_H +#define __VEX_HOST_AMD64_MADDF_H + +#include "libvex_basictypes.h" + +#if defined(VGA_amd64) +extern VEX_REGPARM(3) + void h_amd64_calc_MAddF32_fma4 ( /*OUT*/Float*, Float*, Float*, Float* ); + +extern VEX_REGPARM(3) + void h_amd64_calc_MAddF64_fma4 ( /*OUT*/Double*, Double*, Double*, + Double* ); +#endif +#endif /* ndef __VEX_HOST_AMD64_MADDF_H */ + +/*---------------------------------------------------------------*/ +/*--- end host_amd64_maddf.h --*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index 482047c7a..eda2fe6ee 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -1650,6 +1650,8 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps ) { VEX_HWCAPS_AMD64_F16C, "f16c" }, { VEX_HWCAPS_AMD64_RDRAND, "rdrand" }, { VEX_HWCAPS_AMD64_RDSEED, "rdseed" }, + { VEX_HWCAPS_AMD64_FMA3, "fma" }, /*fma to keep the same naming as /proc/cpuinfo*/ + { VEX_HWCAPS_AMD64_FMA4, "fma4" }, }; /* Allocate a large enough buffer */ static HChar buf[sizeof prefix + diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h index 42c013c1e..15e2d39de 100644 --- a/VEX/pub/libvex.h +++ b/VEX/pub/libvex.h @@ -101,6 +101,8 @@ typedef #define VEX_HWCAPS_AMD64_RDRAND (1<<13) /* RDRAND instructions */ #define VEX_HWCAPS_AMD64_F16C (1<<14) /* F16C instructions */ #define VEX_HWCAPS_AMD64_RDSEED (1<<15) /* RDSEED instructions */ +#define VEX_HWCAPS_AMD64_FMA3 (1<<16) /* FMA3 instructions */ +#define VEX_HWCAPS_AMD64_FMA4 (1<<17) /* FMA4 instructions */ /* ppc32: baseline capability is integer only */ #define VEX_HWCAPS_PPC32_F (1<<8) /* basic (non-optional) FP */ diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c index 079383651..81fb81064 100644 --- a/coregrind/m_machine.c +++ b/coregrind/m_machine.c @@ -984,6 +984,7 @@ Bool VG_(machine_get_hwcaps)( void ) #elif defined(VGA_amd64) { Bool have_sse3, have_ssse3, have_cx8, have_cx16; Bool have_lzcnt, have_avx, have_bmi, have_avx2; + Bool have_fma3, have_fma4; Bool have_rdtscp, have_rdrand, have_f16c, have_rdseed; UInt eax, ebx, ecx, edx, max_basic, max_extended; ULong xgetbv_0 = 0; @@ -992,7 +993,8 @@ Bool VG_(machine_get_hwcaps)( void ) have_sse3 = have_ssse3 = have_cx8 = have_cx16 = have_lzcnt = have_avx = have_bmi = have_avx2 - = have_rdtscp = have_rdrand = have_f16c = have_rdseed = False; + = have_rdtscp = have_rdrand = have_f16c = have_rdseed + = have_fma3 = have_fma4 = False; eax = ebx = ecx = edx = max_basic = max_extended = 0; @@ -1022,7 +1024,7 @@ Bool VG_(machine_get_hwcaps)( void ) // we assume that SSE1 and SSE2 are available by default have_sse3 = (ecx & (1<<0)) != 0; /* True => have sse3 insns */ have_ssse3 = (ecx & (1<<9)) != 0; /* True => have Sup SSE3 insns */ - // fma is ecx:12 + have_fma3 = (ecx & (1<<12))!= 0; /* True => have fma3 insns */ // sse41 is ecx:19 // sse42 is ecx:20 // xsave is ecx:26 @@ -1032,7 +1034,7 @@ Bool VG_(machine_get_hwcaps)( void ) have_rdrand = (ecx & (1<<30)) != 0; /* True => have RDRAND insns */ have_avx = False; - /* have_fma = False; */ + if ( (ecx & ((1<<28)|(1<<27)|(1<<26))) == ((1<<28)|(1<<27)|(1<<26)) ) { /* Processor supports AVX instructions and XGETBV is enabled by OS and AVX instructions are enabled by the OS. */ @@ -1059,9 +1061,6 @@ Bool VG_(machine_get_hwcaps)( void ) if (ebx2 == 576 && eax2 == 256) { have_avx = True; } - /* have_fma = (ecx & (1<<12)) != 0; */ - /* have_fma: Probably correct, but gcc complains due to - unusedness. */ } } @@ -1089,6 +1088,11 @@ Bool VG_(machine_get_hwcaps)( void ) have_rdtscp = (edx & (1<<27)) != 0; /* True => have RDTSVCP */ } + if (max_extended >= 0x80000001) { + VG_(cpuid)(0x80000001, 0, &eax, &ebx, &ecx, &edx); + have_fma4= (ecx & (1<<16)) != 0; /* True => have fma4 */ + } + /* Check for BMI1 and AVX2. If we have AVX1 (plus OS support). */ have_bmi = False; have_avx2 = False; @@ -1120,7 +1124,9 @@ Bool VG_(machine_get_hwcaps)( void ) | (have_rdtscp ? VEX_HWCAPS_AMD64_RDTSCP : 0) | (have_f16c ? VEX_HWCAPS_AMD64_F16C : 0) | (have_rdrand ? VEX_HWCAPS_AMD64_RDRAND : 0) - | (have_rdseed ? VEX_HWCAPS_AMD64_RDSEED : 0); + | (have_rdseed ? VEX_HWCAPS_AMD64_RDSEED : 0) + | (have_fma3 ? VEX_HWCAPS_AMD64_FMA3 : 0) + | (have_fma4 ? VEX_HWCAPS_AMD64_FMA4 : 0); VG_(machine_get_cache_info)(&vai); -- 2.11.4.GIT