From 699414a49357812bd428e9ccd37af398ca299c90 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Fri, 26 Oct 2018 03:19:13 +0000 Subject: [PATCH] [PowerPC] Keep vector int to fp conversions in vector domain At present a v2i16 -> v2f64 convert is implemented by extracts to scalar, scalar converts, and merge back into a vector. Use vector converts instead, with the int data permuted into the proper position and extended if necessary. Patch by RolandF. Differential revision: https://reviews.llvm.org/D53346 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345361 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCISelLowering.cpp | 68 ++++++++++++ lib/Target/PowerPC/PPCISelLowering.h | 3 + test/CodeGen/PowerPC/vec-itofp.ll | 192 +++++++++++++++++++++++++++++++++ 3 files changed, 263 insertions(+) create mode 100644 test/CodeGen/PowerPC/vec-itofp.ll diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index ca60f318278..860181c57bd 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -792,6 +792,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom); + setOperationAction(ISD::FNEG, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Legal); setOperationAction(ISD::FABS, MVT::v4f32, Legal); @@ -7265,10 +7268,75 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, return FP; } +static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { + + EVT VecVT = Vec.getValueType(); + assert(VecVT.isVector() && "Expected a vector type."); + assert(VecVT.getSizeInBits() < 128 && "Vector is already full width."); + + EVT EltVT = VecVT.getVectorElementType(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + + unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements(); + SmallVector Ops(NumConcat); + Ops[0] = Vec; + SDValue UndefVec = DAG.getUNDEF(VecVT); + for (unsigned i = 1; i < NumConcat; ++i) + Ops[i] = UndefVec; + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops); +} + +SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, + SelectionDAG &DAG, + const SDLoc &dl) const { + + unsigned Opc = Op.getOpcode(); + assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) && + "Unexpected conversion type"); + assert(Op.getValueType() == MVT::v2f64 && "Supports v2f64 only."); + + // CPU's prior to P9 don't have a way to sign-extend in vectors. + bool SignedConv = Opc == ISD::SINT_TO_FP; + if (SignedConv && !Subtarget.hasP9Altivec()) + return SDValue(); + + SDValue Wide = widenVec(DAG, Op.getOperand(0), dl); + EVT WideVT = Wide.getValueType(); + unsigned WideNumElts = WideVT.getVectorNumElements(); + + SmallVector ShuffV; + for (unsigned i = 0; i < WideNumElts; ++i) + ShuffV.push_back(i + WideNumElts); + + if (Subtarget.isLittleEndian()) { + ShuffV[0] = 0; + ShuffV[WideNumElts / 2] = 1; + } + else { + ShuffV[WideNumElts / 2 - 1] = 0; + ShuffV[WideNumElts - 1] = 1; + } + + SDValue ShuffleSrc2 = SignedConv ? DAG.getUNDEF(WideVT) : + DAG.getConstant(0, dl, WideVT); + SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV); + unsigned ExtendOp = SignedConv ? (unsigned) PPCISD::SExtVElems : + (unsigned) ISD::BITCAST; + SDValue Extend = DAG.getNode(ExtendOp, dl, MVT::v2i64, Arrange); + + return DAG.getNode(Opc, dl, Op.getValueType(), Extend); +} + SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + if (Op.getValueType() == MVT::v2f64 && + Op.getOperand(0).getValueType() == MVT::v2i16) + return LowerINT_TO_FPVector(Op, DAG, dl); + // Conversions to f128 are legal. if (EnableQuadPrecision && (Op.getValueType() == MVT::f128)) return Op; diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 959831cb1c0..081e7a92bf2 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -927,6 +927,9 @@ namespace llvm { SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const; + SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, + const SDLoc &dl) const; + SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; diff --git a/test/CodeGen/PowerPC/vec-itofp.ll b/test/CodeGen/PowerPC/vec-itofp.ll new file mode 100644 index 00000000000..852b7c822ad --- /dev/null +++ b/test/CodeGen/PowerPC/vec-itofp.ll @@ -0,0 +1,192 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-P8 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-P9 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE + +define void @test8(<8 x double>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16 + %1 = uitofp <8 x i16> %0 to <8 x double> + store <8 x double> %1, <8 x double>* %Sink, align 16 + ret void +; CHECK-P9-LABEL: @test8 +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: xvcvuxddp +; CHECK-P9: xvcvuxddp +; CHECK-P9: xvcvuxddp +; CHECK-P9: xvcvuxddp +; CHECK-P8-LABEL: @test8 +; CHECK-P8: vperm +; CHECK-P8: vperm +; CHECK-P8: vperm +; CHECK-P8: vperm +; CHECK-P8: xvcvuxddp +; CHECK-P8: xvcvuxddp +; CHECK-P8: xvcvuxddp +; CHECK-P8: xvcvuxddp +} + +define void @test4(<4 x double>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16 + %1 = uitofp <4 x i16> %0 to <4 x double> + store <4 x double> %1, <4 x double>* %Sink, align 16 + ret void +; CHECK-P9-LABEL: @test4 +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: xvcvuxddp +; CHECK-P9: xvcvuxddp +; CHECK-P8-LABEL: @test4 +; CHECK-P8: vperm +; CHECK-P8: vperm +; CHECK-P8: xvcvuxddp +; CHECK-P8: xvcvuxddp +} + +define void @test2(<2 x double>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16 + %1 = uitofp <2 x i16> %0 to <2 x double> + store <2 x double> %1, <2 x double>* %Sink, align 16 + ret void +; CHECK-P9-LABEL: .LCPI2_0: +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 30 +; CHECK-P9-NEXT: .byte 13 +; CHECK-P9-NEXT: .byte 12 +; CHECK-P9-NEXT: .byte 11 +; CHECK-P9-NEXT: .byte 10 +; CHECK-P9-NEXT: .byte 9 +; CHECK-P9-NEXT: .byte 8 +; CHECK-P9-NEXT: .byte 29 +; CHECK-P9-NEXT: .byte 28 +; CHECK-P9-NEXT: .byte 5 +; CHECK-P9-NEXT: .byte 4 +; CHECK-P9-NEXT: .byte 3 +; CHECK-P9-NEXT: .byte 2 +; CHECK-P9-NEXT: .byte 1 +; CHECK-P9-NEXT: .byte 0 +; CHECK-P9: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI2_0@toc@l +; CHECK-P9: lxvx [[REG2:v[0-9]+]], 0, [[REG1]] +; CHECK-P9: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]] +; CHECK-P9: xvcvuxddp {{vs[0-9]+}}, [[REG3]] +; CHECK-P8-LABEL: @test2 +; CHECK-P8: vperm [[REG1:v[0-9]+]] +; CHECK-P8: xvcvuxddp {{vs[0-9]+}}, [[REG1]] +; CHECK-BE-LABEL: .LCPI2_0: +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 17 +; CHECK-BE-NEXT: .byte 18 +; CHECK-BE-NEXT: .byte 19 +; CHECK-BE-NEXT: .byte 20 +; CHECK-BE-NEXT: .byte 21 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 1 +; CHECK-BE-NEXT: .byte 24 +; CHECK-BE-NEXT: .byte 25 +; CHECK-BE-NEXT: .byte 26 +; CHECK-BE-NEXT: .byte 27 +; CHECK-BE-NEXT: .byte 28 +; CHECK-BE-NEXT: .byte 29 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 3 +; CHECK-BE: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI2_0@toc@l +; CHECK-BE: lxvx [[REG2:v[0-9]+]], 0, [[REG1]] +; CHECK-BE: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]] +; CHECK-BE: xvcvuxddp {{vs[0-9]+}}, [[REG3]] +} + +define void @stest8(<8 x double>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16 + %1 = sitofp <8 x i16> %0 to <8 x double> + store <8 x double> %1, <8 x double>* %Sink, align 16 + ret void +; CHECK-P9-LABEL: @stest8 +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: vextsh2d +; CHECK-P9: vextsh2d +; CHECK-P9: vextsh2d +; CHECK-P9: vextsh2d +; CHECK-P9: xvcvsxddp +; CHECK-P9: xvcvsxddp +; CHECK-P9: xvcvsxddp +; CHECK-P9: xvcvsxddp +} + +define void @stest4(<4 x double>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16 + %1 = sitofp <4 x i16> %0 to <4 x double> + store <4 x double> %1, <4 x double>* %Sink, align 16 + ret void +; CHECK-P9-LABEL: @stest4 +; CHECK-P9: vperm +; CHECK-P9: vperm +; CHECK-P9: vextsh2d +; CHECK-P9: vextsh2d +; CHECK-P9: xvcvsxddp +; CHECK-P9: xvcvsxddp +} + +define void @stest2(<2 x double>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) { +entry: + %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16 + %1 = sitofp <2 x i16> %0 to <2 x double> + store <2 x double> %1, <2 x double>* %Sink, align 16 + ret void +; CHECK-P9-LABEL: .LCPI5_0: +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 30 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 29 +; CHECK-P9-NEXT: .byte 28 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9-NEXT: .byte 31 +; CHECK-P9: vperm [[REG1:v[0-9]+]] +; CHECK-P9: vextsh2d [[REG2:v[0-9]+]], [[REG1]] +; CHECK-P9: xvcvsxddp {{vs[0-9]+}}, [[REG2]] +; CHECK-BE-LABEL: .LCPI5_0: +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 1 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 0 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 3 +; CHECK-BE: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI5_0@toc@l +; CHECK-BE: lxvx [[REG2:v[0-9]+]], 0, [[REG1]] +; CHECK-BE: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]] +; CHECK-BE: vextsh2d [[REG4:v[0-9]+]], [[REG3]] +; CHECK-BE: xvcvsxddp {{vs[0-9]+}}, [[REG4]] +} -- 2.11.4.GIT