Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -404,6 +404,9 @@ /// representation. QBFLT, + /// Custom extend v4f32 to v2f64. + FP_EXTEND_LHW, + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of /// the GPRC input, then stores it through Ptr. Type can be either i16 or @@ -445,6 +448,10 @@ /// an xxswapd. LXVD2X, + /// VSRC, CHAIN = LXVLHW CHAIN, Ptr - This is a floating-point load of a + /// v2f32 value into the lower half of a VSR register. + LXVLHW, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. @@ -1018,6 +1025,7 @@ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -865,6 +865,7 @@ setOperationAction(ISD::FPOWI, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); } + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); } @@ -1365,6 +1366,8 @@ case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; + case PPCISD::LXVLHW: return "PPCISD::LXVLHW"; + case PPCISD::FP_EXTEND_LHW: return "PPCISD::FP_EXTEND_LHW"; } return nullptr; } @@ -9512,6 +9515,59 @@ return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); } +// Custom lowering for fpext vf32 to v2f64 +SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::FP_EXTEND && + "Should only be called for ISD::FP_EXTEND"); + + // return value is not MTV::v2f64 or param is not v2f32 + if (Op.getValueType() != MVT::v2f64 || + Op.getOperand(0).getValueType() != MVT::v2f32) + return SDValue(); + + SDLoc dl(Op); + SDValue Op0 = Op.getOperand(0); + + switch (Op0.getOpcode()) { + default: + return SDValue(); + case ISD::FADD: + case ISD::FMUL: + case ISD::FSUB: { + SDValue NewLoad[2]; + for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) { + // Ensure both input are loads. + SDValue LdOp = Op0.getOperand(i); + if (LdOp.getOpcode() != ISD::LOAD) + return SDValue(); + // Generate new load DAG. + LoadSDNode *LD = cast(LdOp); + SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; + NewLoad[i] = + DAG.getMemIntrinsicNode(PPCISD::LXVLHW, dl, + DAG.getVTList(MVT::v4f32, MVT::Other), + LoadOps, LD->getMemoryVT(), + LD->getMemOperand()); + } + SDValue newOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, + NewLoad[0], NewLoad[1], + Op0.getNode()->getFlags()); + return DAG.getNode(PPCISD::FP_EXTEND_LHW, dl, MVT::v2f64, newOp); + } + case ISD::LOAD: { + LoadSDNode *LD = cast(Op0); + SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() }; + SDValue NewLd = + DAG.getMemIntrinsicNode(PPCISD::LXVLHW, dl, + DAG.getVTList(MVT::v4f32, MVT::Other), + LoadOps, LD->getMemoryVT(), LD->getMemOperand()); + return DAG.getNode(PPCISD::FP_EXTEND_LHW, dl, MVT::v2f64, NewLd); + } + } + llvm_unreachable("Should never reach here!"); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9565,6 +9621,7 @@ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); Index: llvm/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -53,6 +53,15 @@ def spilltovsrrc : RegisterOperand { let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand; } + +def SDT_PPClxvlhw : SDTypeProfile<1, 1, [ + SDTCisVT<0, v4f32>, SDTCisPtrTy<1> +]>; + +def SDT_PPCfpextlhw : SDTypeProfile<1, 1, [ + SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32> +]>; + // Little-endian-specific nodes. def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>, SDTCisPtrTy<1> @@ -84,6 +93,10 @@ def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; +def PPCfpextlhw : SDNode<"PPCISD::FP_EXTEND_LHW", SDT_PPCfpextlhw, []>; +def PPClxvlhw : SDNode<"PPCISD::LXVLHW", SDT_PPClxvlhw, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, ValueType OutTy, ValueType InTy> { @@ -1062,6 +1075,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)), (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>; +def : Pat<(v2f64 (PPCfpextlhw v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>; + // Loads. let Predicates = [HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>; @@ -3288,6 +3303,10 @@ def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))), (f32 (DFLOADf32 ixaddr:$src))>; + def : Pat<(v4f32 (PPClxvlhw xaddr:$src)), + (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VRRC)>; + def : Pat<(v4f32 (PPClxvlhw ixaddr:$src)), + (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VRRC)>; let AddedComplexity = 400 in { // The following pseudoinstructions are used to ensure the utilization Index: llvm/test/CodeGen/PowerPC/reduce_scalarization.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/reduce_scalarization.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s + +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x double> @test1(<2 x float>* nocapture readonly %Ptr) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfd f0, 0(r3) +; CHECK-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-NEXT: xvcvspdp v2, vs0 +; CHECK-NEXT: blr +entry: + %0 = load <2 x float>, <2 x float>* %Ptr, align 8 + %1 = fpext <2 x float> %0 to <2 x double> + ret <2 x double> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x double> @test2(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfd f0, 0(r4) +; CHECK-NEXT: xxlor v2, vs0, vs0 +; CHECK-NEXT: lfd f0, 0(r3) +; CHECK-NEXT: xvsubsp vs0, vs0, v2 +; CHECK-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-NEXT: xvcvspdp v2, vs0 +; CHECK-NEXT: blr +entry: + %0 = load <2 x float>, <2 x float>* %a, align 8 + %1 = load <2 x float>, <2 x float>* %b, align 8 + %sub = fsub <2 x float> %0, %1 + %2 = fpext <2 x float> %sub to <2 x double> + ret <2 x double> %2 +} + +; Function Attrs: norecurse nounwind readonly +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x double> @test3(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfd f0, 0(r4) +; CHECK-NEXT: xxlor v2, vs0, vs0 +; CHECK-NEXT: lfd f0, 0(r3) +; CHECK-NEXT: xvaddsp vs0, vs0, v2 +; CHECK-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-NEXT: xvcvspdp v2, vs0 +; CHECK-NEXT: blr +entry: + %0 = load <2 x float>, <2 x float>* %a, align 8 + %1 = load <2 x float>, <2 x float>* %b, align 8 + %sub = fadd <2 x float> %0, %1 + %2 = fpext <2 x float> %sub to <2 x double> + ret <2 x double> %2 +} + +; Function Attrs: norecurse nounwind readonly +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x double> @test4(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lfd f0, 0(r4) +; CHECK-NEXT: xxlor v2, vs0, vs0 +; CHECK-NEXT: lfd f0, 0(r3) +; CHECK-NEXT: xvmulsp vs0, vs0, v2 +; CHECK-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-NEXT: xvcvspdp v2, vs0 +; CHECK-NEXT: blr +entry: + %0 = load <2 x float>, <2 x float>* %a, align 8 + %1 = load <2 x float>, <2 x float>* %b, align 8 + %sub = fmul <2 x float> %0, %1 + %2 = fpext <2 x float> %sub to <2 x double> + ret <2 x double> %2 +}