Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -404,8 +404,9 @@ /// representation. QBFLT, - /// Custom extend v4f32 to v2f64. - FP_EXTEND_LH, + /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or + /// lower (IDX=1) half of v4f32 to v2f64. + FP_EXTEND_HALF, /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1380,7 +1380,7 @@ case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; - case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH"; + case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF"; } return nullptr; } @@ -9628,6 +9628,21 @@ switch (Op0.getOpcode()) { default: return SDValue(); + case ISD::EXTRACT_SUBVECTOR: { + assert((Op0.getNumOperands()==2 || isa(Op0->getOperand(1))) + && "Node should have 2 operands with second one being a constant!"); + + // Custom lower is only done for high or low word. + int Idx = cast(Op0.getOperand(1))->getZExtValue(); + if (Idx % 2 != 0) return SDValue(); + int Word = Idx ? 1:0; + + // High and low word positions are different on little endian. + if (Subtarget.isLittleEndian()) Word = !Word; + + return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, + Op0.getOperand(0), DAG.getConstant(Word, dl, MVT::i32)); + } case ISD::FADD: case ISD::FMUL: case ISD::FSUB: { @@ -9649,7 +9664,8 @@ SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0], NewLoad[1], Op0.getNode()->getFlags()); - return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp); + return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp, + DAG.getConstant(0, dl, MVT::i32)); } case ISD::LOAD: { LoadSDNode *LD = cast(Op0); @@ -9658,7 +9674,8 @@ DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps, LD->getMemoryVT(), LD->getMemOperand()); - return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd); + return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd, + DAG.getConstant(0, dl, MVT::i32)); } } llvm_unreachable("ERROR:Should return for all cases within swtich."); Index: llvm/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -58,8 +58,8 @@ SDTCisVT<0, v4f32>, SDTCisPtrTy<1> ]>; -def SDT_PPCfpextlh : SDTypeProfile<1, 1, [ - SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32> +def SDT_PPCfpexth : SDTypeProfile<1, 2, [ + SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2> ]>; // Little-endian-specific nodes. @@ -93,7 +93,7 @@ def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; -def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>; +def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>; def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -1073,7 +1073,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)), (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>; -def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>; +def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>; +def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>; // Loads. let Predicates = [HasVSX, HasOnlySwappingMemOps] in { Index: llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE + +; Test reduce scalarization in fpext v2f32 to v2f64 from the extract_subvector v4f32 node. + +define dso_local void @test(<4 x float>* nocapture readonly %a, <2 x double>* nocapture %b, <2 x double>* nocapture %c) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs0, 0(r3) +; CHECK-NEXT: xxmrglw vs1, vs0, vs0 +; CHECK-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-NEXT: xvcvspdp vs1, vs1 +; CHECK-NEXT: xvcvspdp vs0, vs0 +; CHECK-NEXT: stxv vs1, 0(r4) +; CHECK-NEXT: stxv vs0, 0(r5) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: xxmrghw vs1, vs0, vs0 +; CHECK-BE-NEXT: xxmrglw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 +; CHECK-BE-NEXT: stxv vs1, 0(r4) +; CHECK-BE-NEXT: stxv vs0, 0(r5) +; CHECK-BE-NEXT: blr +entry: + %0 = load <4 x float>, <4 x float>* %a, align 16 + %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> + %shuffle1 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> + %vecinit4 = fpext <2 x float> %shuffle to <2 x double> + %vecinit11 = fpext <2 x float> %shuffle1 to <2 x double> + store <2 x double> %vecinit4, <2 x double>* %b, align 16 + store <2 x double> %vecinit11, <2 x double>* %c, align 16 + ret void +}