diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -6717,6 +6717,99 @@ CurDAG->RemoveDeadNodes(); } +static bool isVSXSwap(SDValue N) { + if (!N->isMachineOpcode()) + return false; + unsigned Opc = N->getMachineOpcode(); + + if (Opc == PPC::XXPERMDIs) { + return isa(N->getOperand(1)) && + N->getConstantOperandVal(1) == 2; + } else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) { + return N->getOperand(0) == N->getOperand(1) && + isa(N->getOperand(2)) && + N->getConstantOperandVal(2) == 2; + } + + return false; +} + +// TODO: Make this complete and replace with a table-gen bit. +static bool isLaneInsensitive(SDValue N) { + if (!N->isMachineOpcode()) + return false; + unsigned Opc = N->getMachineOpcode(); + + switch (Opc) { + default: + return false; + case PPC::VAVGSB: + case PPC::VAVGUB: + case PPC::VAVGSH: + case PPC::VAVGUH: + case PPC::VAVGSW: + case PPC::VAVGUW: + case PPC::VMAXFP: + case PPC::VMAXSB: + case PPC::VMAXUB: + case PPC::VMAXSH: + case PPC::VMAXUH: + case PPC::VMAXSW: + case PPC::VMAXUW: + case PPC::VMINFP: + case PPC::VMINSB: + case PPC::VMINUB: + case PPC::VMINSH: + case PPC::VMINUH: + case PPC::VMINSW: + case PPC::VMINUW: + case PPC::VADDFP: + case PPC::VADDUBM: + case PPC::VADDUHM: + case PPC::VADDUWM: + case PPC::VSUBFP: + case PPC::VSUBUBM: + case PPC::VSUBUHM: + case PPC::VSUBUWM: + case PPC::VAND: + case PPC::VANDC: + case PPC::VOR: + case PPC::VORC: + case PPC::VXOR: + case PPC::VNOR: + case PPC::VMULUWM: + return true; + } +} + +// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is +// lane-insensitive. +static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { + // Our desired xxswap might be source of COPY_TO_REGCLASS. + auto SkipRCCopy = [](SDValue V) { + while (V->isMachineOpcode() && + V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) + V = V->getOperand(0); + return V; + }; + + SDValue VecOp = SkipRCCopy(N->getOperand(0)); + if (!isLaneInsensitive(VecOp)) + return; + + SDValue LHS = SkipRCCopy(VecOp.getOperand(0)), + RHS = SkipRCCopy(VecOp.getOperand(1)); + if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) || + !isVSXSwap(RHS)) + return; + + // These swaps may still have chain-uses here, count on dead code elimination + // in following passes to remove them. + DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0)); + DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0)); + DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0)); +} + void PPCDAGToDAGISel::PeepholePPC64() { SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); @@ -6726,6 +6819,9 @@ if (N->use_empty() || !N->isMachineOpcode()) continue; + if (isVSXSwap(SDValue(N, 0))) + reduceVSXSwap(N, CurDAG); + unsigned FirstOp; unsigned StorageOpcode = N->getMachineOpcode(); bool RequiresMod4Offset = false; diff --git a/llvm/test/CodeGen/PowerPC/swap-reduction.ll b/llvm/test/CodeGen/PowerPC/swap-reduction.ll --- a/llvm/test/CodeGen/PowerPC/swap-reduction.ll +++ b/llvm/test/CodeGen/PowerPC/swap-reduction.ll @@ -7,14 +7,11 @@ ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: ld 4, 0(4) -; CHECK-NEXT: mtfprd 0, 3 +; CHECK-NEXT: mtvsrd 34, 3 ; CHECK-NEXT: add 3, 3, 4 -; CHECK-NEXT: mtfprd 1, 4 -; CHECK-NEXT: xxswapd 34, 0 -; CHECK-NEXT: xxswapd 35, 1 +; CHECK-NEXT: mtvsrd 35, 4 ; CHECK-NEXT: vavgsb 2, 2, 3 -; CHECK-NEXT: xxswapd 0, 34 -; CHECK-NEXT: stfdx 0, 0, 5 +; CHECK-NEXT: stxsdx 34, 0, 5 ; CHECK-NEXT: blr entry: %lhs = load i64, i64* %a, align 8 @@ -37,14 +34,11 @@ ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: ld 4, 0(4) -; CHECK-NEXT: mtfprd 0, 3 +; CHECK-NEXT: mtvsrd 34, 3 ; CHECK-NEXT: add 3, 3, 4 -; CHECK-NEXT: mtfprd 1, 4 -; CHECK-NEXT: xxswapd 34, 0 -; CHECK-NEXT: xxswapd 35, 1 +; CHECK-NEXT: mtvsrd 35, 4 ; CHECK-NEXT: vadduhm 2, 2, 3 -; CHECK-NEXT: xxswapd 0, 34 -; CHECK-NEXT: stfdx 0, 0, 5 +; CHECK-NEXT: stxsdx 34, 0, 5 ; CHECK-NEXT: blr entry: %lhs = load i64, i64* %a, align 8