diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -6717,6 +6717,102 @@ CurDAG->RemoveDeadNodes(); } +static bool isVSXSwap(SDValue N) { + if (!N->isMachineOpcode()) + return false; + unsigned Opc = N->getMachineOpcode(); + + // Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate + // operand is 2. + if (Opc == PPC::XXPERMDIs) { + return isa(N->getOperand(1)) && + N->getConstantOperandVal(1) == 2; + } else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) { + return N->getOperand(0) == N->getOperand(1) && + isa(N->getOperand(2)) && + N->getConstantOperandVal(2) == 2; + } + + return false; +} + +// TODO: Make this complete and replace with a table-gen bit. +static bool isLaneInsensitive(SDValue N) { + if (!N->isMachineOpcode()) + return false; + unsigned Opc = N->getMachineOpcode(); + + switch (Opc) { + default: + return false; + case PPC::VAVGSB: + case PPC::VAVGUB: + case PPC::VAVGSH: + case PPC::VAVGUH: + case PPC::VAVGSW: + case PPC::VAVGUW: + case PPC::VMAXFP: + case PPC::VMAXSB: + case PPC::VMAXUB: + case PPC::VMAXSH: + case PPC::VMAXUH: + case PPC::VMAXSW: + case PPC::VMAXUW: + case PPC::VMINFP: + case PPC::VMINSB: + case PPC::VMINUB: + case PPC::VMINSH: + case PPC::VMINUH: + case PPC::VMINSW: + case PPC::VMINUW: + case PPC::VADDFP: + case PPC::VADDUBM: + case PPC::VADDUHM: + case PPC::VADDUWM: + case PPC::VSUBFP: + case PPC::VSUBUBM: + case PPC::VSUBUHM: + case PPC::VSUBUWM: + case PPC::VAND: + case PPC::VANDC: + case PPC::VOR: + case PPC::VORC: + case PPC::VXOR: + case PPC::VNOR: + case PPC::VMULUWM: + return true; + } +} + +// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is +// lane-insensitive. +static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) { + // Our desired xxswap might be source of COPY_TO_REGCLASS. + // TODO: Can we put this a common method for DAG? + auto SkipRCCopy = [](SDValue V) { + while (V->isMachineOpcode() && + V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) + V = V->getOperand(0); + return V; + }; + + SDValue VecOp = SkipRCCopy(N->getOperand(0)); + if (!isLaneInsensitive(VecOp)) + return; + + SDValue LHS = SkipRCCopy(VecOp.getOperand(0)), + RHS = SkipRCCopy(VecOp.getOperand(1)); + if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) || + !isVSXSwap(RHS)) + return; + + // These swaps may still have chain-uses here, count on dead code elimination + // in following passes to remove them. + DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0)); + DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0)); + DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0)); +} + void PPCDAGToDAGISel::PeepholePPC64() { SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); @@ -6726,6 +6822,9 @@ if (N->use_empty() || !N->isMachineOpcode()) continue; + if (isVSXSwap(SDValue(N, 0))) + reduceVSXSwap(N, CurDAG); + unsigned FirstOp; unsigned StorageOpcode = N->getMachineOpcode(); bool RequiresMod4Offset = false; diff --git a/llvm/test/CodeGen/PowerPC/swap-reduction.ll b/llvm/test/CodeGen/PowerPC/swap-reduction.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/swap-reduction.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le < %s | FileCheck %s + +define i64 @test1(i64* %a, i64* %b) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mr 5, 3 +; CHECK-NEXT: ld 3, 0(3) +; CHECK-NEXT: ld 4, 0(4) +; CHECK-NEXT: mtvsrd 34, 3 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: mtvsrd 35, 4 +; CHECK-NEXT: vavgsb 2, 2, 3 +; CHECK-NEXT: stxsdx 34, 0, 5 +; CHECK-NEXT: blr +entry: + %lhs = load i64, i64* %a, align 8 + %rhs = load i64, i64* %b, align 8 + %sum = add i64 %lhs, %rhs + %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0 + %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0 + %lhc = bitcast <2 x i64> %lv to <16 x i8> + %rhc = bitcast <2 x i64> %rv to <16 x i8> + %add = call <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8> %lhc, <16 x i8> %rhc) + %cb = bitcast <16 x i8> %add to <2 x i64> + %fv = extractelement <2 x i64> %cb, i32 0 + store i64 %fv, i64* %a, align 8 + ret i64 %sum +} + +define i64 @test2(i64* %a, i64* %b) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mr 5, 3 +; CHECK-NEXT: ld 3, 0(3) +; CHECK-NEXT: ld 4, 0(4) +; CHECK-NEXT: mtvsrd 34, 3 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: mtvsrd 35, 4 +; CHECK-NEXT: vadduhm 2, 2, 3 +; CHECK-NEXT: stxsdx 34, 0, 5 +; CHECK-NEXT: blr +entry: + %lhs = load i64, i64* %a, align 8 + %rhs = load i64, i64* %b, align 8 + %sum = add i64 %lhs, %rhs + %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0 + %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0 + %lhc = bitcast <2 x i64> %lv to <8 x i16> + %rhc = bitcast <2 x i64> %rv to <8 x i16> + %add = add <8 x i16> %lhc, %rhc + %cb = bitcast <8 x i16> %add to <2 x i64> + %fv = extractelement <2 x i64> %cb, i32 0 + store i64 %fv, i64* %a, align 8 + ret i64 %sum +} + +declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>)