Index: llvm/lib/Target/AArch64/AArch64MacroFusion.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MacroFusion.cpp +++ llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -29,26 +29,30 @@ namespace { -/// \brief Verify that the instruction pair, First and Second, -/// should be scheduled back to back. Given an anchor instruction, if the other -/// instruction is unspecified, then verify that the anchor instruction may be -/// part of a pair at all. -static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII, - const AArch64Subtarget &ST, - const MachineInstr *First, - const MachineInstr *Second) { - assert((First || Second) && "At least one instr must be specified"); - unsigned FirstOpcode = - First ? First->getOpcode() - : static_cast(AArch64::INSTRUCTION_LIST_END); - unsigned SecondOpcode = - Second ? Second->getOpcode() - : static_cast(AArch64::INSTRUCTION_LIST_END); +/// \brief Verify that the instr pair, LeftMI and RightMI, should be fused +/// together. Given an anchor instr, if the other instr is unspecified, then +/// check if the anchor instr may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *LeftMI, + const MachineInstr *RightMI) { + assert((LeftMI || RightMI) && "At least one instr must be specified"); + + const AArch64InstrInfo &II = static_cast(TII); + const AArch64Subtarget &ST = static_cast(TSI); + + // Assume wildcards for unspecified instrs. + unsigned LeftOpcode = + LeftMI ? LeftMI->getOpcode() + : static_cast(AArch64::INSTRUCTION_LIST_END); + unsigned RightOpcode = + RightMI ? RightMI->getOpcode() + : static_cast(AArch64::INSTRUCTION_LIST_END); if (ST.hasArithmeticBccFusion()) // Fuse CMN, CMP, TST followed by Bcc. - if (SecondOpcode == AArch64::Bcc) - switch (FirstOpcode) { + if (RightOpcode == AArch64::Bcc) + switch (LeftOpcode) { default: return false; case AArch64::ADDSWri: @@ -75,16 +79,16 @@ case AArch64::BICSWrs: case AArch64::BICSXrs: // Shift value can be 0 making these behave like the "rr" variant... - return !TII.hasShiftedReg(*First); + return !II.hasShiftedReg(*LeftMI); case AArch64::INSTRUCTION_LIST_END: return true; } if (ST.hasArithmeticCbzFusion()) // Fuse ALU operations followed by CBZ/CBNZ. - if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || - SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) - switch (FirstOpcode) { + if (RightOpcode == AArch64::CBNZW || RightOpcode == AArch64::CBNZX || + RightOpcode == AArch64::CBZW || RightOpcode == AArch64::CBZX) + switch (LeftOpcode) { default: return false; case AArch64::ADDWri: @@ -117,120 +121,127 @@ case AArch64::BICWrs: case AArch64::BICXrs: // Shift value can be 0 making these behave like the "rr" variant... - return !TII.hasShiftedReg(*First); + return !II.hasShiftedReg(*LeftMI); case AArch64::INSTRUCTION_LIST_END: return true; } if (ST.hasFuseAES()) // Fuse AES crypto operations. - switch(FirstOpcode) { + switch(LeftOpcode) { // AES encode. case AArch64::AESErr: - return SecondOpcode == AArch64::AESMCrr || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + return RightOpcode == AArch64::AESMCrr || + RightOpcode == AArch64::INSTRUCTION_LIST_END; // AES decode. case AArch64::AESDrr: - return SecondOpcode == AArch64::AESIMCrr || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + return RightOpcode == AArch64::AESIMCrr || + RightOpcode == AArch64::INSTRUCTION_LIST_END; } if (ST.hasFuseLiterals()) // Fuse literal generation operations. - switch (FirstOpcode) { + switch (LeftOpcode) { // PC relative address. case AArch64::ADRP: - return SecondOpcode == AArch64::ADDXri || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + return RightOpcode == AArch64::ADDXri || + RightOpcode == AArch64::INSTRUCTION_LIST_END; // 32 bit immediate. case AArch64::MOVZWi: - return (SecondOpcode == AArch64::MOVKWi && - Second->getOperand(3).getImm() == 16) || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + return (RightOpcode == AArch64::MOVKWi && + RightMI->getOperand(3).getImm() == 16) || + RightOpcode == AArch64::INSTRUCTION_LIST_END; // Lower half of 64 bit immediate. case AArch64::MOVZXi: - return (SecondOpcode == AArch64::MOVKXi && - Second->getOperand(3).getImm() == 16) || - SecondOpcode == AArch64::INSTRUCTION_LIST_END; + return (RightOpcode == AArch64::MOVKXi && + RightMI->getOperand(3).getImm() == 16) || + RightOpcode == AArch64::INSTRUCTION_LIST_END; // Upper half of 64 bit immediate. case AArch64::MOVKXi: - return First->getOperand(3).getImm() == 32 && - ((SecondOpcode == AArch64::MOVKXi && - Second->getOperand(3).getImm() == 48) || - SecondOpcode == AArch64::INSTRUCTION_LIST_END); + return LeftMI->getOperand(3).getImm() == 32 && + ((RightOpcode == AArch64::MOVKXi && + RightMI->getOperand(3).getImm() == 48) || + RightOpcode == AArch64::INSTRUCTION_LIST_END); } return false; } -/// \brief Implement the fusion of instruction pairs in the scheduling -/// DAG, anchored at the instruction in ASU. Preds -/// indicates if its dependencies in \param APreds are predecessors instead of -/// successors. -static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit *ASU, - SmallVectorImpl &APreds, bool Preds) { - const AArch64InstrInfo *TII = static_cast(DAG->TII); - const AArch64Subtarget &ST = DAG->MF.getSubtarget(); - - const MachineInstr *AMI = ASU->getInstr(); - if (!AMI || AMI->isPseudo() || AMI->isTransient() || - (Preds && !shouldScheduleAdjacent(*TII, ST, nullptr, AMI)) || - (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, nullptr))) +/// \brief Implement the fusion of instr pairs in the scheduling DAG, +/// anchored at the instr in AnchorSU. Preds indicates if its dependencies in +/// AnchorDeps are predecessors instead of successors. +static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { + const MachineInstr *AnchorMI = AnchorSU.getInstr(); + if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient()) return false; - for (SDep &BDep : APreds) { - if (BDep.isWeak()) + // If the anchor instr is the ExitSU, then consider its predecessors; + // otherwise, its successors. + bool Preds = (&AnchorSU == &DAG->ExitSU); + SmallVectorImpl &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs; + + const MachineInstr *LeftMI = Preds ? nullptr : AnchorMI; + const MachineInstr *RightMI = Preds ? AnchorMI : nullptr; + + // Check if the anchor instr may be fused. + if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), + LeftMI, RightMI)) + return false; + + // Explorer for fusion candidates among the dependencies of the anchor instr. + for (SDep &Dep : AnchorDeps) { + // Ignore dependencies that don't enforce ordering. + if (Dep.isWeak()) continue; - SUnit *BSU = BDep.getSUnit(); - const MachineInstr *BMI = BSU->getInstr(); - if (!BMI || BMI->isPseudo() || BMI->isTransient() || - (Preds && !shouldScheduleAdjacent(*TII, ST, BMI, AMI)) || - (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, BMI))) + SUnit &DepSU = *Dep.getSUnit(); + // Ignore the ExitSU if the dependents are successors. + if (!Preds && &DepSU == &DAG->ExitSU) continue; - // Create a single weak edge between the adjacent instrs. The only - // effect is to cause bottom-up scheduling to heavily prioritize the - // clustered instrs. - if (Preds) - DAG->addEdge(ASU, SDep(BSU, SDep::Cluster)); - else - DAG->addEdge(BSU, SDep(ASU, SDep::Cluster)); - - // Adjust the latency between the 1st instr and its predecessors/successors. - for (SDep &Dep : APreds) - if (Dep.getSUnit() == BSU) - Dep.setLatency(0); - - // Adjust the latency between the 2nd instr and its successors/predecessors. - auto &BSuccs = Preds ? BSU->Succs : BSU->Preds; - for (SDep &Dep : BSuccs) - if (Dep.getSUnit() == ASU) - Dep.setLatency(0); + const MachineInstr *DepMI = DepSU.getInstr(); + if (!DepMI || DepMI->isPseudo() || DepMI->isTransient()) + continue; - ++NumFused; - DEBUG({ SUnit *LSU = Preds ? BSU : ASU; - SUnit *RSU = Preds ? ASU : BSU; - const MachineInstr *LMI = Preds ? BMI : AMI; - const MachineInstr *RMI = Preds ? AMI : BMI; - - dbgs() << DAG->MF.getName() << "(): Macro fuse "; - LSU->print(dbgs(), DAG); - dbgs() << " - "; - RSU->print(dbgs(), DAG); - dbgs() << " / " << - TII->getName(LMI->getOpcode()) << " - " << - TII->getName(RMI->getOpcode()) << '\n'; - }); + LeftMI = Preds ? DepMI : AnchorMI; + RightMI = Preds ? AnchorMI : DepMI; + if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), + LeftMI, RightMI)) + continue; + // Create a single weak edge between the adjacent instrs. The only effect is + // to cause bottom-up scheduling to heavily prioritize the clustered instrs. + SUnit &LeftSU = Preds ? DepSU : AnchorSU; + SUnit &RightSU = Preds ? AnchorSU : DepSU; + DAG->addEdge(&RightSU, SDep(&LeftSU, SDep::Cluster)); + + // Adjust the latency between the anchor instr and its + // predecessors/successors. + for (SDep &IDep : AnchorDeps) + if (IDep.getSUnit() == &DepSU) + IDep.setLatency(0); + + // Adjust the latency between the dependent instr and its + // successors/predecessors. + for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds) + if (IDep.getSUnit() == &AnchorSU) + IDep.setLatency(0); + + DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; + LeftSU.print(dbgs(), DAG); dbgs() << " - "; + RightSU.print(dbgs(), DAG); dbgs() << " / "; + dbgs() << DAG->TII->getName(LeftMI->getOpcode()) << " - " << + DAG->TII->getName(RightMI->getOpcode()) << '\n'; ); + + ++NumFused; return true; } return false; } -/// \brief Post-process the DAG to create cluster edges between instructions -/// that may be fused by the processor into a single operation. +/// \brief Post-process the DAG to create cluster edges between instrs that may +/// be fused by the processor into a single operation. class AArch64MacroFusion : public ScheduleDAGMutation { public: AArch64MacroFusion() {} @@ -241,13 +252,13 @@ void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { ScheduleDAGMI *DAG = static_cast(DAGInstrs); - // For each of the SUnits in the scheduling block, try to fuse the instruction - // in it with one in its successors. - for (SUnit &ASU : DAG->SUnits) - scheduleAdjacentImpl(DAG, &ASU, ASU.Succs, false); + // For each of the SUnits in the scheduling block, try to fuse the instr in it + // with one in its successors. + for (SUnit &ISU : DAG->SUnits) + scheduleAdjacentImpl(DAG, ISU); - // Try to fuse the instruction in the ExitSU with one in its predecessors. - scheduleAdjacentImpl(DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true); + // Try to fuse the instr in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(DAG, DAG->ExitSU); } } // end namespace