diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOutliner.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/Support/BranchProbability.h" @@ -1059,9 +1060,23 @@ /// faster sequence. /// \param Root - Instruction that could be combined with one of its operands /// \param Patterns - Vector of possible combination patterns - virtual bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl &Patterns) const; + virtual bool + getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const; + + /// Return true if target supports to ressociate instrucitons in machine + /// combiner pass to reduce register pressure for a given BB. + virtual bool + shouldReduceRegisterPressure(MachineBasicBlock *MBB, LiveIntervals *LIS, + RegisterClassInfo *RegClassInfo) const { + return false; + } + + /// fix up the placeholder we may add in genAlternativeCodeSequence() + virtual void + finilizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl &InsInstrs) const {} /// Return true when a code sequence can improve throughput. It /// should be called only for instructions in loops. diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -22,6 +23,7 @@ #include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSchedule.h" @@ -72,6 +74,8 @@ MachineTraceMetrics::Ensemble *MinInstr; MachineBlockFrequencyInfo *MBFI; ProfileSummaryInfo *PSI; + RegisterClassInfo RegClassInfo; + LiveIntervals *LIS; TargetSchedModel TSchedModel; @@ -103,6 +107,10 @@ SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, MachineCombinerPattern Pattern, bool SlackIsAccurate); + bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + MachineCombinerPattern Pattern); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -132,6 +140,7 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); + AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); @@ -257,8 +266,9 @@ /// The combiner's goal may differ based on which pattern it is attempting /// to optimize. enum class CombinerObjective { - MustReduceDepth, // The data dependency chain must be improved. - Default // The critical path must not be lengthened. + MustReduceDepth, // The data dependency chain must be improved. + MustReduceRegisterPressure, // The register pressure must be reduced. + Default // The critical path must not be lengthened. }; static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { @@ -300,6 +310,18 @@ return {NewRootLatency, RootLatency}; } +bool MachineCombiner::reduceRegisterPressure( + MachineInstr &Root, MachineBasicBlock *MBB, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + MachineCombinerPattern Pattern) { + // FIXME: for now, we don't do any check for the register pressure patterns. + // We treat them as always profitable. But we can do better if we make + // RressureTracker class be aware of TIE attribute. Then we can get an + // accurate compare of register pressure with DelInstrs or InsInstrs. + return true; +} + /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical @@ -438,6 +460,8 @@ /// \param DelInstrs instruction to delete from \p MBB /// \param MinInstr is a pointer to the machine trace information /// \param RegUnits set of live registers, needed to compute instruction depths +/// \param TII is target instruction info, used to call target hook +/// \param Pattern is used to call target hook finilizeInsInstrs /// \param IncrementalUpdate if true, compute instruction depths incrementally, /// otherwise invalidate the trace static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, @@ -445,7 +469,18 @@ SmallVector DelInstrs, MachineTraceMetrics::Ensemble *MinInstr, SparseSet &RegUnits, + const TargetInstrInfo *TII, + MachineCombinerPattern Pattern, bool IncrementalUpdate) { + // If we want to fix up some placeholder for some target, do it now. + // We need this because in genAlternativeCodeSequence, we have not decided the + // better pattern InsInstrs or DelInstrs, so we don't want generate some + // sideeffect to the function. For example we need to delay the constant pool + // entry creation here after InsInstrs is selected as better pattern. + // Otherwise the constant pool entry created for InsInstrs will not be deleted + // even InsInstrs is not the bettern pattern. + TII->finilizeInsInstrs(MI, Pattern, InsInstrs); + for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); @@ -522,6 +557,9 @@ bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + bool DoRegPressureReduce = + TII->shouldReduceRegisterPressure(MBB, LIS, &RegClassInfo); + while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; SmallVector Patterns; @@ -552,7 +590,7 @@ // machine-combiner-verify-pattern-order is enabled, all patterns are // checked to ensure later patterns do not provide better latency savings. - if (!TII->getMachineCombinerPatterns(MI, Patterns)) + if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce)) continue; if (VerifyPatternOrder) @@ -588,12 +626,33 @@ if (ML && TII->isThroughputPattern(P)) SubstituteAlways = true; - if (IncrementalUpdate) { + if (IncrementalUpdate && LastUpdate != BlockIter) { // Update depths since the last incremental update. MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits); LastUpdate = BlockIter; } + if (DoRegPressureReduce && + getCombinerObjective(P) == + CombinerObjective::MustReduceRegisterPressure) { + if (MBB->size() > inc_threshold) { + // Use incremental depth updates for basic blocks above treshold + IncrementalUpdate = true; + LastUpdate = BlockIter; + } + if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) { + // Replace DelInstrs with InInstrs. + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + RegUnits, TII, P, IncrementalUpdate); + Changed |= true; + + // back to previous instruction as it may have ILP reassociation + // opportunity. + BlockIter--; + break; + } + } + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases @@ -601,7 +660,7 @@ if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount, OptForSize)) { insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, - RegUnits, IncrementalUpdate); + RegUnits, TII, P, IncrementalUpdate); // Eagerly stop after the first pattern fires. Changed = true; break; @@ -624,7 +683,7 @@ } insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, - RegUnits, IncrementalUpdate); + RegUnits, TII, P, IncrementalUpdate); // Eagerly stop after the first pattern fires. Changed = true; @@ -660,6 +719,8 @@ nullptr; MinInstr = nullptr; OptSize = MF.getFunction().hasOptSize(); + RegClassInfo.runOnMachineFunction(MF); + LIS = &getAnalysis(); LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); if (!TII->useMachineCombiner()) { diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -778,8 +778,8 @@ // instruction is known to not increase the critical path, then don't match // that pattern. bool TargetInstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl &Patterns) const { + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { bool Commute; if (isReassociationCandidate(Root, Commute)) { // We found a sequence of instructions that may be suitable for a diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -235,9 +235,10 @@ /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in ``Root``. All potential patterns are /// listed in the ``Patterns`` array. - bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl &Patterns) const override; + bool + getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const override; /// Return true when Inst is associative and commutative so that it can be /// reassociated. bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4508,8 +4508,8 @@ /// pattern evaluator stops checking as soon as it finds a faster sequence. bool AArch64InstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl &Patterns) const { + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; @@ -4517,7 +4517,8 @@ if (getFMAPatterns(Root, Patterns)) return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); } enum class FMAInstKind { Default, Indexed, Accumulator }; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -348,9 +348,10 @@ /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in . All potential patterns are /// output in the array. - bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl &P) const override; + bool + getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl &P, + bool DoRegPressureReduce) const override; bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -421,8 +421,8 @@ } bool PPCInstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl &Patterns) const { + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { // Using the machine combiner in this way is potentially expensive, so // restrict to when aggressive optimizations are desired. if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) @@ -431,7 +431,8 @@ if (getFMAPatterns(Root, Patterns)) return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); } void PPCInstrInfo::genAlternativeCodeSequence( diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -113,6 +113,9 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: AArch64 Conditional Compares +; CHECK-NEXT: Slot index numbering +; CHECK-NEXT: Live Interval Analysis +; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: AArch64 Conditional Branch Tuning diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir --- a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -24,7 +24,7 @@ ... # UNPROFITABLE-LABEL: name: f1_2s # UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1 -# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 +# UNPROFITABLE-NEXT: FSUBv2f32 %3, %2 # # PROFITABLE-LABEL: name: f1_2s # PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 @@ -50,7 +50,7 @@ ... # UNPROFITABLE-LABEL: name: f1_4s # UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1 -# UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2 +# UNPROFITABLE-NEXT: FSUBv4f32 %3, %2 # # PROFITABLE-LABEL: name: f1_4s # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 @@ -76,7 +76,7 @@ ... # UNPROFITABLE-LABEL: name: f1_2d # UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1 -# UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2 +# UNPROFITABLE-NEXT: FSUBv2f64 %3, %2 # # PROFITABLE-LABEL: name: f1_2d # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 @@ -106,7 +106,7 @@ ... # ALL-LABEL: name: f1_both_fmul_2s # ALL: %4:fpr64 = FMULv2f32 %0, %1 -# ALL-NEXT: FMLSv2f32 killed %4, %2, %3 +# ALL-NEXT: FMLSv2f32 %4, %2, %3 --- name: f1_both_fmul_4s registers: @@ -132,7 +132,7 @@ ... # ALL-LABEL: name: f1_both_fmul_4s # ALL: %4:fpr128 = FMULv4f32 %0, %1 -# ALL-NEXT: FMLSv4f32 killed %4, %2, %3 +# ALL-NEXT: FMLSv4f32 %4, %2, %3 --- name: f1_both_fmul_2d registers: @@ -158,5 +158,5 @@ ... # ALL-LABEL: name: f1_both_fmul_2d # ALL: %4:fpr128 = FMULv2f64 %0, %1 -# ALL-NEXT: FMLSv2f64 killed %4, %2, %3 +# ALL-NEXT: FMLSv2f64 %4, %2, %3 diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir b/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir --- a/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir +++ b/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir @@ -86,7 +86,7 @@ # CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 # CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 # CHECK-NEXT: [[MUL:%.*]]:fpr32 = contract FMULSrr [[B]], [[A]] -# CHECK-NEXT: fpr32 = FADDSrr killed [[MUL]], [[C]] +# CHECK-NEXT: fpr32 = FADDSrr [[MUL]], [[C]] --- name: scalar_fmadd_contract_op0 alignment: 4 @@ -166,7 +166,7 @@ # CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 # CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 # CHECK-NEXT: [[MUL:%.*]]:fpr32 = nsz FMULSrr [[B]], [[A]] -# CHECK-NEXT: fpr32 = nsz FADDSrr killed [[MUL]], [[C]] +# CHECK-NEXT: fpr32 = nsz FADDSrr [[MUL]], [[C]] --- name: scalar_fmadd_nsz @@ -285,7 +285,7 @@ # CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 # CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 # CHECK-NEXT: [[MUL:%.*]]:fpr128 = contract FMULv2f64 [[B]], [[A]] -# CHECK-NEXT: fpr128 = FADDv2f64 killed [[MUL]], [[C]] +# CHECK-NEXT: fpr128 = FADDv2f64 [[MUL]], [[C]] --- name: vector_fmadd_contract_op0 alignment: 4 @@ -365,7 +365,7 @@ # CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 # CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 # CHECK-NEXT: [[MUL:%.*]]:fpr128 = nsz FMULv2f64 [[B]], [[A]] -# CHECK-NEXT: fpr128 = nsz FADDv2f64 killed [[MUL]], [[C]] +# CHECK-NEXT: fpr128 = nsz FADDv2f64 [[MUL]], [[C]] --- name: vector_fmadd_nsz alignment: 4 diff --git a/llvm/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll b/llvm/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll --- a/llvm/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll +++ b/llvm/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll @@ -7,7 +7,7 @@ %2 = zext i32 %1 to i64 %3 = shl i64 %2, 48 %4 = ashr exact i64 %3, 48 -; CHECK: RLWINM8 killed {{[^,]+}}, 0, 16, 27 +; CHECK: RLWINM8 {{[^,]+}}, 0, 16, 27 ; CHECK: CMPLDI ; CHECK: BCC @@ -28,7 +28,7 @@ define signext i32 @fn2(i64 %a, i64 %b) { ; CHECK: OR8_rec {{[^, ]+}}, {{[^, ]+}}, implicit-def $cr0 ; CHECK: [[CREG:[^, ]+]]:crrc = COPY killed $cr -; CHECK: BCC 12, killed [[CREG]] +; CHECK: BCC 12, [[CREG]] %1 = or i64 %b, %a %2 = icmp sgt i64 %1, -1 br i1 %2, label %foo, label %bar @@ -42,9 +42,9 @@ ; CHECK-LABEL: fn3 define signext i32 @fn3(i32 %a) { -; CHECK: ANDI_rec killed {{[%0-9]+}}{{[^,]*}}, 10, implicit-def $cr0 +; CHECK: ANDI_rec {{[%0-9]+}}{{[^,]*}}, 10, implicit-def $cr0 ; CHECK: [[CREG:[^, ]+]]:crrc = COPY $cr0 -; CHECK: BCC 76, killed [[CREG]] +; CHECK: BCC 76, [[CREG]] %1 = and i32 %a, 10 %2 = icmp ne i32 %1, 0 br i1 %2, label %foo, label %bar diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -94,6 +94,9 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Early If-Conversion +; CHECK-NEXT: Slot index numbering +; CHECK-NEXT: Live Interval Analysis +; CHECK-NEXT: Machine Trace Metrics ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine InstCombiner ; CHECK-NEXT: X86 cmov Conversion diff --git a/llvm/test/DebugInfo/X86/machinecse-wrongdebug-hoist.ll b/llvm/test/DebugInfo/X86/machinecse-wrongdebug-hoist.ll --- a/llvm/test/DebugInfo/X86/machinecse-wrongdebug-hoist.ll +++ b/llvm/test/DebugInfo/X86/machinecse-wrongdebug-hoist.ll @@ -1,6 +1,6 @@ ; RUN: llc %s -o - -print-after=machine-cse -mtriple=x86_64-- 2>&1 | FileCheck %s --match-full-lines -; CHECK: %5:gr32 = SUB32ri8 %0:gr32(tied-def 0), 1, implicit-def $eflags, debug-location !24; a.c:3:13 +; CHECK: dead %5:gr32 = SUB32ri8 %0:gr32(tied-def 0), 1, implicit-def $eflags, debug-location !24; a.c:3:13 ; CHECK-NEXT: %10:gr32 = MOVSX32rr8 %4:gr8 ; CHECK-NEXT: JCC_1 %bb.2, 15, implicit $eflags, debug-location !25; a.c:3:18