Index: llvm/trunk/lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- llvm/trunk/lib/CodeGen/RegisterCoalescer.cpp +++ llvm/trunk/lib/CodeGen/RegisterCoalescer.cpp @@ -16,6 +16,7 @@ #include "RegisterCoalescer.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -69,6 +70,7 @@ STATISTIC(NumInflated , "Number of register classes inflated"); STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested"); STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved"); +STATISTIC(NumShrinkToUses, "Number of shrinkToUses called"); static cl::opt EnableJoining("join-liveintervals", cl::desc("Coalesce copies (default=true)"), @@ -94,6 +96,15 @@ cl::desc("Verify machine instrs before and after register coalescing"), cl::Hidden); +static cl::opt LateRematUpdateThreshold( + "late-remat-update-threshold", cl::Hidden, + cl::desc("During rematerialization for a copy, if the def instruction has " + "many other copy uses to be rematerialized, delay the multiple " + "separate live interval update work and do them all at once after " + "all those rematerialization are done. It will save a lot of " + "repeated work. "), + cl::init(100)); + namespace { class RegisterCoalescer : public MachineFunctionPass, @@ -137,6 +148,11 @@ /// Virtual registers to be considered for register class inflation. SmallVector InflateRegs; + /// The collection of live intervals which should have been updated + /// immediately after rematerialiation but delayed until + /// lateLiveIntervalUpdate is called. + DenseSet ToBeUpdated; + /// Recursively eliminate dead defs in DeadDefs. void eliminateDeadDefs(); @@ -157,6 +173,13 @@ /// was made. bool copyCoalesceWorkList(MutableArrayRef CurrList); + /// If one def has many copy like uses, and those copy uses are all + /// rematerialized, the live interval update needed for those + /// rematerializations will be delayed and done all at once instead + /// of being done multiple times. This is to save compile cost becuase + /// live interval update is costly. + void lateLiveIntervalUpdate(); + /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the /// src/dst of the copy instruction CopyMI. This returns true if the copy /// was successfully coalesced away. If it is not currently possible to @@ -258,6 +281,7 @@ /// mentioned method returns true. void shrinkToUses(LiveInterval *LI, SmallVectorImpl *Dead = nullptr) { + NumShrinkToUses++; if (LIS->shrinkToUses(LI, Dead)) { /// Check whether or not \p LI is composed by multiple connected /// components and if that is the case, fix that. @@ -1365,11 +1389,9 @@ LLVM_DEBUG(dbgs() << "Remat: " << NewMI); ++NumReMats; - // The source interval can become smaller because we removed a use. - shrinkToUses(&SrcInt, &DeadDefs); - if (!DeadDefs.empty()) { - // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs - // to describe DstReg instead. + // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs + // to describe DstReg instead. + if (MRI->use_nodbg_empty(SrcReg)) { for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) { MachineInstr *UseMI = UseMO.getParent(); if (UseMI->isDebugValue()) { @@ -1380,9 +1402,24 @@ LLVM_DEBUG(dbgs() << "\t\tupdated: " << *UseMI); } } - eliminateDeadDefs(); } + if (ToBeUpdated.count(SrcReg)) + return true; + + long NumCopyUses = 0; + for (MachineOperand &UseMO : MRI->use_nodbg_operands(SrcReg)) { + if (UseMO.getParent()->isCopyLike()) + NumCopyUses++; + } + if (NumCopyUses < LateRematUpdateThreshold) { + // The source interval can become smaller because we removed a use. + shrinkToUses(&SrcInt, &DeadDefs); + if (!DeadDefs.empty()) + eliminateDeadDefs(); + } else { + ToBeUpdated.insert(SrcReg); + } return true; } @@ -3290,6 +3327,18 @@ || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg)); } +void RegisterCoalescer::lateLiveIntervalUpdate() { + for (unsigned reg : ToBeUpdated) { + if (!LIS->hasInterval(reg)) + continue; + LiveInterval &LI = LIS->getInterval(reg); + shrinkToUses(&LI, &DeadDefs); + if (!DeadDefs.empty()) + eliminateDeadDefs(); + } + ToBeUpdated.clear(); +} + bool RegisterCoalescer:: copyCoalesceWorkList(MutableArrayRef CurrList) { bool Progress = false; @@ -3459,12 +3508,14 @@ } copyCoalesceInMBB(MBBs[i].MBB); } + lateLiveIntervalUpdate(); coalesceLocals(); // Joining intervals can allow other intervals to be joined. Iteratively join // until we make no progress. while (copyCoalesceWorkList(WorkList)) /* empty */ ; + lateLiveIntervalUpdate(); } void RegisterCoalescer::releaseMemory() { Index: llvm/trunk/test/CodeGen/X86/late-remat-update.mir =================================================================== --- llvm/trunk/test/CodeGen/X86/late-remat-update.mir +++ llvm/trunk/test/CodeGen/X86/late-remat-update.mir @@ -0,0 +1,118 @@ +# REQUIRES: asserts +# RUN: llc -mtriple=x86_64-- -run-pass=simple-register-coalescing -late-remat-update-threshold=1 -stats %s -o /dev/null 2>&1 | FileCheck %s +# Check the test will rematerialize for three copies, but will call shrinkToUses +# only once to update live range because of late rematerialization update. +# CHECK: 3 regalloc - Number of instructions re-materialized +# CHECK: 1 regalloc - Number of shrinkToUses called +--- | + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + ; Function Attrs: noreturn uwtable + define void @_Z3fooi(i32 %value) local_unnamed_addr #0 { + entry: + br label %do.body + + do.body: ; preds = %do.body, %sw.bb2, %entry + tail call void asm sideeffect "", "~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !3 + switch i32 %value, label %do.body [ + i32 0, label %sw.bb + i32 1, label %sw.bb1 + i32 2, label %sw.bb2 + ] + + sw.bb: ; preds = %do.body + tail call void @_Z3gooi(i32 2122) + br label %sw.bb1 + + sw.bb1: ; preds = %sw.bb, %do.body + tail call void @_Z3gooi(i32 2122) + br label %sw.bb2 + + sw.bb2: ; preds = %sw.bb1, %do.body + tail call void @_Z3gooi(i32 2122) + br label %do.body + } + + declare void @_Z3gooi(i32) local_unnamed_addr #1 + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #2 + + attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { nounwind } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 7, !"PIC Level", i32 2} + !2 = !{!"clang version 7.0.0 (trunk 335057)"} + !3 = !{i32 82} + +... +--- +name: _Z3fooi +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gr32 } + - { id: 1, class: gr32 } + - { id: 2, class: gr32 } + - { id: 3, class: gr32 } + - { id: 4, class: gr32 } + - { id: 5, class: gr32 } +liveins: + - { reg: '$edi', virtual-reg: '%0' } +frameInfo: + hasCalls: true +body: | + bb.0.entry: + liveins: $edi + + %0:gr32 = COPY killed $edi + %5:gr32 = MOV32ri 2122 + + bb.1.do.body: + successors: %bb.6(0x15555555), %bb.2(0x6aaaaaab) + + INLINEASM &"", 1, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15, 12, implicit-def dead early-clobber $eflags, !3 + CMP32ri8 %0, 2, implicit-def $eflags + JE_1 %bb.6, implicit killed $eflags + JMP_1 %bb.2 + + bb.2.do.body: + successors: %bb.5(0x19999999), %bb.3(0x66666667) + + CMP32ri8 %0, 1, implicit-def $eflags + JE_1 %bb.5, implicit killed $eflags + JMP_1 %bb.3 + + bb.3.do.body: + successors: %bb.4(0x20000000), %bb.1(0x60000000) + + TEST32rr %0, %0, implicit-def $eflags + JNE_1 %bb.1, implicit killed $eflags + JMP_1 %bb.4 + + bb.4.sw.bb: + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + $edi = COPY %5 + CALL64pcrel32 @_Z3gooi, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $rsp, implicit-def $ssp + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + bb.5.sw.bb1: + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + $edi = COPY %5 + CALL64pcrel32 @_Z3gooi, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $rsp, implicit-def $ssp + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + + bb.6.sw.bb2: + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + $edi = COPY %5 + CALL64pcrel32 @_Z3gooi, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit-def $rsp, implicit-def $ssp + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + JMP_1 %bb.1 + +...