diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -584,6 +584,11 @@ /// multiple uses. bool hasOneNonDBGUser(Register RegNo) const; + + /// hasAtMostUses - Return true if the given register has at most \p MaxUsers + /// non-debug user instructions. + bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const; + /// replaceRegWith - Replace all instances of FromReg with ToReg in the /// machine function. This is like llvm-level X->replaceAllUsesWith(Y), /// except that it also changes any definitions of the register as well. diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -420,6 +420,16 @@ return hasSingleElement(use_nodbg_instructions(RegNo)); } +bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg, + unsigned MaxUsers) const { + unsigned NumUsers = 0; + auto UI = use_instr_nodbg_begin(Reg), UE = use_instr_nodbg_end(); + for (; UI != UE && NumUsers < MaxUsers; ++UI) + NumUsers++; + // If we haven't reached the end yet then there are more than MaxUses users. + return UI == UE; +} + /// clearKillFlags - Iterate over all the uses of the given register and /// clear the kill flag from the MachineOperand. This function is used by /// optimization passes which extend register lifetimes and need only diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2335,18 +2335,6 @@ llvm_unreachable("Unexpected remat cost"); }; - // Helper to walk through uses and terminate if we've reached a limit. Saves - // us spending time traversing uses if all we want to know is if it's >= min. - auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) { - unsigned NumUses = 0; - auto UI = MRI.use_instr_nodbg_begin(Reg), UE = MRI.use_instr_nodbg_end(); - for (; UI != UE && NumUses < MaxUses; ++UI) { - NumUses++; - } - // If we haven't reached the end yet then there are more than MaxUses users. - return UI == UE; - }; - switch (MI.getOpcode()) { default: return false; @@ -2363,8 +2351,7 @@ unsigned MaxUses = maxUses(RematCost); if (MaxUses == UINT_MAX) return true; // Remats are "free" so always localize. - bool B = isUsesAtMost(Reg, MaxUses); - return B; + return MRI.hasAtMostUserInstrs(Reg, MaxUses); } } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -75,6 +76,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" @@ -20789,6 +20791,21 @@ bool AArch64TargetLowering::shouldLocalize( const MachineInstr &MI, const TargetTransformInfo *TTI) const { + auto &MF = *MI.getMF(); + auto &MRI = MF.getRegInfo(); + auto maxUses = [](unsigned RematCost) { + // A cost of 1 means remats are basically free. + if (RematCost == 1) + return UINT_MAX; + if (RematCost == 2) + return 2U; + + // Remat is too expensive, only sink if there's one user. + if (RematCost > 2) + return 1U; + llvm_unreachable("Unexpected remat cost"); + }; + switch (MI.getOpcode()) { case TargetOpcode::G_GLOBAL_VALUE: { // On Darwin, TLS global vars get selected into function calls, which @@ -20799,6 +20816,18 @@ return false; break; } + case TargetOpcode::G_CONSTANT: { + auto *CI = MI.getOperand(1).getCImm(); + APInt Imm = CI->getValue(); + InstructionCost Cost = TTI->getIntImmCost( + Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize); + assert(Cost.isValid() && "Expected a valid imm cost"); + + unsigned RematCost = *Cost.getValue(); + Register Reg = MI.getOperand(0).getReg(); + unsigned MaxUses = maxUses(RematCost); + return MRI.hasAtMostUserInstrs(Reg, MaxUses); + } // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being // localizable. case AArch64::ADRP: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll @@ -114,3 +114,119 @@ ret i32 0 } +define i32 @imm_cost_too_large_cost_of_2() { + ; CHECK-LABEL: name: imm_cost_too_large_cost_of_2 + ; CHECK: bb.1.entry: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 -2228259 + ; CHECK-NEXT: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK-NEXT: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s32) from @var1) + ; CHECK-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C2]] + ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.4 + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.then: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK-NEXT: G_STORE [[C]](s32), [[GV3]](p0) :: (store (s32) into @var2) + ; CHECK-NEXT: G_BR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.then2: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK-NEXT: G_STORE [[C]](s32), [[GV4]](p0) :: (store (s32) into @var1) + ; CHECK-NEXT: G_BR %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.if.end: + ; CHECK-NEXT: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK-NEXT: G_STORE [[C]](s32), [[GV5]](p0) :: (store (s32) into @var3) + ; CHECK-NEXT: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY [[C3]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %0 = load i32, i32* @var1, align 4 + %cst1 = bitcast i32 -2228259 to i32 + %cmp = icmp eq i32 %0, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %cst1, i32* @var2 + br label %if.then2 + +if.then2: + store i32 %cst1, i32* @var1 + br label %if.end + +if.end: + store i32 %cst1, i32* @var3 + ret i32 0 +} + +define i64 @imm_cost_too_large_cost_of_4() { + ; CHECK-LABEL: name: imm_cost_too_large_cost_of_4 + ; CHECK: bb.1.entry: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 -2228259 + ; CHECK-NEXT: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64 + ; CHECK-NEXT: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64 + ; CHECK-NEXT: [[C1:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s64) from @var1_64, align 4) + ; CHECK-NEXT: [[C2:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C3]] + ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.4 + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.then: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64 + ; CHECK-NEXT: G_STORE [[C]](s64), [[GV3]](p0) :: (store (s64) into @var2_64) + ; CHECK-NEXT: G_BR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.then2: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64 + ; CHECK-NEXT: G_STORE [[C]](s64), [[GV4]](p0) :: (store (s64) into @var1_64) + ; CHECK-NEXT: G_BR %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.if.end: + ; CHECK-NEXT: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64 + ; CHECK-NEXT: G_STORE [[C]](s64), [[GV5]](p0) :: (store (s64) into @var3_64) + ; CHECK-NEXT: [[C4:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: $x0 = COPY [[C4]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 +entry: + %0 = load i64, i64* @var1_64, align 4 + %cst1 = bitcast i64 -2228259 to i64 + %cmp = icmp eq i64 %0, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i64 %cst1, i64* @var2_64 + br label %if.then2 + +if.then2: + store i64 %cst1, i64* @var1_64 + br label %if.end + +if.end: + store i64 %cst1, i64* @var3_64 + ret i64 0 +} + +@var1_64 = common global i64 0, align 4 +@var2_64 = common global i64 0, align 4 +@var3_64 = common global i64 0, align 4