Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -6189,6 +6189,170 @@ return true; } +// Return true if the GEP has two operands, the first operand is of a sequential +// type, and the second operand is a constant. +static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) { + gep_type_iterator I = gep_type_begin(*GEP); + return GEP->getNumOperands() == 2 && + I.isSequential() && + isa(GEP->getOperand(1)); +} + +// Try unmerging GEPs to reduce liveness interference (register pressure) across +// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks, +// reducing liveness interference across those edges benefits global register +// allocation. Currently handles only certain cases. +// +// For example, unmerge %GEPI and %UGEPI as below. +// +// ---------- BEFORE ---------- +// SrcBlock: +// ... +// %GEPIOp = ... +// ... +// %GEPI = gep %GEPIOp, Idx +// ... +// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ] +// (* %GEPI is alive on the indirectbr edges due to other uses ahead) +// (* %GEPIOp is alive on the indirectbr edges only because of it's used by +// %UGEPI) +// +// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged) +// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged) +// ... +// +// DstBi: +// ... +// %UGEPI = gep %GEPIOp, UIdx +// ... +// --------------------------- +// +// ---------- AFTER ---------- +// SrcBlock: +// ... (same as above) +// (* %GEPI is still alive on the indirectbr edges) +// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the +// unmerging) +// ... +// +// DstBi: +// ... +// %UGEPI = gep %GEPI, (UIdx-Idx) +// ... +// --------------------------- +// +// The register pressure on the IndirectBr edges is reduced because %GEPIOp is +// no longer alive on them. +// +// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging +// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as +// not to disable further simplications and optimizations as a result of GEP +// merging. +// +// Note this unmerging may increase the length of the data flow critical path +// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff +// between the register pressure and the length of data-flow critical +// path. Restricting this to the uncommon IndirectBr case would minimize the +// impact of potentially longer critical path, if any, and the impact on compile +// time. +static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, + const TargetTransformInfo *TTI) { + BasicBlock *SrcBlock = GEPI->getParent(); + // Check that SrcBlock ends with an IndirectBr. If not, give up. The common + // (non-IndirectBr) cases exit early here. + if (!isa(SrcBlock->getTerminator())) + return false; + // Check that GEPI is a simple gep with a single constant index. + if (!GEPSequentialConstIndexed(GEPI)) + return false; + ConstantInt *GEPIIdx = cast(GEPI->getOperand(1)); + // Check that GEPI is a cheap one. + if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType()) + > TargetTransformInfo::TCC_Basic) + return false; + Value *GEPIOp = GEPI->getOperand(0); + // Check that GEPIOp is an instruction that's also defined in SrcBlock. + if (!isa(GEPIOp)) + return false; + auto *GEPIOpI = cast(GEPIOp); + if (GEPIOpI->getParent() != SrcBlock) + return false; + // Check that GEP is used outside the block, meaning it's alive on the + // IndirectBr edge(s). + if (find_if(GEPI->users(), [&](User *Usr) { + if (auto *I = dyn_cast(Usr)) { + if (I->getParent() != SrcBlock) { + return true; + } + } + return false; + }) == GEPI->users().end()) + return false; + // The second elements of the GEP chains to be unmerged. + std::vector UGEPIs; + // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive + // on IndirectBr edges. + for (User *Usr : GEPIOp->users()) { + if (Usr == GEPI) continue; + // Check if Usr is an Instruction. If not, give up. + if (!isa(Usr)) + return false; + auto *UI = cast(Usr); + // Check if Usr in the same block as GEPIOp, which is fine, skip. + if (UI->getParent() == SrcBlock) + continue; + // Check if Usr is a GEP. If not, give up. + if (!isa(Usr)) + return false; + auto *UGEPI = cast(Usr); + // Check if UGEPI is a simple gep with a single constant index and GEPIOp is + // the pointer operand to it. If so, record it in the vector. If not, give + // up. + if (!GEPSequentialConstIndexed(UGEPI)) + return false; + if (UGEPI->getOperand(0) != GEPIOp) + return false; + if (GEPIIdx->getType() != + cast(UGEPI->getOperand(1))->getType()) + return false; + ConstantInt *UGEPIIdx = cast(UGEPI->getOperand(1)); + if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType()) + > TargetTransformInfo::TCC_Basic) + return false; + UGEPIs.push_back(UGEPI); + } + if (UGEPIs.size() == 0) + return false; + // Check the materializing cost of (Uidx-Idx). + for (GetElementPtrInst *UGEPI : UGEPIs) { + ConstantInt *UGEPIIdx = cast(UGEPI->getOperand(1)); + APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue(); + unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType()); + if (ImmCost > TargetTransformInfo::TCC_Basic) + return false; + } + // Now unmerge between GEPI and UGEPIs. + for (GetElementPtrInst *UGEPI : UGEPIs) { + UGEPI->setOperand(0, GEPI); + ConstantInt *UGEPIIdx = cast(UGEPI->getOperand(1)); + Constant *NewUGEPIIdx = + ConstantInt::get(GEPIIdx->getType(), + UGEPIIdx->getValue() - GEPIIdx->getValue()); + UGEPI->setOperand(1, NewUGEPIIdx); + // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not + // inbounds to avoid UB. + if (!GEPI->isInBounds()) { + UGEPI->setIsInBounds(false); + } + } + // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not + // alive on IndirectBr edges). + assert(find_if(GEPIOp->users(), [&](User *Usr) { + return cast(Usr)->getParent() != SrcBlock; + }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock"); + return true; +} + bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. @@ -6302,6 +6466,9 @@ optimizeInst(NC, ModifiedDT); return true; } + if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) { + return true; + } return false; } Index: test/Transforms/CodeGenPrepare/gep-unmerging.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/gep-unmerging.ll @@ -0,0 +1,60 @@ +; RUN: opt -codegenprepare -S < %s | FileCheck %s + +@exit_addr = constant i8* blockaddress(@gep_unmerging, %exit) +@op1_addr = constant i8* blockaddress(@gep_unmerging, %op1) +@op2_addr = constant i8* blockaddress(@gep_unmerging, %op2) +@op3_addr = constant i8* blockaddress(@gep_unmerging, %op3) +@dummy = global i8 0 + +define void @gep_unmerging(i1 %pred, i8* %p0) { +entry: + %table = alloca [256 x i8*] + %table_0 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 0 + %table_1 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 1 + %table_2 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 2 + %table_3 = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 3 + %exit_a = load i8*, i8** @exit_addr + %op1_a = load i8*, i8** @op1_addr + %op2_a = load i8*, i8** @op2_addr + %op3_a = load i8*, i8** @op3_addr + store i8* %exit_a, i8** %table_0 + store i8* %op1_a, i8** %table_1 + store i8* %op2_a, i8** %table_2 + store i8* %op3_a, i8** %table_3 + br label %indirectbr + +op1: +; CHECK-LABEL: op1: +; CHECK-NEXT: %p1_inc2 = getelementptr i8, i8* %p_postinc, i64 2 +; CHECK-NEXT: %p1_inc1 = getelementptr i8, i8* %p_postinc, i64 1 + %p1_inc2 = getelementptr i8, i8* %p_preinc, i64 3 + %p1_inc1 = getelementptr i8, i8* %p_preinc, i64 2 + %a10 = load i8, i8* %p_postinc + %a11 = load i8, i8* %p1_inc1 + %a12 = add i8 %a10, %a11 + store i8 %a12, i8* @dummy + br i1 %pred, label %indirectbr, label %exit + +op2: +; CHECK-LABEL: op2: +; CHECK-NEXT: %p2_inc = getelementptr i8, i8* %p_postinc, i64 1 + %p2_inc = getelementptr i8, i8* %p_preinc, i64 2 + %a2 = load i8, i8* %p_postinc + store i8 %a2, i8* @dummy + br i1 %pred, label %indirectbr, label %exit + +op3: + br i1 %pred, label %indirectbr, label %exit + +indirectbr: + %p_preinc = phi i8* [%p0, %entry], [%p1_inc2, %op1], [%p2_inc, %op2], [%p_postinc, %op3] + %p_postinc = getelementptr i8, i8* %p_preinc, i64 1 + %next_op = load i8, i8* %p_preinc + %p_zext = zext i8 %next_op to i64 + %slot = getelementptr [256 x i8*], [256 x i8*]* %table, i64 0, i64 %p_zext + %target = load i8*, i8** %slot + indirectbr i8* %target, [label %exit, label %op1, label %op2] + +exit: + ret void +}