diff --git a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h --- a/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -19,6 +19,7 @@ #include "llvm/ADT/PointerSumType.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/PHITransAddr.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PredIteratorCache.h" #include "llvm/IR/ValueHandle.h" @@ -31,7 +32,6 @@ class AssumptionCache; class BatchAAResults; class DominatorTree; -class PHITransAddr; /// A memory dependence query can return one of three different answers. class MemDepResult { @@ -230,10 +230,11 @@ /// (potentially phi translated) address that was live in the block. class NonLocalDepResult { NonLocalDepEntry Entry; - Value *Address; + SelectAddr Address; public: - NonLocalDepResult(BasicBlock *bb, MemDepResult result, Value *address) + NonLocalDepResult(BasicBlock *bb, MemDepResult result, + const SelectAddr &address) : Entry(bb, result), Address(address) {} // BB is the sort key, it can't be changed. @@ -254,7 +255,7 @@ /// a cached result and that address was deleted. /// /// The address is always null for a non-local 'call' dependence. - Value *getAddress() const { return Address; } + SelectAddr getAddress() const { return Address; } }; /// Provides a lazy, caching interface for making common memory aliasing diff --git a/llvm/include/llvm/Analysis/PHITransAddr.h b/llvm/include/llvm/Analysis/PHITransAddr.h --- a/llvm/include/llvm/Analysis/PHITransAddr.h +++ b/llvm/include/llvm/Analysis/PHITransAddr.h @@ -15,12 +15,35 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include namespace llvm { - class AssumptionCache; - class DominatorTree; - class DataLayout; - class TargetLibraryInfo; +class AssumptionCache; +class DominatorTree; +class DataLayout; +class TargetLibraryInfo; + +// SelectAddr - storage of normal Value address or pair of addresses for true +// and false variant of select dependency. +class SelectAddr { +public: + using SelectAddrs = std::pair; + SelectAddr(Value *V) : Addr(V){}; + SelectAddr(const SelectAddrs &Addrs) : Addr(Addrs){}; + Value *getAddr() const { return std::get(Addr); } + SelectAddrs getSelectAddrs() const { + if (std::holds_alternative(Addr)) + return std::get(Addr); + Value *V = std::get(Addr); + assert(isa(V)); + auto *SI = cast(V); + return {SI->getTrueValue(), SI->getFalseValue()}; + } + +private: + std::variant Addr; +}; /// PHITransAddr - An address value which tracks and handles phi translation. /// As we walk "up" the CFG through predecessors, we need to ensure that the @@ -58,6 +81,15 @@ Value *getAddr() const { return Addr; } + // If translated address has Select input in BB, return it (otherwise null). + SelectInst *getSelectInputInBB(BasicBlock *BB) const { + for (auto *I : InstInputs) + if (I->getParent() == BB) + if (auto *SI = dyn_cast(I)) + return SI; + return nullptr; + } + /// NeedsPHITranslationFromBlock - Return true if moving from the specified /// BasicBlock to its predecessors requires PHI translation. bool NeedsPHITranslationFromBlock(BasicBlock *BB) const { @@ -81,6 +113,14 @@ bool PHITranslateValue(BasicBlock *CurBB, BasicBlock *PredBB, const DominatorTree *DT, bool MustDominate); + // SelTranslateValue - PHI translate the current address from CurBB to PredBB, + // and if PredBB has Sel dependency instruction, translate both cases of this + // select. + SelectAddr::SelectAddrs SelTranslateValue(BasicBlock *CurBB, + BasicBlock *PredBB, + const DominatorTree *DT, + SelectInst *Sel); + /// PHITranslateWithInsertion - PHI translate this value into the specified /// predecessor block, inserting a computation of the value if it is /// unavailable. @@ -101,7 +141,8 @@ private: Value *PHITranslateSubExpr(Value *V, BasicBlock *CurBB, BasicBlock *PredBB, - const DominatorTree *DT); + const DominatorTree *DT, SelectInst *Sel = nullptr, + bool Cond = false); /// InsertPHITranslatedSubExpr - Insert a computation of the PHI translated /// version of 'V' for the edge PredBB->CurBB into the end of the PredBB diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -52,6 +52,7 @@ class NonLocalDepResult; class OptimizationRemarkEmitter; class PHINode; +class SelectAddr; class TargetLibraryInfo; class Value; /// A private "module" namespace for types and utilities used by GVN. These @@ -320,7 +321,8 @@ /// Given a local dependency (Def or Clobber) determine if a value is /// available for the load. std::optional - AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, Value *Address); + AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, + const SelectAddr &Addr); /// Given a list of non-local dependencies, determine if a value is /// available for the load in each specified block. If it is, add it to diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -23,7 +23,6 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" -#include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" @@ -1346,8 +1345,31 @@ // predecessor, then we have to assume that the pointer is clobbered in // that predecessor. We can still do PRE of the load, which would insert // a computation of the pointer in this predecessor. - if (!PredPtrVal) + if (!PredPtrVal) { + // If we have translation failure, but there is a select input in + // address, try to translate both sides of it. + if (auto *SI = PredPointer.getSelectInputInBB(Pred)) { + auto [TrueAddr, FalseAddr] = + PHITransAddr(Pointer).SelTranslateValue(BB, Pred, &DT, SI); + if (TrueAddr && FalseAddr) { + // Check that there are no modifying memory operations between + // select and Pred terminator. + if (!AA.canInstructionRangeModRef(*SI, Pred->back(), + Loc.getWithNewPtr(TrueAddr), + ModRefInfo::Mod) && + !AA.canInstructionRangeModRef(*SI, Pred->back(), + Loc.getWithNewPtr(FalseAddr), + ModRefInfo::Mod)) { + // Save Def dependency from SI instruction. + Result.push_back(NonLocalDepResult( + Pred, MemDepResult::getDef(SI), + SelectAddr::SelectAddrs{TrueAddr, FalseAddr})); + continue; + } + } + } CanTranslate = false; + } // FIXME: it is entirely possible that PHI translating will end up with // the same value. Consider PHI translating something like: diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp --- a/llvm/lib/Analysis/PHITransAddr.cpp +++ b/llvm/lib/Analysis/PHITransAddr.cpp @@ -16,7 +16,6 @@ #include "llvm/Config/llvm-config.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/Instructions.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -142,7 +141,8 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB, BasicBlock *PredBB, - const DominatorTree *DT) { + const DominatorTree *DT, + SelectInst *Sel, bool Cond) { // If this is a non-instruction value, it can't require PHI translation. Instruction *Inst = dyn_cast(V); if (!Inst) return V; @@ -165,8 +165,12 @@ InstInputs.erase(find(InstInputs, Inst)); // If this is a PHI, go ahead and translate it. - if (PHINode *PN = dyn_cast(Inst)) - return AddAsInput(PN->getIncomingValueForBlock(PredBB)); + if (PHINode *PN = dyn_cast(Inst)) { + auto *V = PN->getIncomingValueForBlock(PredBB); + if (Sel && V == Sel) + return AddAsInput(Cond ? Sel->getTrueValue() : Sel->getFalseValue()); + return AddAsInput(V); + } // If this is a non-phi value, and it is analyzable, we can incorporate it // into the expression by making all instruction operands be inputs. @@ -186,7 +190,8 @@ if (CastInst *Cast = dyn_cast(Inst)) { if (!isSafeToSpeculativelyExecute(Cast)) return nullptr; - Value *PHIIn = PHITranslateSubExpr(Cast->getOperand(0), CurBB, PredBB, DT); + Value *PHIIn = + PHITranslateSubExpr(Cast->getOperand(0), CurBB, PredBB, DT, Sel, Cond); if (!PHIIn) return nullptr; if (PHIIn == Cast->getOperand(0)) return Cast; @@ -215,7 +220,8 @@ SmallVector GEPOps; bool AnyChanged = false; for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) { - Value *GEPOp = PHITranslateSubExpr(GEP->getOperand(i), CurBB, PredBB, DT); + Value *GEPOp = + PHITranslateSubExpr(GEP->getOperand(i), CurBB, PredBB, DT, Sel, Cond); if (!GEPOp) return nullptr; AnyChanged |= GEPOp != GEP->getOperand(i); @@ -259,7 +265,8 @@ bool isNSW = cast(Inst)->hasNoSignedWrap(); bool isNUW = cast(Inst)->hasNoUnsignedWrap(); - Value *LHS = PHITranslateSubExpr(Inst->getOperand(0), CurBB, PredBB, DT); + Value *LHS = + PHITranslateSubExpr(Inst->getOperand(0), CurBB, PredBB, DT, Sel, Cond); if (!LHS) return nullptr; // If the PHI translated LHS is an add of a constant, fold the immediates. @@ -306,7 +313,6 @@ return nullptr; } - /// PHITranslateValue - PHI translate the current address up the CFG from /// CurBB to Pred, updating our state to reflect any needed changes. If /// 'MustDominate' is true, the translated value must dominate @@ -331,6 +337,17 @@ return Addr == nullptr; } +SelectAddr::SelectAddrs PHITransAddr::SelTranslateValue(BasicBlock *CurBB, + BasicBlock *PredBB, + const DominatorTree *DT, + SelectInst *Sel) { + Value *TrueAddr = PHITransAddr(*this).PHITranslateSubExpr(Addr, CurBB, PredBB, + DT, Sel, true); + Value *FalseAddr = PHITransAddr(*this).PHITranslateSubExpr( + Addr, CurBB, PredBB, DT, Sel, false); + return {TrueAddr, FalseAddr}; +} + /// PHITranslateWithInsertion - PHI translate this value into the specified /// predecessor block, inserting a computation of the value if it is /// unavailable. diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1156,7 +1156,7 @@ std::optional GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, - Value *Address) { + const SelectAddr &Addr) { assert(Load->isUnordered() && "rules below are incorrect for ordered access"); assert(DepInfo.isLocal() && "expected a local dependence"); @@ -1164,6 +1164,7 @@ const DataLayout &DL = Load->getModule()->getDataLayout(); if (DepInfo.isClobber()) { + Value *Address = Addr.getAddr(); // If the dependence is to a store that writes to a superset of the bits // read by the load, we can extract the bits we need for the load from the // stored value. @@ -1272,16 +1273,18 @@ // between load values. There must be no instructions between the found // loads and DepInst that may clobber the loads. if (auto *Sel = dyn_cast(DepInst)) { - assert(Sel->getType() == Load->getPointerOperandType()); + auto [TrueAddr, FalseAddr] = Addr.getSelectAddrs(); + assert(TrueAddr && TrueAddr->getType() == Load->getPointerOperandType() && + FalseAddr && FalseAddr->getType() == Load->getPointerOperandType()); auto Loc = MemoryLocation::get(Load); Value *V1 = - findDominatingValue(Loc.getWithNewPtr(Sel->getTrueValue()), - Load->getType(), DepInst, getAliasAnalysis()); + findDominatingValue(Loc.getWithNewPtr(TrueAddr), Load->getType(), + DepInst, getAliasAnalysis()); if (!V1) return std::nullopt; Value *V2 = - findDominatingValue(Loc.getWithNewPtr(Sel->getFalseValue()), - Load->getType(), DepInst, getAliasAnalysis()); + findDominatingValue(Loc.getWithNewPtr(FalseAddr), Load->getType(), + DepInst, getAliasAnalysis()); if (!V2) return std::nullopt; return AvailableValue::getSelect(Sel, V1, V2); diff --git a/llvm/test/Transforms/GVN/PRE/pre-load-through-select.ll b/llvm/test/Transforms/GVN/PRE/pre-load-through-select.ll --- a/llvm/test/Transforms/GVN/PRE/pre-load-through-select.ll +++ b/llvm/test/Transforms/GVN/PRE/pre-load-through-select.ll @@ -835,24 +835,31 @@ ; CHECK-LABEL: @test_phi_select_index_non_local( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[IF_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]] +; CHECK: entry.if.end_crit_edge: +; CHECK-NEXT: [[IDXPROM5_PHI_TRANS_INSERT:%.*]] = sext i32 [[I]] to i64 +; CHECK-NEXT: [[ARRAYIDX6_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM5_PHI_TRANS_INSERT]] +; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX6_PHI_TRANS_INSERT]], align 4 +; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: land.lhs.true: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I]], 1 ; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[ADD]] to i64 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM1]] ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[CMP3]], i32 [[TMP1]], i32 [[TMP0]] ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 [[ADD]], i32 [[I]] +; CHECK-NEXT: [[DOTPRE1:%.*]] = sext i32 [[SPEC_SELECT]] to i64 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[I_ADDR_0:%.*]] = phi i32 [ [[I]], [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[LAND_LHS_TRUE]] ] -; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[I_ADDR_0]] to i64 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[IDXPROM5_PRE_PHI:%.*]] = phi i64 [ [[IDXPROM5_PHI_TRANS_INSERT]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[DOTPRE1]], [[LAND_LHS_TRUE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[DOTPRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[TMP2]], [[LAND_LHS_TRUE]] ] +; CHECK-NEXT: [[I_ADDR_0:%.*]] = phi i32 [ [[I]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[SPEC_SELECT]], [[LAND_LHS_TRUE]] ] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM5_PRE_PHI]] +; CHECK-NEXT: ret i32 [[TMP3]] ; entry: %cmp = icmp slt i32 %i, %N @@ -884,17 +891,19 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[A:%.*]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP2:%.*]], [[FOR_BODY]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[RES]] to i64 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP2]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[TMP0]] ; CHECK-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[IDX]], i32 [[RES]] ; CHECK-NEXT: [[IDX_NEXT]] = add nsw i32 [[IDX]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IDX_NEXT]], [[N]]