Index: llvm/include/llvm/Transforms/Scalar/GVN.h =================================================================== --- llvm/include/llvm/Transforms/Scalar/GVN.h +++ llvm/include/llvm/Transforms/Scalar/GVN.h @@ -370,7 +370,7 @@ void replaceInstruction(Instruction *I, Instruction *Repl); std::pair hoistPair(BasicBlock *DestBB, BasicBlock *ThenBB, BasicBlock *ElseBB, Instruction *ThenI, - Instruction *ElseI); + Instruction *ElseI, uint32_t DepLength); Value *findLeader(const BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; Index: llvm/lib/Transforms/Scalar/GVN.cpp =================================================================== --- llvm/lib/Transforms/Scalar/GVN.cpp +++ llvm/lib/Transforms/Scalar/GVN.cpp @@ -128,6 +128,13 @@ static cl::opt GVNEnableSimpleGVNHoist("enable-simple-gvn-hoist", cl::init(true)); +static cl::opt MaxBlockDepth( + "simple-gvn-hoist-max-block-depth", cl::Hidden, cl::init(100), + cl::desc("Limit hoist candidates to the first N instructions in a block")); + +static cl::opt MaxChainLength( + "simple-gvn-hoist-max-chain-length", cl::Hidden, cl::init(10), + cl::desc("Limit the length of dependency chains")); struct llvm::GVN::Expression { uint32_t opcode; @@ -3120,8 +3127,11 @@ } void GVN::collectHoistCandidates(BasicBlock *BB) { + uint32_t Depth = 0; bool HasHoistBarrier = false; for (Instruction &I : *BB) { + if (++Depth > MaxBlockDepth) + break; if (I.isTerminator()) break; if (isa(I)) @@ -3137,8 +3147,11 @@ } void GVN::matchHoistCandidates(BasicBlock *BB) { + uint32_t Depth = 0; bool HasHoistBarrier = false; for (Instruction &I : *BB) { + if (++Depth > MaxBlockDepth) + break; if (I.isTerminator()) break; if (isa(I)) @@ -3198,11 +3211,15 @@ // block. std::pair GVN::hoistPair(BasicBlock *DestBB, BasicBlock *ThenBB, BasicBlock *ElseBB, Instruction *ThenI, - Instruction *ElseI) { + Instruction *ElseI, uint32_t DepLength) { // If the instruction is moved out of the "then" block there's nothing to do. if (ThenI->getParent() != ThenBB) return {false, false}; + // Check if we have exceeded dependency chain length limit. + if (DepLength > MaxChainLength) + return {false, true}; + // Instruction must have already been selected for hoisting and matched with // another instruction. auto It = HoistPairs.find(VN.lookupOrAdd(ThenI)); @@ -3228,7 +3245,7 @@ continue; bool OpChange, CantHoist; std::tie(OpChange, CantHoist) = - hoistPair(DestBB, ThenBB, ElseBB, Op, nullptr); + hoistPair(DestBB, ThenBB, ElseBB, Op, nullptr, DepLength + 1); Change |= OpChange; if (CantHoist) return {Change, true}; @@ -3250,9 +3267,9 @@ MemoryAccess *ThenMA = Walker->getClobberingMemoryAccess(ThenI); while (!MSSA->isLiveOnEntryDef(ThenMA) && ThenMA->getBlock() == ThenBB) { bool DepChange, CantHoist; - std::tie(DepChange, CantHoist) = - hoistPair(DestBB, ThenBB, ElseBB, - cast(ThenMA)->getMemoryInst(), nullptr); + std::tie(DepChange, CantHoist) = hoistPair( + DestBB, ThenBB, ElseBB, cast(ThenMA)->getMemoryInst(), + nullptr, DepLength + 1); Change |= DepChange; if (CantHoist) return {Change, true}; @@ -3283,7 +3300,7 @@ for (Instruction *I : MemoryUses) { bool DepChange, CantHoist; std::tie(DepChange, CantHoist) = - hoistPair(DestBB, ThenBB, ElseBB, I, nullptr); + hoistPair(DestBB, ThenBB, ElseBB, I, nullptr, DepLength + 1); Change |= DepChange; if (CantHoist) return {Change, true}; @@ -3351,8 +3368,12 @@ // Hoist matched pairs. for (const auto &P : HoistPairs) { bool LocalChange, _; - std::tie(LocalChange, _) = - hoistPair(BB, Then, Else, P.second.first, P.second.second); + Instruction *ThenI = P.second.first, *ElseI = P.second.second; + // Don't initiate hoisting from GEPs (they still can be hoisted by their + // users). + if (isa(ThenI)) + continue; + std::tie(LocalChange, _) = hoistPair(BB, Then, Else, ThenI, ElseI, 0); Change |= LocalChange; } } Index: llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -163,16 +163,16 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !8 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !11 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[WIDE_LOAD14]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD14]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !13, !noalias !15 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP8]], [[WIDE_LOAD15]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP8]], <4 x float> [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD15]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP7]], <4 x float> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP8]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP11]], align 4, !alias.scope !13, !noalias !15 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 @@ -182,18 +182,18 @@ ; CHECK-NEXT: [[C_GEP:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[IV1]] ; CHECK-NEXT: [[C_LV:%.*]] = load i32, i32* [[C_GEP]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C_LV]], 20 -; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV1]] ; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV1]] ; CHECK-NEXT: [[A_LV_0:%.*]] = load float, float* [[A_GEP_0]], align 4 ; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X]] +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV1]] ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[ELSE:%.*]] ; CHECK: else: ; CHECK-NEXT: [[B_LV:%.*]] = load float, float* [[B_GEP_0]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL2_I81_I]], [[B_LV]] ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: -; CHECK-NEXT: [[STOREMERGE:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ] -; CHECK-NEXT: store float [[STOREMERGE]], float* [[B_GEP_0]], align 4 +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ] +; CHECK-NEXT: store float [[ADD_SINK]], float* [[B_GEP_0]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999 ; CHECK-NEXT: br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]]