Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -96,6 +96,10 @@ DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), cl::desc("Disable memory promotion in LICM pass")); +static cl::opt MaxNumPromotions( + "licm-max-num-promotions", cl::Hidden, cl::init(0), + cl::desc("The maximum number of memory instructions to be hoisted")); + static cl::opt ControlFlowHoisting( "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); @@ -328,6 +332,20 @@ } } +static void SetMaxNumPromotions(TargetTransformInfo *TTI) { + // TODO: if the maximum number of promotions isn't user specified, + // we set it to number of available scalar registers. This is a very + // first step to not let the register pressure explode for some cases. + // For now, some instructions are not considered for hoisting when + // the maximum is reached, but better would be to come up with a + // register pressure estimator. + if (!MaxNumPromotions) + MaxNumPromotions = + TTI->getNumberOfRegisters(TTI->getRegisterClassForType(false)); + LLVM_DEBUG(dbgs() << "LICM: Using promotion maximum: " << + MaxNumPromotions << "\n"); +} + /// Hoist expressions out of the specified loop. Note, alias info for inner /// loop is not preserved so it is not a good idea to run LICM multiple /// times on one loop. @@ -345,6 +363,8 @@ return false; } + SetMaxNumPromotions(TTI); + std::unique_ptr CurAST; std::unique_ptr MSSAU; std::unique_ptr Flags; @@ -425,6 +445,7 @@ if (!CurAST.get()) CurAST = collectAliasInfoForLoopWithMSSA(L, AA, MSSAU.get()); + unsigned NumPromoted = 0; // Loop over all of the alias sets in the tracker object. for (AliasSet &AS : *CurAST) { // We can promote this alias set if it has a store, if it is a "Must" @@ -442,9 +463,17 @@ for (const auto &ASI : AS) PointerMustAliases.insert(ASI.getValue()); - Promoted |= promoteLoopAccessesToScalars( + if (NumPromoted == MaxNumPromotions) { + LLVM_DEBUG(dbgs() << "LICM: maximum reached, not hoisting: \n"); + break; + } + + if (promoteLoopAccessesToScalars( PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, - DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE); + DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE)) { + Promoted = true; + NumPromoted++; + } } // Once we have promoted values across the loop body we have to @@ -2158,6 +2187,7 @@ << "Moving accesses to memory location out of the loop"; }); ++NumPromoted; + ++NumHoisted; // Look at all the loop uses, and try to merge their locations. std::vector LoopUsesLocs; Index: llvm/test/Transforms/LICM/max-num-hoist.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LICM/max-num-hoist.ll @@ -0,0 +1,96 @@ +; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=DEFAULT8 +; RUN: opt -S -licm -licm-max-num-promotions=2 < %s | FileCheck %s --check-prefix=MAX2 + +target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32" + +@test.F = internal global [8 x [8 x i64]] zeroinitializer, align 8 +@test.f = internal global [8 x [8 x i8]] zeroinitializer, align 1 + +; MAX2: @test +; MAX2: entry: +; MAX2-NEXT: %A = alloca [8 x [8 x i64]], align 8 +; MAX2-NEXT: %i = bitcast [8 x [8 x i64]]* %A to i8* +; MAX2-NEXT: %.promoted = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 0), align 8 +; MAX2-NEXT: %.promoted2 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 1), align 8 +; MAX2-NEXT: br label %for.cond16.preheader + +; DEFAULT8: @test +; DEFAULT8: entry: +; DEFAULT8-NEXT: %A = alloca [8 x [8 x i64]], align 8 +; DEFAULT8-NEXT: %i = bitcast [8 x [8 x i64]]* %A to i8* +; DEFAULT8-NEXT: %.promoted = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 0), align 8 +; DEFAULT8-NEXT: %.promoted2 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 1), align 8 +; DEFAULT8-NEXT: %.promoted4 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 2), align 8 +; DEFAULT8-NEXT: %.promoted6 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 3), align 8 +; DEFAULT8-NEXT: %.promoted8 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 4), align 8 +; DEFAULT8-NEXT: %.promoted10 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 5), align 8 +; DEFAULT8-NEXT: %.promoted12 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 6), align 8 +; DEFAULT8-NEXT: %.promoted14 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 7), align 8 +; DEFAULT8-NEXT: br label %for.cond16.preheader + +define void @test() { +entry: + %A = alloca [8 x [8 x i64]], align 8 + %i = bitcast [8 x [8 x i64]]* %A to i8* + br label %for.cond16.preheader + +for.cond16.preheader: ; preds = %for.cond16.preheader, %for.cond13.preheader + %storemerge3844 = phi i64 [ 0, %entry ], [ %inc35, %for.cond16.preheader ] + %arrayidx25 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 0 + %arrayidx23 = getelementptr inbounds [8 x [8 x i8]], [8 x [8 x i8]]* @test.f, i64 0, i64 0, i64 %storemerge3844 + %i7 = load i8, i8* %arrayidx23, align 1 + %conv = sext i8 %i7 to i64 + %i8 = load i64, i64* %arrayidx25, align 8 + %mul = mul nsw i64 %i8, %conv + %i9 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 0), align 8 + %add = add nsw i64 %i9, %mul + store i64 %add, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 0), align 8 + %arrayidx25.1 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 1 + %i10 = load i64, i64* %arrayidx25.1, align 8 + %mul.1 = mul nsw i64 %i10, %conv + %i11 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 1), align 8 + %add.1 = add nsw i64 %i11, %mul.1 + store i64 %add.1, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 1), align 8 + %arrayidx25.2 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 2 + %i12 = load i64, i64* %arrayidx25.2, align 8 + %mul.2 = mul nsw i64 %i12, %conv + %i13 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 2), align 8 + %add.2 = add nsw i64 %i13, %mul.2 + store i64 %add.2, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 2), align 8 + %arrayidx25.3 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 3 + %i14 = load i64, i64* %arrayidx25.3, align 8 + %mul.3 = mul nsw i64 %i14, %conv + %i15 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 3), align 8 + %add.3 = add nsw i64 %i15, %mul.3 + store i64 %add.3, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 3), align 8 + %arrayidx25.4 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 4 + %i16 = load i64, i64* %arrayidx25.4, align 8 + %mul.4 = mul nsw i64 %i16, %conv + %i17 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 4), align 8 + %add.4 = add nsw i64 %i17, %mul.4 + store i64 %add.4, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 4), align 8 + %arrayidx25.5 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 5 + %i18 = load i64, i64* %arrayidx25.5, align 8 + %mul.5 = mul nsw i64 %i18, %conv + %i19 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 5), align 8 + %add.5 = add nsw i64 %i19, %mul.5 + store i64 %add.5, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 5), align 8 + %arrayidx25.6 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 6 + %i20 = load i64, i64* %arrayidx25.6, align 8 + %mul.6 = mul nsw i64 %i20, %conv + %i21 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 6), align 8 + %add.6 = add nsw i64 %i21, %mul.6 + store i64 %add.6, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 6), align 8 + %arrayidx25.7 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* %A, i64 0, i64 %storemerge3844, i64 7 + %i22 = load i64, i64* %arrayidx25.7, align 8 + %mul.7 = mul nsw i64 %i22, %conv + %i23 = load i64, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 7), align 8 + %add.7 = add nsw i64 %i23, %mul.7 + store i64 %add.7, i64* getelementptr inbounds ([8 x [8 x i64]], [8 x [8 x i64]]* @test.F, i64 0, i64 0, i64 7), align 8 + %inc35 = add nuw nsw i64 %storemerge3844, 1 + %exitcond47.not = icmp eq i64 %inc35, 8 + br i1 %exitcond47.not, label %for.end36, label %for.cond16.preheader + +for.end36: + ret void +}