diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -789,6 +789,41 @@ }; } // namespace + +/// Return true if we know how to rewrite all uses of the given alloca after +/// hoisting it out of the loop. The main concerns are a) potential captures +/// and b) invariant.start markers which don't capture, but are no longer +/// valid w/o a corresponding invariant.end. +static bool canRewriteUsesOfAlloca(AllocaInst &AI) { + // TODO: This looks a lot like capture tracking, but we need to remove any + // invariant starts if we extend the lifetime of the alloca by hoisting it. + // We should probably refactor capture tracking into a form which allows us + // to reuse the relevant bits and remove the duplicated logic here. + + SmallVector Worklist; + for (Use &U : AI.uses()) + Worklist.push_back(&U); + + unsigned NumUsesExplored = 0; + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast(U->getUser()); + NumUsesExplored++; + if (NumUsesExplored > DefaultMaxUsesToExplore) + return false; + // Non capturing, terminating uses + if (isa(I) || + (isa(I) && U->getOperandNo() == 1)) + continue; + // Non capturing, non-terminating + if (!isa(I) && !isa(I)) + return false; + for (Use &U : I->uses()) + Worklist.push_back(&U); + } + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -909,6 +944,16 @@ continue; } + if (isa(&I) && + SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + canRewriteUsesOfAlloca(cast(I))) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + HoistedInstructions.push_back(&I); + Changed = true; + continue; + } + if (PHINode *PN = dyn_cast(&I)) { if (CFH.canHoistPHI(PN)) { // Redirect incoming blocks first to ensure that we create hoisted diff --git a/llvm/test/Transforms/LICM/hoist-alloca.ll b/llvm/test/Transforms/LICM/hoist-alloca.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/hoist-alloca.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -licm < %s | FileCheck %s + +@G = external global i64 + +define void @test(i64 %n) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i64 +; CHECK-NEXT: [[VAL:%.*]] = load i64, i64* [[A]] +; CHECK-NEXT: store i64 [[VAL]], i64* @G +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %a = alloca i64 + %val = load i64, i64* %a + store i64 %val, i64* @G + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ult i64 %iv, %n + br i1 %exitcond, label %for.body, label %exit +exit: + ret void +} + +define void @test2(i64 %n) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], [[FOR_BODY]] ] +; CHECK-NEXT: store i64 [[IV_LCSSA]], i64* [[A]], align 4 +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %a = alloca i64 + store i64 %iv, i64* %a + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ult i64 %iv, %n + br i1 %exitcond, label %for.body, label %exit +exit: + ret void +} + + +define void @test3(i64 %n) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = alloca i64 +; CHECK-NEXT: [[A_I8:%.*]] = bitcast i64* [[A]] to i8* +; CHECK-NEXT: [[A_OFFSET:%.*]] = getelementptr i8, i8* [[A_I8]], i64 4 +; CHECK-NEXT: store i8 0, i8* [[A_OFFSET]] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %a = alloca i64 + %a.i8 = bitcast i64* %a to i8* + %a.offset = getelementptr i8, i8* %a.i8, i64 4 + store i8 0, i8* %a.offset + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ult i64 %iv, %n + br i1 %exitcond, label %for.body, label %exit +exit: + ret void +} + +; This example is subtle. Because the dynamic alloca isn't reclaimed until +; end of function scope, the captured value can legally point to a dynamic +; alloca stack region from a previous iteration. +define void @test4(i64 %n) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[A:%.*]] = alloca i64 +; CHECK-NEXT: store i64 [[IV]], i64* [[A]] +; CHECK-NEXT: call void @capture(i64* [[A]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %a = alloca i64 + store i64 %iv, i64* %a + %a.i8 = bitcast i64* %a to i8* + call void @capture(i64* %a) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ult i64 %iv, %n + br i1 %exitcond, label %for.body, label %exit +exit: + ret void +} +declare void @capture(i64* %a) + + +; TODO: not yet handled +define void @test5(i64 %n) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[A:%.*]] = alloca i64 +; CHECK-NEXT: store i64 [[IV]], i64* [[A]] +; CHECK-NEXT: [[A_I8:%.*]] = bitcast i64* [[A]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 8, i8* [[A_I8]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %a = alloca i64 + store i64 %iv, i64* %a + %a.i8 = bitcast i64* %a to i8* + call {}* @llvm.invariant.start.p0i8(i64 8, i8* %a.i8) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ult i64 %iv, %n + br i1 %exitcond, label %for.body, label %exit +exit: + ret void +} + +declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly +