diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -255,6 +255,34 @@ /// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] /// br %normal_dst /// +/// An indirect musttail call is processed slightly differently in that: +/// 1. No merge block needed for the orginal and the cloned callsite, since +/// either one ends the flow. No phi node is needed either. +/// 2. The return statement following the original call site is duplicated too +/// and placed immediately after the cloned call site per the IR convention. +/// +/// For example, the musttail call instruction below: +/// +/// orig_bb: +/// %t0 = musttail call i32 %ptr() +/// ... +/// +/// Is replaced by the following: +/// +/// cond_bb: +/// %cond = icmp eq i32 ()* %ptr, @func +/// br i1 %cond, %then_bb, %orig_bb +/// +/// then_bb: +/// ; The clone of the original call instruction is placed in the "then" +/// ; block. It is not yet promoted. +/// %t1 = musttail call i32 %ptr() +/// ret %t1 +/// +/// orig_bb: +/// ; The original call instruction stays in its original block. +/// %t0 = musttail call i32 %ptr() +/// ret %t0 static CallBase &versionCallSite(CallBase &CB, Value *Callee, MDNode *BranchWeights) { @@ -268,6 +296,44 @@ Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType()); auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee); + if (OrigInst->isMustTailCall()) { + // Create an if-then structure. The original instruction stays in its block, + // and a clone of the original instruction is placed in the "then" block. + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights); + BasicBlock *ThenBlock = ThenTerm->getParent(); + ThenBlock->setName("if.true.direct_targ"); + CallBase *NewInst = cast(OrigInst->clone()); + NewInst->insertBefore(ThenTerm); + + // Place a clone of the optional bitcast after the new call site. + Value *NewRetVal = NewInst; + auto Next = OrigInst->getNextNode(); + if (auto *BitCast = dyn_cast_or_null(Next)) { + assert(BitCast->getOperand(0) == OrigInst && + "bitcast following musttail call must use the call"); + auto NewBitCast = BitCast->clone(); + NewBitCast->replaceUsesOfWith(OrigInst, NewInst); + NewBitCast->insertBefore(ThenTerm); + NewRetVal = NewBitCast; + Next = BitCast->getNextNode(); + } + + // Place a clone of the return instruction after the new call site. + ReturnInst *Ret = dyn_cast_or_null(Next); + assert(Ret && "musttail call must precede a ret with an optional bitcast"); + auto NewRet = Ret->clone(); + if (Ret->getReturnValue()) + NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal); + NewRet->insertBefore(ThenTerm); + + // A return instructions is terminating, so we don't need the terminator + // instruction just created. + ThenTerm->eraseFromParent(); + + return *NewInst; + } + // Create an if-then-else structure. The original instruction is moved into // the "else" block, and a clone of the original instruction is placed in the // "then" block. diff --git a/llvm/test/Transforms/PGOProfile/indirect_call_promotion_musttail.ll b/llvm/test/Transforms/PGOProfile/indirect_call_promotion_musttail.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/indirect_call_promotion_musttail.ll @@ -0,0 +1,68 @@ +; RUN: opt < %s -pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-PROM +; RUN: opt < %s -passes=pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-PROM + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@foo = common global i32* ()* null, align 8 + +declare i32* @func1() + +declare i32* @func2() + +declare i32* @func3() + +declare i32* @func4() + +define i32* @bar() { +entry: + %tmp = load i32* ()*, i32* ()** @foo, align 8 +; ICALL-PROM: [[CMP1:%[0-9]+]] = icmp eq i32* ()* %tmp, @func4 +; ICALL-PROM: br i1 [[CMP1]], label %if.true.direct_targ, label %[[L1:[0-9]+]], !prof [[BRANCH_WEIGHT1:![0-9]+]] +; ICALL-PROM: if.true.direct_targ: +; ICALL-PROM: [[DIRCALL_RET1:%[0-9]+]] = musttail call i32* @func4() +; ICALL-PROM: ret i32* [[DIRCALL_RET1]] +; ICALL-PROM: [[L1]]: +; ICALL-PROM: [[CMP2:%[0-9]+]] = icmp eq i32* ()* %tmp, @func2 +; ICALL-PROM: br i1 [[CMP2]], label %if.true.direct_targ1, label %[[L2:[0-9]+]], !prof [[BRANCH_WEIGHT2:![0-9]+]] +; ICALL-PROM: if.true.direct_targ1: +; ICALL-PROM: [[DIRCALL_RET2:%[0-9]+]] = musttail call i32* @func2() +; ICALL-PROM: ret i32* [[DIRCALL_RET2]] +; ICALL-PROM: [[L2]]: +; ICALL-PROM: [[CMP3:%[0-9]+]] = icmp eq i32* ()* %tmp, @func3 +; ICALL-PROM: br i1 [[CMP3]], label %if.true.direct_targ2, label %[[L3:[0-9]+]], !prof [[BRANCH_WEIGHT3:![0-9]+]] +; ICALL-PROM: if.true.direct_targ2: +; ICALL-PROM: [[DIRCALL_RET3:%[0-9]+]] = musttail call i32* @func3() +; ICALL-PROM: ret i32* [[DIRCALL_RET3]] +; ICALL-PROM: [[L3]]: +; ICALL-PROM: %call = musttail call i32* %tmp() +; ICALL-PROM: ret i32* %call + %call = musttail call i32* %tmp(), !prof !1 + ret i32* %call +} + +define i64* @bar2() { +entry: + %tmp = load i32* ()*, i32* ()** @foo, align 8 +; ICALL-PROM: [[CMP1:%[0-9]+]] = icmp eq i32* ()* %tmp, @func4 +; ICALL-PROM: br i1 [[CMP1]], label %if.true.direct_targ, label %[[L4:[0-9]+]], !prof [[BRANCH_WEIGHT4:![0-9]+]] +; ICALL-PROM: if.true.direct_targ: +; ICALL-PROM: [[DIRCALL_RET1:%[0-9]+]] = musttail call i32* @func4() +; ICALL-PROM: [[DIRCALL_RET2:%[0-9]+]] = bitcast i32* [[DIRCALL_RET1]] to i64* +; ICALL-PROM: ret i64* [[DIRCALL_RET2]] +; ICALL-PROM: [[L4]]: +; ICALL-PROM: %call = musttail call i32* %tmp() +; ICALL-PROM: %rv = bitcast i32* %call to i64* +; ICALL-PROM: ret i64* %rv + %call = musttail call i32* %tmp(), !prof !2 + %rv = bitcast i32* %call to i64* + ret i64* %rv +} + +!1 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410, i64 -6929281286627296573, i64 150, i64 -2545542355363006406, i64 10} +!2 = !{!"VP", i32 0, i64 100, i64 7651369219802541373, i64 100} + +; ICALL-PROM: [[BRANCH_WEIGHT1]] = !{!"branch_weights", i32 1030, i32 570} +; ICALL-PROM: [[BRANCH_WEIGHT2]] = !{!"branch_weights", i32 410, i32 160} +; ICALL-PROM: [[BRANCH_WEIGHT3]] = !{!"branch_weights", i32 150, i32 10} +; ICALL-PROM: [[BRANCH_WEIGHT4]] = !{!"branch_weights", i32 100, i32 0}