diff --git a/clang/test/CodeGenCoroutines/pr56329.cpp b/clang/test/CodeGenCoroutines/pr56329.cpp --- a/clang/test/CodeGenCoroutines/pr56329.cpp +++ b/clang/test/CodeGenCoroutines/pr56329.cpp @@ -1,6 +1,8 @@ // Test for PR56919. Tests the we won't contain the resumption of final suspend point. // // RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %s -O3 -S -emit-llvm -o - | FileCheck %s +// This test is expected to fail on PowerPC. +// XFAIL: powerpc #include "Inputs/coroutine.h" diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -765,6 +765,9 @@ /// If the target supports tail calls. bool supportsTailCalls() const; + /// If target supports tail call on \p CB + bool supportsTailCallFor(const CallBase *CB) const; + /// Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; @@ -1635,6 +1638,7 @@ ArrayRef Tys) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool supportsTailCalls() = 0; + virtual bool supportsTailCallFor(const CallBase *CB) = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0; @@ -2109,6 +2113,9 @@ } bool supportsTailCalls() override { return Impl.supportsTailCalls(); } + bool supportsTailCallFor(const CallBase *CB) override { + return Impl.supportsTailCallFor(CB); + } bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -343,6 +343,10 @@ bool supportsTailCalls() const { return true; } + bool supportsTailCallFor(const CallBase *CB) const { + return supportsTailCalls(); + } + bool enableAggressiveInterleaving(bool LoopHasReductions) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -528,6 +528,10 @@ return TTIImpl->supportsTailCalls(); } +bool TargetTransformInfo::supportsTailCallFor(const CallBase *CB) const { + return TTIImpl->supportsTailCallFor(CB); +} + bool TargetTransformInfo::enableAggressiveInterleaving( bool LoopHasReductions) const { return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -148,6 +148,7 @@ unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); + bool supportsTailCallFor(const CallBase *CB) const; private: // The following constant is used for estimating costs on power9. diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1461,3 +1461,19 @@ // evl but no mask, on Power 9/10. Otherwise, we must scalarize. return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } + +bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const { + // Subtargets using PC-Relative addressing supported. + if (ST->isUsingPCRelativeCalls()) + return true; + + const Function *Callee = CB->getCalledFunction(); + // Indirect calls and variadic argument functions not supported. + if (!Callee || Callee->isVarArg()) + return false; + + const Function *Caller = CB->getCaller(); + // Support if we can share TOC base. + return ST->getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), + Callee); +} diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1362,7 +1362,7 @@ // for symmetrical coroutine control transfer (C++ Coroutines TS extension). // This transformation is done only in the resume part of the coroutine that has // identical signature and calling convention as the coro.resume call. -static void addMustTailToCoroResumes(Function &F) { +static void addMustTailToCoroResumes(Function &F, TargetTransformInfo &TTI) { bool changed = false; // Collect potential resume instructions. @@ -1374,7 +1374,9 @@ // Set musttail on those that are followed by a ret instruction. for (CallInst *Call : Resumes) - if (simplifyTerminatorLeadingToRet(Call->getNextNode())) { + // Skip targets which don't support tail call on the specific case. + if (TTI.supportsTailCallFor(Call) && + simplifyTerminatorLeadingToRet(Call->getNextNode())) { Call->setTailCallKind(CallInst::TCK_MustTail); changed = true; } @@ -1610,7 +1612,7 @@ // FIXME: Could we support symmetric transfer effectively without musttail // call? if (TTI.supportsTailCalls()) - addMustTailToCoroResumes(*ResumeClone); + addMustTailToCoroResumes(*ResumeClone, TTI); // Store addresses resume/destroy/cleanup functions in the coroutine frame. updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone); diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll @@ -0,0 +1,74 @@ +; Tests that some target (e.g. ppc) can support tail call under condition. +; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 | FileCheck %s +; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 --code-model=medium \ +; RUN: | FileCheck %s --check-prefix=CHECK-PCREL + +define void @f() #0 { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %alloc = call i8* @malloc(i64 16) #3 + %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc) + + %save = call token @llvm.coro.save(i8* null) + %addr1 = call i8* @llvm.coro.subfn.addr(i8* null, i8 0) + %pv1 = bitcast i8* %addr1 to void (i8*)* + call fastcc void %pv1(i8* null) + + %suspend = call i8 @llvm.coro.suspend(token %save, i1 false) + switch i8 %suspend, label %exit [ + i8 0, label %await.ready + i8 1, label %exit + ] +await.ready: + %save2 = call token @llvm.coro.save(i8* null) + %addr2 = call i8* @llvm.coro.subfn.addr(i8* null, i8 0) + %pv2 = bitcast i8* %addr2 to void (i8*)* + call fastcc void %pv2(i8* null) + + %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false) + switch i8 %suspend2, label %exit [ + i8 0, label %exit + i8 1, label %exit + ] +exit: + call i1 @llvm.coro.end(i8* null, i1 false) + ret void +} + +; Verify that in the initial function resume is not marked with musttail. +; CHECK-LABEL: @f( +; CHECK: %[[addr1:.+]] = call i8* @llvm.coro.subfn.addr(i8* null, i8 0) +; CHECK-NEXT: %[[pv1:.+]] = bitcast i8* %[[addr1]] to void (i8*)* +; CHECK-NOT: musttail call fastcc void %[[pv1]](i8* null) + +; Verify that ppc target not using PC-Relative addressing in the resume part resume call is not marked with musttail. +; CHECK-LABEL: @f.resume( +; CHECK: %[[addr2:.+]] = call i8* @llvm.coro.subfn.addr(i8* null, i8 0) +; CHECK-NEXT: %[[pv2:.+]] = bitcast i8* %[[addr2]] to void (i8*)* +; CHECK-NEXT: call fastcc void %[[pv2]](i8* null) + +; Verify that ppc target using PC-Relative addressing in the resume part resume call is marked with musttail. +; CHECK-PCREL-LABEL: @f.resume( +; CHECK-PCREL: %[[addr2:.+]] = call i8* @llvm.coro.subfn.addr(i8* null, i8 0) +; CHECK-PCREL-NEXT: %[[pv2:.+]] = bitcast i8* %[[addr2]] to void (i8*)* +; CHECK-PCREL-NEXT: musttail call fastcc void %[[pv2]](i8* null) +; CHECK-PCREL-NEXT: ret void + +declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1 +declare i1 @llvm.coro.alloc(token) #2 +declare i64 @llvm.coro.size.i64() #3 +declare i8* @llvm.coro.begin(token, i8* writeonly) #2 +declare token @llvm.coro.save(i8*) #2 +declare i8* @llvm.coro.frame() #3 +declare i8 @llvm.coro.suspend(token, i1) #2 +declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1 +declare i1 @llvm.coro.end(i8*, i1) #2 +declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #1 +declare i8* @malloc(i64) + +attributes #0 = { presplitcoroutine } +attributes #1 = { argmemonly nounwind readonly } +attributes #2 = { nounwind } +attributes #3 = { nounwind readnone }