diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -2686,7 +2686,7 @@ .. code-block:: c - size_t __builtin_coro_size() + size_t __builtin_coro_size(bool alloc) void *__builtin_coro_frame() void *__builtin_coro_free(void *coro_frame) diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -1573,7 +1573,7 @@ BUILTIN(__builtin_coro_done, "bv*", "n") BUILTIN(__builtin_coro_promise, "v*v*IiIb", "n") -BUILTIN(__builtin_coro_size, "z", "n") +BUILTIN(__builtin_coro_size, "zb", "n") BUILTIN(__builtin_coro_frame, "v*", "n") BUILTIN(__builtin_coro_noop, "v*", "n") BUILTIN(__builtin_coro_free, "v*v*", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4430,11 +4430,12 @@ return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E)); case Builtin::BI__builtin_coro_size: { + Value *Arg0 = EmitScalarExpr(E->getArg(0)); auto & Context = getContext(); auto SizeTy = Context.getSizeType(); auto T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); Function *F = CGM.getIntrinsic(Intrinsic::coro_size, T); - return RValue::get(Builder.CreateCall(F)); + return RValue::get(Builder.CreateCall(F, {Arg0})); } case Builtin::BI__builtin_coro_id: diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp --- a/clang/lib/Sema/SemaCoroutine.cpp +++ b/clang/lib/Sema/SemaCoroutine.cpp @@ -1375,8 +1375,12 @@ Expr *FramePtr = buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_frame, {}); + Expr *IsAlloc = IntegerLiteral::Create( + S.Context, llvm::APInt(32, 1), + S.Context.getIntTypeForBitwidth(32, /*Signed=*/0), Loc); + Expr *FrameSize = - buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_size, {}); + buildBuiltinCall(S, Loc, Builtin::BI__builtin_coro_size, {IsAlloc}); // Make new call. diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -927,8 +927,8 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: - declare i32 @llvm.coro.size.i32() - declare i64 @llvm.coro.size.i64() + declare i32 @llvm.coro.size.i32(i1 ) + declare i64 @llvm.coro.size.i64(i1 ) Overview: """"""""" @@ -940,7 +940,10 @@ Arguments: """""""""" -None +The first argument is a boolean indicating that the retuned value is used by +alloc/dealloc functions. If it is true, overallocate may be performed to handle +over-alignment. All alloc/delloc functions should use ``llvm.coro.size`` with +the same ``alloc`` value. Semantics: """""""""" @@ -948,6 +951,32 @@ The `coro.size` intrinsic is lowered to a constant representing the size of the coroutine frame. +.. _coro.align: + +'llvm.coro.align' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + declare i32 @llvm.coro.align.i32() + declare i64 @llvm.coro.align.i64() + +Overview: +""""""""" + +The '``llvm.coro.align``' intrinsic returns the alignment of the coroutine frame +in bytes. This is only supported for switched-resume coroutines. + +Arguments: +"""""""""" + +None + +Semantics: +"""""""""" + +The `coro.align` intrinsic is lowered to a constant representing the alignment +of the coroutine frame. + .. _coro.begin: 'llvm.coro.begin' Intrinsic diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1236,7 +1236,8 @@ def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; def int_coro_noop : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; -def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; +def int_coro_size : Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem]>; +def int_coro_align : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; def int_coro_save : Intrinsic<[llvm_token_ty], [llvm_ptr_ty], []>; def int_coro_suspend : Intrinsic<[llvm_i8_ty], [llvm_token_ty, llvm_i1_ty], []>; diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -14,17 +14,21 @@ // the value into the coroutine frame. //===----------------------------------------------------------------------===// +#include "CoroInstr.h" #include "CoroInternal.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/StackLifetime.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" @@ -486,6 +490,8 @@ return StructAlign; } + SmallVector &getFields() { return Fields; } + FieldIDType getLayoutFieldIndex(FieldIDType Id) const { assert(IsFinished && "not yet finished!"); return Fields[Id].LayoutFieldIndex; @@ -710,6 +716,77 @@ IsFinished = true; } +// Adapted from CodeGenFunction::EmitBuiltinAlignTo. +static Value *emitAlignUpTo(IRBuilder<> &Builder, Value *Src, uint64_t Align) { + const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); + + auto *SrcType = cast(Src->getType()); + IntegerType *IntType = IntegerType::get(Builder.getContext(), + DL.getIndexTypeSizeInBits(SrcType)); + Value *Alignment = ConstantInt::get(IntType, Align); + auto *One = ConstantInt::get(IntType, 1); + Value *Mask = Builder.CreateSub(Alignment, One, "mask"); + Value *SrcAddr = Builder.CreatePtrToInt(Src, IntType, "intptr"); + + // When aligning up we have to first add the mask to ensure we go over the + // next alignment value and then align down to the next valid multiple. + // By adding the mask, we ensure that align_up on an already aligned + // value will not change the value. + Value *SrcForMask = Builder.CreateAdd(SrcAddr, Mask, "over_boundary"); + + // Invert the mask to only clear the lower bits. + Value *InvertedMask = Builder.CreateNot(Mask, "inverted_mask"); + Value *Result = Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result"); + + Result->setName("aligned_intptr"); + Value *Difference = Builder.CreateSub(Result, SrcAddr, "diff"); + // The result must point to the same underlying allocation. This means we + // can use an inbounds GEP to enable better optimization. + + PointerType *DestType = Builder.getInt8PtrTy(); + if (unsigned AddrSpace = SrcType->getAddressSpace()) + DestType = Type::getInt8PtrTy(Builder.getContext(), AddrSpace); + + Value *Base = Src; + if (SrcType != DestType) + Base = Builder.CreateBitCast(Src, DestType); + + // Out-of-bound case could not happen. + Result = Builder.CreateGEP(Base, Difference, "aligned_result"); + Result = Builder.CreatePointerCast(Result, SrcType); + + Type *IntPtrTy = Builder.getIntPtrTy(DL); + if (Alignment->getType() != IntPtrTy) + Alignment = + Builder.CreateIntCast(Alignment, IntPtrTy, false, "casted.align"); + (void)Builder.CreateAlignmentAssumption(DL, Result, Alignment); + assert(Result->getType() == SrcType); + return Result; +} + +static void replaceCoroFreeMemArg(CoroIdInst *CoroId, + AllocaInst *FramePtrAddr) { + SmallVector CoroFrees; + for (User *U : CoroId->users()) + if (auto *CF = dyn_cast(U)) + CoroFrees.push_back(CF); + + if (CoroFrees.empty()) + return; + + for (CoroFreeInst *CF : CoroFrees) { + // Assume CallInst users of `coro.free` are dealloc functions. + + // CF->setArgOperand(CoroFreeInst::FrameArg, FrameAddr); + for (Use &U : CF->uses()) + if (CallBase *CB = dyn_cast(U.getUser())) { + LoadInst *FrameAddr = new LoadInst(FramePtrAddr->getAllocatedType(), + FramePtrAddr, "raw.frame.ptr", CB); + CB->replaceUsesOfWith(CF, FrameAddr); + } + } +} + // Build a struct that will keep state for an active coroutine. // struct f.frame { // ResumeFnTy ResumeFnAddr; @@ -779,6 +856,40 @@ FrameData.setFieldIndex(S.first, Id); } + Align FrameAlign = + std::max_element( + B.getFields().begin(), B.getFields().end(), + [](auto &F1, auto &F2) { return F1.Alignment < F2.Alignment; }) + ->Alignment; + + // Check for over-alignment. + if (Shape.HandleOverAlign && + FrameAlign.value() > Shape.getSwitchCoroId()->getAlignment()) { + BasicBlock &Entry = F.getEntryBlock(); + IRBuilder<> Builder(&Entry, Entry.getFirstInsertionPt()); + + // Save to raw frame pointer to alloca + Value *Mem = Shape.CoroBegin->getMem(); + AllocaInst *FramePtrAddr = + Builder.CreateAlloca(Mem->getType(), nullptr, "alloc.frame.ptr"); + Builder.SetInsertPoint(Shape.CoroBegin); + Value *MockMem = Builder.CreatePointerCast(FramePtrAddr, Mem->getType()); + Builder.CreateStore(MockMem, FramePtrAddr); + + // Ajust frame pointer value. + Value *NewMem = emitAlignUpTo(Builder, MockMem, FrameAlign.value()); + Mem->replaceAllUsesWith(NewMem); + MockMem->replaceAllUsesWith(Mem); + cast(MockMem)->eraseFromParent(); + + // Replace all corofree second arg with raw frame pointer loaded from + // alloca. + replaceCoroFreeMemArg(Shape.getSwitchCoroId(), FramePtrAddr); + + // Add alloca to frame. + (void)B.addFieldForAlloca(FramePtrAddr); + } + B.finish(FrameTy); FrameData.updateLayoutIndex(B); Shape.FrameAlign = B.getStructAlign(); diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h --- a/llvm/lib/Transforms/Coroutines/CoroInstr.h +++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h @@ -121,6 +121,10 @@ : cast(Arg->stripPointerCasts()); } + unsigned getAlignment() const { + return cast(getArgOperand(AlignArg))->getZExtValue(); + } + void clearPromise() { Value *Arg = getArgOperand(PromiseArg); setArgOperand(PromiseArg, @@ -402,9 +406,9 @@ /// This represents the llvm.coro.free instruction. class LLVM_LIBRARY_VISIBILITY CoroFreeInst : public IntrinsicInst { +public: enum { IdArg, FrameArg }; -public: Value *getFrame() const { return getArgOperand(FrameArg); } // Methods to support type inquiry through isa, cast, and dyn_cast: @@ -589,7 +593,12 @@ /// This represents the llvm.coro.size instruction. class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst { + enum { AllocArg }; + public: + bool isAlloc() const { + return cast(getArgOperand(0))->isOneValue(); + } // Methods to support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::coro_size; @@ -599,6 +608,18 @@ } }; +/// This represents the llvm.coro.align instruction. +class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_align; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst { enum { FrameArg, UnwindArg }; diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -99,6 +99,7 @@ CoroBeginInst *CoroBegin; SmallVector CoroEnds; SmallVector CoroSizes; + SmallVector CoroAligns; SmallVector CoroSuspends; SmallVector SwiftErrorOps; @@ -124,8 +125,8 @@ uint64_t FrameSize; Instruction *FramePtr; BasicBlock *AllocaSpillBlock; - - bool ReuseFrameSlot; + bool HandleOverAlign = false; + bool ReuseFrameSlot = false; struct SwitchLoweringStorage { SwitchInst *ResumeSwitch; @@ -269,7 +270,7 @@ void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const; Shape() = default; - explicit Shape(Function &F, bool ReuseFrameSlot = false) + explicit Shape(Function &F, bool ReuseFrameSlot) : ReuseFrameSlot(ReuseFrameSlot) { buildFrom(F); } diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -69,6 +69,7 @@ #include #include #include +#include using namespace llvm; @@ -997,23 +998,39 @@ Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct); } -static void replaceFrameSize(coro::Shape &Shape) { +static void replaceFrameSizeAndAlign(coro::Shape &Shape) { if (Shape.ABI == coro::ABI::Async) updateAsyncFuncPointerContextSize(Shape); - if (Shape.CoroSizes.empty()) - return; + if (!Shape.CoroSizes.empty()) { + // In the same function all coro.sizes should have the same result type. + auto *SizeIntrin = Shape.CoroSizes.back(); + Module *M = SizeIntrin->getModule(); + const DataLayout &DL = M->getDataLayout(); + auto Size = DL.getTypeAllocSize(Shape.FrameTy); - // In the same function all coro.sizes should have the same result type. - auto *SizeIntrin = Shape.CoroSizes.back(); - Module *M = SizeIntrin->getModule(); - const DataLayout &DL = M->getDataLayout(); - auto Size = DL.getTypeAllocSize(Shape.FrameTy); - auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size); + uint64_t FrameAlign = Shape.FrameAlign.value(); + uint64_t NewAlign = Shape.getSwitchCoroId()->getAlignment(); + uint64_t Extra = FrameAlign > NewAlign ? FrameAlign - NewAlign : 0; - for (CoroSizeInst *CS : Shape.CoroSizes) { - CS->replaceAllUsesWith(SizeConstant); - CS->eraseFromParent(); + for (CoroSizeInst *CS : Shape.CoroSizes) { + // Over allocate if needed. + uint64_t FrameSize = Size + (Shape.HandleOverAlign ? Extra : 0); + auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), FrameSize); + CS->replaceAllUsesWith(SizeConstant); + CS->eraseFromParent(); + } + } + + if (!Shape.CoroAligns.empty()) { + auto *Intrin = Shape.CoroAligns.back(); + auto *AlignConstant = + ConstantInt::get(Intrin->getType(), Shape.FrameAlign.value()); + + for (CoroAlignInst *CS : Shape.CoroAligns) { + CS->replaceAllUsesWith(AlignConstant); + CS->eraseFromParent(); + } } } @@ -1748,7 +1765,7 @@ simplifySuspendPoints(Shape); buildCoroutineFrame(F, Shape); - replaceFrameSize(Shape); + replaceFrameSizeAndAlign(Shape); // If there are no suspend points, no split required, just remove // the allocation and deallocation blocks, they are not needed. diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -234,6 +234,7 @@ Shape.CoroBegin = nullptr; Shape.CoroEnds.clear(); Shape.CoroSizes.clear(); + Shape.CoroAligns.clear(); Shape.CoroSuspends.clear(); Shape.FrameTy = nullptr; @@ -267,6 +268,10 @@ continue; case Intrinsic::coro_size: CoroSizes.push_back(cast(II)); + HandleOverAlign = HandleOverAlign || CoroSizes.back()->isAlloc(); + break; + case Intrinsic::coro_align: + CoroAligns.push_back(cast(II)); break; case Intrinsic::coro_frame: CoroFrames.push_back(cast(II)); diff --git a/llvm/test/Transforms/Coroutines/ArgAddr.ll b/llvm/test/Transforms/Coroutines/ArgAddr.ll --- a/llvm/test/Transforms/Coroutines/ArgAddr.ll +++ b/llvm/test/Transforms/Coroutines/ArgAddr.ll @@ -21,10 +21,10 @@ ; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4 ; entry: - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null); + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null); %n.addr = alloca i32 store i32 %n, i32* %n.addr ; this needs to go after coro.begin - %0 = tail call i32 @llvm.coro.size.i32() + %0 = tail call i32 @llvm.coro.size.i32(i1 true) %call = tail call i8* @malloc(i32 %0) %1 = tail call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %call) %2 = bitcast i32* %n.addr to i8* @@ -69,7 +69,7 @@ declare void @ctor(i8* nocapture readonly) declare token @llvm.coro.id(i32, i8*, i8*, i8*) -declare i32 @llvm.coro.size.i32() +declare i32 @llvm.coro.size.i32(i1) declare i8* @llvm.coro.begin(token, i8*) declare i8 @llvm.coro.suspend(token, i1) declare i8* @llvm.coro.free(token, i8*) diff --git a/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll b/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll --- a/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll @@ -9,8 +9,8 @@ %this.addr = alloca i64 store i64 %this_arg, i64* %this.addr %this = load i64, i64* %this.addr - %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) - %size = call i32 @llvm.coro.size.i32() + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null) + %size = call i32 @llvm.coro.size.i32(i1 true) %alloc = call i8* @myAlloc(i64 %this, i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) %0 = call i8 @llvm.coro.suspend(token none, i1 false) @@ -45,7 +45,7 @@ ; CHECK: ret void declare i8* @llvm.coro.free(token, i8*) -declare i32 @llvm.coro.size.i32() +declare i32 @llvm.coro.size.i32(i1) declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(i8*) declare void @llvm.coro.destroy(i8*) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-01.ll b/llvm/test/Transforms/Coroutines/coro-alloca-01.ll --- a/llvm/test/Transforms/Coroutines/coro-alloca-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-01.ll @@ -8,7 +8,7 @@ %x = alloca i64 %y = alloca i64 %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) - %size = call i32 @llvm.coro.size.i32() + %size = call i32 @llvm.coro.size.i32(i1 false) %alloc = call i8* @malloc(i32 %size) %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) br i1 %n, label %flag_true, label %flag_false