diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -304,11 +304,13 @@ AllocaInst *Alloca; DenseMap> Aliases; bool MayWriteBeforeCoroBegin; + bool NeedsDynamicAlignment; AllocaInfo(AllocaInst *Alloca, DenseMap> Aliases, - bool MayWriteBeforeCoroBegin) + bool MayWriteBeforeCoroBegin, bool NeedsDynamicAlignment) : Alloca(Alloca), Aliases(std::move(Aliases)), - MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} + MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin), + NeedsDynamicAlignment(NeedsDynamicAlignment) {} }; struct FrameDataInfo { // All the values (that are not allocas) that needs to be spilled to the @@ -416,6 +418,7 @@ FieldIDType LayoutFieldIndex; Align Alignment; Align TyAlignment; + uint64_t DynamicAlignBuffer; }; const DataLayout &DL; @@ -510,6 +513,18 @@ FieldAlignment = TyAlignment; } + // The field alignment could be bigger than the max frame case, in that case + // we request additional storage to be able to dynamically align the + // pointer. + uint64_t DynamicAlignBuffer = 0; + if (MaxFrameAlignment && + (FieldAlignment.valueOrOne() > *MaxFrameAlignment)) { + DynamicAlignBuffer = + offsetToAlignment((*MaxFrameAlignment).value(), *FieldAlignment); + FieldAlignment = *MaxFrameAlignment; + FieldSize = FieldSize + DynamicAlignBuffer; + } + // Lay out header fields immediately. uint64_t Offset; if (IsHeader) { @@ -521,7 +536,8 @@ Offset = OptimizedStructLayoutField::FlexibleOffset; } - Fields.push_back({FieldSize, Offset, Ty, 0, *FieldAlignment, TyAlignment}); + Fields.push_back({FieldSize, Offset, Ty, 0, *FieldAlignment, TyAlignment, + DynamicAlignBuffer}); return Fields.size() - 1; } @@ -748,6 +764,10 @@ F.LayoutFieldIndex = FieldTypes.size(); FieldTypes.push_back(F.Ty); + if (F.DynamicAlignBuffer) { + FieldTypes.push_back( + ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer)); + } LastOffset = Offset + F.Size; } @@ -1141,7 +1161,8 @@ // We assume that the promise alloca won't be modified before // CoroBegin and no alias will be create before CoroBegin. FrameData.Allocas.emplace_back( - PromiseAlloca, DenseMap>{}, false); + PromiseAlloca, DenseMap>{}, false, + false); // Create an entry for every spilled value. for (auto &S : FrameData.Spills) { Type *FieldType = S.first->getType(); @@ -1523,7 +1544,8 @@ // Create a GEP with the given index into the coroutine frame for the original // value Orig. Appends an extra 0 index for array-allocas, preserving the // original type. - auto GetFramePointer = [&](Value *Orig) -> Value * { + auto GetFramePointer = [&](Value *Orig, + bool NeedsDynamicAlignment) -> Value * { FieldIDType Index = FrameData.getFieldIndex(Orig); SmallVector Indices = { ConstantInt::get(Type::getInt32Ty(C), 0), @@ -1543,7 +1565,18 @@ auto GEP = cast( Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices)); - if (isa(Orig)) { + if (auto *AI = dyn_cast(Orig)) { + if (NeedsDynamicAlignment) { + auto *M = AI->getModule(); + auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType()); + auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy); + auto *AlignMask = + Builder.CreateSub(ConstantInt::get(IntPtrTy, AI->getAlignment()), + ConstantInt::get(IntPtrTy, 1)); + PtrValue = Builder.CreateAdd(PtrValue, AlignMask); + PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask)); + return Builder.CreateIntToPtr(PtrValue, AI->getType()); + } // If the type of GEP is not equal to the type of AllocaInst, it implies // that the AllocaInst may be reused in the Frame slot of other // AllocaInst. So We cast GEP to the AllocaInst here to re-use @@ -1555,6 +1588,9 @@ return Builder.CreateBitCast(GEP, Orig->getType(), Orig->getName() + Twine(".cast")); } + assert(!NeedsDynamicAlignment || + isa(Orig) && + "Only expect dynamic realignment for allocas"); return GEP; }; @@ -1630,7 +1666,7 @@ CurrentBlock = U->getParent(); Builder.SetInsertPoint(&*CurrentBlock->getFirstInsertionPt()); - auto *GEP = GetFramePointer(E.first); + auto *GEP = GetFramePointer(E.first, false /*NeedsDynamicAlignment*/); GEP->setName(E.first->getName() + Twine(".reload.addr")); if (ByValTy) CurrentReload = GEP; @@ -1693,7 +1729,7 @@ Builder.SetInsertPoint(&SpillBlock->front()); for (const auto &P : FrameData.Allocas) { AllocaInst *Alloca = P.Alloca; - auto *G = GetFramePointer(Alloca); + auto *G = GetFramePointer(Alloca, P.NeedsDynamicAlignment); // We are not using ReplaceInstWithInst(P.first, cast(G)) // here, as we are changing location of the instruction. @@ -1721,7 +1757,7 @@ } if (UsersToUpdate.empty()) continue; - auto *G = GetFramePointer(Alloca); + auto *G = GetFramePointer(Alloca, false /*NeedsDynamicAlignment*/); G->setName(Alloca->getName() + Twine(".reload.addr")); SmallVector DIs; @@ -1741,7 +1777,7 @@ report_fatal_error( "Coroutines cannot handle copying of array allocas yet"); - auto *G = GetFramePointer(Alloca); + auto *G = GetFramePointer(Alloca, false /*NeedsDynamicAlignment*/); auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca); Builder.CreateStore(Value, G); } @@ -1749,7 +1785,7 @@ // CoroBegin, we recreate them after CoroBegin by appplying the offset // to the pointer in the frame. for (const auto &Alias : A.Aliases) { - auto *FramePtr = GetFramePointer(Alloca); + auto *FramePtr = GetFramePointer(Alloca, false /*NeedsDynamicAlignment*/); auto *FramePtrRaw = Builder.CreateBitCast(FramePtr, Type::getInt8PtrTy(C)); auto &Value = Alias.second.getValue(); @@ -2500,8 +2536,16 @@ Visitor.visitPtr(*AI); if (!Visitor.getShouldLiveOnFrame()) continue; + + bool NeedsDynamicAlignment = false; + if (Shape.ABI == coro::ABI::Async) { + NeedsDynamicAlignment = + AI->getAlign() > Shape.AsyncLowering.getContextAlignment(); + } + Allocas.emplace_back(AI, Visitor.getAliasesCopy(), - Visitor.getMayWriteBeforeCoroBegin()); + Visitor.getMayWriteBeforeCoroBegin(), + NeedsDynamicAlignment); } } diff --git a/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll b/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll @@ -0,0 +1,102 @@ +; RUN: opt < %s -O0 -S | FileCheck %s + +target datalayout = "p:64:64:64" + +%async.task = type { i64 } +%async.actor = type { i64 } +%async.fp = type <{ i32, i32 }> + +%async.ctxt = type { i8*, void (i8*)* } + +@my_other_async_function_fp = external global <{ i32, i32 }> +declare void @my_other_async_function(i8* %async.ctxt) + +@my_async_function_fp = constant <{ i32, i32 }> + <{ i32 trunc ( + i64 sub ( + i64 ptrtoint (void (i8*)* @my_async_function to i64), + i64 ptrtoint (i32* getelementptr inbounds (<{ i32, i32 }>, <{ i32, i32 }>* @my_async_function_fp, i32 0, i32 1) to i64) + ) + to i32), + i32 32 +}> + +declare void @opaque(i64*) +declare i8* @llvm.coro.async.context.alloc(i8*, i8*) +declare void @llvm.coro.async.context.dealloc(i8*) +declare i8* @llvm.coro.async.resume() +declare token @llvm.coro.id.async(i32, i32, i32, i8*) +declare i8* @llvm.coro.begin(token, i8*) +declare i1 @llvm.coro.end.async(i8*, i1, ...) +declare i1 @llvm.coro.end(i8*, i1) +declare swiftcc void @asyncReturn(i8*) +declare swiftcc void @asyncSuspend(i8*) +declare {i8*} @llvm.coro.suspend.async(i32, i8*, i8*, ...) + +define swiftcc void @my_async_function.my_other_async_function_fp.apply(i8* %fnPtr, i8* %async.ctxt) { + %callee = bitcast i8* %fnPtr to void(i8*)* + tail call swiftcc void %callee(i8* %async.ctxt) + ret void +} + +define i8* @__swift_async_resume_project_context(i8* %ctxt) { +entry: + %resume_ctxt_addr = bitcast i8* %ctxt to i8** + %resume_ctxt = load i8*, i8** %resume_ctxt_addr, align 8 + ret i8* %resume_ctxt +} + + +; CHECK: %my_async_function.Frame = type { i64, [48 x i8], i64, i64, [16 x i8], i8*, i64, i8* } +; CHECK: define swiftcc void @my_async_function +; CHECK: [[T0:%.*]] = getelementptr inbounds %my_async_function.Frame, %my_async_function.Frame* %FramePtr, i32 0, i32 3 +; CHECK: [[T1:%.*]] = ptrtoint i64* [[T0]] to i64 +; CHECK: [[T2:%.*]] = add i64 [[T1]], 31 +; CHECK: [[T3:%.*]] = and i64 [[T2]], -32 +; CHECK: [[T4:%.*]] = inttoptr i64 [[T3]] to i64* +; CHECK: [[T5:%.*]] = getelementptr inbounds %my_async_function.Frame, %my_async_function.Frame* %FramePtr, i32 0, i32 0 +; CHECK: [[T6:%.*]] = ptrtoint i64* [[T5]] to i64 +; CHECK: [[T7:%.*]] = add i64 [[T6]], 63 +; CHECK: [[T8:%.*]] = and i64 [[T7]], -64 +; CHECK: [[T9:%.*]] = inttoptr i64 [[T8]] to i64* +; CHECK: store i64 2, i64* [[T4]] +; CHECK: store i64 3, i64* [[T9]] + +define swiftcc void @my_async_function(i8* swiftasync %async.ctxt) "coroutine.presplit"="1" { +entry: + %tmp = alloca i64, align 8 + %tmp2 = alloca i64, align 16 + %tmp3 = alloca i64, align 32 + %tmp4 = alloca i64, align 64 + + %id = call token @llvm.coro.id.async(i32 32, i32 16, i32 0, + i8* bitcast (<{i32, i32}>* @my_async_function_fp to i8*)) + %hdl = call i8* @llvm.coro.begin(token %id, i8* null) + store i64 0, i64* %tmp + store i64 1, i64* %tmp2 + store i64 2, i64* %tmp3 + store i64 3, i64* %tmp4 + + %callee_context = call i8* @llvm.coro.async.context.alloc(i8* null, i8* null) + %callee_context.0 = bitcast i8* %callee_context to %async.ctxt* + %callee_context.return_to_caller.addr = getelementptr inbounds %async.ctxt, %async.ctxt* %callee_context.0, i32 0, i32 1 + %return_to_caller.addr = bitcast void(i8*)** %callee_context.return_to_caller.addr to i8** + %resume.func_ptr = call i8* @llvm.coro.async.resume() + store i8* %resume.func_ptr, i8** %return_to_caller.addr + + %callee = bitcast void(i8*)* @asyncSuspend to i8* + %resume_proj_fun = bitcast i8*(i8*)* @__swift_async_resume_project_context to i8* + %res = call {i8*} (i32, i8*, i8*, ...) @llvm.coro.suspend.async(i32 0, + i8* %resume.func_ptr, + i8* %resume_proj_fun, + void (i8*, i8*)* @my_async_function.my_other_async_function_fp.apply, + i8* %callee, i8* %callee_context) + call void @opaque(i64* %tmp) + call void @opaque(i64* %tmp2) + call void @opaque(i64* %tmp3) + call void @opaque(i64* %tmp4) + call void @llvm.coro.async.context.dealloc(i8* %callee_context) + tail call swiftcc void @asyncReturn(i8* %async.ctxt) + call i1 (i8*, i1, ...) @llvm.coro.end.async(i8* %hdl, i1 0) + unreachable +}