diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1280,7 +1280,7 @@ CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind, HasCancel, OutlinedHelperName); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); - return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D.getBeginLoc()); + return CGF.GenerateOpenMPCapturedStmtFunctionAggregate(*CS, D.getBeginLoc()); } llvm::Function *CGOpenMPRuntime::emitParallelOutlinedFunction( diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2106,12 +2106,14 @@ // TODO: Is that needed? CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); + // Store addresses of global arguments to pass to the parallel call. Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), "captured_vars_addrs"); - // There's something to share. + + // Store globalized values to push, pop through the global stack. + SmallVector GlobalValues; if (!CapturedVars.empty()) { - // Prepare for parallel region. Indicate the outlined function. ASTContext &Ctx = CGF.getContext(); unsigned Idx = 0; for (llvm::Value *V : CapturedVars) { @@ -2119,8 +2121,38 @@ llvm::Value *PtrV; if (V->getType()->isIntegerTy()) PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); - else - PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); + else { + assert(V->getType()->isPointerTy() && + "Expected Pointer Type to globalize."); + // Globalize and store pointer. + llvm::Type *PtrElemTy = V->getType()->getPointerElementType(); + auto &DL = CGM.getDataLayout(); + unsigned GlobalSize = DL.getTypeAllocSize(PtrElemTy); + + // Use shared memory to store globalized pointer values, for now this + // should be the outlined args aggregate struct. + llvm::Value *GlobalSizeArg[] = { + llvm::ConstantInt::get(CGM.SizeTy, GlobalSize), + CGF.Builder.getInt16(/*UseSharedMemory*/ 1)}; + llvm::Value *GlobalValue = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), + IsInTTDRegion + ? OMPRTL___kmpc_data_sharing_push_stack + : OMPRTL___kmpc_data_sharing_coalesced_push_stack), + GlobalSizeArg); + GlobalValues.push_back(GlobalValue); + + llvm::Value *CapturedVarVal = Bld.CreateAlignedLoad( + PtrElemTy, V, DL.getABITypeAlign(PtrElemTy)); + llvm::Value *GlobalValueCast = + Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalValue, PtrElemTy->getPointerTo()); + Bld.CreateDefaultAlignedStore(CapturedVarVal, GlobalValueCast); + + PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(GlobalValue, + CGF.VoidPtrTy); + } CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, Ctx.getPointerType(Ctx.VoidPtrTy)); ++Idx; @@ -2133,8 +2165,9 @@ /* isSigned */ false); else IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); - assert(IfCondVal && "Expected a value"); + + // Create the parallel call. llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); llvm::Value *Args[] = { RTLoc, @@ -2150,6 +2183,14 @@ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_parallel_51), Args); + + // Pop any globalized values from the global stack. + for (auto *V : GlobalValues) { + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), + V); + } }; RegionCodeGenTy RCG(ParallelGen); @@ -4062,7 +4103,6 @@ D.getBeginLoc(), D.getBeginLoc()); const auto *RD = CS.getCapturedRecordDecl(); - auto CurField = RD->field_begin(); Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); @@ -4074,7 +4114,6 @@ Args.emplace_back(ZeroAddr.getPointer()); CGBuilderTy &Bld = CGF.Builder; - auto CI = CS.capture_begin(); // Use global memory for data sharing. // Handle passing of global args to workers. @@ -4121,23 +4160,27 @@ ++Idx; } if (CS.capture_size() > 0) { + auto CI = CS.capture_begin(); + // Load the outlined arg aggregate struct. ASTContext &CGFContext = CGF.getContext(); - for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { - QualType ElemTy = CurField->getType(); - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); - Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); - llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, - /*Volatile=*/false, - CGFContext.getPointerType(ElemTy), - CI->getLocation()); - if (CI->capturesVariableByCopy() && - !CI->getCapturedVar()->getType()->isAnyPointerType()) { - Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), - CI->getLocation()); - } - Args.emplace_back(Arg); - } + QualType RecordPointerTy = + CGFContext.getPointerType(CGFContext.getRecordType(RD)); + Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); + Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( + Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(RecordPointerTy))); + llvm::Value *Arg = CGF.EmitLoadOfScalar( + TypedAddress, + /*Volatile=*/false, CGFContext.getPointerType(RecordPointerTy), + CI->getLocation()); + Args.emplace_back(Arg); + } else { + // If there are no captured arguments, use nullptr. + ASTContext &CGFContext = CGF.getContext(); + QualType RecordPointerTy = + CGFContext.getPointerType(CGFContext.getRecordType(RD)); + llvm::Value *Arg = + llvm::Constant::getNullValue(CGF.ConvertTypeForMem(RecordPointerTy)); + Args.emplace_back(Arg); } emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -318,6 +318,32 @@ return CGM.getSize(SizeInChars); } +void CodeGenFunction::GenerateOpenMPCapturedVarsAggregate( + const CapturedStmt &S, SmallVectorImpl &CapturedVars) { + const RecordDecl *RD = S.getCapturedRecordDecl(); + QualType RecordTy = getContext().getRecordType(RD); + // Create the aggregate argument struct for the outlined function. + LValue AggLV = MakeAddrLValue( + CreateMemTemp(RecordTy, "omp.outlined.arg.agg."), RecordTy); + + // Initialize the aggregate with captured values. + auto CurField = RD->field_begin(); + for (CapturedStmt::const_capture_init_iterator I = S.capture_init_begin(), + E = S.capture_init_end(); + I != E; ++I, ++CurField) { + LValue LV = EmitLValueForFieldInitialization(AggLV, *CurField); + // Initialize for VLA. + if (CurField->hasCapturedVLAType()) { + EmitLambdaVLACapture(CurField->getCapturedVLAType(), LV); + } else + // Initialize for capturesThis, capturesVariableByCopy, + // capturesVariable + EmitInitializerForField(*CurField, LV, *I); + } + + CapturedVars.push_back(AggLV.getPointer(*this)); +} + void CodeGenFunction::GenerateOpenMPCapturedVars( const CapturedStmt &S, SmallVectorImpl &CapturedVars) { const RecordDecl *RD = S.getCapturedRecordDecl(); @@ -418,6 +444,101 @@ }; } // namespace +static llvm::Function *emitOutlinedFunctionPrologueAggregate( + CodeGenFunction &CGF, FunctionArgList &Args, + llvm::MapVector> + &LocalAddrs, + llvm::DenseMap> + &VLASizes, + llvm::Value *&CXXThisValue, const CapturedStmt &CS, SourceLocation Loc, + StringRef FunctionName) { + const CapturedDecl *CD = CS.getCapturedDecl(); + const RecordDecl *RD = CS.getCapturedRecordDecl(); + assert(CD->hasBody() && "missing CapturedDecl body"); + + CXXThisValue = nullptr; + // Build the argument list. + CodeGenModule &CGM = CGF.CGM; + ASTContext &Ctx = CGM.getContext(); + Args.append(CD->param_begin(), CD->param_end()); + + // Create the function declaration. + const CGFunctionInfo &FuncInfo = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Args); + llvm::FunctionType *FuncLLVMTy = CGM.getTypes().GetFunctionType(FuncInfo); + + auto *F = + llvm::Function::Create(FuncLLVMTy, llvm::GlobalValue::InternalLinkage, + FunctionName, &CGM.getModule()); + CGM.SetInternalFunctionAttributes(CD, F, FuncInfo); + if (CD->isNothrow()) + F->setDoesNotThrow(); + F->setDoesNotRecurse(); + + // Generate the function. + CGF.StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, Loc, Loc); + Address ContextAddr = CGF.GetAddrOfLocalVar(CD->getContextParam()); + llvm::Value *ContextV = CGF.Builder.CreateLoad(ContextAddr); + LValue ContextLV = CGF.MakeNaturalAlignAddrLValue( + ContextV, CGM.getContext().getTagDeclType(RD)); + auto I = CS.captures().begin(); + for (const FieldDecl *FD : RD->fields()) { + LValue FieldLV = CGF.EmitLValueForFieldInitialization(ContextLV, FD); + // Do not map arguments if we emit function with non-original types. + Address LocalAddr = FieldLV.getAddress(CGF); + // If we are capturing a pointer by copy we don't need to do anything, just + // use the value that we get from the arguments. + if (I->capturesVariableByCopy() && FD->getType()->isAnyPointerType()) { + const VarDecl *CurVD = I->getCapturedVar(); + LocalAddrs.insert({FD, {CurVD, LocalAddr}}); + ++I; + continue; + } + + LValue ArgLVal = + CGF.MakeAddrLValue(LocalAddr, FD->getType(), AlignmentSource::Decl); + if (FD->hasCapturedVLAType()) { + llvm::Value *ExprArg = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation()); + const VariableArrayType *VAT = FD->getCapturedVLAType(); + VLASizes.try_emplace(FD, VAT->getSizeExpr(), ExprArg); + } else if (I->capturesVariable()) { + const VarDecl *Var = I->getCapturedVar(); + QualType VarTy = Var->getType(); + Address ArgAddr = ArgLVal.getAddress(CGF); + if (ArgLVal.getType()->isLValueReferenceType()) { + ArgAddr = CGF.EmitLoadOfReference(ArgLVal); + } else if (!VarTy->isVariablyModifiedType() || !VarTy->isPointerType()) { + assert(ArgLVal.getType()->isPointerType()); + ArgAddr = CGF.EmitLoadOfPointer( + ArgAddr, ArgLVal.getType()->castAs()); + } + LocalAddrs.insert( + {FD, {Var, Address(ArgAddr.getPointer(), Ctx.getDeclAlign(Var))}}); + } else if (I->capturesVariableByCopy()) { + assert(!FD->getType()->isAnyPointerType() && + "Not expecting a captured pointer."); + const VarDecl *Var = I->getCapturedVar(); + Address CopyAddr = CGF.CreateMemTemp(FD->getType(), Ctx.getDeclAlign(FD), + Var->getName()); + LValue CopyLVal = + CGF.MakeAddrLValue(CopyAddr, FD->getType(), AlignmentSource::Decl); + + RValue ArgRVal = CGF.EmitLoadOfLValue(ArgLVal, I->getLocation()); + CGF.EmitStoreThroughLValue(ArgRVal, CopyLVal); + + LocalAddrs.insert({FD, {Var, CopyAddr}}); + } else { + // If 'this' is captured, load it into CXXThisValue. + assert(I->capturesThis()); + CXXThisValue = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation()); + LocalAddrs.insert({FD, {nullptr, ArgLVal.getAddress(CGF)}}); + } + ++I; + } + + return F; +} + static llvm::Function *emitOutlinedFunctionPrologue( CodeGenFunction &CGF, FunctionArgList &Args, llvm::MapVector> @@ -593,6 +714,37 @@ return F; } +llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunctionAggregate( + const CapturedStmt &S, SourceLocation Loc) { + assert( + CapturedStmtInfo && + "CapturedStmtInfo should be set when generating the captured function"); + const CapturedDecl *CD = S.getCapturedDecl(); + // Build the argument list. + FunctionArgList Args; + llvm::MapVector> LocalAddrs; + llvm::DenseMap> VLASizes; + StringRef FunctionName = CapturedStmtInfo->getHelperName(); + llvm::Function *F = emitOutlinedFunctionPrologueAggregate( + *this, Args, LocalAddrs, VLASizes, CXXThisValue, S, Loc, FunctionName); + CodeGenFunction::OMPPrivateScope LocalScope(*this); + for (const auto &LocalAddrPair : LocalAddrs) { + if (LocalAddrPair.second.first) { + LocalScope.addPrivate(LocalAddrPair.second.first, [&LocalAddrPair]() { + return LocalAddrPair.second.second; + }); + } + } + (void)LocalScope.Privatize(); + for (const auto &VLASizePair : VLASizes) + VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second; + PGO.assignRegionCounters(GlobalDecl(CD), F); + CapturedStmtInfo->EmitBody(*this, CD->getBody()); + (void)LocalScope.ForceCleanup(); + FinishFunction(CD->getBodyRBrace()); + return F; +} + llvm::Function * CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S, SourceLocation Loc) { @@ -1581,7 +1733,7 @@ // The following lambda takes care of appending the lower and upper bound // parameters when necessary CodeGenBoundParameters(CGF, S, CapturedVars); - CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); + CGF.GenerateOpenMPCapturedVarsAggregate(*CS, CapturedVars); CGF.CGM.getOpenMPRuntime().emitParallelCall(CGF, S.getBeginLoc(), OutlinedFn, CapturedVars, IfCond); } @@ -5298,7 +5450,7 @@ const CapturedStmt *CS = S.getInnermostCapturedStmt(); if (C) { llvm::SmallVector CapturedVars; - CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); + CGF.GenerateOpenMPCapturedVarsAggregate(*CS, CapturedVars); llvm::Function *OutlinedFn = emitOutlinedOrderedFunction(CGM, CS, S.getBeginLoc()); CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, S.getBeginLoc(), @@ -6029,7 +6181,7 @@ OMPTeamsScope Scope(CGF, S); llvm::SmallVector CapturedVars; - CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars); + CGF.GenerateOpenMPCapturedVarsAggregate(*CS, CapturedVars); CGF.CGM.getOpenMPRuntime().emitTeamsCall(CGF, S, S.getBeginLoc(), OutlinedFn, CapturedVars); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3276,8 +3276,13 @@ llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K); llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S); Address GenerateCapturedStmtArgument(const CapturedStmt &S); + llvm::Function * + GenerateOpenMPCapturedStmtFunctionAggregate(const CapturedStmt &S, + SourceLocation Loc); llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S, SourceLocation Loc); + void GenerateOpenMPCapturedVarsAggregate( + const CapturedStmt &S, SmallVectorImpl &CapturedVars); void GenerateOpenMPCapturedVars(const CapturedStmt &S, SmallVectorImpl &CapturedVars); void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy,