diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -10617,18 +10617,18 @@ } Linkage = CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false); // Temp solution to prevent optimizations of the internal variables. - if (CGM.getLangOpts().OpenMPIsDevice && !VD->isExternallyVisible()) { - std::string RefName = getName({VarName, "ref"}); - if (!CGM.GetGlobalValue(RefName)) { - llvm::Constant *AddrRef = - getOrCreateInternalVariable(Addr->getType(), RefName); - auto *GVAddrRef = cast(AddrRef); - GVAddrRef->setConstant(/*Val=*/true); - GVAddrRef->setLinkage(llvm::GlobalValue::InternalLinkage); - GVAddrRef->setInitializer(Addr); - CGM.addCompilerUsedGlobal(GVAddrRef); - } - } + // if (CGM.getLangOpts().OpenMPIsDevice && !VD->isExternallyVisible()) { + // std::string RefName = getName({VarName, "ref"}); + // if (!CGM.GetGlobalValue(RefName)) { + // llvm::Constant *AddrRef = + // getOrCreateInternalVariable(Addr->getType(), RefName); + // auto *GVAddrRef = cast(AddrRef); + // GVAddrRef->setConstant([>Val=<]true); + // GVAddrRef->setLinkage(llvm::GlobalValue::InternalLinkage); + // GVAddrRef->setInitializer(Addr); + // CGM.addCompilerUsedGlobal(GVAddrRef); + //} + //} } else { assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) || (*Res == OMPDeclareTargetDeclAttr::MT_To && diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -440,15 +440,14 @@ /// The data for the single globalized variable. struct MappedVarData { /// Corresponding field in the global record. - const FieldDecl *FD = nullptr; + llvm::Value *GlobalizedVal = nullptr; /// Corresponding address. Address PrivateAddr = Address::invalid(); /// true, if only one element is required (for latprivates in SPMD mode), /// false, if need to create based on the warp-size. bool IsOnePerTeam = false; MappedVarData() = delete; - MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) - : FD(FD), IsOnePerTeam(IsOnePerTeam) {} + MappedVarData(bool IsOnePerTeam = false) : IsOnePerTeam(IsOnePerTeam) {} }; /// The map of local variables to their addresses in the global memory. using DeclToAddrMapTy = llvm::MapVector; @@ -460,29 +459,12 @@ EscapedParamsTy EscapedParameters; llvm::SmallVector EscapedVariableLengthDecls; llvm::SmallVector EscapedVariableLengthDeclsAddrs; - const RecordDecl *GlobalRecord = nullptr; - llvm::Optional SecondaryGlobalRecord = llvm::None; - llvm::Value *GlobalRecordAddr = nullptr; llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr MappedParams; }; /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap FunctionGlobalizedDecls; - /// List of records for the globalized variables in target/teams/distribute - /// contexts. Inner records are going to be joined into the single record, - /// while those resulting records are going to be joined into the single - /// union. This resulting union (one per CU) is the entry point for the static - /// memory management runtime functions. - struct GlobalPtrSizeRecsTy { - llvm::GlobalVariable *UseSharedMemory = nullptr; - llvm::GlobalVariable *RecSize = nullptr; - llvm::GlobalVariable *Buffer = nullptr; - SourceLocation Loc; - llvm::SmallVector Records; - unsigned RegionCounter = 0; - }; - llvm::SmallVector GlobalizedRecords; llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; /// List of the records with the list of fields for the reductions across the /// teams. Used to build the intermediate buffer for the fast teams diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1096,17 +1096,6 @@ } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1156,10 +1145,6 @@ CGM.getModule(), OMPRTL___kmpc_kernel_init), Args); - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); - emitGenericVarsProlog(CGF, WST.Loc); } @@ -1228,17 +1213,6 @@ } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1260,12 +1234,6 @@ CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), Args); - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); - } - CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); @@ -1671,7 +1639,6 @@ static_cast(CGF.CGM.getOpenMPRuntime()); if (GlobalizedRD) { auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; I->getSecond().MappedParams = std::make_unique(); DeclToAddrMapTy &Data = I->getSecond().LocalVarData; @@ -1679,8 +1646,7 @@ assert(Pair.getFirst()->isCanonicalDecl() && "Expected canonical declaration"); Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); + MappedVarData(/*IsOnePerTeam=*/true))); } } Rt.emitGenericVarsProlog(CGF, Loc); @@ -1709,282 +1675,67 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return; - if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - QualType SecGlobalRecTy; - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + for (auto &Rec : I->getSecond().LocalVarData) { + const auto *VD = cast(Rec.first); + bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); + QualType VarTy = VD->getType(); - llvm::PointerType *GlobalRecPtrTy = - CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); - llvm::Value *GlobalRecCastAddr; - llvm::Value *IsTTD = nullptr; - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - if (I->getSecond().SecondaryGlobalRecord.hasValue()) { - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_parallel_level), - {RTLoc, ThreadID}); - IsTTD = Bld.CreateIsNull(PL); - } - llvm::Value *IsSPMD = Bld.CreateIsNotNull( - CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), - CharUnits::fromQuantity(Alignment)); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); - if (const RecordDecl *SecGlobalizedVarsRecord = - I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { - SecGlobalRecTy = - CGM.getContext().getRecordType(SecGlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - Size = Bld.CreateSelect( - IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); - } - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, - /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); - GlobalRecCastAddr = Phi; - I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().Records.size() < 2 && - "Expected less than 2 globalized records: one for target and one " - "for teams."); - unsigned Offset = 0; - for (const RecordDecl *RD : GlobalizedRecords.back().Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); - unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); - Offset = - llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); - } - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - Offset = llvm::alignTo(Offset, Alignment); - GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); - ++GlobalizedRecords.back().RegionCounter; - if (GlobalizedRecords.back().Records.size() == 1) { - assert(KernelStaticGlobalized && - "Kernel static pointer must be initialized already."); - auto *UseSharedMemory = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$is_shared"); - UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, Loc); - auto *StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, nullptr); - auto *RecSize = new llvm::GlobalVariable( - CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$size"); - RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, - CGM.getContext().getSizeType(), Loc); - llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), - GlobalRecordSizeArg); - GlobalizedRecords.back().Buffer = StaticGlobalized; - GlobalizedRecords.back().RecSize = RecSize; - GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; - GlobalizedRecords.back().Loc = Loc; - } - assert(KernelStaticGlobalized && "Global address must be set already."); - Address FrameAddr = CGF.EmitLoadOfPointer( - Address(KernelStaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs()); - llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); - } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - bool UseSharedMemory = - IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), - IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack - : OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; + // Get the local allocation of a firstprivate variable before sharing + llvm::Value *ParValue; + if (EscapedParam) { + LValue ParLVal = + CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); + ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); } - LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); + // Get the size needed in the stack. Logic of how much to allocate + // and which part to give to wich thread is inside the runtime function. + llvm::Value *Size = CGF.getTypeSize(VD->getType()); + llvm::Value *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + {Size}); - // Emit the "global alloca" which is a GEP from the global declaration - // record using the pointer returned by the runtime. - LValue SecBase; - decltype(I->getSecond().LocalVarData)::const_iterator SecIt; - if (IsTTD) { - SecIt = I->getSecond().SecondaryLocalVarData->begin(); - llvm::PointerType *SecGlobalRecPtrTy = - CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); - SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( - Bld.CreatePointerBitCastOrAddrSpaceCast( - I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), - SecGlobalRecTy); - } - for (auto &Rec : I->getSecond().LocalVarData) { - bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); - llvm::Value *ParValue; - if (EscapedParam) { - const auto *VD = cast(Rec.first); - LValue ParLVal = - CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); - ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); - } - LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); - // Emit VarAddr basing on lane-id if required. - QualType VarTy; - if (Rec.second.IsOnePerTeam) { - VarTy = Rec.second.FD->getType(); - } else { - Address Addr = VarAddr.getAddress(CGF); - llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( - Addr.getElementType(), Addr.getPointer(), - {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - VarTy = - Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue( - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, - AlignmentSource::Decl); - } - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - assert(I->getSecond().IsInSPMDModeFlag && - "Expected unknown execution mode or required SPMD check."); - if (IsTTD) { - assert(SecIt->second.IsOnePerTeam && - "Secondary glob data must be one per team."); - LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); - VarAddr.setAddress( - Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), - VarAddr.getPointer(CGF)), - VarAddr.getAlignment())); - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - } - Address GlobalPtr = Rec.second.PrivateAddr; - Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); - Rec.second.PrivateAddr = Address( - Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, - LocalAddr.getPointer(), GlobalPtr.getPointer()), - LocalAddr.getAlignment()); - } - if (EscapedParam) { - const auto *VD = cast(Rec.first); - CGF.EmitStoreOfScalar(ParValue, VarAddr); - I->getSecond().MappedParams->setVarAddr(CGF, VD, - VarAddr.getAddress(CGF)); - } - if (IsTTD) - ++SecIt; + Rec.second.GlobalizedVal = VoidPtr; + + // Cast the void pointer and get the address of the globalized variable. + llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); + llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); + LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy); + Rec.second.PrivateAddr = VarAddr.getAddress(CGF); + + // Assign the local allocation to the newly globalized location. + if (EscapedParam) { + CGF.EmitStoreOfScalar(ParValue, VarAddr); + I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF)); } } - for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding + for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) { + // Use actual memory size of the VLA object including the padding // for alignment purposes. - CGBuilderTy &Bld = CGF.Builder; llvm::Value *Size = CGF.getTypeSize(VD->getType()); CharUnits Align = CGM.getContext().getDeclAlign(VD); Size = Bld.CreateNUWAdd( Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); llvm::Value *AlignVal = llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); + Size = Bld.CreateUDiv(Size, AlignVal); Size = Bld.CreateNUWMul(Size, AlignVal); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); - LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), + + // Allocate space for this VLA object to be globalized + llvm::Value *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + {Size}); + + I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr); + LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(), CGM.getContext().getDeclAlign(VD), AlignmentSource::Decl); I->getSecond().MappedParams->setVarAddr(CGF, cast(VD), Base.getAddress(CGF)); - I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); } I->getSecond().MappedParams->apply(CGF); } @@ -1997,60 +1748,20 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { - I->getSecond().MappedParams->restore(CGF); - if (!CGF.HaveInsertPoint()) - return; + // Deallocate the memory for each globalized VLA object for (llvm::Value *Addr : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - Addr); + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + Addr); } - if (I->getSecond().GlobalRecordAddr) { - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().RegionCounter > 0 && - "region counter must be > 0."); - --GlobalizedRecords.back().RegionCounter; - // Emit the restore function only in the target region. - if (GlobalizedRecords.back().RegionCounter == 0) { - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(GlobalizedRecords.back().UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); - llvm::Value *Args[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - IsInSharedMemory}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), - Args); - } - } else { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); - } + // Deallocate the memory for each globalized value + for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { + I->getSecond().MappedParams->restore(CGF); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal}); } } } @@ -2093,6 +1804,14 @@ // Force inline this outlined function at its call site. Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + // Ensure we do not inline the function. This is trivially true for the ones + // passed to __kmpc_fork_call but the ones calles in serialized regions + // could be inlined. This is not a perfect but it is closer to the invariant + // we want, namely, every data environment starts with a new function. + // TODO: We should pass the if condition to the runtime function and do the + // handling there. Much cleaner code. + cast(OutlinedFn)->addFnAttr(llvm::Attribute::NoInline); + Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); @@ -4208,15 +3927,6 @@ auto *Fn = llvm::Function::Create( CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); - - // Ensure we do not inline the function. This is trivially true for the ones - // passed to __kmpc_fork_call but the ones calles in serialized regions - // could be inlined. This is not a perfect but it is closer to the invariant - // we want, namely, every data environment starts with a new function. - // TODO: We should pass the if condition to the runtime function and do the - // handling there. Much cleaner code. - Fn->addFnAttr(llvm::Attribute::NoInline); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); Fn->setLinkage(llvm::GlobalValue::InternalLinkage); Fn->setDoesNotRecurse(); @@ -4334,6 +4044,7 @@ } if (!Body) return; + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = @@ -4347,7 +4058,6 @@ auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().MappedParams = std::make_unique(); - I->getSecond().GlobalRecord = GlobalizedVarsRecord; I->getSecond().EscapedParameters.insert( VarChecker.getEscapedParameters().begin(), VarChecker.getEscapedParameters().end()); @@ -4356,21 +4066,16 @@ DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); + Data.insert(std::make_pair(VD, MappedVarData(IsInTTDRegion))); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); - I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); I->getSecond().SecondaryLocalVarData.emplace(); DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); + Data.insert(std::make_pair(VD, MappedVarData(/*IsInTTDRegion=*/true))); } } if (!NeedToDelayGlobalization) { @@ -4661,185 +4366,8 @@ CGOpenMPRuntime::processRequiresDirective(D); } -/// Get number of SMs and number of blocks per SM. -static std::pair getSMsBlocksPerSM(CodeGenModule &CGM) { - std::pair Data; - if (CGM.getLangOpts().OpenMPCUDANumSMs) - Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; - if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) - Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; - if (Data.first && Data.second) - return Data; - switch (getCudaArch(CGM)) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - return {16, 16}; - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return {56, 32}; - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::SM_86: - return {84, 32}; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX602: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX705: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX805: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX90a: - case CudaArch::GFX90c: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1030: - case CudaArch::GFX1031: - case CudaArch::GFX1032: - case CudaArch::GFX1033: - case CudaArch::UNUSED: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - llvm_unreachable("Unexpected NVPTX target without ptx feature."); -} - void CGOpenMPRuntimeGPU::clear() { - if (!GlobalizedRecords.empty() && - !CGM.getLangOpts().OpenMPCUDATargetParallel) { - ASTContext &C = CGM.getContext(); - llvm::SmallVector GlobalRecs; - llvm::SmallVector SharedRecs; - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - RecordDecl *SharedStaticRD = C.buildImplicitRecord( - "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - SharedStaticRD->startDefinition(); - for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { - if (Records.Records.empty()) - continue; - unsigned Size = 0; - unsigned RecAlignment = 0; - for (const RecordDecl *RD : Records.Records) { - QualType RDTy = C.getRecordType(RD); - unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); - RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); - Size = - llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); - } - Size = llvm::alignTo(Size, RecAlignment); - llvm::APInt ArySize(/*numBits=*/64, Size); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - const bool UseSharedMemory = Size <= SharedMemorySize; - auto *Field = - FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, - SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (UseSharedMemory) { - SharedStaticRD->addDecl(Field); - SharedRecs.push_back(&Records); - } else { - StaticRD->addDecl(Field); - GlobalRecs.push_back(&Records); - } - Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); - Records.UseSharedMemory->setInitializer( - llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); - } - // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. - if (!SharedStaticRD->field_empty()) { - llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - SharedStaticRD->addDecl(Field); - } - SharedStaticRD->completeDefinition(); - if (!SharedStaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(SharedStaticRD); - llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, - llvm::UndefValue::get(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - StaticRD->completeDefinition(); - if (!StaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(StaticRD); - std::pair SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = - C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - } + if (!TeamsReductions.empty()) { ASTContext &C = CGM.getContext(); RecordDecl *StaticRD = C.buildImplicitRecord( diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -12604,6 +12604,10 @@ return; } + // If we there is no initializer requested we don't check for one. + if (Var->hasAttr()) + return; + if (!Var->isInvalidDecl() && RealDecl->hasAttr()) { if (Var->getStorageClass() == SC_Extern) { Diag(Var->getLocation(), diag::err_loader_uninitialized_extern_decl) diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -329,6 +329,12 @@ /// int __isoc99_sscanf(const char *s, const char *format, ...) TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_sscanf) TLI_DEFINE_STRING_INTERNAL("__isoc99_sscanf") +/// void __kmpc_alloc_shared(size_t nbyte); +TLI_DEFINE_ENUM_INTERNAL(__kmpc_alloc_shared) +TLI_DEFINE_STRING_INTERNAL("__kmpc_alloc_shared") +/// void __kmpc_free_shared(void *ptr); +TLI_DEFINE_ENUM_INTERNAL(__kmpc_free_shared) +TLI_DEFINE_STRING_INTERNAL("__kmpc_free_shared") /// double __log10_finite(double x); TLI_DEFINE_ENUM_INTERNAL(log10_finite) TLI_DEFINE_STRING_INTERNAL("__log10_finite") diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -425,12 +425,9 @@ GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) -__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr) +__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) +__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) __OMP_RTL(__kmpc_end_sharing_variables, false, Void, ) __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr) @@ -526,6 +523,11 @@ EnumAttr(NoCapture)) : AttributeSet()) +__OMP_ATTRS_SET(NoCaptureAttrs, + OptimisticAttributes + ? AttributeSet(EnumAttr(NoCapture)) + : AttributeSet(EnumAttr(NoCapture))) + #if 0 __OMP_ATTRS_SET(WriteOnlyPtrAttrs, OptimisticAttributes @@ -841,38 +843,26 @@ __OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs()) __OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), ParamAttrs()) -__OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs, - ParamAttrs()) -__OMP_RTL_ATTRS(__kmpc_destroy_allocator, AllocAttrs, AttributeSet(), - ParamAttrs()) +__OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs, ParamAttrs()) +__OMP_RTL_ATTRS(__kmpc_destroy_allocator, AllocAttrs, AttributeSet(), ParamAttrs()) -__OMP_RTL_ATTRS(__kmpc_push_target_tripcount_mapper, SetterAttrs, - AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__kmpc_push_target_tripcount_mapper, SetterAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(__tgt_target_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_target_nowait_mapper, ForkAttrs, AttributeSet(), - ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_target_teams_mapper, ForkAttrs, AttributeSet(), - ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_target_teams_nowait_mapper, ForkAttrs, AttributeSet(), - ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(), - ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(), - ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_target_nowait_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_target_teams_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_target_teams_nowait_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(__tgt_target_data_begin_nowait_mapper, ForkAttrs, - AttributeSet(), ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_target_data_end_mapper, ForkAttrs, AttributeSet(), - ParamAttrs()) + AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_target_data_end_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(__tgt_target_data_end_nowait_mapper, ForkAttrs, - AttributeSet(), ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_target_data_update_mapper, ForkAttrs, AttributeSet(), - ParamAttrs()) + AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_target_data_update_mapper, ForkAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(__tgt_target_data_update_nowait_mapper, ForkAttrs, - AttributeSet(), ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_mapper_num_components, ForkAttrs, AttributeSet(), - ParamAttrs()) -__OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(), - ParamAttrs()) + AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_mapper_num_components, ForkAttrs, AttributeSet(), ParamAttrs()) +__OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(), ParamAttrs()) __OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs, ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs)) @@ -967,7 +957,6 @@ __OMP_PROC_BIND_KIND(master, 2) __OMP_PROC_BIND_KIND(close, 3) __OMP_PROC_BIND_KIND(spread, 4) -__OMP_PROC_BIND_KIND(primary, 5) __OMP_PROC_BIND_KIND(default, 6) __OMP_PROC_BIND_KIND(unknown, 7) @@ -1079,7 +1068,6 @@ __OMP_TRAIT_PROPERTY(user, condition, false) __OMP_TRAIT_PROPERTY(user, condition, unknown) -__OMP_TRAIT_SELECTOR_AND_PROPERTY(construct, dispatch) // Note that we put isa last so that the other conditions are checked first. // This allows us to issue warnings wrt. isa only if we match otherwise. diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -71,46 +71,65 @@ // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to // know which functions are nounwind, noalias, nocapture parameters, etc. static const std::pair AllocationFnData[] = { - {LibFunc_malloc, {MallocLike, 1, 0, -1}}, - {LibFunc_vec_malloc, {MallocLike, 1, 0, -1}}, - {LibFunc_valloc, {MallocLike, 1, 0, -1}}, - {LibFunc_Znwj, {OpNewLike, 1, 0, -1}}, // new(unsigned int) - {LibFunc_ZnwjRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow) - {LibFunc_ZnwjSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new(unsigned int, align_val_t) - {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_Znwm, {OpNewLike, 1, 0, -1}}, // new(unsigned long) - {LibFunc_ZnwmRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new(unsigned long, nothrow) - {LibFunc_ZnwmSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new(unsigned long, align_val_t) - {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_Znaj, {OpNewLike, 1, 0, -1}}, // new[](unsigned int) - {LibFunc_ZnajRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow) - {LibFunc_ZnajSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new[](unsigned int, align_val_t) - {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_Znam, {OpNewLike, 1, 0, -1}}, // new[](unsigned long) - {LibFunc_ZnamRKSt9nothrow_t, {MallocLike, 2, 0, -1}}, // new[](unsigned long, nothrow) - {LibFunc_ZnamSt11align_val_t, {OpNewLike, 2, 0, -1}}, // new[](unsigned long, align_val_t) - {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long, align_val_t, nothrow) - {MallocLike, 3, 0, -1}}, - {LibFunc_msvc_new_int, {OpNewLike, 1, 0, -1}}, // new(unsigned int) - {LibFunc_msvc_new_int_nothrow, {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow) - {LibFunc_msvc_new_longlong, {OpNewLike, 1, 0, -1}}, // new(unsigned long long) - {LibFunc_msvc_new_longlong_nothrow, {MallocLike, 2, 0, -1}}, // new(unsigned long long, nothrow) - {LibFunc_msvc_new_array_int, {OpNewLike, 1, 0, -1}}, // new[](unsigned int) - {LibFunc_msvc_new_array_int_nothrow, {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow) - {LibFunc_msvc_new_array_longlong, {OpNewLike, 1, 0, -1}}, // new[](unsigned long long) - {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike, 2, 0, -1}}, // new[](unsigned long long, nothrow) - {LibFunc_aligned_alloc, {AlignedAllocLike, 2, 1, -1}}, - {LibFunc_calloc, {CallocLike, 2, 0, 1}}, - {LibFunc_vec_calloc, {CallocLike, 2, 0, 1}}, - {LibFunc_realloc, {ReallocLike, 2, 1, -1}}, - {LibFunc_vec_realloc, {ReallocLike, 2, 1, -1}}, - {LibFunc_reallocf, {ReallocLike, 2, 1, -1}}, - {LibFunc_strdup, {StrDupLike, 1, -1, -1}}, - {LibFunc_strndup, {StrDupLike, 2, 1, -1}} - // TODO: Handle "int posix_memalign(void **, size_t, size_t)" + {LibFunc_malloc, {MallocLike, 1, 0, -1}}, + {LibFunc_vec_malloc, {MallocLike, 1, 0, -1}}, + {LibFunc_valloc, {MallocLike, 1, 0, -1}}, + {LibFunc_Znwj, {OpNewLike, 1, 0, -1}}, // new(unsigned int) + {LibFunc_ZnwjRKSt9nothrow_t, + {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow) + {LibFunc_ZnwjSt11align_val_t, + {OpNewLike, 2, 0, -1}}, // new(unsigned int, align_val_t) + {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int, + // align_val_t, nothrow) + {MallocLike, 3, 0, -1}}, + {LibFunc_Znwm, {OpNewLike, 1, 0, -1}}, // new(unsigned long) + {LibFunc_ZnwmRKSt9nothrow_t, + {MallocLike, 2, 0, -1}}, // new(unsigned long, nothrow) + {LibFunc_ZnwmSt11align_val_t, + {OpNewLike, 2, 0, -1}}, // new(unsigned long, align_val_t) + {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long, + // align_val_t, nothrow) + {MallocLike, 3, 0, -1}}, + {LibFunc_Znaj, {OpNewLike, 1, 0, -1}}, // new[](unsigned int) + {LibFunc_ZnajRKSt9nothrow_t, + {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow) + {LibFunc_ZnajSt11align_val_t, + {OpNewLike, 2, 0, -1}}, // new[](unsigned int, align_val_t) + {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int, + // align_val_t, nothrow) + {MallocLike, 3, 0, -1}}, + {LibFunc_Znam, {OpNewLike, 1, 0, -1}}, // new[](unsigned long) + {LibFunc_ZnamRKSt9nothrow_t, + {MallocLike, 2, 0, -1}}, // new[](unsigned long, nothrow) + {LibFunc_ZnamSt11align_val_t, + {OpNewLike, 2, 0, -1}}, // new[](unsigned long, align_val_t) + {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long, + // align_val_t, nothrow) + {MallocLike, 3, 0, -1}}, + {LibFunc_msvc_new_int, {OpNewLike, 1, 0, -1}}, // new(unsigned int) + {LibFunc_msvc_new_int_nothrow, + {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow) + {LibFunc_msvc_new_longlong, + {OpNewLike, 1, 0, -1}}, // new(unsigned long long) + {LibFunc_msvc_new_longlong_nothrow, + {MallocLike, 2, 0, -1}}, // new(unsigned long long, nothrow) + {LibFunc_msvc_new_array_int, {OpNewLike, 1, 0, -1}}, // new[](unsigned int) + {LibFunc_msvc_new_array_int_nothrow, + {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow) + {LibFunc_msvc_new_array_longlong, + {OpNewLike, 1, 0, -1}}, // new[](unsigned long long) + {LibFunc_msvc_new_array_longlong_nothrow, + {MallocLike, 2, 0, -1}}, // new[](unsigned long long, nothrow) + {LibFunc_aligned_alloc, {AlignedAllocLike, 2, 1, -1}}, + {LibFunc_calloc, {CallocLike, 2, 0, 1}}, + {LibFunc_vec_calloc, {CallocLike, 2, 0, 1}}, + {LibFunc_realloc, {ReallocLike, 2, 1, -1}}, + {LibFunc_vec_realloc, {ReallocLike, 2, 1, -1}}, + {LibFunc_reallocf, {ReallocLike, 2, 1, -1}}, + {LibFunc_strdup, {StrDupLike, 1, -1, -1}}, + {LibFunc_strndup, {StrDupLike, 2, 1, -1}}, + // TODO: Handle "int posix_memalign(void **, size_t, size_t)" + {LibFunc___kmpc_alloc_shared, {MallocLike, 1, 0, -1}}, }; static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast, @@ -433,12 +452,13 @@ bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) { unsigned ExpectedNumParams; if (TLIFn == LibFunc_free || - TLIFn == LibFunc_ZdlPv || // operator delete(void*) - TLIFn == LibFunc_ZdaPv || // operator delete[](void*) - TLIFn == LibFunc_msvc_delete_ptr32 || // operator delete(void*) - TLIFn == LibFunc_msvc_delete_ptr64 || // operator delete(void*) + TLIFn == LibFunc_ZdlPv || // operator delete(void*) + TLIFn == LibFunc_ZdaPv || // operator delete[](void*) + TLIFn == LibFunc_msvc_delete_ptr32 || // operator delete(void*) + TLIFn == LibFunc_msvc_delete_ptr64 || // operator delete(void*) TLIFn == LibFunc_msvc_delete_array_ptr32 || // operator delete[](void*) - TLIFn == LibFunc_msvc_delete_array_ptr64) // operator delete[](void*) + TLIFn == LibFunc_msvc_delete_array_ptr64 || // operator delete[](void*) + TLIFn == LibFunc___kmpc_free_shared) ExpectedNumParams = 1; else if (TLIFn == LibFunc_ZdlPvj || // delete(void*, uint) TLIFn == LibFunc_ZdlPvm || // delete(void*, ulong) diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -158,6 +158,8 @@ TLI.setUnavailable(LibFunc_memcpy); TLI.setUnavailable(LibFunc_memset); TLI.setUnavailable(LibFunc_memset_pattern16); + TLI.setAvailable(llvm::LibFunc___kmpc_alloc_shared); + TLI.setAvailable(llvm::LibFunc___kmpc_free_shared); return; } @@ -601,6 +603,8 @@ // TLI.setAvailable(llvm::LibFunc_memcpy); // TLI.setAvailable(llvm::LibFunc_memset); + TLI.setAvailable(llvm::LibFunc___kmpc_alloc_shared); + TLI.setAvailable(llvm::LibFunc___kmpc_free_shared); } else { TLI.setUnavailable(LibFunc_nvvm_reflect); } @@ -891,6 +895,8 @@ FTy.getParamType(2)->isPointerTy()); case LibFunc_system: return (NumParams == 1 && FTy.getParamType(0)->isPointerTy()); + case LibFunc___kmpc_alloc_shared: + case LibFunc___kmpc_free_shared: case LibFunc_malloc: case LibFunc_vec_malloc: return (NumParams == 1 && FTy.getReturnType()->isPointerTy()); diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -155,22 +155,27 @@ /// is set to false and the instruction is volatile, return nullptr. static const Value *getPointerOperand(const Instruction *I, bool AllowVolatile) { - if (!AllowVolatile && I->isVolatile()) - return nullptr; - if (auto *LI = dyn_cast(I)) { + if (!AllowVolatile && LI->isVolatile()) + return nullptr; return LI->getPointerOperand(); } if (auto *SI = dyn_cast(I)) { + if (!AllowVolatile && SI->isVolatile()) + return nullptr; return SI->getPointerOperand(); } if (auto *CXI = dyn_cast(I)) { + if (!AllowVolatile && CXI->isVolatile()) + return nullptr; return CXI->getPointerOperand(); } if (auto *RMWI = dyn_cast(I)) { + if (!AllowVolatile && RMWI->isVolatile()) + return nullptr; return RMWI->getPointerOperand(); } @@ -1282,7 +1287,11 @@ /// or monotonic ordering static bool isNonRelaxedAtomic(Instruction *I); - /// Helper function specific for intrinsics which are potentially volatile + /// Helper function used to determine whether an instruction is volatile. + static bool isVolatile(Instruction *I); + + /// Helper function uset to check if intrinsic is volatile (memcpy, memmove, + /// memset). static bool isNoSyncIntrinsic(Instruction *I); }; @@ -1290,15 +1299,6 @@ if (!I->isAtomic()) return false; - if (auto *FI = dyn_cast(I)) - // All legal orderings for fence are stronger than monotonic. - return FI->getSyncScopeID() != SyncScope::SingleThread; - else if (auto *AI = dyn_cast(I)) { - // Unordered is not a legal ordering for cmpxchg. - return (AI->getSuccessOrdering() != AtomicOrdering::Monotonic || - AI->getFailureOrdering() != AtomicOrdering::Monotonic); - } - AtomicOrdering Ordering; switch (I->getOpcode()) { case Instruction::AtomicRMW: @@ -1310,42 +1310,98 @@ case Instruction::Load: Ordering = cast(I)->getOrdering(); break; + case Instruction::Fence: { + auto *FI = cast(I); + if (FI->getSyncScopeID() == SyncScope::SingleThread) + return false; + Ordering = FI->getOrdering(); + break; + } + case Instruction::AtomicCmpXchg: { + AtomicOrdering Success = cast(I)->getSuccessOrdering(); + AtomicOrdering Failure = cast(I)->getFailureOrdering(); + // Only if both are relaxed, than it can be treated as relaxed. + // Otherwise it is non-relaxed. + if (Success != AtomicOrdering::Unordered && + Success != AtomicOrdering::Monotonic) + return true; + if (Failure != AtomicOrdering::Unordered && + Failure != AtomicOrdering::Monotonic) + return true; + return false; + } default: llvm_unreachable( "New atomic operations need to be known in the attributor."); } - return (Ordering != AtomicOrdering::Unordered && - Ordering != AtomicOrdering::Monotonic); + // Relaxed. + if (Ordering == AtomicOrdering::Unordered || + Ordering == AtomicOrdering::Monotonic) + return false; + return true; } -/// Return true if this intrinsic is nosync. This is only used for intrinsics -/// which would be nosync except that they have a volatile flag. All other -/// intrinsics are simply annotated with the nosync attribute in Intrinsics.td. +/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics. +/// FIXME: We should ipmrove the handling of intrinsics. bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) { - if (auto *MI = dyn_cast(I)) - return !MI->isVolatile(); + if (auto *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + /// Element wise atomic memory intrinsics are can only be unordered, + /// therefore nosync. + case Intrinsic::memset_element_unordered_atomic: + case Intrinsic::memmove_element_unordered_atomic: + case Intrinsic::memcpy_element_unordered_atomic: + return true; + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + if (!cast(II)->isVolatile()) + return true; + return false; + default: + return false; + } + } return false; } +bool AANoSyncImpl::isVolatile(Instruction *I) { + assert(!isa(I) && "Calls should not be checked here"); + + switch (I->getOpcode()) { + case Instruction::AtomicRMW: + return cast(I)->isVolatile(); + case Instruction::Store: + return cast(I)->isVolatile(); + case Instruction::Load: + return cast(I)->isVolatile(); + case Instruction::AtomicCmpXchg: + return cast(I)->isVolatile(); + default: + return false; + } +} + ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) { auto CheckRWInstForNoSync = [&](Instruction &I) { /// We are looking for volatile instructions or Non-Relaxed atomics. + /// FIXME: We should improve the handling of intrinsics. + + if (isa(&I) && isNoSyncIntrinsic(&I)) + return true; if (const auto *CB = dyn_cast(&I)) { if (CB->hasFnAttr(Attribute::NoSync)) return true; - if (isNoSyncIntrinsic(&I)) - return true; - const auto &NoSyncAA = A.getAAFor( *this, IRPosition::callsite_function(*CB), DepClassTy::REQUIRED); return NoSyncAA.isAssumedNoSync(); } - if (!I.isVolatile() && !isNonRelaxedAtomic(&I)) + if (!isVolatile(&I) && !isNonRelaxedAtomic(&I)) return true; return false; @@ -1694,7 +1750,7 @@ AANonNull::initialize(A); - bool CanBeNull, CanBeFreed; + bool CanBeNull = true, CanBeFreed; if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull, CanBeFreed)) { if (!CanBeNull) { @@ -3608,7 +3664,7 @@ // TODO: track globally. bool CanBeNull, CanBeFreed; DerefBytes = - Base->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); + Base->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); T.GlobalState.indicatePessimisticFixpoint(); } else { const DerefState &DS = AA.getState(); @@ -5109,6 +5165,7 @@ A.getInfoCache().getMustBeExecutedContextExplorer(); auto FreeCheck = [&](Instruction &I) { + return false; const auto &Frees = FreesForMalloc.lookup(&I); if (Frees.size() != 1) return false; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -50,6 +50,17 @@ static cl::opt PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden); +static cl::opt SharedMemoryLimit( + "openmp-shared-limit", cl::Optional, + cl::desc("Limits the total amount of shared memory used for optimizations"), + cl::init(-1)); + +static cl::opt + SharedMemoryThreshold("openmp-shared-threshold", cl::Optional, + cl::desc("Only place variables sizes larger than the " + "threshold in shared memory"), + cl::init(0)); + static cl::opt HideMemoryTransferLatency( "openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" @@ -72,6 +83,9 @@ STATISTIC(NumOpenMPParallelRegionsMerged, "Number of OpenMP parallel regions merged"); +STATISTIC(NumBytesMovedToSharedMemory, + "Amount of memory pushed to shared memory"); + #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; #endif @@ -517,6 +531,8 @@ // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); + Changed |= removeGlobalization(); + Changed |= replaceGlobalization(); Changed |= deleteParallelRegions(); if (HideMemoryTransferLatency) Changed |= hideMemTransfersLatency(); @@ -982,6 +998,131 @@ return Changed; } + bool removeGlobalization() { + return false; + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + + auto RemoveAllocCalls = [&](Use &U, Function &F) { + auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; + CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CB) + return false; + + IRPosition AllocPos = IRPosition::callsite_returned(*CB); + if (!A.lookupAAFor(AllocPos)->isKnownNoCapture()) + return false; + + LLVM_DEBUG(dbgs() << TAG << "Remove globalization call in " + << CB->getCaller()->getName() << "\n"); + + Value *AllocSize = CB->getArgOperand(0); + + for (auto *U : CB->users()) { + CallBase *FC = dyn_cast(U); + if (FC && FC->getCalledFunction() == FreeCall.Declaration) { + FC->eraseFromParent(); + break; + } + } + + const DataLayout &DL = M.getDataLayout(); + Type *Int8Ty = Type::getInt8Ty(M.getContext()); + AllocaInst *NewAlloca = + new AllocaInst(Int8Ty, DL.getAllocaAddrSpace(), AllocSize, + CB->getName(), &F.front().front()); + NewAlloca->setDebugLoc(CB->getDebugLoc()); + CB->replaceAllUsesWith(NewAlloca); + CB->eraseFromParent(); + + return false; + }; + RFI.foreachUse(SCC, RemoveAllocCalls); + + return false; + } + + /// Replace globalization calls in the device with shared memory. Variables + /// will not be placed in shared memory if their size is below the threshold, + /// or if it would exceed the limit. + bool replaceGlobalization() { + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + bool Changed = false; + + auto ReplaceAllocCalls = [&](Use &U, Function &F) { + if (!isKernel(F)) + return false; + auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; + CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CB) + return false; + + ConstantInt *AllocSize = dyn_cast(CB->getArgOperand(0)); + if (!AllocSize) + return false; + + if (AllocSize->getZExtValue() <= SharedMemoryThreshold) + return false; + + if (NumBytesMovedToSharedMemory + AllocSize->getZExtValue() > + SharedMemoryLimit) + return false; + + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in " + << CB->getCaller()->getName() << " with " + << AllocSize->getZExtValue() + << " bytes of shared memory\n"); + + // Remove the free call + for (auto *U : CB->users()) { + CallBase *FC = dyn_cast(U); + if (FC && FC->getCalledFunction() == FreeCall.Declaration) { + FC->eraseFromParent(); + break; + } + } + + // Create a new shared memory buffer of the same size as the allocation + // and replace all the uses of the original allocation with it. + Type *Int8Ty = Type::getInt8Ty(M.getContext()); + Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); + auto *SharedMem = new GlobalVariable( + M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, + UndefValue::get(Int8ArrTy), CB->getName(), nullptr, + GlobalValue::NotThreadLocal, 3); + SharedMem->setAlignment(Align(32)); + + auto *NullInt = Constant::getNullValue(Type::getInt64Ty(M.getContext())); + auto *GEPExpr = ConstantExpr::getGetElementPtr( + Int8ArrTy, SharedMem, SmallVector({NullInt, NullInt})); + + auto *NewBuffer = new AddrSpaceCastInst(GEPExpr, Int8Ty->getPointerTo(), + CB->getName() + "_shared", CB); + + NewBuffer->setDebugLoc(CB->getDebugLoc()); + CB->replaceAllUsesWith(NewBuffer); + CB->eraseFromParent(); + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Replaced globalized variable with " + << ore::NV("SharedMemory", AllocSize->getZExtValue()) + << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ") + << " of shared memory"; + }; + emitRemark(NewBuffer, "OpenMPReplaceGlobalization", + Remark); + + NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); + Changed = true; + + return true; + }; + RFI.foreachUse(SCC, ReplaceAllocCalls); + + OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_free_shared); + + return Changed; + } + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; @@ -1112,9 +1253,8 @@ } void analysisGlobalization() { - RuntimeFunction GlobalizationRuntimeIDs[] = { - OMPRTL___kmpc_data_sharing_coalesced_push_stack, - OMPRTL___kmpc_data_sharing_push_stack}; + RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared, + OMPRTL___kmpc_free_shared}; for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; @@ -1602,6 +1742,18 @@ GetterRFI.foreachUse(SCC, CreateAA); } + + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + auto CreateAA = [&](Use &U, Function &Decl) { + auto *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CB) + return false; + + IRPosition CBPos = IRPosition::function(*CB->getFunction()); + A.getOrCreateAAFor(CBPos); + return false; + }; + RFI.foreachUse(SCC, CreateAA); } }; diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -76,7 +76,8 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) -add_subdirectory(deviceRTLs) +# add_subdirectory(deviceRTLs) +add_subdirectory(DeviceRTL) # Add tests. add_subdirectory(test) diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -0,0 +1,195 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the Device RTL for all toolchains that are available +# +##===----------------------------------------------------------------------===## + +# TODO: copied from NVPTX, need to be generalized. + +# By default we will not build NVPTX deviceRTL on a CUDA free system +set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL + "Whether build NVPTX deviceRTL on CUDA free system.") + +if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB)) + libomptarget_say("Not building NVPTX deviceRTL by default on CUDA free system.") + return() +endif() + +# Check if we can create an LLVM bitcode implementation of the runtime library +# that could be inlined in the user application. For that we need to find +# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and +# an LLVM linker. +set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") +set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) +elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(cuda_compiler ${CMAKE_C_COMPILER}) +else() + libomptarget_say("Not building NVPTX deviceRTL: clang not found") + return() +endif() + +# Get compiler directory to try to locate a suitable linker. +get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY) +set(llvm_link "${compiler_dir}/llvm-link") + +if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER}) +elseif (EXISTS ${llvm_link}) + set(bc_linker ${llvm_link}) +else() + libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found") + return() +endif() + +# TODO: This part needs to be refined when libomptarget is going to support +# Windows! +# TODO: This part can also be removed if we can change the clang driver to make +# it support device only compilation. +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(aux_triple x86_64-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le") + set(aux_triple powerpc64le-unknown-linux-gnu) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(aux_triple aarch64-unknown-linux-gnu) +else() + libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + return() +endif() + +set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) +set(include_directory ${devicertl_base_directory}/include) +set(source_directory ${devicertl_base_directory}/src) + +set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80) + +set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING + "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") +string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES) + +if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all") + set(nvptx_sm_list ${all_capabilities}) +elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto") + if (NOT LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.") + endif() + set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH}) +else() + string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}") +endif() + +# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the +# build. +if (NOT nvptx_sm_list) + libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list") + return() +endif() + +# Check all SM values +foreach(sm ${nvptx_sm_list}) + if (NOT ${sm} IN_LIST all_capabilities) + libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.") + endif() +endforeach() + +# Override default MAX_SM in src/target_impl.h if requested +if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM) + set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}") +endif() + +# Activate RTL message dumps if requested by the user. +set(LIBOMPTARGET_DEVICE_DEBUG FALSE CACHE BOOL + "Activate NVPTX device RTL debug messages.") + +libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") + +set(src_files + ${source_directory}/Configuration.cpp + ${source_directory}/Debug.cpp + ${source_directory}/Kernel.cpp + ${source_directory}/Mapping.cpp + ${source_directory}/Misc.cpp + ${source_directory}/Parallelism.cpp + ${source_directory}/Reduction.cpp + ${source_directory}/State.cpp + ${source_directory}/Synchronization.cpp + ${source_directory}/Tasking.cpp + ${source_directory}/Utils.cpp + ${source_directory}/Workshare.cpp +) + +# Set flags for LLVM Bitcode compilation. +set(bc_flags -S -x c++ -O1 -std=c++17 + -target nvptx64 + -Xclang -emit-llvm-bc + -Xclang -aux-triple -Xclang ${aux_triple} + -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device + -Xclang -target-feature -Xclang +ptx61 + -I${include_directory} +) + +if(${LIBOMPTARGET_DEVICE_DEBUG}) + list(APPEND bc_flags -DOMPTARGET_DEBUG=-1) +else() + list(APPEND bc_flags -DOMPTARGET_DEBUG=0) +endif() + +# Create target to build all Bitcode libraries. +add_custom_target(omptarget-nvptx-bc) + +# Generate a Bitcode library for all the compute capabilities the user requested +foreach(sm ${nvptx_sm_list}) + set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0") + set(bc_files "") + foreach(src ${src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + set(outfile "${outfile}-sm_${sm}.bc") + + add_custom_command(OUTPUT ${outfile} + COMMAND ${cuda_compiler} ${bc_flags} + ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile} + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}) + + list(APPEND bc_files ${outfile}) + endforeach() + + set(bclib_name "libomptarget-nvptx-sm_${sm}.bc") + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + COMMAND ${bc_linker} + -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode ${bclib_name}" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name}) + + set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc") + + add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) + add_dependencies(omptarget-nvptx-bc ${bclib_target_name}) + + # Copy library to destination. + add_custom_command(TARGET ${bclib_target_name} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} + ${LIBOMPTARGET_LIBRARY_DIR}) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}") +endforeach() diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -0,0 +1,39 @@ +//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// API to query the global (constant) device environment. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_CONFIGURATION_H +#define OMPTARGET_CONFIGURATION_H + +#include "Types.h" + +namespace _OMP { +namespace config { + +enum DebugLevel : int32_t { Assertion }; + +/// Return the number of devices in the system, same number as returned on the +/// host by omp_get_num_devices. +int32_t getNumDevices(); + +/// Return the user choosen debug level. +int32_t getDebugLevel(); + +/// Return the amonut of shared memory scratchpad allocated to the main thread +/// in generic mode. Unused in SPMD-mode. +uint32_t getGenericModeMainThreadSharedMemoryStorage(); + +bool isDebugMode(DebugLevel Level); + +} // namespace config +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Debug.h @@ -0,0 +1,27 @@ +//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_DEBUG_H +#define OMPTARGET_DEVICERTL_DEBUG_H + +/// Assertion +/// +/// { +extern "C" { +void __assert_assume(bool cond, const char *exp, const char *file, int line); +} + +#define ASSERT(e) __assert_assume(e, #e, __FILE__, __LINE__) +///} + +// TODO: Print + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -0,0 +1,350 @@ +//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_INTERFACE_H +#define OMPTARGET_DEVICERTL_INTERFACE_H + +#include "Types.h" + +/// External API +/// +///{ + +extern "C" { + +/// ICV: dyn-var, constant 0 +/// +/// setter: ignored. +/// getter: returns 0. +/// +///{ +void omp_set_dynamic(int); +int omp_get_dynamic(void); +///} + +/// ICV: nthreads-var, integer +/// +/// scope: data environment +/// +/// setter: ignored. +/// getter: returns false. +/// +/// implementation notes: +/// +/// +///{ +void omp_set_num_threads(int); +int omp_get_max_threads(void); +///} + +/// ICV: thread-limit-var, computed +/// +/// getter: returns thread limited defined during launch. +/// +///{ +int omp_get_thread_limit(void); +///} + +/// ICV: max-active-level-var, constant 1 +/// +/// setter: ignored. +/// getter: returns 1. +/// +///{ +void omp_set_max_active_levels(int); +int omp_get_max_active_levels(void); +///} + +/// ICV: places-partition-var +/// +/// +///{ +///} + +/// ICV: active-level-var, 0 or 1 +/// +/// getter: returns 0 or 1. +/// +///{ +int omp_get_active_level(void); +///} + +/// ICV: level-var +/// +/// getter: returns parallel region nesting +/// +///{ +int omp_get_level(void); +///} + +/// ICV: run-sched-var +/// +/// +///{ +void omp_set_schedule(omp_sched_t, int); +void omp_get_schedule(omp_sched_t *, int *); +///} + +/// TODO this is incomplete. +int omp_get_num_threads(void); +int omp_get_thread_num(void); +void omp_set_nested(int); + +int omp_get_nested(void); + +void omp_set_max_active_levels(int Level); + +int omp_get_max_active_levels(void); + +omp_proc_bind_t omp_get_proc_bind(void); + +int omp_get_num_places(void); + +int omp_get_place_num_procs(int place_num); + +void omp_get_place_proc_ids(int place_num, int *ids); + +int omp_get_place_num(void); + +int omp_get_partition_num_places(void); + +void omp_get_partition_place_nums(int *place_nums); + +int omp_get_cancellation(void); + +void omp_set_default_device(int deviceId); + +int omp_get_default_device(void); + +int omp_get_num_devices(void); + +int omp_get_num_teams(void); + +int omp_get_team_num(); + +int omp_get_initial_device(void); + +/// Synchronization +/// +///{ +void omp_init_lock(omp_lock_t *Lock); + +void omp_destroy_lock(omp_lock_t *Lock); + +void omp_set_lock(omp_lock_t *Lock); + +void omp_unset_lock(omp_lock_t *Lock); + +int omp_test_lock(omp_lock_t *Lock); +///} + +/// Tasking +/// +///{ +int omp_in_final(void); + +int omp_get_max_task_priority(void); +///} + +/// Misc +/// +///{ +double omp_get_wtick(void); + +double omp_get_wtime(void); +///} +} + +extern "C" { +/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be +/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be +/// called by any thread, allocation happens *per thread*. +void *__kmpc_alloc_shared(uint64_t Bytes); + +/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like +/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the +/// allocated by __kmpc_alloc_shared by the same thread. +void __kmpc_free_shared(void *Ptr); + +/// Allocate sufficient space for \p NumArgs sequential `void*` and store the +/// allocation address in \p GlobalArgs. +/// +/// Called by the main thread prior to a parallel region. +/// +/// We also remember it in GlobalArgsPtr to ensure the worker threads and +/// deallocation function know the allocation address too. +void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs); + +/// Deallocate the memory allocated by __kmpc_begin_sharing_variables. +/// +/// Called by the main thread after a parallel region. +/// +/// TODO: This should really take the address, or number of bytes at least. +void __kmpc_end_sharing_variables(); + +/// Store the allocation address obtained via __kmpc_begin_sharing_variables in +/// \p GlobalArgs. +/// +/// Called by the worker threads in the parallel region (function). +void __kmpc_get_shared_variables(void ***GlobalArgs); + +/// Kernel +/// +///{ + +void __kmpc_kernel_init(int, int16_t); + +void __kmpc_kernel_deinit(int16_t); + +void __kmpc_spmd_kernel_init(int, int16_t); + +void __kmpc_spmd_kernel_deinit_v2(int16_t); + +int8_t __kmpc_is_spmd_exec_mode(); +///} + +/// Reduction +/// +///{ +void __kmpc_nvptx_end_reduce(int32_t TId); + +void __kmpc_nvptx_end_reduce_nowait(int32_t TId); + +extern "C" int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct); + +int32_t __kmpc_nvptx_teams_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, + ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, + ListGlobalFnTy glredFct); +///} + +/// Synchronization +/// +///{ +void __kmpc_ordered(IdentTy *Loc, int32_t TId); + +void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); + +int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); + +void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); + +void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); + +int32_t __kmpc_master(IdentTy *Loc, int32_t TId); + +void __kmpc_end_master(IdentTy *Loc, int32_t TId); + +int32_t __kmpc_single(IdentTy *Loc, int32_t TId); + +void __kmpc_end_single(IdentTy *Loc, int32_t TId); + +void __kmpc_flush(IdentTy *Loc); + +__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); + +void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask); + +void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); + +void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); +///} + +/// Parallelism +/// +///{ +/// TODO +void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); + +/// TODO +bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); + +/// TODO +void __kmpc_kernel_end_parallel(); + +/// TODO +void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t); + +/// TODO +void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t); + +/// TODO +void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); + +/// TODO +void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams, + int32_t ThreadLimit); + +/// TODO +uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); + +/// TODO +void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads); +///} + +/// Tasking +/// +///{ +TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t, + uint32_t TaskSizeInclPrivateValues, + uint32_t SharedValuesSize, + TaskFnTy TaskFn); + +int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); + +int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int32_t, + void *, int32_t, void *); + +void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); + +void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); + +void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, + void *); + +void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); + +void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); + +int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); + +int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); + +void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int, + uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, + int32_t, uint64_t, void *); +///} + +/// Misc +/// +///{ +int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal); + +int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); +///} + +/// Shuffle +/// +///{ +int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); +///} +} + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -0,0 +1,86 @@ +//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_MAPPING_H +#define OMPTARGET_MAPPING_H + +#include "Types.h" + +namespace _OMP { + +namespace mapping { + +#pragma omp declare target + +inline constexpr uint32_t MaxThreadsPerTeam = 1024; + +#pragma omp end declare target + +/// Initialize the mapping machinery. +void init(bool IsSPMD); + +/// Return true if the kernel is executed in SPMD mode. +bool isSPMDMode(); + +/// Return true if the kernel is executed in generic mode. +bool isGenericMode(); + +/// Return true if the executing thread is the main thread in generic mode. +bool isMainThreadInGenericMode(); + +/// Return true if the executing thread has the lowest Id of the active threads +/// in the warp. +bool isLeaderInWarp(); + +/// Return a mask describing all active threads in the warp. +LaneMaskTy activemask(); + +/// Return a mask describing all threads with a smaller Id in the warp. +LaneMaskTy lanemaskLT(); + +/// Return a mask describing all threads with a larget Id in the warp. +LaneMaskTy lanemaskGT(); + +/// Return the thread Id in the warp, in [0, getWarpSize()). +uint32_t getThreadIdInWarp(); + +/// Return the thread Id in the block, in [0, getBlockSize()). +uint32_t getThreadIdInBlock(); + +/// Return the warp id in the block. +uint32_t getWarpId(); + +/// Return the warp size, thus number of threads in the warp. +uint32_t getWarpSize(); + +/// Return the number of warps in the block. +uint32_t getNumberOfWarpsInBlock(); + +/// Return the block Id in the kernel, in [0, getKernelSize()). +uint32_t getBlockId(); + +/// Return the block size, thus number of threads in the block. +uint32_t getBlockSize(); + +/// Return the number of blocks in the kernel. +uint32_t getNumberOfBlocks(); + +/// Return the kernel size, thus number of threads in the kernel. +uint32_t getKernelSize(); + +/// Return the number of processing elements on the device. +uint32_t getNumberOfProcessorElements(); + +} // namespace mapping + +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/State.h @@ -0,0 +1,173 @@ +//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_STATE_H +#define OMPTARGET_STATE_H + +#include "Types.h" + +namespace _OMP { + +namespace state { + +#pragma omp declare target + +// TODO: Expose this via CMAKE. +inline constexpr uint32_t SharedScratchpadSize = 2048; + +/// Initialize the state machinery. Must be called by all threads. +void init(bool IsSPMD); + +/// TODO +enum ValueKind { + VK_NThreads, + VK_Level, + VK_ActiveLevel, + VK_MaxActiveLevels, + VK_RunSched, + // --- + VK_RunSchedChunk, + VK_ParallelRegionFn, + VK_ParallelTeamSize, +}; + +/// TODO +void enterDataEnvironment(); + +/// TODO +void exitDataEnvironment(); + +/// TODO +struct DateEnvironmentRAII { + DateEnvironmentRAII() { enterDataEnvironment(); } + ~DateEnvironmentRAII() { exitDataEnvironment(); } +}; + +/// TODO +void resetStateForThread(); + +uint32_t &lookup32(ValueKind VK, bool IsReadonly); +void *&lookupPtr(ValueKind VK, bool IsReadonly); + +/// A mookup class without actual state used to provide +/// a nice interface to lookup and update ICV values +/// we can declare in global scope. +template struct Value { + operator Ty() { return lookup(/* IsReadonly */ true); } + + Value &operator=(const Ty &Other) { + set(Other); + return *this; + } + + Value &operator++() { + inc(1); + return *this; + } + + Value &operator--() { + inc(-1); + return *this; + } + +private: + Ty &lookup(bool IsReadonly) { return lookup32(Kind, IsReadonly); } + + Ty &inc(int UpdateVal) { + return (lookup(/* IsReadonly */ false) += UpdateVal); + } + + Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); } +}; + +/// A mookup class without actual state used to provide +/// a nice interface to lookup and update ICV values +/// we can declare in global scope. +template struct PtrValue { + operator Ty() { return lookup(/* IsReadonly */ true); } + + PtrValue &operator=(const Ty Other) { + set(Other); + return *this; + } + +private: + Ty &lookup(bool IsReadonly) { return lookupPtr(Kind, IsReadonly); } + + Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); } +}; + +/// TODO +inline state::Value RunSchedChunk; + +/// TODO +inline state::Value ParallelTeamSize; + +/// TODO +inline state::PtrValue + ParallelRegionFn; +#pragma omp end declare target + +void runAndCheckState(void(Func(void))); + +void assumeInitialState(bool IsSPMD); + +} // namespace state + +namespace icv { + +#pragma omp declare target + +/// TODO +inline state::Value NThreads; + +/// TODO +inline state::Value Level; + +/// The `active-level` describes which of the parallel level counted with the +/// `level-var` is active. There can only be one. +/// +/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0. +inline state::Value ActiveLevel; + +/// TODO +inline state::Value MaxActiveLevels; + +/// TODO +inline state::Value RunSched; + +#pragma omp end declare target + +} // namespace icv + +namespace memory { + +/// Alloca \p Size bytes in shared memory, if possible, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_alloc_shared for proper usage. +void *allocShared(uint64_t Size, const char *Reason); + +/// Free \p Ptr, alloated via allocShared, for \p Reason. +/// +/// Note: See the restrictions on __kmpc_free_shared for proper usage. +void freeShared(void *Ptr, const char *Reason); + +/// Alloca \p Size bytes in global memory, if possible, for \p Reason. +void *allocGlobal(uint64_t Size, const char *Reason); + +/// Free \p Ptr, alloated via allocGlobal, for \p Reason. +void freeGlobal(void *Ptr, const char *Reason); + +} // namespace memory + +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -0,0 +1,69 @@ +//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H +#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H + +#include "Types.h" + +namespace _OMP { + +namespace synchronize { + +/// Initialize the synchronization machinery. Must be called by all threads. +void init(bool IsSPMD); + +/// Synchronize all threads in a warp identified by \p Mask. +void warp(LaneMaskTy Mask); + +/// Synchronize all threads in a block. +void threads(); + +} // namespace synchronize + +namespace fence { + +/// Memory fence with \p Ordering semantics for the team. +void team(int Ordering); + +/// Memory fence with \p Ordering semantics for the contention group. +void kernel(int Ordering); + +/// Memory fence with \p Ordering semantics for the system. +void system(int Ordering); + +} // namespace fence + +namespace atomic { + +/// Atomically read \p Addr with \p Ordering semantics. +uint32_t read(uint32_t *Addr, int Ordering); + +/// Atomically store \p V to \p Addr with \p Ordering semantics. +uint32_t store(uint32_t *Addr, uint32_t V, int Ordering); + +/// Atomically store \p V to \p Addr with \p Ordering semantics. +uint64_t store(uint64_t *Addr, uint64_t V, int Ordering); + +/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. +uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering); + +/// Atomically add \p V to \p *Addr with \p Ordering semantics. +uint32_t add(uint32_t *Addr, uint32_t V, int Ordering); + +/// Atomically add \p V to \p *Addr with \p Ordering semantics. +uint64_t add(uint64_t *Addr, uint64_t V, int Ordering); + +} // namespace atomic + +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Types.h @@ -0,0 +1,200 @@ +//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_TYPES_H +#define OMPTARGET_TYPES_H + +/// Base type declarations for freestanding mode +/// +///{ +using int8_t = char; +using uint8_t = unsigned char; +using int16_t = short; +using uint16_t = unsigned short; +using int32_t = int; +using uint32_t = unsigned int; +using int64_t = long; +using uint64_t = unsigned long; + +static_assert(sizeof(int8_t) == 1, "type size mismatch"); +static_assert(sizeof(uint8_t) == 1, "type size mismatch"); +static_assert(sizeof(int16_t) == 2, "type size mismatch"); +static_assert(sizeof(uint16_t) == 2, "type size mismatch"); +static_assert(sizeof(int32_t) == 4, "type size mismatch"); +static_assert(sizeof(uint32_t) == 4, "type size mismatch"); +static_assert(sizeof(int64_t) == 8, "type size mismatch"); +static_assert(sizeof(uint64_t) == 8, "type size mismatch"); +///} + +enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +}; + +enum omp_sched_t { + omp_sched_static = 1, /* chunkSize >0 */ + omp_sched_dynamic = 2, /* chunkSize >0 */ + omp_sched_guided = 3, /* chunkSize >0 */ + omp_sched_auto = 4, /* no chunkSize */ +}; + +enum kmp_sched_t { + kmp_sched_static_chunk = 33, + kmp_sched_static_nochunk = 34, + kmp_sched_dynamic = 35, + kmp_sched_guided = 36, + kmp_sched_runtime = 37, + kmp_sched_auto = 38, + + kmp_sched_static_balanced_chunk = 45, + + kmp_sched_static_ordered = 65, + kmp_sched_static_nochunk_ordered = 66, + kmp_sched_dynamic_ordered = 67, + kmp_sched_guided_ordered = 68, + kmp_sched_runtime_ordered = 69, + kmp_sched_auto_ordered = 70, + + kmp_sched_distr_static_chunk = 91, + kmp_sched_distr_static_nochunk = 92, + kmp_sched_distr_static_chunk_sched_static_chunkone = 93, + + kmp_sched_default = kmp_sched_static_nochunk, + kmp_sched_unordered_first = kmp_sched_static_chunk, + kmp_sched_unordered_last = kmp_sched_auto, + kmp_sched_ordered_first = kmp_sched_static_ordered, + kmp_sched_ordered_last = kmp_sched_auto_ordered, + kmp_sched_distribute_first = kmp_sched_distr_static_chunk, + kmp_sched_distribute_last = + kmp_sched_distr_static_chunk_sched_static_chunkone, + + /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. + * Since we need to distinguish the three possible cases (no modifier, + * monotonic modifier, nonmonotonic modifier), we need separate bits for + * each modifier. The absence of monotonic does not imply nonmonotonic, + * especially since 4.5 says that the behaviour of the "no modifier" case + * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. + * + * Since we're passing a full 32 bit value, we can use a couple of high + * bits for these flags; out of paranoia we avoid the sign bit. + * + * These modifiers can be or-ed into non-static schedules by the compiler + * to pass the additional information. They will be stripped early in the + * processing in __kmp_dispatch_init when setting up schedules, so + * most of the code won't ever see schedules with these bits set. + */ + kmp_sched_modifier_monotonic = (1 << 29), + /**< Set if the monotonic schedule modifier was present */ + kmp_sched_modifier_nonmonotonic = (1 << 30), +/**< Set if the nonmonotonic schedule modifier was present */ + +#define SCHEDULE_WITHOUT_MODIFIERS(s) \ + (enum kmp_sched_t)( \ + (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) +#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) +#define SCHEDULE_HAS_NONMONOTONIC(s) \ + (((s)&kmp_sched_modifier_nonmonotonic) != 0) +#define SCHEDULE_HAS_NO_MODIFIERS(s) \ + (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ + 0) + +}; + +struct TaskDescriptorTy; +using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr); +struct TaskDescriptorTy { + void *Payload; + TaskFnTy TaskFn; +}; + +#pragma omp begin declare variant match(device = {arch(amdgcn)}) +using LaneMaskTy = uint64_t; +#pragma omp end declare variant + +#pragma omp begin declare variant match( \ + device = {arch(amdgcn)}, implementation = {extension(match_none)}) +using LaneMaskTy = uint64_t; +#pragma omp end declare variant + +namespace lanes { +enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; +} // namespace lanes + +/// The ident structure that describes a source location. The struct is +/// identical to the one in the kmp.h file. We maintain the same data structure +/// for compatibility. +struct IdentTy { + int32_t reserved_1; /**< might be used in Fortran; see above */ + int32_t flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC + identifies this union member */ + int32_t reserved_2; /**< not really used in Fortran any more; see above */ + int32_t reserved_3; /**< source[4] in Fortran, do not use for C++ */ + char const *psource; /**< String describing the source location. + The string is composed of semi-colon separated fields + which describe the source file, the function and a pair + of line numbers that delimit the construct. */ +}; + +using __kmpc_impl_lanemask_t = LaneMaskTy; + +using ParallelRegionFnTy = void *; + +using CriticalNameTy = int32_t[8]; + +struct omp_lock_t { + void *Lock; +}; + +using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num); +using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id, + int16_t lane_offset, int16_t shortCircuit); +using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data); + +/// Macros for allocating variables in different address spaces. +///{ + +// Follows the pattern in interface.h +typedef enum omp_allocator_handle_t { + omp_null_allocator = 0, + omp_default_mem_alloc = 1, + omp_large_cap_mem_alloc = 2, + omp_const_mem_alloc = 3, + omp_high_bw_mem_alloc = 4, + omp_low_lat_mem_alloc = 5, + omp_cgroup_mem_alloc = 6, + omp_pteam_mem_alloc = 7, + omp_thread_mem_alloc = 8, + KMP_ALLOCATOR_MAX_HANDLE = ~(0U) +} omp_allocator_handle_t; + +#define __PRAGMA(STR) _Pragma(#STR) +#define OMP_PRAGMA(STR) __PRAGMA(omp STR) + +#define SHARED(NAME) \ + NAME [[clang::loader_uninitialized]]; \ + OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right +// now that's not the case. +#define THREAD_LOCAL(NAME) \ + NAME [[clang::loader_uninitialized, clang::address_space(5)]] + +// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it +// does? +#define CONSTANT(NAME) \ + NAME [[clang::loader_uninitialized, clang::address_space(4)]] + +///} + +#endif diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Utils.h @@ -0,0 +1,57 @@ +//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_UTILS_H +#define OMPTARGET_DEVICERTL_UTILS_H + +#include "Types.h" + +namespace _OMP { +namespace utils { + +/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread +/// is identified by \p Mask. +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane); + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); + +/// Return \p LowBits and \p HighBits packed into a single 64 bit value. +uint64_t pack(uint32_t LowBits, uint32_t HighBits); + +/// Unpack \p Val into \p LowBits and \p HighBits. +void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); + +/// Round up \p V to a \p Boundary. +template inline Ty roundUp(Ty V, Ty Boundary) { + return (V + Boundary - 1) / Boundary * Boundary; +} + +/// Advance \p Ptr by \p Bytes bytes. +template inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) { + return reinterpret_cast(reinterpret_cast(Ptr) + Bytes); +} + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint32_t V) { return __builtin_ffs(V); } + +/// Return the first bit set in \p V. +inline uint32_t ffs(uint64_t V) { return __builtin_ffsl(V); } + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint32_t V) { return __builtin_popcount(V); } + +/// Return the number of bits set in \p V. +inline uint32_t popc(uint64_t V) { return __builtin_popcountl(V); } + +} // namespace utils +} // namespace _OMP + +#endif diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -0,0 +1,50 @@ +//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the data object of the constant device environment and the +// query API. +// +//===----------------------------------------------------------------------===// + +#include "Configuration.h" +#include "State.h" +#include "Types.h" + +using namespace _OMP; + +struct DeviceEnvironmentTy { + int32_t DebugLevel; + // uint32_t GenericModeMainThreadSharedMemoryStorage; + // uint32_t NumDevices; +}; + +#pragma omp declare target + +// TOOD: We want to change the name as soon as the old runtime is gone. +DeviceEnvironmentTy CONSTANT(omptarget_device_environment) + __attribute__((used)); + +int32_t config::getDebugLevel() { + return omptarget_device_environment.DebugLevel; +} + +int32_t config::getNumDevices() { + // return omptarget_device_environment.NumDevices; + return 0; +} + +uint32_t config::getGenericModeMainThreadSharedMemoryStorage() { + // return DeviceEnvironment.GenericModeMainThreadSharedMemoryStorage; + return state::SharedScratchpadSize / 4; +} + +bool config::isDebugMode(config::DebugLevel Level) { + return config::getDebugLevel() >= Level; +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -0,0 +1,31 @@ +//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains debug utilities +// +//===----------------------------------------------------------------------===// + +#include "Configuration.h" + +#include + +#pragma omp declare target + +extern "C" { +void __assert_assume(bool cond, const char *exp, const char *file, int line) { + // TODO: Replace 0 with right mode + if (_OMP::config::isDebugMode(_OMP::config::DebugLevel::Assertion) && !cond) { + printf("Assertion failed: %s at %s, line %d\n", exp, file, line); + __builtin_trap(); + } + + __builtin_assume(cond); +} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -0,0 +1,60 @@ +//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the kernel entry points for the device. +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" + +using namespace _OMP; + +namespace { +void inititializeRuntime(bool IsSPMD) { + synchronize::init(IsSPMD); + mapping::init(IsSPMD); + state::init(IsSPMD); +} +} // namespace + +#pragma omp declare target + +extern "C" { +void __kmpc_kernel_init(int, int16_t) { + inititializeRuntime(/* IsSPMD */ false); + // No need to wait since only the main threads will execute user + // code and workers will run into a barrier right away. + + ASSERT(!mapping::isSPMDMode()); +} + +void __kmpc_kernel_deinit(int16_t) { + // Signal the workers to exit the state machine and exit the kernel. + state::ParallelRegionFn = nullptr; +} + +void __kmpc_spmd_kernel_init(int, int16_t) { + inititializeRuntime(/* IsSPMD */ true); + + state::runAndCheckState(synchronize::threads); + + ASSERT(mapping::isSPMDMode()); +} + +void __kmpc_spmd_kernel_deinit_v2(int16_t) { + state::assumeInitialState(/* IsSPMD */ true); +} + +int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -0,0 +1,216 @@ +//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Mapping.h" + +#include "Utils.h" + +using namespace _OMP; + +namespace _OMP { +namespace impl { + +#pragma omp declare target + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +uint32_t getGridDim(uint32_t n, uint16_t d) { + uint32_t q = n / d; + return q + (n > q * d); +} + +uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, + uint16_t group_size) { + uint32_t r = grid_size - group_id * group_size; + return (r < group_size) ? r : group_size; +} + +LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } + +LaneMaskTy lanemaskLT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + int64_t Ballot = mapping::activemask(); + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + uint32_t Lane = mapping::getThreadIdInWarp(); + if (lane == (mapping::getWarpSize() - 1)) + return 0; + int64_t Ballot = mapping::activemask(); + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} + +uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } + +uint32_t getBlockSize() { + return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); +} + +uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } + +uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } + +uint32_t getNumberOfBlocks() { + return getGridDim(__builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); +} + +uint32_t getNumberOfProcessorElements() { + // TODO + return mapping::getBlockSize(); +} + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return 64; } + +uint32_t getNumberOfWarpsInBlock() { + return mapping::getBlockSize() / mapping::getWarpSize(); +} + +#pragma omp end declare variant +///} + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +LaneMaskTy activemask() { + unsigned int Mask; + asm("activemask.b32 %0;" : "=r"(Mask)); + return Mask; +} + +LaneMaskTy lanemaskLT() { + __kmpc_impl_lanemask_t Res; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res)); + return Res; +} + +LaneMaskTy lanemaskGT() { + __kmpc_impl_lanemask_t Res; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res)); + return Res; +} + +uint32_t getThreadIdInWarp() { + return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } + +uint32_t getBlockSize() { return __nvvm_read_ptx_sreg_ntid_x(); } + +uint32_t getKernelSize() { return __nvvm_read_ptx_sreg_nctaid_x(); } + +uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } + +uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } + +uint32_t getNumberOfProcessorElements() { + // TODO + return mapping::getBlockSize(); +} + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return 32; } + +uint32_t getNumberOfWarpsInBlock() { + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); +} + +#pragma omp end declare variant +///} + +#pragma omp end declare target + +} // namespace impl +} // namespace _OMP + +#pragma omp declare target + +bool mapping::isMainThreadInGenericMode() { + if (mapping::isSPMDMode()) + return false; + + // Check if this is the last warp in the block. + return mapping::getWarpId() + 1 == mapping::getNumberOfWarpsInBlock(); +} + +bool mapping::isLeaderInWarp() { + __kmpc_impl_lanemask_t Active = mapping::activemask(); + __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); + return utils::popc(Active & LaneMaskLT) == 0; +} + +LaneMaskTy mapping::activemask() { return impl::activemask(); } + +LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } + +LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } + +uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); } + +uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); } + +uint32_t mapping::getBlockSize() { return impl::getBlockSize(); } + +uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } + +uint32_t mapping::getBlockId() { return impl::getBlockId(); } + +uint32_t mapping::getNumberOfBlocks() { return impl::getNumberOfBlocks(); } + +uint32_t mapping::getNumberOfProcessorElements() { + return impl::getNumberOfProcessorElements(); +} + +uint32_t mapping::getWarpId() { return impl::getWarpId(); } + +uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } + +uint32_t mapping::getNumberOfWarpsInBlock() { + return impl::getNumberOfWarpsInBlock(); +} + +/// Execution mode +/// +///{ +static int SHARED(IsSPMDMode); + +void mapping::init(bool IsSPMD) { IsSPMDMode = IsSPMD; } + +bool mapping::isSPMDMode() { return IsSPMDMode; } + +bool mapping::isGenericMode() { return !isSPMDMode(); } +///} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -0,0 +1,74 @@ +//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Types.h" + +namespace _OMP { +namespace impl { +#pragma omp declare target + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +double getWTick() { return ((double)1E-9); } + +double getWTime() { + // The intrinsics for measuring time have undocumented frequency + // This will probably need to be found by measurement on a number of + // architectures. Until then, return 0, which is very inaccurate as a + // timer but resolves the undefined symbol at link time. + return 0; +} + +#pragma omp end declare variant + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +double getWTick() { + // Timer precision is 1ns + return ((double)1E-9); +} + +double getWTime() { + unsigned long long nsecs; + asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); + return (double)nsecs * getWTick(); +} + +#pragma omp end declare variant + +#pragma omp end declare target +} // namespace impl +} // namespace _OMP + +/// Interfaces +/// +///{ +#pragma omp declare target + +extern "C" { +int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } + +int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } + +double omp_get_wtick(void) { return _OMP::impl::getWTick(); } + +double omp_get_wtime(void) { return _OMP::impl::getWTime(); } +} + +#pragma omp end declare target +///} diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -0,0 +1,142 @@ +//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Parallel implementation in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Types.h" + +using namespace _OMP; + +#pragma omp declare target + +namespace { + +uint32_t determineNumberOfThreads() { + uint32_t NThreadsICV = icv::NThreads; + uint32_t NumThreads = mapping::getBlockSize(); + + // In non-SPMD mode, we need to substract the warp for the master thread + if (!mapping::isSPMDMode()) + NumThreads -= mapping::getWarpSize(); + + if (NThreadsICV != 0 && NThreadsICV < NumThreads) + NumThreads = NThreadsICV; + + // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP. + if (NumThreads < mapping::getWarpSize()) + NumThreads = 1; + else + NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); + + return NumThreads; +} + +} // namespace + +extern "C" { + +void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn) { + + uint32_t NumThreads = determineNumberOfThreads(); + state::ParallelTeamSize = NumThreads; + state::ParallelRegionFn = WorkFn; + + // We do *not* create a new data environment because all threads in the team + // that are active are now running this parallel region. They share the + // TeamState, which has an increase level-var and potentially active-level + // set, but they do not have individual ThreadStates yet. If they ever + // modify the ICVs beyond this point a ThreadStates will be allocated. + int NewLevel = ++icv::Level; + bool IsActiveParallelRegion = NumThreads > 1; + if (IsActiveParallelRegion) + icv::ActiveLevel = NewLevel; +} + +bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { + // Work function and arguments for L1 parallel region. + *WorkFn = state::ParallelRegionFn; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) { + return false; + } + + // Set to true for workers participating in the parallel region. + uint32_t TId = mapping::getThreadIdInBlock(); + bool ThreadIsActive = TId < state::ParallelTeamSize; + return ThreadIsActive; +} + +void __kmpc_kernel_end_parallel() { + // We did *not* create a new data environment because all threads in the team + // that were active were running the parallel region. We used the TeamState + // which needs adjustment now. + // --icv::Level; + // bool IsActiveParallelRegion = state::ParallelTeamSize; + // if (IsActiveParallelRegion) + // icv::ActiveLevel = 0; + + // state::ParallelTeamSize = 1; + + // In case we have modified an ICV for this thread before a ThreadState was + // created. We drop it now to not contaminate the next parallel region. + state::resetStateForThread(); +} + +void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) { + state::enterDataEnvironment(); + ++icv::Level; +} + +void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) { + state::exitDataEnvironment(); + --icv::Level; +} + +uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } + +int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } + +void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) { + icv::NThreads = NumThreads; +} + +void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, + int32_t thread_limit) {} + +void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -0,0 +1,319 @@ +//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +namespace { + +#pragma omp declare target + +void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { + for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { + shflFct(reduce_data, /*LaneId - not used= */ 0, + /*Offset = */ mask, /*AlgoVersion=*/0); + } +} + +void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, + uint32_t size, uint32_t tid) { + uint32_t curr_size; + uint32_t mask; + curr_size = size; + mask = curr_size / 2; + while (mask > 0) { + shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); + curr_size = (curr_size + 1) / 2; + mask = curr_size / 2; + } +} + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +static uint32_t gpu_irregular_simd_reduce(void *reduce_data, + ShuffleReductFnTy shflFct) { + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); + __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); + __kmpc_impl_lanemask_t Liveness = mapping::activemask(); + uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; + __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); + do { + Liveness = mapping::activemask(); + remote_id = utils::ffs(Liveness & lanemask_gt); + size = utils::popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} +#endif + +static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, + uint64_t reduce_size, + void *reduce_data, + ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct, + bool isSPMDExecutionMode, bool) { + uint32_t BlockThreadId = mapping::getThreadIdInBlock(); + uint32_t NumThreads = omp_get_num_threads(); + if (NumThreads == 1) + return 1; + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + uint32_t WarpId = mapping::getWarpId(); + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % mapping::getWarpSize(), + /*LaneId=*/mapping::getThreadIdInBlock() % + mapping::getWarpSize()); + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + } + return BlockThreadId == 0; +#else + __kmpc_impl_lanemask_t Liveness = mapping::activemask(); + if (Liveness == lanes::All) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/utils::popc(Liveness), + /*LaneId=*/mapping::getThreadIdInBlock() % + mapping::getWarpSize()); + else { // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + } + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return TId == 0; +#endif // __CUDA_ARCH__ >= 700 +} + +uint32_t roundToWarpsize(uint32_t s) { + if (s < mapping::getWarpSize()) + return 1; + return (s & ~(unsigned)(mapping::getWarpSize() - 1)); +} + +uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } + +static volatile uint32_t IterCnt = 0; +static volatile uint32_t Cnt = 0; + +#pragma omp end declare target + +} // namespace + +#pragma omp declare target + +extern "C" { +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { + return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data, + shflFct, cpyFct, mapping::isSPMDMode(), + false); +} + +int32_t __kmpc_nvptx_teams_reduce_nowait_v2( + IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, + void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, + ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, + ListGlobalFnTy glredFct) { + + // Terminate all threads in non-SPMD mode except for the master thread. + if (mapping::isGenericMode() && !mapping::isMainThreadInGenericMode()) + return 0; + + uint32_t ThreadId = omp_get_thread_num(); + + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = omp_get_num_threads(); + uint32_t TeamId = omp_get_team_num(); + uint32_t NumTeams = omp_get_num_teams(); + static unsigned SHARED(Bound); + static unsigned SHARED(ChunkTeamCount); + + // Block progress for teams greater than the current upper + // limit. We always only allow a number of teams less or equal + // to the number of slots in the buffer. + // bool IsMaster = __kmpc_master(nullptr, TId); + bool IsMaster = (ThreadId == 0); + while (IsMaster) { + Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); + if (TeamId < Bound + num_of_records) + break; + } + + if (IsMaster) { + int ModBockId = TeamId % num_of_records; + if (TeamId < num_of_records) { + lgcpyFct(GlobalBuffer, ModBockId, reduce_data); + } else + lgredFct(GlobalBuffer, ModBockId, reduce_data); + + fence::system(__ATOMIC_SEQ_CST); + + // Increment team counter. + // This counter is incremented by all teams in the current + // BUFFER_SIZE chunk. + ChunkTeamCount = + atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST); + } + // Synchronize + if (mapping::isSPMDMode()) + __kmpc_barrier(Loc, TId); + + // reduce_data is global or shared so before being reduced within the + // warp we need to bring it in local memory: + // local_reduce_data = reduce_data[i] + // + // Example for 3 reduction variables a, b, c (of potentially different + // types): + // + // buffer layout (struct of arrays): + // a, a, ..., a, b, b, ... b, c, c, ... c + // |__________| + // num_of_records + // + // local_data_reduce layout (struct): + // a, b, c + // + // Each thread will have a local struct containing the values to be + // reduced: + // 1. do reduction within each warp. + // 2. do reduction across warps. + // 3. write the final result to the main reduction variable + // by returning 1 in the thread holding the reduction result. + + // Check if this is the very last team. + unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); + if (ChunkTeamCount == NumTeams - Bound - 1) { + // + // Last team processing. + // + if (ThreadId >= NumRecs) + return 0; + NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); + if (ThreadId >= NumThreads) + return 0; + + // Load from buffer and reduce. + glcpyFct(GlobalBuffer, ThreadId, reduce_data); + for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) + glredFct(GlobalBuffer, i, reduce_data); + + // Reduce across warps to the warp master. + if (NumThreads > 1) { + gpu_regular_warp_reduce(reduce_data, shflFct); + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); + if (ActiveThreads > mapping::getWarpSize()) { + uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / mapping::getWarpSize(); + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + ThreadId); + } + } + + if (IsMaster) { + Cnt = 0; + IterCnt = 0; + return 1; + } + return 0; + } + if (IsMaster && ChunkTeamCount == num_of_records - 1) { + // Allow SIZE number of teams to proceed writing their + // intermediate results to the global buffer. + atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records), + __ATOMIC_SEQ_CST); + } + + return 0; +} + +void __kmpc_nvptx_end_reduce(int32_t TId) {} + +void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -0,0 +1,714 @@ +//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "State.h" +#include "Configuration.h" +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +/// Memory implementation +/// +///{ + +namespace { + +#pragma omp declare target + +/// Fallback implementations are missing to trigger a link time error. +/// Implementations for new devices, including the host, should go into a +/// dedicated begin/end declare variant. +/// +///{ + +extern "C" { +void *malloc(uint64_t Size); +void free(void *Ptr); +} + +///} + +/// AMDGCN implementations of the shuffle sync idiom. +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +extern "C" { +void *malloc(uint64_t Size) { + // TODO: Use some preallocated space for dynamic malloc. + return 0; +} + +void free(void *Ptr) {} +} + +#pragma omp end declare variant +///} + +/// Add worst-case padding so that future allocations are properly aligned. +constexpr const uint32_t Alignment = 8; + +/// We do expose a malloc/free interface but instead should take the number of +/// bytes also in the pop call. Until clang is updated we hide the allocation +/// size "under" the allocated pointer. +constexpr const uint32_t StorageTrackingBytes = 8; + +static_assert((StorageTrackingBytes % Alignment) == 0, + "Storage tracker should preserve alignment."); + +/// A "smart" stack in shared memory. +/// +/// The stack exposes a malloc/free interface but works like a stack internally. +/// In fact, it is a separate stack *per warp*. That means, each warp must push +/// and pop symmetrically or this breaks, badly. The implementation will (aim +/// to) detect non-lock-step warps and fallback to malloc/free. The same will +/// happen if a warp runs out of memory. The master warp in generic memory is +/// special and is given more memory than the rest. +/// +struct SharedMemorySmartStackTy { + /// The amount of memory reserved for the main thread in generic mode. This + /// is done because the main thread in generic mode has to share values with + /// the workers which requires usually more space than used by workers for + /// sharing among each other. + uint32_t GenericModeMainThreadStorage; + + /// The amount of memory for each warp. + uint32_t WarpStorageTotal; + + /// Initialize the stack. Must be called by all threads. + void init(bool IsSPMD); + + /// Allocate \p Bytes on the stack for the encountering thread. Each thread + /// can call this function. + void *push(uint64_t Bytes); + + /// Deallocate the last allocation made by the encountering thread and pointed + /// to by \p Ptr from the stack. Each thread can call this function. + void pop(void *Ptr); + +private: + /// Compute the size of the storage space reserved for the warp of the + /// encountering thread. + uint32_t computeWarpStorageTotal(); + + /// Return the bottom address of the warp data stack, that is the first + /// address this warp allocated memory at, or will. + char *getWarpDataBottom(); + + /// Same as \p getWarpDataBottom() but it can access other warp's bottom. No + /// lock is used so it's user's responsibility to make sure the correctness. + char *getWarpDataBottom(uint32_t WarpId); + + /// Return the top address of the warp data stack, that is the first address + /// this warp will allocate memory at next. + char *getWarpDataTop(); + + /// Return the location of the usage tracker which keeps track of the amount + /// of memory used by this warp. + /// TODO: We could use the next warp bottom to avoid this tracker completely. + uint32_t *getWarpStorageTracker(); + + /// Same as \p getWarpStorageTracker() but it can access other warp's bottom. + /// No lock is used so it's user's responsibility to make sure the + /// correctness. + uint32_t *getWarpStorageTracker(uint32_t WarpId); + + /// The actual storage, shared among all warps. + char Data[state::SharedScratchpadSize] __attribute__((aligned(Alignment))); +}; + +/// The allocation of a single shared memory scratchpad. +static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); + +void SharedMemorySmartStackTy::init(bool IsSPMD) { + // Only leaders are needed to initialize the per-warp areas. + if (!mapping::isLeaderInWarp()) + return; + + GenericModeMainThreadStorage = + IsSPMD ? 0 : config::getGenericModeMainThreadSharedMemoryStorage(); + + WarpStorageTotal = computeWarpStorageTotal(); + + // Initialize the tracker to the bytes used by the storage tracker. + uint32_t *WarpStorageTracker = getWarpStorageTracker(); + *WarpStorageTracker = StorageTrackingBytes; + + static_assert( + StorageTrackingBytes >= (sizeof(*WarpStorageTracker)), + "Storage tracker bytes should cover the size of the storage tracker."); + + // Initialize the tracker for all warps if in non-spmd mode because this + // function will be only called by master thread. + if (!IsSPMD) { + for (unsigned I = 0; I < mapping::getNumberOfWarpsInBlock(); ++I) { + uint32_t *WarpStorageTracker = getWarpStorageTracker(I); + *WarpStorageTracker = StorageTrackingBytes; + } + } +} + +void *SharedMemorySmartStackTy::push(uint64_t BytesPerLane) { + // First align the number of requested bytes. + BytesPerLane = (BytesPerLane + (Alignment - 1)) / Alignment * Alignment; + + // The pointer we eventually return. + void *Ptr = nullptr; + + LaneMaskTy Active = mapping::activemask(); + uint32_t NumActive = utils::popc(Active); + + // Only the leader allocates, the rest, if any, waits for the result at the + // shfl_sync below. + if (mapping::isLeaderInWarp()) { + uint32_t BytesTotal = BytesPerLane * NumActive; + + // Ensure the warp is complete or it is the sole main thread in generic + // mode. In any other situation we might not be able to preserve balanced + // push/pop accesses even if the code looked that way because the hardware + // could independently schedule the warp parts. If an incomplete warp + // arrives here we fallback to the slow path, namely malloc. + if (NumActive < mapping::getWarpSize() && + !mapping::isMainThreadInGenericMode()) { + Ptr = memory::allocGlobal( + BytesTotal, "Slow path shared memory allocation, incomplete warp!"); + } else { + // If warp is "complete" determine if we have sufficient space. + uint32_t *WarpStorageTracker = getWarpStorageTracker(); + uint32_t BytesTotalAdjusted = BytesTotal + StorageTrackingBytes; + uint32_t BytesTotalAdjustedAligned = + (BytesTotalAdjusted + (Alignment - 1)) / Alignment * Alignment; + + uint32_t BytesInUse = *WarpStorageTracker; + if (BytesInUse + BytesTotalAdjustedAligned > WarpStorageTotal) { + Ptr = memory::allocGlobal( + BytesTotal, + "Slow path shared memory allocation, insufficient memory!"); + } else { + // We have enough memory, put the new allocation on the top of the stack + // preceded by the size of the allocation. + Ptr = getWarpDataTop(); + *WarpStorageTracker += BytesTotalAdjustedAligned; + *((uint64_t *)Ptr) = BytesTotalAdjustedAligned; + Ptr = ((char *)Ptr) + StorageTrackingBytes; + } + } + } + + // Skip the shfl_sync if the thread is alone. + if (NumActive == 1) + return Ptr; + + // Get the address of the allocation from the leader. + uint32_t Leader = utils::ffs(Active) - 1; + int *FP = reinterpret_cast(&Ptr); + FP[0] = utils::shuffle(Active, FP[0], Leader); + if (sizeof(Ptr) == 8) + FP[1] = utils::shuffle(Active, FP[1], Leader); + + // Compute the thread position into the allocation, which we did for the + // entire warp. + LaneMaskTy LaneMaskLT = mapping::lanemaskLT(); + uint32_t WarpPosition = utils::popc(Active & LaneMaskLT); + return reinterpret_cast(Ptr) + (BytesPerLane * WarpPosition); +} + +void SharedMemorySmartStackTy::pop(void *Ptr) { + // Only the leader deallocates, the rest, if any, waits at the synchwarp + // below. + if (mapping::isLeaderInWarp()) { + // memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); + // // Check if the pointer is from a malloc or from within the stack. + if (Ptr < &Data[0] || Ptr > &Data[state::SharedScratchpadSize]) { + memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); + } else { + // Lookup the allocation size "below" the allocation (=Ptr). + Ptr = reinterpret_cast(Ptr) - StorageTrackingBytes; + uint64_t BytesTotalAdjustedAligned = *reinterpret_cast(Ptr); + + // Free the memory by adjusting the storage tracker accordingly. + uint32_t *WarpStorageTracker = getWarpStorageTracker(); + *WarpStorageTracker -= BytesTotalAdjustedAligned; + } + } + // Ensure the entire warp waits until the pop is done. + synchronize::warp(mapping::activemask()); +} + +uint32_t SharedMemorySmartStackTy::computeWarpStorageTotal() { + if (mapping::isMainThreadInGenericMode()) + return GenericModeMainThreadStorage; + + // In generic mode we reserve parts of the storage for the main thread. + uint32_t StorageTotal = state::SharedScratchpadSize; + if (mapping::isGenericMode()) + StorageTotal -= GenericModeMainThreadStorage; + + uint32_t NumWarps = mapping::getNumberOfWarpsInBlock(); + uint32_t WarpStorageTotal = StorageTotal / NumWarps; + + // Align the size + WarpStorageTotal = WarpStorageTotal / Alignment * Alignment; + + return WarpStorageTotal; +} + +char *SharedMemorySmartStackTy::getWarpDataBottom(uint32_t WarpId) { + if (mapping::isMainThreadInGenericMode()) + return &Data[0]; + + uint32_t PriorWarpStorageTotal = 0; + if (mapping::isGenericMode()) + PriorWarpStorageTotal += GenericModeMainThreadStorage; + + PriorWarpStorageTotal += WarpStorageTotal * WarpId; + + return &Data[PriorWarpStorageTotal]; +} + +char *SharedMemorySmartStackTy::getWarpDataBottom() { + return getWarpDataBottom(mapping::getWarpId()); +} + +char *SharedMemorySmartStackTy::getWarpDataTop() { + uint32_t *WarpStorageTracker = getWarpStorageTracker(); + return getWarpDataBottom() + (*WarpStorageTracker); +} + +uint32_t *SharedMemorySmartStackTy::getWarpStorageTracker(uint32_t WarpId) { + return ((uint32_t *)getWarpDataBottom(WarpId)); +} + +uint32_t *SharedMemorySmartStackTy::getWarpStorageTracker() { + return getWarpStorageTracker(mapping::getWarpId()); +} + +#pragma omp end declare target +} // namespace + +// TODO: Clang should accept namespaces inside the declare target range. +#pragma omp declare target + +void *memory::allocShared(uint64_t Bytes, const char *Reason) { + return SharedMemorySmartStack.push(Bytes); +} + +void memory::freeShared(void *Ptr, const char *Reason) { + SharedMemorySmartStack.pop(Ptr); +} + +void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { + return malloc(Bytes); +} + +void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } + +#pragma omp end declare target + +///} + +namespace { + +#pragma omp declare target + +struct ICVStateTy { + uint32_t NThreadsVar; + uint32_t LevelVar; + uint32_t ActiveLevelVar; + uint32_t MaxActiveLevelsVar; + uint32_t RunSchedVar; + uint32_t RunSchedChunkVar; + + bool operator==(const ICVStateTy &Other) const; + + void assertEqual(const ICVStateTy &Other) const; +}; + +bool ICVStateTy::operator==(const ICVStateTy &Other) const { + return (NThreadsVar == Other.NThreadsVar) && (LevelVar == Other.LevelVar) && + (ActiveLevelVar == Other.ActiveLevelVar) && + (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) && + (RunSchedVar == Other.RunSchedVar) && + (RunSchedChunkVar == Other.RunSchedChunkVar); +} + +void ICVStateTy::assertEqual(const ICVStateTy &Other) const { + ASSERT(*this == Other); +} + +struct TeamStateTy { + /// TODO: provide a proper init function. + void init(bool IsSPMD); + + bool operator==(const TeamStateTy &) const; + + void assertEqual(TeamStateTy &Other) const; + + /// ICVs + /// + /// Preallocated storage for ICV values that are used if the threads have not + /// set a custom default. The latter is supported but unlikely and slow(er). + /// + ///{ + ICVStateTy ICVState; + ///} + + uint32_t ParallelTeamSize; + ParallelRegionFnTy ParallelRegionFnVar; +}; + +TeamStateTy SHARED(TeamState); + +void TeamStateTy::init(bool IsSPMD) { + if (IsSPMD) { + ICVState.NThreadsVar = 1; + ICVState.LevelVar = 1; + ICVState.ActiveLevelVar = 1; + ICVState.MaxActiveLevelsVar = 1; + ICVState.RunSchedVar = omp_sched_static; + ICVState.RunSchedChunkVar = 1; + ParallelTeamSize = mapping::getBlockSize(); + } else { + // In non-SPMD mode, the last warp only contains one thread, the master + // thread. + ICVState.NThreadsVar = + (mapping::getNumberOfWarpsInBlock() - 1) * mapping::getWarpSize(); + ICVState.LevelVar = 0; + ICVState.ActiveLevelVar = 0; + ICVState.MaxActiveLevelsVar = 1; + ICVState.RunSchedVar = omp_sched_static; + ICVState.RunSchedChunkVar = 1; + ParallelTeamSize = 1; + } +} + +bool TeamStateTy::operator==(const TeamStateTy &Other) const { + return ICVState == Other.ICVState && + ParallelTeamSize == Other.ParallelTeamSize; +} + +void TeamStateTy::assertEqual(TeamStateTy &Other) const { + ASSERT(*this == Other); +} + +struct ThreadStateTy { + + /// ICVs have preallocated storage in the TeamStateTy which is used if a + /// thread has not set a custom value. The latter is supported but unlikely. + /// When it happens we will allocate dynamic memory to hold the values of all + /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an + /// ICV struct to hold them all. This is slower than alternatives but allows + /// users to pay only for what they use. + /// + ICVStateTy ICVState; + + ThreadStateTy *PreviousThreadState; + + void init() { + ICVState = TeamState.ICVState; + PreviousThreadState = nullptr; + } + + void init(ThreadStateTy &PreviousTS) { + ICVState = PreviousTS.ICVState; + PreviousThreadState = &PreviousTS; + } +}; + +__attribute__((loader_uninitialized)) +ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; +#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) + +uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) { + if (mapping::isMainThreadInGenericMode()) + return TeamState.ICVState.*Var; + uint32_t TId = mapping::getThreadIdInBlock(); + if (!ThreadStates[TId]) { + ThreadStates[TId] = reinterpret_cast(memory::allocGlobal( + sizeof(ThreadStateTy), "ICV modification outside data environment")); + ThreadStates[TId]->init(); + } + return ThreadStates[TId]->ICVState.*Var; +} + +uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { + if (mapping::isMainThreadInGenericMode()) + return TeamState.ICVState.*Var; + uint32_t TId = mapping::getThreadIdInBlock(); + if (ThreadStates[TId]) + return ThreadStates[TId]->ICVState.*Var; + return TeamState.ICVState.*Var; +} +uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { + if (mapping::isMainThreadInGenericMode()) + return TeamState.ICVState.*Var; + uint64_t TId = mapping::getThreadIdInBlock(); + if (ThreadStates[TId]) + return ThreadStates[TId]->ICVState.*Var; + return TeamState.ICVState.*Var; +} + +int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, + int OutOfBoundsVal = -1) { + if (Level == 0) + return DefaultVal; + int LevelVar = omp_get_level(); + if (Level < 0 || Level > LevelVar) + return OutOfBoundsVal; + int ActiveLevel = icv::ActiveLevel; + if (Level != ActiveLevel) + return DefaultVal; + return Val; +} + +#pragma omp end declare target + +} // namespace + +#pragma omp declare target + +uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) { + switch (Kind) { + case state::VK_NThreads: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::NThreadsVar); + return lookupForModify32Impl(&ICVStateTy::NThreadsVar); + case state::VK_Level: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::LevelVar); + return lookupForModify32Impl(&ICVStateTy::LevelVar); + case state::VK_ActiveLevel: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::ActiveLevelVar); + return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar); + case state::VK_MaxActiveLevels: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar); + return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar); + case state::VK_RunSched: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::RunSchedVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedVar); + case state::VK_RunSchedChunk: + if (IsReadonly) + return lookup32Impl(&ICVStateTy::RunSchedChunkVar); + return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar); + case state::VK_ParallelTeamSize: + return TeamState.ParallelTeamSize; + default: + break; + } + __builtin_unreachable(); +} + +void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { + switch (Kind) { + case state::VK_ParallelRegionFn: + return TeamState.ParallelRegionFnVar; + default: + break; + } + __builtin_unreachable(); +} + +void state::init(bool IsSPMD) { + SharedMemorySmartStack.init(IsSPMD); + TeamState.init(IsSPMD); + + if (IsSPMD) { + ThreadStates[mapping::getThreadIdInBlock()] = nullptr; + } else { + for (uint32_t i = 0; i < mapping::getBlockSize(); ++i) + ThreadStates[i] = nullptr; + } +} + +void state::enterDataEnvironment() { + unsigned TId = mapping::getThreadIdInBlock(); + + ThreadStateTy *NewThreadState = + static_cast(__kmpc_alloc_shared(sizeof(ThreadStateTy))); + NewThreadState->init(*ThreadStates[TId]); + ThreadStates[TId] = NewThreadState; +} + +void state::exitDataEnvironment() { + // assert(ThreadStates[TId] && "exptected thread state"); + resetStateForThread(); +} + +void state::resetStateForThread() { + unsigned TId = mapping::getThreadIdInBlock(); + if (!ThreadStates[TId]) + return; + + ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; + __kmpc_free_shared(ThreadStates[TId]); + ThreadStates[TId] = PreviousThreadState; +} + +void state::runAndCheckState(void(Func(void))) { + TeamStateTy OldTeamState = TeamState; + + Func(); + + ASSERT(OldTeamState == TeamState); +} + +void state::assumeInitialState(bool IsSPMD) { + TeamStateTy InitialTeamState; + InitialTeamState.init(IsSPMD); + ASSERT(InitialTeamState == TeamState); + + if (IsSPMD) + ASSERT(ThreadStates[mapping::getThreadIdInBlock()] == nullptr); + else + for (unsigned I = 0; I < mapping::getBlockSize(); ++I) + ASSERT(ThreadStates[I] == nullptr); +} + +extern "C" { +void omp_set_dynamic(int V) {} + +int omp_get_dynamic(void) { return 0; } + +void omp_set_num_threads(int V) { icv::NThreads = V; } + +int omp_get_max_threads(void) { return icv::NThreads; } + +int omp_get_level(void) { + int LevelVar = icv::Level; + __builtin_assume(LevelVar >= 0); + return LevelVar; +} + +int omp_get_active_level(void) { return !!icv::ActiveLevel; } + +int omp_in_parallel(void) { return !!icv::ActiveLevel; } + +void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { + *ScheduleKind = static_cast((int)icv::RunSched); + *ChunkSize = state::RunSchedChunk; +} + +void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { + icv::RunSched = (int)ScheduleKind; + state::RunSchedChunk = ChunkSize; +} + +int omp_get_ancestor_thread_num(int Level) { + return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); +} + +int omp_get_thread_num(void) { + return omp_get_ancestor_thread_num(omp_get_level()); +} + +int omp_get_team_size(int Level) { + return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1); +} + +int omp_get_num_threads(void) { return state::ParallelTeamSize; } + +int omp_get_thread_limit(void) { return mapping::getKernelSize(); } + +int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } + +void omp_set_nested(int) {} + +int omp_get_nested(void) { return false; } + +void omp_set_max_active_levels(int Levels) { + icv::MaxActiveLevels = Levels > 0 ? 1 : 0; +} + +int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } + +omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } + +int omp_get_num_places(void) { return 0; } + +int omp_get_place_num_procs(int) { return omp_get_num_procs(); } + +void omp_get_place_proc_ids(int, int *) { + // TODO +} + +int omp_get_place_num(void) { return 0; } + +int omp_get_partition_num_places(void) { return 0; } + +void omp_get_partition_place_nums(int *) { + // TODO +} + +int omp_get_cancellation(void) { return 0; } + +void omp_set_default_device(int) {} + +int omp_get_default_device(void) { return -1; } + +int omp_get_num_devices(void) { return config::getNumDevices(); } + +int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } + +int omp_get_team_num() { return mapping::getBlockId(); } + +int omp_get_initial_device(void) { return -1; } +} + +extern "C" { +// TODO: The noinline is a workaround until we run OpenMP opt before the +// inliner. +__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) { + return memory::allocShared(Bytes, "Frontend alloc shared"); +} + +// TODO: The noinline is a workaround until we run OpenMP opt before the +// inliner. +__attribute__((noinline)) void __kmpc_free_shared(void *Ptr) { + memory::freeShared(Ptr, "Frontend free shared"); +} + +/// The shared variable used by the main thread to communicate with the workers. +/// It will contain the location of the memory allocated for the actually shared +/// values. +/// +/// Workaround until the interface is changed. +static void **SHARED(GlobalArgsPtr); + +void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs) { + // TODO: To mimic the old behavior we allocate in `sizeof(void*)` chunks. We + // should pass the required bytes instead. + *GlobalArgs = GlobalArgsPtr = static_cast( + __kmpc_alloc_shared(NumArgs * sizeof(GlobalArgsPtr[0]))); +} + +void __kmpc_end_sharing_variables() { + --icv::Level; + bool IsActiveParallelRegion = state::ParallelTeamSize > 1; + if (IsActiveParallelRegion) + icv::ActiveLevel = 0; + + state::ParallelTeamSize = 1; + + __kmpc_free_shared(GlobalArgsPtr); +} + +void __kmpc_get_shared_variables(void ***GlobalArgs) { + *GlobalArgs = GlobalArgsPtr; +} +} +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -0,0 +1,315 @@ +//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Include all synchronization. +// +//===----------------------------------------------------------------------===// + +#include "Synchronization.h" + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +namespace impl { +#pragma omp declare target + +/// Atomics +/// +///{ +/// NOTE: This function needs to be implemented by every target. +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); + +uint32_t atomicRead(uint32_t *Address, int Ordering) { + return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST); +} + +uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) { + return __atomic_fetch_add(Address, Val, Ordering); +} +uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) { + return __atomic_fetch_max(Address, Val, Ordering); +} + +uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) { + uint32_t R; + __atomic_exchange(Address, &Val, &R, Ordering); + return R; +} +uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val, + int Ordering) { + (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering, + Ordering); + return Compare; +} + +uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) { + return __atomic_fetch_add(Address, Val, Ordering); +} +///} + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, ""); +} + +uint32_t SHARD(namedBarrierTracker); + +void namedBarrierInit() { + // Don't have global ctors, and shared memory is not zero init + atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE); +} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + // assert(NumThreads % 32 == 0); + + uint32_t WarpSize = maping::getWarpSize(); + uint32_t NumWaves = NumThreads / WarpSize; + + fence::team(__ATOMIC_ACQUIRE); + + // named barrier implementation for amdgcn. + // Uses two 16 bit unsigned counters. One for the number of waves to have + // reached the barrier, and one to count how many times the barrier has been + // passed. These are packed in a single atomically accessed 32 bit integer. + // Low bits for the number of waves, assumed zero before this call. + // High bits to count the number of times the barrier has been passed. + + // precondition: NumWaves != 0; + // invariant: NumWaves * WarpSize == NumThreads; + // precondition: NumWaves < 0xffffu; + + // Increment the low 16 bits once, using the lowest active thread. + if (mapping::isLeaderInWarp()) { + uint32_t load = atomic::add(&namedBarrierTracker, 1, + __ATOMIC_RELAXED); // commutative + + // Record the number of times the barrier has been passed + uint32_t generation = load & 0xffff0000u; + + if ((load & 0x0000ffffu) == (NumWaves - 1)) { + // Reached NumWaves in low bits so this is the last wave. + // Set low bits to zero and increment high bits + load += 0x00010000u; // wrap is safe + load &= 0xffff0000u; // because bits zeroed second + + // Reset the wave counter and release the waiting waves + atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED); + } else { + // more waves still to go, spin until generation counter changes + do { + __builtin_amdgcn_s_sleep(0); + load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED); + } while ((load & 0xffff0000u) == generation); + } + } + fence::team(__ATOMIC_RELEASE); +} + +void syncWarp(__kmpc_impl_lanemask_t) { + // AMDGCN doesn't need to sync threads in a warp +} + +void syncThreads() { __builtin_amdgcn_s_barrier(); } + +void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); } + +void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); } + +void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); } + +#pragma omp end declare variant +///} + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return __nvvm_atom_inc_gen_ui(Address, Val); +} + +void namedBarrierInit() {} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + ASSERT(NumThreads % 32 == 0); + + // The named barrier for active parallel threads of a team in an L1 parallel + // region to synchronize with each other. + int BarrierNo = 7; + asm volatile("bar.sync %0, %1;" + : + : "r"(BarrierNo), "r"(NumThreads) + : "memory"); +} + +void fenceTeam(int) { __nvvm_membar_cta(); } + +void fenceKernel(int) { __nvvm_membar_gl(); } + +void fenceSystem(int) { __nvvm_membar_sys(); } + +void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } + +void syncThreads() { __syncthreads(); } + +constexpr uint32_t OMP_SPIN = 1000; +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +// TODO: This seems to hide a bug in the declare variant handling. If it is +// called before it is defined +// here the overload won't happen. Investigate lalter! +void unsetLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST); +} + +int testLock(omp_lock_t *Lock) { + return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST); +} + +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void setLock(omp_lock_t *Lock) { + // TODO: not sure spinning is a good idea here.. + while (atomicCAS((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST) != UNSET) { + int32_t start = __nvvm_read_ptx_sreg_clock(); + int32_t now; + for (;;) { + now = __nvvm_read_ptx_sreg_clock(); + int32_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= OMP_SPIN * mapping::getBlockId()) { + break; + } + } + } // wait for 0 to be the read value +} + +#pragma omp end declare variant +///} + +#pragma omp end declare target +} // namespace impl + +#pragma omp declare target + +void synchronize::init(bool IsSPMD) { + if (!IsSPMD) + impl::namedBarrierInit(); +} + +void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } + +void synchronize::threads() { impl::syncThreads(); } + +void fence::team(int Ordering) { impl::fenceTeam(Ordering); } + +void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); } + +void fence::system(int Ordering) { impl::fenceSystem(Ordering); } + +uint32_t atomic::read(uint32_t *Addr, int Ordering) { + return impl::atomicRead(Addr, Ordering); +} + +uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) { + return impl::atomicInc(Addr, V, Ordering); +} + +uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) { + return impl::atomicAdd(Addr, V, Ordering); +} + +uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) { + return impl::atomicAdd(Addr, V, Ordering); +} + +extern "C" { +void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} + +void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} + +int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { + __kmpc_barrier(Loc, TId); + return 0; +} + +void __kmpc_barrier(IdentTy *Loc, int32_t TId) { + if (mapping::isMainThreadInGenericMode()) + return __kmpc_flush(Loc); + + if (mapping::isSPMDMode()) + return __kmpc_barrier_simple_spmd(Loc, TId); + + impl::namedBarrier(); +} + +void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { + synchronize::threads(); +} + +int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { + int WarpSize = mapping::getWarpSize(); + int BlockSize = mapping::getBlockSize(); + int MasterId = ((WarpSize - 1) ^ (~0U)) & (BlockSize - 1); + return mapping::getThreadIdInBlock() == MasterId; +} + +void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} + +int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { + return __kmpc_master(Loc, TId); +} + +void __kmpc_end_single(IdentTy *Loc, int32_t TId) { + // The barrier is explicitly called. +} + +void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); } + +__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() { + return mapping::activemask(); +} + +void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); } + +void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { + omp_set_lock(reinterpret_cast(Name)); +} + +void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { + omp_unset_lock(reinterpret_cast(Name)); +} + +void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } + +void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); } + +void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } + +void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } + +int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } +} // extern "C" + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp @@ -0,0 +1,104 @@ +//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Task implementation support. +// +// TODO: We should not allocate and execute the task in two steps. A new API is +// needed for that though. +// +//===----------------------------------------------------------------------===// + +#include "Interface.h" +#include "State.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +#pragma omp declare target + +TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t, + uint64_t TaskSizeInclPrivateValues, + uint64_t SharedValuesSize, + TaskFnTy TaskFn) { + auto TaskSizeInclPrivateValuesPadded = + utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *))); + auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; + TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( + TaskSizeTotal, "explicit task descriptor"); + TaskDescriptor->Payload = + utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded); + TaskDescriptor->TaskFn = TaskFn; + + return TaskDescriptor; +} + +int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { + return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); +} + +int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int32_t, + void *, int32_t, void *) { + state::DateEnvironmentRAII DERAII; + + TaskDescriptor->TaskFn(0, TaskDescriptor); + + memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); + return 0; +} + +void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { + state::enterDataEnvironment(); +} + +void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { + state::exitDataEnvironment(); + + memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); +} + +void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, + void *) {} + +void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} + +void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} + +int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } + +int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } + +void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int, + uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, + int32_t, uint64_t, void *) { + // Skip task entirely if empty iteration space. + if (*LowerBound > *UpperBound) + return; + + // The compiler has already stored lb and ub in the TaskDescriptorTy structure + // as we are using a single task to execute the entire loop, we can leave + // the initial task_t untouched + __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); +} + +int omp_in_final(void) { + // treat all tasks as final... Specs may expect runtime to keep + // track more precisely if a task was actively set by users... This + // is not explicitly specified; will treat as if runtime can + // actively decide to put a non-final task into a final one. + return 1; +} + +int omp_get_max_task_priority(void) { return 0; } + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -0,0 +1,133 @@ +//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Utils.h" + +#include "Mapping.h" + +using namespace _OMP; + +namespace impl { + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF)); + *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + uint32_t LowBitsLocal, HighBitsLocal; + asm("mov.b64 {%0,%1}, %2;" + : "=r"(LowBitsLocal), "=r"(HighBitsLocal) + : "l"(Val)); + *LowBits = LowBitsLocal; + *HighBits = HighBitsLocal; +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + uint64_t Val; + asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits)); + return Val; +} + +#pragma omp end declare variant + +/// AMDGCN Implementation +/// +///{ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + int Width = GetWarpSize(); + int Self = GetLaneId(); + int Index = SrcLane + (Self & ~(Width - 1)); + return __builtin_amdgcn_ds_bpermute(Index << 2, Var); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, + int32_t Width) { + int Self = GetLaneId(); + int Index = Self + LaneDelta; + Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; + return __builtin_amdgcn_ds_bpermute(Index << 2, Var); +} + +#pragma omp end declare variant +///} + +/// NVPTX Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f; + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); +} + +#pragma omp end declare variant +} // namespace impl + +#pragma omp declare target + +uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { + return impl::Pack(LowBits, HighBits); +} + +void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { + impl::Unpack(Val, &LowBits, &HighBits); +} + +int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return impl::shuffle(Mask, Var, SrcLane); +} + +int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, + int32_t Width) { + return impl::shuffleDown(Mask, Var, Delta, Width); +} + +extern "C" { +int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { + return impl::shuffleDown(lanes::All, Val, Delta, SrcLane); +} + +int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { + uint32_t lo, hi; + utils::unpack(Val, lo, hi); + hi = impl::shuffleDown(lanes::All, hi, Delta, Width); + lo = impl::shuffleDown(lanes::All, lo, Delta, Width); + return utils::pack(lo, hi); +} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -0,0 +1,769 @@ +//===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the KMPC interface +// for the loop construct plus other worksharing constructs that use the same +// interface as loops. +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "Interface.h" +#include "Mapping.h" +#include "State.h" +#include "Synchronization.h" +#include "Types.h" +#include "Utils.h" + +using namespace _OMP; + +// TODO: +struct DynamicScheduleTracker { + int64_t Chunk; + int64_t LoopUpperBound; + int64_t NextLowerBound; + int64_t Stride; + kmp_sched_t ScheduleType; + DynamicScheduleTracker *NextDST; +}; + +#define PRINT(...) +#define PRINT0(...) +#define ASSERT0(...) + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + +#pragma omp declare target + +// TODO: This variable is a hack inherited from the old runtime. +uint64_t SHARED(Cnt); + +template struct omptarget_nvptx_LoopSupport { + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling with chunk + + // Generic implementation of OMP loop scheduling with static policy + /*! \brief Calculate initial bounds for static loop and stride + * @param[in] loc location in code of the call (not used here) + * @param[in] global_tid global thread id + * @param[in] schetype type of scheduling (see omptarget-nvptx.h) + * @param[in] plastiter pointer to last iteration + * @param[in,out] pointer to loop lower bound. it will contain value of + * lower bound of first chunk + * @param[in,out] pointer to loop upper bound. It will contain value of + * upper bound of first chunk + * @param[in,out] pointer to loop stride. It will contain value of stride + * between two successive chunks executed by the same thread + * @param[in] loop increment bump + * @param[in] chunk size + */ + + // helper function for static chunk + static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, + T entityId, T numberOfEntities) { + // each thread executes multiple chunks all of the same size, except + // the last one + // distance between two successive chunks + stride = numberOfEntities * chunk; + lb = lb + entityId * chunk; + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + // Say ub' is the begining of the last chunk. Then who ever has a + // lower bound plus a multiple of the increment equal to ub' is + // the last one. + T beginingLastChunk = inputUb - (inputUb % chunk); + last = ((beginingLastChunk - lb) % stride) == 0; + } + + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling without chunk + + // helper function for static no chunk + static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, + T entityId, T numberOfEntities) { + // No chunk size specified. Each thread or warp gets at most one + // chunk; chunks are all almost of equal size + T loopSize = ub - lb + 1; + + chunk = loopSize / numberOfEntities; + T leftOver = loopSize - chunk * numberOfEntities; + + if (entityId < leftOver) { + chunk++; + lb = lb + entityId * chunk; + } else { + lb = lb + entityId * chunk + leftOver; + } + + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + last = lb <= inputUb && inputUb <= ub; + stride = loopSize; // make sure we only do 1 chunk per warp + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for Static Init + + static void for_static_init(int32_t gtid, int32_t schedtype, + int32_t *plastiter, T *plower, T *pupper, + ST *pstride, ST chunk, bool IsSPMDExecutionMode) { + // When IsRuntimeUninitialized is true, we assume that the caller is + // in an L0 parallel region and that all worker threads participate. + + // Assume we are in teams region or that we use a single block + // per target region + int numberOfActiveOMPThreads = omp_get_num_threads(); + + // All warps that are in excess of the maximum requested, do + // not execute the loop + PRINT(LD_LOOP, + "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " + "%d, num tids %d\n", + (int)gtid, (int)schedtype, (long long)chunk, (int)gtid, + (int)numberOfActiveOMPThreads); + ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, + "current thread is not needed here; error"); + + // copy + int lastiter = 0; + T lb = *plower; + T ub = *pupper; + ST stride = *pstride; + + // init + switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { + case kmp_sched_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_balanced_chunk: { + if (chunk > 0) { + // round up to make sure the chunk is enough to cover all iterations + T tripCount = ub - lb + 1; // +1 because ub is inclusive + T span = (tripCount + numberOfActiveOMPThreads - 1) / + numberOfActiveOMPThreads; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + if (ub > oldUb) + ub = oldUb; + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + case kmp_sched_distr_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), + omp_get_num_teams()); + break; + } // note: if chunk <=0, use nochunk + } + case kmp_sched_distr_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(), + omp_get_num_teams()); + break; + } + case kmp_sched_distr_static_chunk_sched_static_chunkone: { + ForStaticChunk(lastiter, lb, ub, stride, chunk, + numberOfActiveOMPThreads * omp_get_team_num() + gtid, + omp_get_num_teams() * numberOfActiveOMPThreads); + break; + } + default: { + // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); + PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", + (int)schedtype); + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } + // copy back + *plastiter = lastiter; + *plower = lb; + *pupper = ub; + *pstride = stride; + PRINT(LD_LOOP, + "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " + "%d\n", + (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(), + (long long)(*plower), (long long)(*pupper), (long long)(*pstride), + (int)lastiter); + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch Init + + static int OrderedSchedule(kmp_sched_t schedule) { + return schedule >= kmp_sched_ordered_first && + schedule <= kmp_sched_ordered_last; + } + + static void dispatch_init(IdentTy *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, + DynamicScheduleTracker *DST) { + int tid = mapping::getThreadIdInBlock(); + T tnum = omp_get_num_threads(); + T tripCount = ub - lb + 1; // +1 because ub is inclusive + ASSERT0(LT_FUSSY, threadId < tnum, + "current thread is not needed here; error"); + + /* Currently just ignore the monotonic and non-monotonic modifiers + * (the compiler isn't producing them * yet anyway). + * When it is we'll want to look at them somewhere here and use that + * information to add to our schedule choice. We shouldn't need to pass + * them on, they merely affect which schedule we can legally choose for + * various dynamic cases. (In particular, whether or not a stealing scheme + * is legal). + */ + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + + // Process schedule. + if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { + if (OrderedSchedule(schedule)) + __kmpc_barrier(loc, threadId); + PRINT(LD_LOOP, + "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", + (long)tnum, (long long)tripCount, (int)schedule); + schedule = kmp_sched_static_chunk; + chunk = tripCount; // one thread gets the whole loop + } else if (schedule == kmp_sched_runtime) { + // process runtime + omp_sched_t rtSched; + int ChunkInt; + omp_get_schedule(&rtSched, &ChunkInt); + chunk = ChunkInt; + switch (rtSched) { + case omp_sched_static: { + if (chunk > 0) + schedule = kmp_sched_static_chunk; + else + schedule = kmp_sched_static_nochunk; + break; + } + case omp_sched_auto: { + schedule = kmp_sched_static_chunk; + chunk = 1; + break; + } + case omp_sched_dynamic: + case omp_sched_guided: { + schedule = kmp_sched_dynamic; + break; + } + } + PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + } else if (schedule == kmp_sched_auto) { + schedule = kmp_sched_static_chunk; + chunk = 1; + PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + } else { + PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + // ASSERT(LT_FUSSY, + // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + // "unknown schedule %d & chunk %lld\n", (int)schedule, + // (long long)chunk); + } + + // init schedules + if (schedule == kmp_sched_static_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + DST->ScheduleType = schedule; + // save ub + DST->LoopUpperBound = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, DST->LoopUpperBound, + (unsigned long long)DST->NextLowerBound, + (unsigned long long)DST->Stride); + } else if (schedule == kmp_sched_static_balanced_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + DST->ScheduleType = schedule; + // save ub + DST->LoopUpperBound = ub; + // compute static chunk + ST stride; + int lastiter = 0; + // round up to make sure the chunk is enough to cover all iterations + T span = (tripCount + tnum - 1) / tnum; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + if (ub > oldUb) + ub = oldUb; + // save computed params + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, DST->LoopUpperBound, + (unsigned long long)DST->NextLowerBound, + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_static_nochunk) { + ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save sched state + DST->ScheduleType = schedule; + // save ub + DST->LoopUpperBound = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; + PRINT(LD_LOOP, + "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, DST->LoopUpperBound, + (unsigned long long)DST->NextLowerBound, + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { + // save data + DST->ScheduleType = schedule; + if (chunk < 1) + chunk = 1; + DST->Chunk = chunk; + DST->LoopUpperBound = ub; + DST->NextLowerBound = lb; + __kmpc_barrier(loc, threadId); + if (tid == 0) { + Cnt = 0; + fence::team(__ATOMIC_SEQ_CST); + } + __kmpc_barrier(loc, threadId); + PRINT(LD_LOOP, + "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 + ", chunk %" PRIu64 "\n", + (int)tnum, (unsigned long long)DST->NextLowerBound, + DST->LoopUpperBound, DST->Chunk); + } + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch next + + static uint64_t NextIter() { + __kmpc_impl_lanemask_t active = mapping::activemask(); + uint32_t leader = utils::ffs(active) - 1; + uint32_t change = utils::popc(active); + __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT(); + unsigned int rank = utils::popc(active & lane_mask_lt); + uint64_t warp_res; + if (rank == 0) { + warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST); + } + warp_res = utils::shuffle(active, warp_res, leader); + return warp_res + rank; + } + + static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, + T loopUpperBound) { + T N = NextIter(); + lb = loopLowerBound + N * chunkSize; + ub = lb + chunkSize - 1; // Clang uses i <= ub + + // 3 result cases: + // a. lb and ub < loopUpperBound --> NOT_FINISHED + // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> + // NOT_FINISHED + // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED + // a. + if (lb <= loopUpperBound && ub < loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); + return NOT_FINISHED; + } + // b. + if (lb <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); + ub = loopUpperBound; + return LAST_CHUNK; + } + // c. if we are here, we are in case 'c' + lb = loopUpperBound + 2; + ub = loopUpperBound + 1; + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb, + (long long)ub, (long long)loopUpperBound); + return FINISHED; + } + + static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, + T *plower, T *pupper, ST *pstride, + DynamicScheduleTracker *DST) { + // ID of a thread in its own warp + + // automatically selects thread or warp ID based on selected implementation + ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(), + "current thread is not needed here; error"); + // retrieve schedule + kmp_sched_t schedule = DST->ScheduleType; + + // xxx reduce to one + if (schedule == kmp_sched_static_chunk || + schedule == kmp_sched_static_nochunk) { + T myLb = DST->NextLowerBound; + T ub = DST->LoopUpperBound; + // finished? + if (myLb > ub) { + PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", + (long long)myLb, (long long)ub); + return DISPATCH_FINISHED; + } + // not finished, save current bounds + ST chunk = DST->Chunk; + *plower = myLb; + T myUb = myLb + chunk - 1; // Clang uses i <= ub + if (myUb > ub) + myUb = ub; + *pupper = myUb; + *plast = (int32_t)(myUb == ub); + + // increment next lower bound by the stride + ST stride = DST->Stride; + DST->NextLowerBound = myLb + stride; + PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", + (long long)*plower, (long long)*pupper); + return DISPATCH_NOTFINISHED; + } + ASSERT0(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "bad sched"); + T myLb, myUb; + int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, + DST->LoopUpperBound); + + if (finished == FINISHED) + return DISPATCH_FINISHED; + + // not finished (either not finished or last chunk) + *plast = (int32_t)(finished == LAST_CHUNK); + *plower = myLb; + *pupper = myUb; + *pstride = 1; + + PRINT(LD_LOOP, + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " + "last %d\n", + (int)omp_get_num_threads(), (int)GetNumberOfWorkersInTeam(), + (long long)*plower, (long long)*pupper, (long long)*pstride, + (int)*plast); + return DISPATCH_NOTFINISHED; + } + + static void dispatch_fini() { + // nothing + } + + //////////////////////////////////////////////////////////////////////////////// + // end of template class that encapsulate all the helper functions + //////////////////////////////////////////////////////////////////////////////// +}; + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (dyn loops) +//////////////////////////////////////////////////////////////////////////////// + +// TODO: This is a stopgap. We probably want to expand the dispatch API to take +// an DST pointer which can then be allocated properly without malloc. +DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr); + +// Create a new DST, link the current one, and define the new as current. +static DynamicScheduleTracker *pushDST() { + DynamicScheduleTracker *NewDST = static_cast( + memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST")); + *NewDST = DynamicScheduleTracker({0}); + NewDST->NextDST = ThreadDSTPtr; + ThreadDSTPtr = NewDST; + return ThreadDSTPtr; +} + +// Return the current DST. +static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; } + +// Pop the current DST and restore the last one. +static void popDST() { + DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST; + memory::freeGlobal(ThreadDSTPtr, "remove DST"); + ThreadDSTPtr = OldDST; +} + +extern "C" { + +// init +void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, + int32_t lb, int32_t ub, int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, + uint32_t lb, uint32_t ub, int32_t st, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, + int64_t lb, int64_t ub, int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, + uint64_t lb, uint64_t ub, int64_t st, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); + DynamicScheduleTracker *DST = pushDST(); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); +} + +// next +int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, + int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, + uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, + int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, + uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); + DynamicScheduleTracker *DST = peekDST(); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st, DST); +} + +// fini +void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (static loops) +//////////////////////////////////////////////////////////////////////////////// + +void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + mapping::isSPMDMode()); +} + +void __kmpc_for_static_init_4_simple_spmd(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +void __kmpc_for_static_init_4u_simple_spmd(IdentTy *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride, + int32_t incr, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +void __kmpc_for_static_init_8_simple_spmd(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +void __kmpc_for_static_init_8u_simple_spmd(IdentTy *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride, + int64_t incr, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +void __kmpc_for_static_init_4_simple_generic(IdentTy *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +void __kmpc_for_static_init_4u_simple_generic( + IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +void __kmpc_for_static_init_8_simple_generic(IdentTy *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +void __kmpc_for_static_init_8u_simple_generic( + IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_for_static_fini\n"); +} +} + +#pragma omp end declare target