diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -439,16 +439,11 @@ /// The data for the single globalized variable. struct MappedVarData { + MappedVarData() = default; /// Corresponding field in the global record. - const FieldDecl *FD = nullptr; + llvm::Value *GlobalizedVal = nullptr; /// Corresponding address. Address PrivateAddr = Address::invalid(); - /// true, if only one element is required (for latprivates in SPMD mode), - /// false, if need to create based on the warp-size. - bool IsOnePerTeam = false; - MappedVarData() = delete; - MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) - : FD(FD), IsOnePerTeam(IsOnePerTeam) {} }; /// The map of local variables to their addresses in the global memory. using DeclToAddrMapTy = llvm::MapVector; @@ -460,29 +455,12 @@ EscapedParamsTy EscapedParameters; llvm::SmallVector EscapedVariableLengthDecls; llvm::SmallVector EscapedVariableLengthDeclsAddrs; - const RecordDecl *GlobalRecord = nullptr; - llvm::Optional SecondaryGlobalRecord = llvm::None; - llvm::Value *GlobalRecordAddr = nullptr; llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr MappedParams; }; /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap FunctionGlobalizedDecls; - /// List of records for the globalized variables in target/teams/distribute - /// contexts. Inner records are going to be joined into the single record, - /// while those resulting records are going to be joined into the single - /// union. This resulting union (one per CU) is the entry point for the static - /// memory management runtime functions. - struct GlobalPtrSizeRecsTy { - llvm::GlobalVariable *UseSharedMemory = nullptr; - llvm::GlobalVariable *RecSize = nullptr; - llvm::GlobalVariable *Buffer = nullptr; - SourceLocation Loc; - llvm::SmallVector Records; - unsigned RegionCounter = 0; - }; - llvm::SmallVector GlobalizedRecords; llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; /// List of the records with the list of fields for the reductions across the /// teams. Used to build the intermediate buffer for the fast teams diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1096,17 +1096,6 @@ } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1156,10 +1145,6 @@ CGM.getModule(), OMPRTL___kmpc_kernel_init), Args); - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); - emitGenericVarsProlog(CGF, WST.Loc); } @@ -1228,17 +1213,6 @@ } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1260,12 +1234,6 @@ CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), Args); - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); - } - CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); @@ -1671,7 +1639,6 @@ static_cast(CGF.CGM.getOpenMPRuntime()); if (GlobalizedRD) { auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; I->getSecond().MappedParams = std::make_unique(); DeclToAddrMapTy &Data = I->getSecond().LocalVarData; @@ -1679,8 +1646,7 @@ assert(Pair.getFirst()->isCanonicalDecl() && "Expected canonical declaration"); Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); + MappedVarData())); } } Rt.emitGenericVarsProlog(CGF, Loc); @@ -1709,282 +1675,69 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return; - if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - QualType SecGlobalRecTy; - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - - llvm::PointerType *GlobalRecPtrTy = - CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); - llvm::Value *GlobalRecCastAddr; - llvm::Value *IsTTD = nullptr; - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - if (I->getSecond().SecondaryGlobalRecord.hasValue()) { - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_parallel_level), - {RTLoc, ThreadID}); - IsTTD = Bld.CreateIsNull(PL); - } - llvm::Value *IsSPMD = Bld.CreateIsNotNull( - CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), - CharUnits::fromQuantity(Alignment)); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); - if (const RecordDecl *SecGlobalizedVarsRecord = - I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { - SecGlobalRecTy = - CGM.getContext().getRecordType(SecGlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - Size = Bld.CreateSelect( - IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); - } - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, - /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); - GlobalRecCastAddr = Phi; - I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().Records.size() < 2 && - "Expected less than 2 globalized records: one for target and one " - "for teams."); - unsigned Offset = 0; - for (const RecordDecl *RD : GlobalizedRecords.back().Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); - unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); - Offset = - llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); - } - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - Offset = llvm::alignTo(Offset, Alignment); - GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); - ++GlobalizedRecords.back().RegionCounter; - if (GlobalizedRecords.back().Records.size() == 1) { - assert(KernelStaticGlobalized && - "Kernel static pointer must be initialized already."); - auto *UseSharedMemory = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$is_shared"); - UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, Loc); - auto *StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, nullptr); - auto *RecSize = new llvm::GlobalVariable( - CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$size"); - RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, - CGM.getContext().getSizeType(), Loc); - llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), - GlobalRecordSizeArg); - GlobalizedRecords.back().Buffer = StaticGlobalized; - GlobalizedRecords.back().RecSize = RecSize; - GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; - GlobalizedRecords.back().Loc = Loc; - } - assert(KernelStaticGlobalized && "Global address must be set already."); - Address FrameAddr = CGF.EmitLoadOfPointer( - Address(KernelStaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs()); - llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); - } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - bool UseSharedMemory = - IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), - IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack - : OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - } - LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); - - // Emit the "global alloca" which is a GEP from the global declaration - // record using the pointer returned by the runtime. - LValue SecBase; - decltype(I->getSecond().LocalVarData)::const_iterator SecIt; - if (IsTTD) { - SecIt = I->getSecond().SecondaryLocalVarData->begin(); - llvm::PointerType *SecGlobalRecPtrTy = - CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); - SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( - Bld.CreatePointerBitCastOrAddrSpaceCast( - I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), - SecGlobalRecTy); + for (auto &Rec : I->getSecond().LocalVarData) { + const auto *VD = cast(Rec.first); + bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); + QualType VarTy = VD->getType(); + + // Get the local allocation of a firstprivate variable before sharing + llvm::Value *ParValue; + if (EscapedParam) { + LValue ParLVal = + CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); + ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); } - for (auto &Rec : I->getSecond().LocalVarData) { - bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); - llvm::Value *ParValue; - if (EscapedParam) { - const auto *VD = cast(Rec.first); - LValue ParLVal = - CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); - ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); - } - LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); - // Emit VarAddr basing on lane-id if required. - QualType VarTy; - if (Rec.second.IsOnePerTeam) { - VarTy = Rec.second.FD->getType(); - } else { - Address Addr = VarAddr.getAddress(CGF); - llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( - Addr.getElementType(), Addr.getPointer(), - {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - VarTy = - Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue( - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, - AlignmentSource::Decl); - } - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - assert(I->getSecond().IsInSPMDModeFlag && - "Expected unknown execution mode or required SPMD check."); - if (IsTTD) { - assert(SecIt->second.IsOnePerTeam && - "Secondary glob data must be one per team."); - LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); - VarAddr.setAddress( - Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), - VarAddr.getPointer(CGF)), - VarAddr.getAlignment())); - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - } - Address GlobalPtr = Rec.second.PrivateAddr; - Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); - Rec.second.PrivateAddr = Address( - Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, - LocalAddr.getPointer(), GlobalPtr.getPointer()), - LocalAddr.getAlignment()); - } - if (EscapedParam) { - const auto *VD = cast(Rec.first); - CGF.EmitStoreOfScalar(ParValue, VarAddr); - I->getSecond().MappedParams->setVarAddr(CGF, VD, - VarAddr.getAddress(CGF)); - } - if (IsTTD) - ++SecIt; + + // Allocate space for the variable to be globalized + llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; + llvm::Instruction *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); + + // Cast the void pointer and get the address of the globalized variable. + llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); + llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); + LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy); + Rec.second.PrivateAddr = VarAddr.getAddress(CGF); + Rec.second.GlobalizedVal = VoidPtr; + + // Assign the local allocation to the newly globalized location. + if (EscapedParam) { + CGF.EmitStoreOfScalar(ParValue, VarAddr); + I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF)); } + if (auto *DI = CGF.getDebugInfo()) + VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); } - for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding + for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) { + // Use actual memory size of the VLA object including the padding // for alignment purposes. - CGBuilderTy &Bld = CGF.Builder; llvm::Value *Size = CGF.getTypeSize(VD->getType()); CharUnits Align = CGM.getContext().getDeclAlign(VD); Size = Bld.CreateNUWAdd( Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); llvm::Value *AlignVal = llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); + Size = Bld.CreateUDiv(Size, AlignVal); Size = Bld.CreateNUWMul(Size, AlignVal); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); - LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), + + // Allocate space for this VLA object to be globalized. + llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; + llvm::Instruction *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); + + I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr); + LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(), CGM.getContext().getDeclAlign(VD), AlignmentSource::Decl); I->getSecond().MappedParams->setVarAddr(CGF, cast(VD), Base.getAddress(CGF)); - I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); } I->getSecond().MappedParams->apply(CGF); } @@ -1997,60 +1750,20 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { - I->getSecond().MappedParams->restore(CGF); - if (!CGF.HaveInsertPoint()) - return; + // Deallocate the memory for each globalized VLA object for (llvm::Value *Addr : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - Addr); + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + Addr); } - if (I->getSecond().GlobalRecordAddr) { - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().RegionCounter > 0 && - "region counter must be > 0."); - --GlobalizedRecords.back().RegionCounter; - // Emit the restore function only in the target region. - if (GlobalizedRecords.back().RegionCounter == 0) { - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(GlobalizedRecords.back().UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); - llvm::Value *Args[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - IsInSharedMemory}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), - Args); - } - } else { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); - } + // Deallocate the memory for each globalized value + for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { + I->getSecond().MappedParams->restore(CGF); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal}); } } } @@ -4333,6 +4046,7 @@ } if (!Body) return; + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = @@ -4346,7 +4060,6 @@ auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().MappedParams = std::make_unique(); - I->getSecond().GlobalRecord = GlobalizedVarsRecord; I->getSecond().EscapedParameters.insert( VarChecker.getEscapedParameters().begin(), VarChecker.getEscapedParameters().end()); @@ -4355,21 +4068,16 @@ DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); + Data.insert(std::make_pair(VD, MappedVarData())); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); - I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); I->getSecond().SecondaryLocalVarData.emplace(); DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); + Data.insert(std::make_pair(VD, MappedVarData())); } } if (!NeedToDelayGlobalization) { @@ -4660,185 +4368,8 @@ CGOpenMPRuntime::processRequiresDirective(D); } -/// Get number of SMs and number of blocks per SM. -static std::pair getSMsBlocksPerSM(CodeGenModule &CGM) { - std::pair Data; - if (CGM.getLangOpts().OpenMPCUDANumSMs) - Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; - if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) - Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; - if (Data.first && Data.second) - return Data; - switch (getCudaArch(CGM)) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - return {16, 16}; - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return {56, 32}; - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::SM_86: - return {84, 32}; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX602: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX705: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX805: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX90a: - case CudaArch::GFX90c: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1030: - case CudaArch::GFX1031: - case CudaArch::GFX1032: - case CudaArch::GFX1033: - case CudaArch::UNUSED: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - llvm_unreachable("Unexpected NVPTX target without ptx feature."); -} - void CGOpenMPRuntimeGPU::clear() { - if (!GlobalizedRecords.empty() && - !CGM.getLangOpts().OpenMPCUDATargetParallel) { - ASTContext &C = CGM.getContext(); - llvm::SmallVector GlobalRecs; - llvm::SmallVector SharedRecs; - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - RecordDecl *SharedStaticRD = C.buildImplicitRecord( - "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - SharedStaticRD->startDefinition(); - for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { - if (Records.Records.empty()) - continue; - unsigned Size = 0; - unsigned RecAlignment = 0; - for (const RecordDecl *RD : Records.Records) { - QualType RDTy = C.getRecordType(RD); - unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); - RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); - Size = - llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); - } - Size = llvm::alignTo(Size, RecAlignment); - llvm::APInt ArySize(/*numBits=*/64, Size); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - const bool UseSharedMemory = Size <= SharedMemorySize; - auto *Field = - FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, - SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (UseSharedMemory) { - SharedStaticRD->addDecl(Field); - SharedRecs.push_back(&Records); - } else { - StaticRD->addDecl(Field); - GlobalRecs.push_back(&Records); - } - Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); - Records.UseSharedMemory->setInitializer( - llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); - } - // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. - if (!SharedStaticRD->field_empty()) { - llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - SharedStaticRD->addDecl(Field); - } - SharedStaticRD->completeDefinition(); - if (!SharedStaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(SharedStaticRD); - llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, - llvm::UndefValue::get(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - StaticRD->completeDefinition(); - if (!StaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(StaticRD); - std::pair SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = - C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - } + if (!TeamsReductions.empty()) { ASTContext &C = CGM.getContext(); RecordDecl *StaticRD = C.buildImplicitRecord( diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp --- a/clang/test/OpenMP/assumes_include_nvptx.cpp +++ b/clang/test/OpenMP/assumes_include_nvptx.cpp @@ -19,8 +19,6 @@ // CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() [[attr1]] // CHECK: declare void @__kmpc_kernel_init(i32, i16) // CHECK-NOT: # -// CHECK: declare void @__kmpc_data_sharing_init_stack() -// CHECK-NOT: # // CHECK: declare float @_Z3sinf(float) [[attr2:#[0-9]*]] // CHECK: declare void @__kmpc_kernel_deinit(i16) // CHECK-NOT: # diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+_(.*)_l[0-9]+" // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s // expected-no-diagnostics @@ -9,8 +10,6 @@ return foo(a); } -// CHECK: define weak void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+5]](i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}}) -// CHECK-NOT: @__kmpc_data_sharing_coalesced_push_stack int maini1() { int a; @@ -23,36 +22,67 @@ } // parallel region -// CHECK: define {{.*}}void @{{.*}}(i32* noalias {{.*}}, i32* noalias {{.*}}, i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}}) -// CHECK-NOT: call i8* @__kmpc_data_sharing_coalesced_push_stack( -// CHECK: [[B_ADDR:%.+]] = alloca i32, -// CHECK: call {{.*}}[[FOO:@.*foo.*]](i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[B_ADDR]]) -// CHECK: call {{.*}}[[BAR:@.*bar.*]]() -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( -// CHECK: ret void -// CHECK: define {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK-NOT: @__kmpc_data_sharing_coalesced_push_stack - -// CHECK: define {{.*}}[[BAR]]() -// CHECK: alloca i32, -// CHECK: [[A_LOCAL_ADDR:%.+]] = alloca i32, -// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() -// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: br label -// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) -// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%.+]]* -// CHECK: br label -// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[GLOBALS]], {{.+}} ] -// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[A_GLOBAL_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ADDR]], i32 0, i32 [[LID]] -// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_GLOBAL_ADDR]] -// CHECK: call {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}} [[A_ADDR]]) -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* -// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) -// CHECK: br label -// CHECK: ret i32 +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z6maini1v_l[0-9]+}} +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTBOUND_ZERO_ADDR]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1) +// CHECK-NEXT: br label [[DOTEXECUTE:%.*]] +// CHECK: .execute: +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 +// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTBOUND_ZERO_ADDR]], i32* [[TMP0]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: br label [[DOTOMP_DEINIT:%.*]] +// CHECK: .omp.deinit: +// CHECK-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) +// CHECK-NEXT: br label [[DOTEXIT:%.*]] +// CHECK: .exit: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[B:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]] +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] +// CHECK-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3fooRi +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-NEXT: ret i32 [[TMP1]] +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z3barv +// CHECK-SAME: () #[[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32* +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]] +// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[A]]) +// CHECK-NEXT: ret i32 [[CALL]] +// diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -1,9 +1,9 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+_(.*)_l[0-9]+" // Test device global memory data sharing codegen. ///==========================================================================/// // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s // expected-no-diagnostics @@ -27,93 +27,75 @@ } } } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -/// ========= In the worker function ========= /// -// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker() -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1-NOT: call void @__kmpc_data_sharing_init_stack - -/// ========= In the kernel function ========= /// - -// CK1: {{.*}}define weak void @__omp_offloading{{.*}}test_ds{{.*}}() -// CK1: [[SHAREDARGS1:%.+]] = alloca i8** -// CK1: [[SHAREDARGS2:%.+]] = alloca i8** -// CK1: call void @__kmpc_kernel_init -// CK1: call void @__kmpc_data_sharing_init_stack -// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0 -// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1) -// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* -// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 -// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 -// CK1: store i32 10, i32* [[A]] -// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}) -// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1) -// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]] -// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0 -// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8* -// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]] -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_end_sharing_variables() -// CK1: store i32 100, i32* [[B]] -// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}) -// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2) -// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]] -// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0 -// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8* -// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]] -// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1 -// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8* -// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]] -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CK1: call void @__kmpc_end_sharing_variables() -// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]]) -// CK1: call void @__kmpc_kernel_deinit(i16 1) - -/// ========= In the data sharing wrapper function ========= /// - -// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}}) -// CK1: [[SHAREDARGS4:%.+]] = alloca i8** -// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]]) -// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]] -// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0 -// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32** -// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]] -// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]]) - -/// outlined function for the second parallel region /// - -// CK1: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}) -// CK1-NOT: call i8* @__kmpc_data_sharing_push_stack( -// CK1: [[C_ADDR:%.+]] = alloca i32, -// CK1: store i32* [[C_ADDR]], i32** % -// CK1i-NOT: call void @__kmpc_data_sharing_pop_stack( - -/// ========= In the data sharing wrapper function ========= /// - -// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}}) -// CK1: [[SHAREDARGS3:%.+]] = alloca i8** -// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]]) -// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]] -// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0 -// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32** -// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]] -// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1 -// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32** -// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]] -// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]]) +// CHECK: entry: +// CHECK: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK: [[SHARED_ARG_REFS:%.*]] = alloca i8**, align 8 +// CHECK: [[C:%.*]] = alloca i32, align 4 +// CHECK: [[DOTZERO_ADDR7:%.*]] = alloca i32, align 4 +// CHECK: [[SHARED_ARG_REFS8:%.*]] = alloca i8**, align 8 +// CHECK: store i32 0, i32* [[DOTZERO_ADDR7]], align 4 +// CHECK: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]] +// CHECK: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] +// CHECK: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] +// CHECK: .worker: +// CHECK: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z7test_dsv_l[0-9]+}}_worker() #[[ATTR4:[0-9]+]] +// CHECK: br label [[DOTEXIT:%.*]] +// CHECK: .mastercheck: +// CHECK: [[NVPTX_NUM_THREADS1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: [[NVPTX_WARP_SIZE2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE2]], 1 +// CHECK: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 +// CHECK: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS1]], 1 +// CHECK: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP2]] +// CHECK: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]] +// CHECK: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]] +// CHECK: .master: +// CHECK: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE4]] +// CHECK: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) +// CHECK: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32* +// CHECK: [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) +// CHECK: [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32* +// CHECK: store i32 10, i32* [[A_ON_STACK]], align 4 +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)) +// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_ARG_REFS]], i64 1) +// CHECK: [[TMP5:%.*]] = load i8**, i8*** [[SHARED_ARG_REFS]], align 8 +// CHECK: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP5]], i64 0 +// CHECK: [[TMP7:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: call void @__kmpc_end_sharing_variables() +// CHECK: store i32 100, i32* [[B_ON_STACK]], align 4 +// CHECK: store i32 1000, i32* [[C]], align 4 +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)) +// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_ARG_REFS8]], i64 2) +// CHECK: [[TMP8:%.*]] = load i8**, i8*** [[SHARED_ARG_REFS8]], align 8 +// CHECK: [[TMP9:%.*]] = getelementptr inbounds i8*, i8** [[TMP8]], i64 0 +// CHECK: [[TMP10:%.*]] = bitcast i32* [[B_ON_STACK]] to i8* +// CHECK: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK: [[TMP11:%.*]] = getelementptr inbounds i8*, i8** [[TMP8]], i64 1 +// CHECK: [[TMP12:%.*]] = bitcast i32* [[A_ON_STACK]] to i8* +// CHECK: store i8* [[TMP12]], i8** [[TMP11]], align 8 +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: call void @__kmpc_end_sharing_variables() +// CHECK: call void @__kmpc_free_shared(i8* [[B]]) +// CHECK: call void @__kmpc_free_shared(i8* [[A]]) +// CHECK: br label [[DOTTERMINATION_NOTIFIER:%.*]] +// CHECK: .termination.notifier: +// CHECK: call void @__kmpc_kernel_deinit(i16 1) +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: br label [[DOTEXIT]] +// CHECK: .exit: +// CHECK: ret void #endif - diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -1,21 +1,15 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK // expected-no-diagnostics #ifndef HEADER @@ -34,36 +28,17 @@ return 0; } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 +// CHECK: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0 // CHECK: define weak void @__omp_offloading_{{.*}}_main_[[LINE]]([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}) -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 -// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1) -// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty* -// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], +// CHECK: [[CVOIDPTR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 40) +// CHECK: [[CSTACK:%.+]] = bitcast i8* [[CVOIDPTR]] to [10 x i{{32|64}}]* // CHECK: call void @__kmpc_for_static_init_4( // CHECK: call void [[PARALLEL:@.+]]( // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]]) - -// CHECK: define internal void [[PARALLEL]]( -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( - -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( +// CHECK: call void @__kmpc_free_shared(i8* [[CVOIDPTR]]) #endif diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -26,15 +26,14 @@ // CHECK-DAG: [[CAP1:%.+]] = type { [[S]]* } // CHECK-DAG: [[CAP2:%.+]] = type { i32*, i32*, i32*, i32**, i32* } -// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() -// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}}) +// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_{{.*}}_worker() +// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_{{.*}}([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}}) // CLASS-NOT: getelementptr // CLASS: br i1 % -// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() +// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_{{.*}}_worker() // CLASS: br label % // CLASS: br i1 % // CLASS: call void @__kmpc_kernel_init( -// CLASS: call void @__kmpc_data_sharing_init_stack() // CLASS: call void @llvm.memcpy. // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], // CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0 @@ -43,7 +42,7 @@ // CLASS: call i32 [[LAMBDA1:@.+foo.+]]([[CAP1]]* {{[^,]*}} [[L]]) // CLASS: ret void -// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l69([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}}) +// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_{{.*}}([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}}) // CLASS-NOT: getelementptr // CLASS: call void @llvm.memcpy. // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], @@ -72,15 +71,14 @@ } } s; -// FUN: define internal void @__omp_offloading_{{.+}}_main_l124_worker() -// FUN: define weak void @__omp_offloading_{{.+}}_main_l124(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) +// FUN: define internal void @__omp_offloading_{{.+}}_main_{{.*}}_worker() +// FUN: define weak void @__omp_offloading_{{.+}}_main_{{.*}}(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) // FUN-NOT: getelementptr // FUN: br i1 % -// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l124_worker() +// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_{{.*}}_worker() // FUN: br label % // FUN: br i1 % // FUN: call void @__kmpc_kernel_init( -// FUN: call void @__kmpc_data_sharing_init_stack() // FUN: call void @llvm.memcpy. // FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], // FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0 @@ -97,7 +95,7 @@ // FUN: call i64 [[LAMBDA2:@.+main.+]]([[CAP2]]* {{[^,]*}} [[L]]) // FUN: ret void -// FUN: define weak void @__omp_offloading_{{.+}}_main_l126(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) +// FUN: define weak void @__omp_offloading_{{.+}}_main_{{.*}}(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) // FUN-NOT: getelementptr // FUN: call void @llvm.memcpy. // FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1,12 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -75,15 +72,9 @@ return a; } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() -// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker() - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -136,7 +127,7 @@ // CHECK: [[EXIT]] // CHECK: ret void -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l29]](i[[SZ:32|64]] +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] @@ -202,7 +193,7 @@ // CHECK: store i[[SZ]] 44, i[[SZ]]* %a, // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l46}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -246,7 +237,7 @@ // CHECK: [[EXIT]] // CHECK: ret void -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l46]](i[[SZ:32|64]] +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]] // Create local storage for each capture. // CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]], // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], @@ -326,25 +317,17 @@ // CHECK: declare void @__kmpc_barrier(%struct.ident_t*, i32) #[[#CONVERGENT:]] -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}_worker() -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}( // CHECK-32: [[A_ADDR:%.+]] = alloca i32, // CHECK-64: [[A_ADDR:%.+]] = alloca i64, // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32* -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty* // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]], // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]], -// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]], -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]]) +// CHECK: [[STACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to i32* +// CHECK: store i32 [[A]], i32* [[BC]], +// CHECK: call void @__kmpc_free_shared(i8* [[STACK]]) // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK: [[CC:%.+]] = alloca i32, diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -1,7 +1,6 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -31,35 +30,21 @@ return a; } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l13}}_worker() +// CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}_worker() // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call i1 @__kmpc_kernel_parallel( // CHECK: call void @__omp_outlined___wrapper( -// CHECK: define weak void @__omp_offloading_{{.*}}l13( -// CHECK: call void @__omp_offloading_{{.*}}l13_worker() +// CHECK: define weak void @__omp_offloading_{{.*}}( +// CHECK: call void @__omp_offloading_{{.*}}_worker() // CHECK: call void @__kmpc_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack() -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) +// CHECK: [[STACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) // CHECK: call void @__kmpc_kernel_prepare_parallel( // CHECK: call void @__kmpc_begin_sharing_variables({{.*}}, i64 2) // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_end_sharing_variables() -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]]) +// CHECK: call void @__kmpc_free_shared(i8* [[STACK]]) // CHECK: call void @__kmpc_kernel_deinit(i16 1) // CHECK: define internal void @__omp_outlined__( diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -36,7 +36,7 @@ // CHECK: store i32** [[PTR2_REF]], i32*** [[PTR2_REF_PTR:%.+]], // CHECK: [[PTR2_REF:%.+]] = load i32**, i32*** [[PTR2_REF_PTR]], // CHECK: call void @__kmpc_spmd_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() +// // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], // CHECK: call void @{{.+}}(i32* [[THREADID]], i32* %{{.+}}, i32** [[PTR1_ADDR]], i32** [[PTR2_REF]]) @@ -571,66 +571,38 @@ // CHECK: ret void // CHECK: define{{ hidden | }}i32 [[BAZ]](i32 [[F:%.*]], double* nonnull align {{[0-9]+}} dereferenceable{{.*}}) -// CHECK: alloca i32, -// CHECK: [[LOCAL_F_PTR:%.+]] = alloca i32, -// CHECK: [[ZERO_ADDR:%.+]] = alloca i32, -// CHECK: [[BND_ZERO_ADDR:%.+]] = alloca i32, -// CHECK: store i32 0, i32* [[BND_ZERO_ADDR]] -// CHECK: store i32 0, i32* [[ZERO_ADDR]] -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[UNKNOWN]] -// CHECK: [[PAR_LEVEL:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: [[IS_TTD:%.+]] = icmp eq i16 %1, 0 -// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() -// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: br label -// CHECK: [[SIZE:%.+]] = select i1 [[IS_TTD]], i{{64|32}} 4, i{{64|32}} 128 -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i{{64|32}} [[SIZE]], i16 0) -// CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST:%.+]]* -// CHECK: br label -// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[REC_ADDR]], {{.+}} ] -// CHECK: [[TTD_ITEMS:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to [[SEC_GLOBAL_ST:%.+]]* -// CHECK: [[F_PTR_ARR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[GLOBAL_F_PTR_PAR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F_PTR_ARR]], i32 0, i32 [[LID]] -// CHECK: [[GLOBAL_F_PTR_TTD:%.+]] = getelementptr inbounds [[SEC_GLOBAL_ST]], [[SEC_GLOBAL_ST]]* [[TTD_ITEMS]], i32 0, i32 0 -// CHECK: [[GLOBAL_F_PTR:%.+]] = select i1 [[IS_TTD]], i32* [[GLOBAL_F_PTR_TTD]], i32* [[GLOBAL_F_PTR_PAR]] -// CHECK: [[F_PTR:%.+]] = select i1 [[IS_SPMD]], i32* [[LOCAL_F_PTR]], i32* [[GLOBAL_F_PTR]] -// CHECK: store i32 %{{.+}}, i32* [[F_PTR]], - -// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() -// CHECK: icmp ne i8 [[RES]], 0 -// CHECK: br i1 - -// CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: icmp ne i16 [[RES]], 0 -// CHECK: br i1 - -// CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: call void [[OUTLINED:@.+]](i32* [[ZERO_ADDR]], i32* [[BND_ZERO_ADDR]], i32* [[F_PTR]], double* %{{.+}}) -// CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]]) -// CHECK: br label - -// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*)) -// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_PTR:%.+]], i{{64|32}} 2) -// CHECK: [[SHARED:%.+]] = load i8**, i8*** [[SHARED_PTR]], -// CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{64|32}} 0 -// CHECK: [[F_REF:%.+]] = bitcast i32* [[F_PTR]] to i8* -// CHECK: store i8* [[F_REF]], i8** [[REF]], -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) -// CHECK: call void @__kmpc_end_sharing_variables() -// CHECK: br label - -// CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]], -// CHECK: store i32 [[RES]], i32* [[RET:%.+]], -// CHECK: br i1 [[IS_SPMD]], label -// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* -// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) -// CHECK: br label -// CHECK: [[RES:%.+]] = load i32, i32* [[RET]], -// CHECK: ret i32 [[RES]] +// CHECK: [[ZERO_ADDR:%.+]] = alloca i32 +// CHECK: [[BND_ZERO_ADDR:%.+]] = alloca i32 +// CHECK: [[SHARED_ARG:%.+]] = alloca i8** +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num({{.*}}) +// CHECK: [[F_PTR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CHECK: [[F_ON_STACK:%.+]] = bitcast i8* [[F_PTR]] to i32* +// CHECK: store i32 [[F]], i32* [[F_ON_STACK]] +// CHECK: [[SPMD_EXEC:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() +// CHECK: br i1 %{{.+}} label [[SEQ:%.+]], label [[PARCHECK:%.+]] +// +// CHECK: [[PAR_LEVEL:%.+]] = call i16 @__kmpc_parallel_level({{.*}}) +// CHECK: br i1 %{{.+}} label [[SEQ:%.+]], label [[MASTER:%.+]] +// +// CHECK: call void @__kmpc_serialized_parallel({{.*}}) +// CHECK: call void @__omp_outlined__1(i32* [[ZERO_ADDR]], i32* [[BND_ZERO_ADDR]], i32* [[F_ON_STACK]], double* %{{.+}}) +// CHECK: call void @__kmpc_end_serialized_parallel({{.*}}) +// CHECK: br label +// +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*)) +// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_ARG]], i{{32|64}} 2) +// CHECK: [[SHARED:%.+]] = load i8**, i8*** %shared_arg_refs +// CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{32|64}} 0 +// CHECK: [[F_REF:%.+]] = bitcast i32* %f_on_stack to i8* +// CHECK: store i8* [[F_REF]], i8** [[REF]] +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) +// CHECK: call void @__kmpc_end_sharing_variables() +// CHECK: br label +// +// CHECK: [[RET:%.+]] = load i32, i32* [[F_ON_STACK]] +// CHECK: call void @__kmpc_free_shared(i8* [[F_PTR]]) +// CHECK: ret i32 [[RET]] // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l331}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -57,12 +57,11 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack +// CHECK-NOT: call i8* @__kmpc_alloc_shared // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -100,7 +99,6 @@ // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -54,7 +54,6 @@ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]]) @@ -85,7 +84,6 @@ // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -55,7 +55,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -73,7 +72,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] @@ -91,7 +89,6 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // // CHECK: [[EXEC]] diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -55,7 +55,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] @@ -239,7 +238,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] @@ -501,7 +499,6 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // // CHECK: [[EXECUTE]] diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -233,12 +233,11 @@ // CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1) -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd -// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( +// CHECK-NOT: call i8* @__kmpc_alloc_shared( // CHECK-NOT: call void @__kmpc_serialized_parallel( // CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.+}}) // CHECK-NOT: call void @__kmpc_end_serialized_parallel( -// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( +// CHECK-NOT: call void @__kmpc_free_shared( // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK: ret diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -1,21 +1,13 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: {{@__omp_offloading_.+}}_l23_exec_mode = weak constant i8 1 -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - template tx ftemplate(int n) { int i; @@ -38,10 +30,10 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l23}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+}}_worker() // CHECK: ret void - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l23}}() + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+}}() // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() @@ -51,7 +43,7 @@ // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l23}}_worker() + // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+}}_worker() // CHECK: br label {{%?}}[[EXIT:.+]] // // CHECK: [[CHECK_MASTER]] @@ -78,21 +70,15 @@ // CHECK: ret void // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}) - // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], - // SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], - // SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) - // SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]], - // SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 - // PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) - // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to [[GLOB_TY:%.+]]* - // CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0 + // CHECK: [[ADDR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) + // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to i32* // // CHECK: call void @__kmpc_for_static_init_4( // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*)) // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_VARS_PTR:%.+]], i{{64|32}} 1) // CHECK: [[SHARED_VARS_BUF:%.+]] = load i8**, i8*** [[SHARED_VARS_PTR]], // CHECK: [[VARS_BUF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED_VARS_BUF]], i{{64|32}} 0 - // CHECK: [[I_ADDR_BC:%.+]] = bitcast i32* [[I_ADDR]] to i8* + // CHECK: [[I_ADDR_BC:%.+]] = bitcast i32* [[RD]] to i8* // CHECK: store i8* [[I_ADDR_BC]], i8** [[VARS_BUF]], // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,37 +1,29 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l61}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l66}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l74}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l81}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -92,33 +84,19 @@ return a; } -// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void [[PARALLEL:@.+]]( // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: define internal void [[PARALLEL]]( -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 -// PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: [[BC:%.+]] = bitcast i8* [[ADDR]] to [[REC:%.+]]* -// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[ADDR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CHECK: [[L_ON_STACK:%.+]] = bitcast i8* [[ADDR]] to i32* // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[ADDR]]) +// CHECK: call void @__kmpc_free_shared(i8* [[ADDR]]) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( @@ -251,13 +229,13 @@ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l74(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) +// CHECK: define weak void @__omp_offloading_{{.*}}(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK-DIV64: div i64 // CHECK-DIV32-NO: div i64 -// CHECK: define weak void @__omp_offloading_{{.*}}_l81(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{.*}}) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -1,31 +1,25 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l59}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -75,31 +69,17 @@ return a; } -// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 - -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 -// PAR: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CHECK: [[BC:%.+]] = bitcast i8* [[PTR]] to [[REC:%.+]]* -// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[PTR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CHECK: [[L_ON_STACK:%.+]] = bitcast i8* [[PTR]] to i32* // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) -// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]]) +// CHECK: call void @__kmpc_free_shared(i8* [[PTR]]) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -1,10 +1,8 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix SEQ -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix PAR +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix SEQ -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix PAR +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -29,31 +27,18 @@ return tmain(argv); } -// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} -// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 - // only nvptx side: do not outline teams region and do not call fork_teams // CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]]) // CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, // CK1: store {{.+}} 0, {{.+}}, -// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], -// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) -// CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] -// CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK1: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], -// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]]) +// CK1: store i{{32|64}} [[ARGC]], i{{32|64}}* [[ARGCADDR]], +// CK1-64: [[CONV:%.+]] = bitcast i{{32|64}}* [[ARGCADDR]] to i{{32|64}}* +// CK1-64: [[ARG:%.+]] = load i{{32|64}}, i{{32|64}}* [[CONV]] +// CK1-32: [[ARG:%.+]] = load i{{32|64}}, i{{32|64}}* [[ARGCADDR]] +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CK1: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{32|64}}* +// CK1: store i{{32|64}} [[ARG]], i{{32|64}}* [[ARGC_ON_STACK]], +// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGC_ON_STACK]]) // CK1: ret void // CK1-NEXT: } @@ -65,16 +50,11 @@ // CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]]) // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] -// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], -// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1) // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], -// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]]) +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} {{4|8}}) +// CK1: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{[0-9]+}}* +// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGC_ON_STACK]], +// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGC_ON_STACK]]) // CK1: ret void // CK1-NEXT: } @@ -87,11 +67,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix SEQ2 -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix PAR2 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix SEQ2 -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix PAR2 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 // expected-no-diagnostics #ifdef CK2 @@ -118,14 +96,6 @@ return tmain(argv); } -// SEQ2: [[MEM_TY:%.+]] = type { [128 x i8] } -// SEQ2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef -// SEQ2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// SEQ2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} -// SEQ2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// SEQ2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 - // CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]]) // CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, // CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}, @@ -136,18 +106,13 @@ // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* -// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], -// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], +// CK2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4) +// CK2: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{32|64}}* +// CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGC_ON_STACK]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( -// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]]) +// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGC_ON_STACK]]) // CK2: ret // CK2: define internal void [[OUTLINED]]( @@ -162,17 +127,12 @@ // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], -// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], -// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 -// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1) // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], +// CK2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} {{4|8}}) +// CK2: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{[0-9]+}}* +// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGC_ON_STACK]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( -// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]]) +// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGC_ON_STACK]]) // CK2: ret void // CK2: define internal void [[OUTLINED]]( diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -1,12 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -15,21 +12,14 @@ // CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] } // CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] } // CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] } -// SEQ-DAG: [[MAP_TY:%.+]] = type { [128 x i8] } - -// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef -// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 -// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} -// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] // Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD. -// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0 // CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer @@ -73,9 +63,9 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l44}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l34}}_worker() - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l44]]( + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l34]]( // // CHECK: {{call|invoke}} void [[T1]]_worker() // @@ -340,9 +330,9 @@ // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) // CHECK: ret void - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker() - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l50]]( + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l40]]( // // CHECK: {{call|invoke}} void [[T2]]_worker() @@ -707,10 +697,9 @@ // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) // CHECK: ret void - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}( + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}( // // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -425,12 +425,9 @@ GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) -__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr) +__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) +__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) __OMP_RTL(__kmpc_end_sharing_variables, false, Void, ) __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1112,9 +1112,8 @@ } void analysisGlobalization() { - RuntimeFunction GlobalizationRuntimeIDs[] = { - OMPRTL___kmpc_data_sharing_coalesced_push_stack, - OMPRTL___kmpc_data_sharing_push_stack}; + RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared, + OMPRTL___kmpc_free_shared}; for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll --- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll +++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll @@ -1,145 +1,41 @@ +; RUN: opt -openmpopt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s ; RUN: opt -passes=openmpopt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -; ModuleID = 'declare_target_codegen_globalization.cpp' -source_filename = "declare_target_codegen_globalization.cpp" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" +target triple = "nvptx64" -%struct.ident_t = type { i32, i32, i32, i32, i8* } -%struct._globalized_locals_ty = type { [32 x i32] } +; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. -@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1 -@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8 -@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0 -@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata" +@S = external local_unnamed_addr global i8* -; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. -; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. - -; Function Attrs: norecurse nounwind -define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 { +define void @foo() { entry: - %nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13 - tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12 - tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12 - %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1) - %1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4 - %.not.i.i = icmp eq i8 %1, 0 - br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit - -.non-spmd2.i.i: ; preds = %entry - %2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12 - tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14 - br label %__omp_outlined__.exit, !dbg !14 - -__omp_outlined__.exit: ; preds = %entry, %.non-spmd2.i.i - tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19 - ret void, !dbg !20 + %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8 + %x_on_stack = bitcast i8* %0 to i32* + %1 = bitcast i32* %x_on_stack to i8* + call void @share(i8* %1) + call void @__kmpc_free_shared(i8* %0) + ret void } -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 - -declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr - -declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr - -; Function Attrs: norecurse nounwind readonly -define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 { +define void @share(i8* %x) { entry: - %0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23 - ret i32 %0, !dbg !27 + store i8* %x, i8** @S + ret void } -; Function Attrs: nounwind -define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 { -entry: - %a1 = alloca i32, align 4 - %0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4 - %.not = icmp eq i8 %0, 0 - br i1 %.not, label %.non-spmd, label %.exit - -.non-spmd: ; preds = %entry - %1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31 - %2 = bitcast i8* %1 to %struct._globalized_locals_ty* - br label %.exit +declare i8* @__kmpc_alloc_shared(i64) -.exit: ; preds = %entry, %.non-spmd - %_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ] - %nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28 - %nvptx_lane_id = and i32 %nvptx_tid, 31 - %3 = zext i32 %nvptx_lane_id to i64 - %4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3 - %5 = select i1 %.not, i32* %4, i32* %a1 - %6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23 - br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31 +declare void @__kmpc_free_shared(i8*) -.non-spmd2: ; preds = %.exit - %7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31 - tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31 - br label %.exit3, !dbg !31 - -.exit3: ; preds = %.non-spmd2, %.exit - ret i32 %6, !dbg !31 -} - -declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr - -declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr - -declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr - -; Function Attrs: nounwind readnone -declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 - -declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr - -; Function Attrs: nounwind -declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4 - -declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr - -attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } !llvm.dbg.cu = !{!0} -!omp_offload.info = !{!3} -!nvvm.annotations = !{!4} -!llvm.module.flags = !{!5, !6, !7, !8} -!llvm.ident = !{!9} +!llvm.module.flags = !{!3, !4} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP") +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c") !2 = !{} -!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0} -!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1} -!5 = !{i32 7, !"Dwarf Version", i32 2} -!6 = !{i32 2, !"Debug Info Version", i32 3} -!7 = !{i32 1, !"wchar_size", i32 4} -!8 = !{i32 7, !"PIC Level", i32 2} -!9 = !{!"clang version 12.0.0"} -!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!11 = !DISubroutineType(types: !2) -!12 = !DILocation(line: 17, column: 1, scope: !10) -!13 = !{i32 1, i32 1025} -!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16) -!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18) -!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!18 = distinct !DILocation(line: 17, column: 1, scope: !10) -!19 = !DILocation(line: 17, column: 40, scope: !10) -!20 = !DILocation(line: 21, column: 3, scope: !10) -!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) -!22 = !DILocation(line: 5, column: 26, scope: !21) -!23 = !{!24, !24, i64 0} -!24 = !{!"int", !25, i64 0} -!25 = !{!"omnipotent char", !26, i64 0} -!26 = !{!"Simple C++ TBAA"} -!27 = !DILocation(line: 5, column: 19, scope: !21) -!28 = !{i32 0, i32 1024} -!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30) -!30 = distinct !DILocation(line: 9, column: 10, scope: !15) -!31 = !DILocation(line: 10, column: 1, scope: !15) - +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 5, column: 7, scope: !6) diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -24,187 +24,16 @@ // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// -INLINE static void data_sharing_init_stack_common() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - - for (int WID = 0; WID < DS_Max_Warp_Number; WID++) { - __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } +// Allocate memory that can be shared between the threads. +// TODO: Add a small buffer of shared memory to allocate memory from +// TODO: Add an INFO message to communicate with the user +EXTERN void *__kmpc_alloc_shared(size_t DataSize) { + return (void *)SafeMalloc(DataSize, "Alloc Shared"); } -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called only by the MASTER thread of each -// team in non-SPMD mode. -EXTERN void __kmpc_data_sharing_init_stack() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - data_sharing_init_stack_common(); - omptarget_nvptx_globalArgs.Init(); -} - -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called in SPMD mode only. -EXTERN void __kmpc_data_sharing_init_stack_spmd() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - if (GetThreadIdInBlock() == 0) - data_sharing_init_stack_common(); - - __kmpc_impl_threadfence_block(); -} - -INLINE static void *data_sharing_push_stack_common(size_t PushSize) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - // Only warp active master threads manage the stack. - bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; - - // Add worst-case padding to DataSize so that future stack allocations are - // correctly aligned. - const size_t Alignment = 8; - PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; - - // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = GetWarpId(); - void *FrameP = 0; - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - - if (IsWarpMaster) { - // SlotP will point to either the shared memory slot or an existing - // global memory slot. - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - - // Check if we have room for the data in the current slot. - const uintptr_t StartAddress = (uintptr_t)StackP; - const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; - - // If we requested more data than there is room for in the rest - // of the slot then we need to either re-use the next slot, if one exists, - // or create a new slot. - if (EndAddress < RequestedEndAddress) { - __kmpc_data_sharing_slot *NewSlot = 0; - size_t NewSize = PushSize; - - // Allocate at least the default size for each type of slot. - // Master is a special case and even though there is only one thread, - // it can share more things with the workers. For uniformity, it uses - // the full size of a worker warp slot. - size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; - if (DefaultSlotSize > NewSize) - NewSize = DefaultSlotSize; - NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Global memory slot allocation."); - - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - NewSlot->PrevSlotStackPtr = StackP; - NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; - - // Make previous slot point to the newly allocated slot. - SlotP->Next = NewSlot; - // The current slot becomes the new slot. - SlotP = NewSlot; - // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[0] + PushSize; - // The frame pointer always points to the beginning of the frame. - FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; - } else { - // Add the data chunk to the current slot. The frame pointer is set to - // point to the start of the new frame held in StackP. - FrameP = DataSharingState.FramePtr[WID] = StackP; - // Reset stack pointer to the requested address. - StackP = (void *)RequestedEndAddress; - } - } - // Get address from lane 0. - int *FP = (int *)&FrameP; - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); - if (sizeof(FrameP) == 8) - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); - - return FrameP; -} - -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - return data_sharing_push_stack_common(DataSize); -} - -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) - ? DataSize - : WARPSIZE * DataSize; - - // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = - (uintptr_t)data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t)(GetLaneId() * DataSize); - return (void *)FrameStartAddress; -} - -// Pop the stack and free any memory which can be reclaimed. -// -// When the pop operation removes the last global memory slot, -// reclaim all outstanding global memory slots since it is -// likely we have reached the end of the kernel. -EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - __kmpc_impl_threadfence_block(); - - if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = GetWarpId(); - - // Current slot - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - - // Pointer to next available stack. - void *&StackP = DataSharingState.StackPtr[WID]; - - // Pop the frame. - StackP = FrameStart; - - // If the current slot is empty, we need to free the slot after the - // pop. - bool SlotEmpty = (StackP == &SlotP->Data[0]); - - if (SlotEmpty && SlotP->Prev) { - // Before removing the slot we need to reset StackP. - StackP = SlotP->PrevSlotStackPtr; - - // Remove the slot. - SlotP = SlotP->Prev; - SafeFree(SlotP->Next, "Free slot."); - SlotP->Next = 0; - } - } +// Free the allocated memory. +EXTERN void __kmpc_free_shared(void *FrameStart) { + SafeFree(FrameStart, "Free Shared"); } // Begin a data sharing context. Maintain a list of references to shared @@ -278,4 +107,21 @@ omptarget_nvptx_simpleMemoryManager.Release(); } +// Deprecated globalization code +EXTERN void __kmpc_data_sharing_init_stack() {} +EXTERN void __kmpc_data_sharing_init_stack_spmd() {} + +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t) { + return (void *)SafeMalloc(DataSize, "Alloc Deprecated"); +} + +EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, int16_t) { + return (void *)SafeMalloc(DataSize, "Alloc Deprecated"); +} + +EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + SafeFree(FrameStart, "Free Shared"); +} + #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -424,13 +424,8 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn); EXTERN void __kmpc_kernel_end_parallel(); -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_data_sharing_init_stack_spmd(); -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void *__kmpc_data_sharing_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void *__kmpc_alloc_shared(size_t Size); +EXTERN void __kmpc_free_shared(void *Data); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); EXTERN void __kmpc_end_sharing_variables(); EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); @@ -445,4 +440,11 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, int16_t is_shared); +// Deprecated globalization interface +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, int16_t s); +EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t s); +EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void __kmpc_data_sharing_init_stack_spmd(); + #endif