Index: cfe/trunk/lib/CodeGen/CGDecl.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGDecl.cpp +++ cfe/trunk/lib/CodeGen/CGDecl.cpp @@ -1068,9 +1068,17 @@ } // A normal fixed sized variable becomes an alloca in the entry block, - // unless it's an NRVO variable. - - if (NRVO) { + // unless: + // - it's an NRVO variable. + // - we are compiling OpenMP and it's an OpenMP local variable. + + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { + address = OpenMPLocalAddr; + } else if (NRVO) { // The named return value optimization: allocate this variable in the // return slot, so that we can elide the copy when returning this // variable (C++0x [class.copy]p34). @@ -1896,9 +1904,18 @@ } } } else { - // Otherwise, create a temporary to hold the value. - DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), - D.getName() + ".addr"); + // Check if the parameter address is controlled by OpenMP runtime. + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { + DeclPtr = OpenMPLocalAddr; + } else { + // Otherwise, create a temporary to hold the value. + DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), + D.getName() + ".addr"); + } DoStore = true; } Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h @@ -676,7 +676,7 @@ /// \brief Cleans up references to the objects in finished function. /// - void functionFinished(CodeGenFunction &CGF); + virtual void functionFinished(CodeGenFunction &CGF); /// \brief Emits code for parallel or serial call of the \a OutlinedFn with /// variables captured in a record which address is stored in \a @@ -1362,6 +1362,14 @@ emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef Args = llvm::None) const; + + /// Emits OpenMP-specific function prolog. + /// Required for device constructs. + virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) {} + + /// Gets the OpenMP-specific address of the local variable. + virtual Address getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD); }; /// Class supports emissionof SIMD-only code. Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp @@ -8100,6 +8100,11 @@ return CGF.GetAddrOfLocalVar(NativeParam); } +Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) { + return Address::invalid(); +} + llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -289,6 +289,14 @@ CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef Args = llvm::None) const override; + /// Emits OpenMP-specific function prolog. + /// Required for device constructs. + void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override; + + /// Gets the OpenMP-specific address of the local variable. + Address getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) override; + /// Target codegen is specialized based on two programming models: the /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' /// model for constructs like 'target parallel' that support it. @@ -300,12 +308,37 @@ Unknown, }; + /// Cleans up references to the objects in finished function. + /// + void functionFinished(CodeGenFunction &CGF) override; + private: // Track the execution mode when codegening directives within a target // region. The appropriate mode (generic/spmd) is set on entry to the // target region and used by containing directives such as 'parallel' // to emit optimized code. ExecutionMode CurrentExecutionMode; + + /// Map between an outlined function and its wrapper. + llvm::DenseMap WrapperFunctionsMap; + + /// Emit function which wraps the outline parallel region + /// and controls the parameters which are passed to this function. + /// The wrapper ensures that the outlined function is called + /// with the correct arguments when data is shared. + llvm::Function *createParallelDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D); + + /// The map of local variables to their addresses in the global memory. + using DeclToAddrMapTy = llvm::MapVector>; + /// Maps the function to the list of the globalized variables with their + /// addresses. + llvm::DenseMap> + FunctionGlobalizedDecls; + /// Map from function to global record pointer. + llvm::DenseMap FunctionToGlobalRecPtr; }; } // CodeGen namespace. Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -13,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeNVPTX.h" -#include "clang/AST/DeclOpenMP.h" #include "CodeGenFunction.h" +#include "clang/AST/DeclOpenMP.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtVisitor.h" +#include "llvm/ADT/SmallPtrSet.h" using namespace clang; using namespace CodeGen; @@ -70,7 +72,21 @@ /// index, int32_t width, int32_t reduce)) OMPRTL_NVPTX__kmpc_teams_reduce_nowait, /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); - OMPRTL_NVPTX__kmpc_end_reduce_nowait + OMPRTL_NVPTX__kmpc_end_reduce_nowait, + /// \brief Call to void __kmpc_data_sharing_init_stack(); + OMPRTL_NVPTX__kmpc_data_sharing_init_stack, + /// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size, + /// int16_t UseSharedMemory); + OMPRTL_NVPTX__kmpc_data_sharing_push_stack, + /// \brief Call to void __kmpc_data_sharing_pop_stack(void *a); + OMPRTL_NVPTX__kmpc_data_sharing_pop_stack, + /// \brief Call to void __kmpc_begin_sharing_variables(void ***args, + /// size_t n_args); + OMPRTL_NVPTX__kmpc_begin_sharing_variables, + /// \brief Call to void __kmpc_end_sharing_variables(); + OMPRTL_NVPTX__kmpc_end_sharing_variables, + /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs) + OMPRTL_NVPTX__kmpc_get_shared_variables, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -149,6 +165,245 @@ /// barrier. NB_Parallel = 1, }; + +/// Get the list of variables that can escape their declaration context. +class CheckVarsEscapingDeclContext final + : public ConstStmtVisitor { + CodeGenFunction &CGF; + llvm::SetVector EscapedDecls; + llvm::SmallPtrSet IgnoredDecls; + bool AllEscaped = false; + RecordDecl *GlobalizedRD = nullptr; + llvm::SmallDenseMap MappedDeclsFields; + + void markAsEscaped(const ValueDecl *VD) { + if (IgnoredDecls.count(VD) || + (CGF.CapturedStmtInfo && + CGF.CapturedStmtInfo->lookup(cast(VD)))) + return; + EscapedDecls.insert(VD); + } + + void VisitValueDecl(const ValueDecl *VD) { + if (VD->getType()->isLValueReferenceType()) { + markAsEscaped(VD); + if (const auto *VarD = dyn_cast(VD)) { + if (!isa(VarD) && VarD->hasInit()) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(VarD->getInit()); + AllEscaped = SavedAllEscaped; + } + } + } + } + void VisitOpenMPCapturedStmt(const CapturedStmt *S) { + if (!S) + return; + for (const auto &C : S->captures()) { + if (C.capturesVariable() && !C.capturesVariableByCopy()) { + const ValueDecl *VD = C.getCapturedVar(); + markAsEscaped(VD); + if (isa(VD)) + VisitValueDecl(VD); + } + } + } + + typedef std::pair VarsDataTy; + static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) { + return P1.first > P2.first; + } + + void buildRecordForGlobalizedVars() { + assert(!GlobalizedRD && + "Record for globalized variables is built already."); + if (EscapedDecls.empty()) + return; + ASTContext &C = CGF.getContext(); + SmallVector GlobalizedVars; + for (const auto *D : EscapedDecls) + GlobalizedVars.emplace_back(C.getDeclAlign(D), D); + std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(), + stable_sort_comparator); + // Build struct _globalized_locals_ty { + // /* globalized vars */ + // }; + GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); + GlobalizedRD->startDefinition(); + for (const auto &Pair : GlobalizedVars) { + const ValueDecl *VD = Pair.second; + QualType Type = VD->getType(); + if (Type->isLValueReferenceType()) + Type = C.getPointerType(Type.getNonReferenceType()); + else + Type = Type.getNonReferenceType(); + SourceLocation Loc = VD->getLocation(); + auto *Field = FieldDecl::Create( + C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, + C.getTrivialTypeSourceInfo(Type, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + GlobalizedRD->addDecl(Field); + if (VD->hasAttrs()) { + for (specific_attr_iterator I(VD->getAttrs().begin()), + E(VD->getAttrs().end()); + I != E; ++I) + Field->addAttr(*I); + } + MappedDeclsFields.try_emplace(VD, Field); + } + GlobalizedRD->completeDefinition(); + } + +public: + CheckVarsEscapingDeclContext(CodeGenFunction &CGF, + ArrayRef IgnoredDecls) + : CGF(CGF), IgnoredDecls(IgnoredDecls.begin(), IgnoredDecls.end()) {} + virtual ~CheckVarsEscapingDeclContext() = default; + void VisitDeclStmt(const DeclStmt *S) { + if (!S) + return; + for (const auto *D : S->decls()) + if (const auto *VD = dyn_cast_or_null(D)) + VisitValueDecl(VD); + } + void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { + if (!D) + return; + if (D->hasAssociatedStmt()) { + if (const auto *S = + dyn_cast_or_null(D->getAssociatedStmt())) + VisitOpenMPCapturedStmt(S); + } + } + void VisitCapturedStmt(const CapturedStmt *S) { + if (!S) + return; + for (const auto &C : S->captures()) { + if (C.capturesVariable() && !C.capturesVariableByCopy()) { + const ValueDecl *VD = C.getCapturedVar(); + markAsEscaped(VD); + if (isa(VD)) + VisitValueDecl(VD); + } + } + } + void VisitLambdaExpr(const LambdaExpr *E) { + if (!E) + return; + for (const auto &C : E->captures()) { + if (C.capturesVariable()) { + if (C.getCaptureKind() == LCK_ByRef) { + const ValueDecl *VD = C.getCapturedVar(); + markAsEscaped(VD); + if (E->isInitCapture(&C) || isa(VD)) + VisitValueDecl(VD); + } + } + } + } + void VisitBlockExpr(const BlockExpr *E) { + if (!E) + return; + for (const auto &C : E->getBlockDecl()->captures()) { + if (C.isByRef()) { + const VarDecl *VD = C.getVariable(); + markAsEscaped(VD); + if (isa(VD) || VD->isInitCapture()) + VisitValueDecl(VD); + } + } + } + void VisitCallExpr(const CallExpr *E) { + if (!E) + return; + for (const Expr *Arg : E->arguments()) { + if (!Arg) + continue; + if (Arg->isLValue()) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(Arg); + AllEscaped = SavedAllEscaped; + } else + Visit(Arg); + } + Visit(E->getCallee()); + } + void VisitDeclRefExpr(const DeclRefExpr *E) { + if (!E) + return; + const ValueDecl *VD = E->getDecl(); + if (AllEscaped) + markAsEscaped(VD); + if (isa(VD)) + VisitValueDecl(VD); + else if (const auto *VarD = dyn_cast(VD)) + if (VarD->isInitCapture()) + VisitValueDecl(VD); + } + void VisitUnaryOperator(const UnaryOperator *E) { + if (!E) + return; + if (E->getOpcode() == UO_AddrOf) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(E->getSubExpr()); + AllEscaped = SavedAllEscaped; + } else + Visit(E->getSubExpr()); + } + void VisitImplicitCastExpr(const ImplicitCastExpr *E) { + if (!E) + return; + if (E->getCastKind() == CK_ArrayToPointerDecay) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(E->getSubExpr()); + AllEscaped = SavedAllEscaped; + } else + Visit(E->getSubExpr()); + } + void VisitExpr(const Expr *E) { + if (!E) + return; + bool SavedAllEscaped = AllEscaped; + if (!E->isLValue()) + AllEscaped = false; + for (const auto *Child : E->children()) + if (Child) + Visit(Child); + AllEscaped = SavedAllEscaped; + } + void VisitStmt(const Stmt *S) { + if (!S) + return; + for (const auto *Child : S->children()) + if (Child) + Visit(Child); + } + + const RecordDecl *getGlobalizedRecord() { + if (!GlobalizedRD) + buildRecordForGlobalizedVars(); + return GlobalizedRD; + } + + const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { + assert(GlobalizedRD && + "Record for globalized variables must be generated already."); + auto I = MappedDeclsFields.find(VD); + if (I == MappedDeclsFields.end()) + return nullptr; + return I->getSecond(); + } + + ArrayRef getEscapedDecls() const { + return EscapedDecls.getArrayRef(); + } +}; } // anonymous namespace /// Get the GPU warp size. @@ -288,6 +543,7 @@ EntryFunctionState EST; WorkerFunctionState WST(CGM, D.getLocStart()); Work.clear(); + WrapperFunctionsMap.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { @@ -344,6 +600,7 @@ Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); + // SEQUENTIAL (MASTER) REGION START // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. // TODO: Optimize runtime initialization and pass in correct value. @@ -351,10 +608,65 @@ Bld.getInt16(/*RequiresOMPRuntime=*/1)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); + + // For data sharing, we need to initialize the stack. + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); + + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I == FunctionGlobalizedDecls.end()) + return; + const RecordDecl *GlobalizedVarsRecord = I->getSecond().first; + QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); + + // Recover pointer to this function's global record. The runtime will + // handle the specifics of the allocation of the memory. + // Use actual memory size of the record including the padding + // for alignment purposes. + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); + unsigned GlobalRecordSize = + CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); + GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + // TODO: allow the usage of shared memory to be controlled by + // the user, for now, default to global. + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), + CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack), + GlobalRecordSizeArg); + llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + FunctionToGlobalRecPtr.try_emplace(CGF.CurFn, GlobalRecValue); + + // Emit the "global alloca" which is a GEP from the global declaration record + // using the pointer returned by the runtime. + LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); + auto &Res = I->getSecond().second; + for (auto &Rec : Res) { + const FieldDecl *FD = Rec.second.first; + LValue VarAddr = CGF.EmitLValueForField(Base, FD); + Rec.second.second = VarAddr.getAddress(); + } } void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I != FunctionGlobalizedDecls.end()) { + if (!CGF.HaveInsertPoint()) + return; + auto I = FunctionToGlobalRecPtr.find(CGF.CurFn); + if (I != FunctionToGlobalRecPtr.end()) { + llvm::Value *Args[] = {I->getSecond()}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), + Args); + } + } + if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -543,14 +855,13 @@ // Execute this outlined function. CGF.EmitBlock(ExecuteFNBB); - // Insert call to work function. - // FIXME: Pass arguments to outlined function from master thread. - auto *Fn = cast(W); - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); - llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; - emitCall(CGF, WST.Loc, Fn, FnArgs); + // Insert call to work function via shared wrapper. The shared + // wrapper takes two arguments: + // - the parallelism level; + // - the master thread ID; + emitOutlinedFunctionCall(CGF, WST.Loc, W, + {Bld.getInt16(/*ParallelLevel=*/0), + getMasterThreadID(CGF)}); // Go to end of parallel region. CGF.EmitBranch(TerminateBB); @@ -619,8 +930,7 @@ case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( /// void *outlined_function, int16_t IsOMPRuntimeInitialized); - llvm::Type *TypeParams[] = {CGM.Int8PtrTy, - CGM.Int16Ty}; + llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); @@ -758,6 +1068,56 @@ FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); break; } + case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { + /// Build void __kmpc_data_sharing_init_stack(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); + break; + } + case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { + // Build void *__kmpc_data_sharing_push_stack(size_t size, + // int16_t UseSharedMemory); + llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_data_sharing_push_stack"); + break; + } + case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: { + // Build void __kmpc_data_sharing_pop_stack(void *a); + llvm::Type *TypeParams[] = {CGM.VoidPtrTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, + /*Name=*/"__kmpc_data_sharing_pop_stack"); + break; + } + case OMPRTL_NVPTX__kmpc_begin_sharing_variables: { + /// Build void __kmpc_begin_sharing_variables(void ***args, + /// size_t n_args); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables"); + break; + } + case OMPRTL_NVPTX__kmpc_end_sharing_variables: { + /// Build void __kmpc_end_sharing_variables(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables"); + break; + } + case OMPRTL_NVPTX__kmpc_get_shared_variables: { + /// Build void __kmpc_get_shared_variables(void ***GlobalArgs); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); + break; + } } return RTLFn; } @@ -847,8 +1207,16 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar, - InnermostKind, CodeGen); + auto *OutlinedFun = + cast(CGOpenMPRuntime::emitParallelOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen)); + if (!isInSpmdExecutionMode()) { + llvm::Function *WrapperFun = + createParallelDataSharingWrapper(OutlinedFun, D); + WrapperFunctionsMap[OutlinedFun] = WrapperFun; + } + + return OutlinedFun; } llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( @@ -900,16 +1268,58 @@ CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast(OutlinedFn); + llvm::Function *WFn = WrapperFunctionsMap[Fn]; + + assert(WFn && "Wrapper function does not exist!"); - auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) { + // Force inline this outlined function at its call site. + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + + auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF, + PrePostActionTy &) { CGBuilderTy &Bld = CGF.Builder; - // TODO: Optimize runtime initialization. - llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy), - /*RequiresOMPRuntime=*/Bld.getInt16(1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); + + // Prepare for parallel region. Indicate the outlined function. + llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + + // Create a private scope that will globalize the arguments + // passed from the outside of the target region. + CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); + + // There's somehting to share. + if (!CapturedVars.empty()) { + // Prepare for parallel region. Indicate the outlined function. + Address SharedArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs"); + llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); + + llvm::Value *DataSharingArgs[] = { + SharedArgsPtr, + llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_begin_sharing_variables), + DataSharingArgs); + + // Store variable address in a list of references to pass to workers. + unsigned Idx = 0; + ASTContext &Ctx = CGF.getContext(); + Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs, + Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) + .castAs()); + for (llvm::Value *V : CapturedVars) { + Address Dst = Bld.CreateConstInBoundsGEP( + SharedArgListAddress, Idx, CGF.getPointerSize()); + llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy); + CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, + Ctx.getPointerType(Ctx.VoidPtrTy)); + Idx++; + } + } // Activate workers. This barrier is used by the master to signal // work for the workers. @@ -923,8 +1333,12 @@ // The master waits at this barrier until all workers are done. syncCTAThreads(CGF); + if (!CapturedVars.empty()) + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables)); + // Remember for post-processing in worker loop. - Work.emplace_back(Fn); + Work.emplace_back(WFn); }; auto *RTLoc = emitUpdateLocation(CGF, Loc); @@ -2343,3 +2757,149 @@ } CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); } + +/// Emit function which wraps the outline parallel region +/// and controls the arguments which are passed to this function. +/// The wrapper ensures that the outlined function is called +/// with the correct arguments when data is shared. +llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { + ASTContext &Ctx = CGM.getContext(); + const auto &CS = *D.getCapturedStmt(OMPD_parallel); + + // Create a function that takes as argument the source thread. + FunctionArgList WrapperArgs; + QualType Int16QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); + QualType Int32QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); + ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(), + /*Id=*/nullptr, Int16QTy, + ImplicitParamDecl::Other); + ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(), + /*Id=*/nullptr, Int32QTy, + ImplicitParamDecl::Other); + WrapperArgs.emplace_back(&ParallelLevelArg); + WrapperArgs.emplace_back(&WrapperArg); + + auto &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); + + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/GlobalDecl(), Fn, CGFI); + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + + CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, + D.getLocStart(), D.getLocStart()); + + const auto *RD = CS.getCapturedRecordDecl(); + auto CurField = RD->field_begin(); + + // Get the array of arguments. + SmallVector Args; + + // TODO: suppport SIMD and pass actual values + Args.emplace_back( + llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + Args.emplace_back( + llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + + CGBuilderTy &Bld = CGF.Builder; + auto CI = CS.capture_begin(); + + // Use global memory for data sharing. + // Handle passing of global args to workers. + Address GlobalArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); + llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); + llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables), + DataSharingArgs); + + // Retrieve the shared variables from the list of references returned + // by the runtime. Pass the variables to the outlined function. + if (CS.capture_size() > 0) { + ASTContext &CGFContext = CGF.getContext(); + Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs, + CGFContext + .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy)) + .castAs()); + for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { + QualType ElemTy = CurField->getType(); + Address Src = Bld.CreateConstInBoundsGEP( + SharedArgListAddress, I, CGF.getPointerSize()); + Address TypedAddress = Bld.CreateBitCast( + Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); + llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, + /*Volatile=*/false, + CGFContext.getPointerType(ElemTy), + CI->getLocation()); + Args.emplace_back(Arg); + } + } + + emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args); + CGF.FinishFunction(); + return Fn; +} + +void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, + const Decl *D) { + assert(D && "Expected function or captured|block decl."); + assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && + "Function is registered already."); + SmallVector IgnoredDecls; + const Stmt *Body = nullptr; + if (const auto *FD = dyn_cast(D)) { + Body = FD->getBody(); + } else if (const auto *BD = dyn_cast(D)) { + Body = BD->getBody(); + } else if (const auto *CD = dyn_cast(D)) { + Body = CD->getBody(); + if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) { + if (const auto *CS = dyn_cast(Body)) { + IgnoredDecls.reserve(CS->capture_size()); + for (const auto &Capture : CS->captures()) + if (Capture.capturesVariable()) + IgnoredDecls.emplace_back(Capture.getCapturedVar()); + } + } + } + if (!Body) + return; + CheckVarsEscapingDeclContext VarChecker(CGF, IgnoredDecls); + VarChecker.Visit(Body); + const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(); + if (!GlobalizedVarsRecord) + return; + auto &Res = + FunctionGlobalizedDecls + .try_emplace(CGF.CurFn, GlobalizedVarsRecord, DeclToAddrMapTy()) + .first->getSecond() + .second; + for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { + const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); + Res.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid()))); + } +} + +Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) { + auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I == FunctionGlobalizedDecls.end()) + return Address::invalid(); + auto VDI = I->getSecond().second.find(VD); + if (VDI == I->getSecond().second.end()) + return Address::invalid(); + return VDI->second.second; +} + +void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) { + FunctionToGlobalRecPtr.erase(CGF.CurFn); + FunctionGlobalizedDecls.erase(CGF.CurFn); + CGOpenMPRuntime::functionFinished(CGF); +} Index: cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp +++ cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp @@ -585,6 +585,7 @@ /*RegisterCastedArgsOnly=*/true, CapturedStmtInfo->getHelperName()); CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true); + WrapperCGF.CapturedStmtInfo = CapturedStmtInfo; Args.clear(); LocalAddrs.clear(); VLASizes.clear(); Index: cfe/trunk/lib/CodeGen/CodeGenFunction.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenFunction.cpp +++ cfe/trunk/lib/CodeGen/CodeGenFunction.cpp @@ -1067,6 +1067,11 @@ EmitStartEHSpec(CurCodeDecl); PrologueCleanupDepth = EHStack.stable_begin(); + + // Emit OpenMP specific initialization of the device functions. + if (getLangOpts().OpenMP && CurCodeDecl) + CGM.getOpenMPRuntime().emitFunctionProlog(*this, CurCodeDecl); + EmitFunctionProlog(*CurFnInfo, CurFn, Args); if (D && isa(D) && cast(D)->isInstance()) { Index: cfe/trunk/test/OpenMP/nvptx_data_sharing.cpp =================================================================== --- cfe/trunk/test/OpenMP/nvptx_data_sharing.cpp +++ cfe/trunk/test/OpenMP/nvptx_data_sharing.cpp @@ -0,0 +1,91 @@ +// Test device global memory data sharing codegen. +///==========================================================================/// + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +void test_ds(){ + #pragma omp target + { + int a = 10; + #pragma omp parallel + { + a = 1000; + } + int b = 100; + #pragma omp parallel + { + b = a + 10000; + } + } +} + +/// ========= In the kernel function ========= /// + +// CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() +// CK1: [[SHAREDARGS1:%.+]] = alloca i8** +// CK1: [[SHAREDARGS2:%.+]] = alloca i8** +// CK1: call void @__kmpc_kernel_init +// CK1: call void @__kmpc_data_sharing_init_stack +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0) +// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* +// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 +// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 +// CK1: store i32 10, i32* [[A]] +// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1) +// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1) +// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]] +// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0 +// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8* +// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]] +// CK1: call void @llvm.nvvm.barrier0() +// CK1: call void @llvm.nvvm.barrier0() +// CK1: call void @__kmpc_end_sharing_variables() +// CK1: store i32 100, i32* [[B]] +// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1) +// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2) +// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]] +// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0 +// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8* +// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]] +// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1 +// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8* +// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]] +// CK1: call void @llvm.nvvm.barrier0() +// CK1: call void @llvm.nvvm.barrier0() +// CK1: call void @__kmpc_end_sharing_variables() +// CK1: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]]) +// CK1: call void @__kmpc_kernel_deinit(i16 1) + +/// ========= In the data sharing wrapper function ========= /// + +// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}}) +// CK1: [[SHAREDARGS4:%.+]] = alloca i8** +// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]]) +// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]] +// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0 +// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32** +// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]] +// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]]) + +/// ========= In the data sharing wrapper function ========= /// + +// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}}) +// CK1: [[SHAREDARGS3:%.+]] = alloca i8** +// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]]) +// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]] +// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0 +// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32** +// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]] +// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1 +// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32** +// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]] +// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]]) + +#endif + Index: cfe/trunk/test/OpenMP/nvptx_parallel_codegen.cpp =================================================================== --- cfe/trunk/test/OpenMP/nvptx_parallel_codegen.cpp +++ cfe/trunk/test/OpenMP/nvptx_parallel_codegen.cpp @@ -64,254 +64,243 @@ // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() - - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() - // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, - // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, - // CHECK: store i8* null, i8** [[OMP_WORK_FN]], - // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], - // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] - // - // CHECK: [[AWAIT_WORK]] - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] - // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 - // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 - // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null - // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] - // - // CHECK: [[SEL_WORKERS]] - // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 - // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] - // - // CHECK: [[EXEC_PARALLEL]] - // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) - // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] - // - // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[CHECK_NEXT1]] - // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) - // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] - // - // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]( - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[CHECK_NEXT2]] - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[TERM_PARALLEL]] - // CHECK: call void @__kmpc_kernel_end_parallel() - // CHECK: br label {{%?}}[[BAR_PARALLEL]] - // - // CHECK: [[BAR_PARALLEL]] - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: br label {{%?}}[[AWAIT_WORK]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] - // Create local storage for each capture. - // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], - // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] - // Store captures in the context. - // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* - // - // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] - // - // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T6]]_worker() - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[CHECK_MASTER]] - // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], - // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] - // - // CHECK: [[MASTER]] - // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] - // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*), - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: call void @__kmpc_serialized_parallel( - // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( - // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*), - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK-64-DAG: load i32, i32* [[REF_A]] - // CHECK-32-DAG: load i32, i32* [[LOCAL_A]] - // CHECK: br label {{%?}}[[TERMINATE:.+]] - // - // CHECK: [[TERMINATE]] - // CHECK: call void @__kmpc_kernel_deinit( - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: br label {{%?}}[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK-DAG: define internal void [[PARALLEL_FN1]]( - // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], - // CHECK: store i[[SZ]] 42, i[[SZ]]* %a, - // CHECK: ret void - - // CHECK-DAG: define internal void [[PARALLEL_FN3]]( - // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], - // CHECK: store i[[SZ]] 43, i[[SZ]]* %a, - // CHECK: ret void - - // CHECK-DAG: define internal void [[PARALLEL_FN2]]( - // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], - // CHECK: store i[[SZ]] 44, i[[SZ]]* %a, - // CHECK: ret void - - - - - - - - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() - // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, - // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, - // CHECK: store i8* null, i8** [[OMP_WORK_FN]], - // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], - // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] - // - // CHECK: [[AWAIT_WORK]] - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] - // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 - // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 - // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null - // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] - // - // CHECK: [[SEL_WORKERS]] - // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 - // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] - // - // CHECK: [[EXEC_PARALLEL]] - // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*) - // CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]] - // - // CHECK: [[EXEC_PFN]] - // CHECK: call void [[PARALLEL_FN4]]( - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[CHECK_NEXT]] - // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] - // - // CHECK: [[TERM_PARALLEL]] - // CHECK: call void @__kmpc_kernel_end_parallel() - // CHECK: br label {{%?}}[[BAR_PARALLEL]] - // - // CHECK: [[BAR_PARALLEL]] - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: br label {{%?}}[[AWAIT_WORK]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]] - // Create local storage for each capture. - // CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]], - // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], - // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]], - // CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* - // CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]] - // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] - // CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] - // CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] - // Store captures in the context. - // CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32* - // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* - // CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* - // CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], - // - // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] - // - // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T6]]_worker() - // CHECK: br label {{%?}}[[EXIT:.+]] - // - // CHECK: [[CHECK_MASTER]] - // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], - // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] - // - // CHECK: [[MASTER]] - // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] - // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]], - // CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]], - // CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000 - // CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] - // - // CHECK: [[IF_THEN]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*), - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: br label {{%?}}[[IF_END:.+]] - // - // CHECK: [[IF_ELSE]] - // CHECK: call void @__kmpc_serialized_parallel( - // CHECK: {{call|invoke}} void [[PARALLEL_FN4]]( - // CHECK: call void @__kmpc_end_serialized_parallel( - // br label [[IF_END]] - // - // CHECK: [[IF_END]] - // CHECK-64-DAG: load i32, i32* [[REF_A]] - // CHECK-32-DAG: load i32, i32* [[LOCAL_A]] - // CHECK-DAG: load i16, i16* [[REF_AA]] - // CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 - // - // CHECK: br label {{%?}}[[TERMINATE:.+]] - // - // CHECK: [[TERMINATE]] - // CHECK: call void @__kmpc_kernel_deinit( - // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: br label {{%?}}[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - - // CHECK: define internal void [[PARALLEL_FN4]]( - // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], - // CHECK: store i[[SZ]] 45, i[[SZ]]* %a, - // CHECK: ret void +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() +// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, +// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, +// CHECK: store i8* null, i8** [[OMP_WORK_FN]], +// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], +// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] +// +// CHECK: [[AWAIT_WORK]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] +// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 +// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 +// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null +// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] +// +// CHECK: [[SEL_WORKERS]] +// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] +// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 +// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] +// +// CHECK: [[EXEC_PARALLEL]] +// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] +// +// CHECK: [[EXEC_PFN1]] +// CHECK: call void [[PARALLEL_FN1]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT1]] +// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] +// +// CHECK: [[EXEC_PFN2]] +// CHECK: call void [[PARALLEL_FN2]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT2]] +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[TERM_PARALLEL]] +// CHECK: call void @__kmpc_kernel_end_parallel() +// CHECK: br label {{%?}}[[BAR_PARALLEL]] +// +// CHECK: [[BAR_PARALLEL]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[AWAIT_WORK]] +// +// CHECK: [[EXIT]] +// CHECK: ret void + +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// Store captures in the context. +// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// +// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] +// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] +// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] +// +// CHECK: [[WORKER]] +// CHECK: {{call|invoke}} void [[T6]]_worker() +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[CHECK_MASTER]] +// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], +// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] +// +// CHECK: [[MASTER]] +// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] +// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*), +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*), +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK-64-DAG: load i32, i32* [[REF_A]] +// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] +// CHECK: br label {{%?}}[[TERMINATE:.+]] +// +// CHECK: [[TERMINATE]] +// CHECK: call void @__kmpc_kernel_deinit( +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[EXIT]] +// +// CHECK: [[EXIT]] +// CHECK: ret void + +// CHECK-DAG: define internal void [[PARALLEL_FN1]]( +// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], +// CHECK: store i[[SZ]] 42, i[[SZ]]* %a, +// CHECK: ret void + +// CHECK-DAG: define internal void [[PARALLEL_FN3]]( +// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], +// CHECK: store i[[SZ]] 43, i[[SZ]]* %a, +// CHECK: ret void + +// CHECK-DAG: define internal void [[PARALLEL_FN2]]( +// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], +// CHECK: store i[[SZ]] 44, i[[SZ]]* %a, +// CHECK: ret void + +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() +// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, +// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, +// CHECK: store i8* null, i8** [[OMP_WORK_FN]], +// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], +// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] +// +// CHECK: [[AWAIT_WORK]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], +// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 +// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 +// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null +// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] +// +// CHECK: [[SEL_WORKERS]] +// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] +// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 +// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] +// +// CHECK: [[EXEC_PARALLEL]] +// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32)* [[PARALLEL_FN4:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]] +// +// CHECK: [[EXEC_PFN]] +// CHECK: call void [[PARALLEL_FN4]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT]] +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[TERM_PARALLEL]] +// CHECK: call void @__kmpc_kernel_end_parallel() +// CHECK: br label {{%?}}[[BAR_PARALLEL]] +// +// CHECK: [[BAR_PARALLEL]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[AWAIT_WORK]] +// +// CHECK: [[EXIT]] +// CHECK: ret void + +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]] +// Create local storage for each capture. +// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]], +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], +// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]], +// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]] +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32* +// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], +// +// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] +// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] +// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] +// +// CHECK: [[WORKER]] +// CHECK: {{call|invoke}} void [[T6]]_worker() +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[CHECK_MASTER]] +// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], +// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] +// +// CHECK: [[MASTER]] +// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] +// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] +// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]], +// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]], +// CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000 +// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] +// +// CHECK: [[IF_THEN]] +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN4]]_wrapper to i8*), +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[IF_END:.+]] +// +// CHECK: [[IF_ELSE]] +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: {{call|invoke}} void [[PARALLEL_FN4]]( +// CHECK: call void @__kmpc_end_serialized_parallel( +// br label [[IF_END]] +// +// CHECK: [[IF_END]] +// CHECK-64-DAG: load i32, i32* [[REF_A]] +// CHECK-32-DAG: load i32, i32* [[LOCAL_A]] +// CHECK-DAG: load i16, i16* [[REF_AA]] +// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 +// +// CHECK: br label {{%?}}[[TERMINATE:.+]] +// +// CHECK: [[TERMINATE]] +// CHECK: call void @__kmpc_kernel_deinit( +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[EXIT]] +// +// CHECK: [[EXIT]] +// CHECK: ret void + +// CHECK: define internal void [[PARALLEL_FN4]]( +// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], +// CHECK: store i[[SZ]] 45, i[[SZ]]* %a, +// CHECK: ret void #endif