Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -130,6 +130,30 @@ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); + /// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen + /// function. Here is the logic: + /// if (Cond) { + /// ThenGen(); + /// } else { + /// ElseGen(); + /// } + void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, + const RegionCodeGenTy &ThenGen, + const RegionCodeGenTy &ElseGen); + + /// \brief Emits object of ident_t type with info for source location. + /// \param Flags Flags for OpenMP location. + /// + llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, + unsigned Flags = 0); + + /// \brief Returns pointer to ident_t type. + llvm::Type *getIdentTyPointerTy(); + + /// \brief Gets thread id value for the current thread. + /// + virtual llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc); + private: /// \brief Default const ident_t object used for initialization of all other /// ident_t objects. @@ -380,15 +404,6 @@ /// \brief Build type kmp_routine_entry_t (if not built yet). void emitKmpRoutineEntryT(QualType KmpInt32Ty); - /// \brief Emits object of ident_t type with info for source location. - /// \param Flags Flags for OpenMP location. - /// - llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, - unsigned Flags = 0); - - /// \brief Returns pointer to ident_t type. - llvm::Type *getIdentTyPointerTy(); - /// \brief Returns pointer to kmpc_micro type. llvm::Type *getKmpc_MicroPointerTy(); @@ -424,10 +439,6 @@ /// stored. virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc); - /// \brief Gets thread id value for the current thread. - /// - llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc); - /// \brief Gets (if variable with the given name already exist) or creates /// internal global variable with the specified Name. The created variable has /// linkage CommonLinkage by default and is initialized by null value. Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -111,7 +111,7 @@ const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; } /// \brief Get the name of the capture helper. - StringRef getHelperName() const override { return ".omp_outlined."; } + StringRef getHelperName() const override { return "__omp_outlined__"; } static bool classof(const CGCapturedStmtInfo *Info) { return CGOpenMPRegionInfo::classof(Info) && @@ -1892,9 +1892,9 @@ /// } else { /// ElseGen(); /// } -static void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, - const RegionCodeGenTy &ThenGen, - const RegionCodeGenTy &ElseGen) { +void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, + const RegionCodeGenTy &ThenGen, + const RegionCodeGenTy &ElseGen) { CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange()); // If the condition constant folds and can be elided, try to avoid emitting Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -25,6 +25,9 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { private: + // Parallel outlined function work for workers to execute. + llvm::SmallVector Work; + struct EntryFunctionState { llvm::BasicBlock *ExitBB = nullptr; }; @@ -70,6 +73,10 @@ void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size) override; + /// \brief Gets thread id value for the current thread. + /// + llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc) override; + /// \brief Emit outlined function specialized for the Fork-Join /// programming model for applicable target directives on the NVPTX device. /// \param D Directive to emit. @@ -100,6 +107,21 @@ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) override; + /// \brief Emits code for parallel or serial call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// This call is for the Generic Execution Mode. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond); + public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); @@ -137,6 +159,20 @@ void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars) override; + + /// \brief Emits code for parallel or serial call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond) override; }; } // CodeGen namespace. Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -26,6 +26,19 @@ OMPRTL_NVPTX__kmpc_kernel_init, /// \brief Call to void __kmpc_kernel_deinit(); OMPRTL_NVPTX__kmpc_kernel_deinit, + /// \brief Call to void __kmpc_kernel_prepare_parallel(void + /// *outlined_function); + OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, + /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function); + OMPRTL_NVPTX__kmpc_kernel_parallel, + /// \brief Call to void __kmpc_kernel_end_parallel(); + OMPRTL_NVPTX__kmpc_kernel_end_parallel, + /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 + /// global_tid); + OMPRTL_NVPTX__kmpc_serialized_parallel, + /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 + /// global_tid); + OMPRTL_NVPTX__kmpc_end_serialized_parallel, }; } // namespace @@ -93,6 +106,46 @@ Bld.CreateNot(Mask), "master_tid"); } +/// Get the id of the current block on the GPU. +static llvm::Value *getNVPTXBlockID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateCall( + llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x), + llvm::None, "nvptx_block_id"); +} + +/// Get number of OMP workers for parallel region after subtracting +/// the master warp. +static llvm::Value *getNumWorkers(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateNUWSub(getNVPTXNumThreads(CGF), Bld.getInt32(32), + "num_workers"); +} + +/// Get thread id in team. +/// FIXME: Remove the expensive remainder operation. +static llvm::Value *getTeamThreadId(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + // N % M = N & (M-1) it M is a power of 2. The master Id is expected to be a + // power of two in all cases. + auto *Mask = Bld.CreateNUWSub(getMasterThreadID(CGF), Bld.getInt32(1)); + return Bld.CreateAnd(getNVPTXThreadID(CGF), Mask, "team_tid"); +} + +/// Get global thread id. +static llvm::Value *getGlobalThreadId(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateAdd(Bld.CreateMul(getNVPTXBlockID(CGF), getNumWorkers(CGF)), + getTeamThreadId(CGF), "global_tid"); +} + +llvm::Value *CGOpenMPRuntimeNVPTX::getThreadID(CodeGenFunction &CGF, + SourceLocation Loc) { + assert(CGF.CurFn && "No function in current CodeGenFunction."); + return getGlobalThreadId(CGF); +} + CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( CodeGenModule &CGM) : WorkerFn(nullptr), CGFI(nullptr) { @@ -118,6 +171,7 @@ const RegionCodeGenTy &CodeGen) { EntryFunctionState EST; WorkerFunctionState WST(CGM); + Work.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { @@ -246,7 +300,10 @@ CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); - // TODO: Call into runtime to get parallel work. + llvm::Value *Args[] = {WorkFn.getPointer()}; + llvm::Value *Ret = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); + Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); // On termination condition (workid == 0), exit loop. llvm::Value *ShouldTerminate = @@ -261,10 +318,44 @@ // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); - // TODO: Add parallel work. + + // Process work items: outlined parallel functions. + for (auto *W : Work) { + // Try to match this outlined function. + auto ID = Bld.CreatePtrToInt(W, CGM.Int64Ty); + ID = Bld.CreateIntToPtr(ID, CGM.Int8PtrTy); + llvm::Value *WorkFnMatch = + Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); + + llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); + llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); + Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); + + // Execute this outlined function. + CGF.EmitBlock(ExecuteFNBB); + + // Insert call to work function. + // FIXME: Pass arguments to outlined function from master thread. + auto Fn = cast(W); + Address ZeroAddr = + CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); + llvm::SmallVector FnArgs; + FnArgs.push_back(ZeroAddr.getPointer()); + FnArgs.push_back(ZeroAddr.getPointer()); + CGF.EmitCallOrInvoke(Fn, FnArgs); + + // Go to end of parallel region. + CGF.EmitBranch(TerminateBB); + + CGF.EmitBlock(CheckNextBB); + } // Signal end of parallel region. CGF.EmitBlock(TerminateBB); + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), + ArrayRef()); CGF.EmitBranch(BarrierBB); // All active and inactive workers wait at a barrier after parallel region. @@ -300,6 +391,49 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); break; } + case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { + /// Build void __kmpc_kernel_prepare_parallel( + /// void *outlined_function); + llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); + break; + } + case OMPRTL_NVPTX__kmpc_kernel_parallel: { + /// Build bool __kmpc_kernel_parallel(void **outlined_function); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(llvm::Type::getInt1Ty(CGM.getLLVMContext()), + TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); + break; + } + case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { + /// Build void __kmpc_kernel_end_parallel(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); + break; + } + case OMPRTL_NVPTX__kmpc_serialized_parallel: { + // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 + // global_tid); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); + break; + } + case OMPRTL_NVPTX__kmpc_end_serialized_parallel: { + // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 + // global_tid); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); + break; + } } return RTLFn; } @@ -354,19 +488,8 @@ const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - llvm::Function *OutlinedFun = nullptr; - if (isa(D)) { - llvm::Value *OutlinedFunVal = - CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( - D, ThreadIDVar, InnermostKind, CodeGen); - OutlinedFun = cast(OutlinedFunVal); - OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); - OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); - } else - llvm_unreachable("parallel directive is not yet supported for nvptx " - "backend."); - - return OutlinedFun; + return CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen); } void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, @@ -387,3 +510,71 @@ OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); } + +void CGOpenMPRuntimeNVPTX::emitParallelCall( + CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef CapturedVars, const Expr *IfCond) { + if (!CGF.HaveInsertPoint()) + return; + + emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); +} + +void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( + CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef CapturedVars, const Expr *IfCond) { + llvm::Function *Fn = cast(OutlinedFn); + + auto *RTLoc = emitUpdateLocation(CGF, Loc); + auto &&L0ParallelGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF, + PrePostActionTy &) { + CGBuilderTy &Bld = CGF.Builder; + + // Prepare for parallel region. Indicate the outlined function. + llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + + // Activate workers. + syncCTAThreads(CGF); + + // Barrier at end of parallel region. + syncCTAThreads(CGF); + + // Remember for post-processing in worker loop. + Work.push_back(Fn); + }; + + auto &&SeqGen = [this, Fn, &CapturedVars, &RTLoc, &Loc](CodeGenFunction &CGF, + PrePostActionTy &) { + auto DL = CGM.getDataLayout(); + auto ThreadID = getThreadID(CGF, Loc); + + llvm::Value *Args[] = {RTLoc, ThreadID}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), + Args); + + llvm::SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back( + llvm::Constant::getNullValue(CGM.Int32Ty->getPointerTo())); + OutlinedFnArgs.push_back( + llvm::Constant::getNullValue(CGM.Int32Ty->getPointerTo())); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs); + + llvm::Value *EndArgs[] = {emitUpdateLocation(CGF, Loc), ThreadID}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), + EndArgs); + }; + + if (IfCond) { + emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen); + } else { + CodeGenFunction::RunCleanupsScope Scope(CGF); + RegionCodeGenTy ThenRCG(L0ParallelGen); + ThenRCG(CGF); + } +} Index: test/OpenMP/nvptx_parallel_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -0,0 +1,323 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +template +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target if(0) + { + #pragma omp parallel + { + int a = 41; + } + a += 1; + } + + #pragma omp target + { + #pragma omp parallel + { + int a = 42; + } + #pragma omp parallel if(0) + { + int a = 43; + } + #pragma omp parallel if(1) + { + int a = 44; + } + a += 1; + } + + #pragma omp target if(n>40) + { + #pragma omp parallel if(n>1000) + { + int a = 45; + } + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +int bar(int n){ + int a = 0; + + a += ftemplate(n); + + return a; +} + + // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], + // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] + // + // CHECK: [[AWAIT_WORK]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 + // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null + // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] + // + // CHECK: [[SEL_WORKERS]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 + // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] + // + // CHECK: [[EXEC_PARALLEL]] + // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] + // + // CHECK: [[EXEC_PFN1]] + // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[CHECK_NEXT1]] + // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] + // + // CHECK: [[EXEC_PFN2]] + // CHECK: call void [[PARALLEL_FN2]]( + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[CHECK_NEXT2]] + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[TERM_PARALLEL]] + // CHECK: call void @__kmpc_kernel_end_parallel() + // CHECK: br label {{%?}}[[BAR_PARALLEL]] + // + // CHECK: [[BAR_PARALLEL]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[AWAIT_WORK]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] + // Create local storage for each capture. + // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], + // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] + // Store captures in the context. + // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* + // + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] + // + // CHECK: [[WORKER]] + // CHECK: {{call|invoke}} void [[T6]]_worker() + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[CMTMP1:%.+]] = sub i32 [[CMWS]], 1 + // CHECK: [[CMTMP2:%.+]] = sub i32 [[CMNTH]], 1 + // CHECK: [[MID:%.+]] = and i32 [[CMTMP2]], + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], [[MID]] + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*)) + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: call void @__kmpc_serialized_parallel( + // CHECK: call void [[PARALLEL_FN3:@.+]]( + // CHECK: call void @__kmpc_end_serialized_parallel( + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*)) + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK-64-DAG: load i32, i32* [[REF_A]] + // CHECK-32-DAG: load i32, i32* [[LOCAL_A]] + // CHECK: br label {{%?}}[[TERMINATE:.+]] + // + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK-DAG: define internal void [[PARALLEL_FN1]]( + // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], + // CHECK: store i[[SZ]] 42, i[[SZ]]* %a, + // CHECK: ret void + + // CHECK-DAG: define internal void [[PARALLEL_FN3]]( + // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], + // CHECK: store i[[SZ]] 43, i[[SZ]]* %a, + // CHECK: ret void + + // CHECK-DAG: define internal void [[PARALLEL_FN2]]( + // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], + // CHECK: store i[[SZ]] 44, i[[SZ]]* %a, + // CHECK: ret void + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], + // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] + // + // CHECK: [[AWAIT_WORK]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 + // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null + // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] + // + // CHECK: [[SEL_WORKERS]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 + // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] + // + // CHECK: [[EXEC_PARALLEL]] + // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*) + // CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]] + // + // CHECK: [[EXEC_PFN]] + // CHECK: call void [[PARALLEL_FN4]]( + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[CHECK_NEXT]] + // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] + // + // CHECK: [[TERM_PARALLEL]] + // CHECK: call void @__kmpc_kernel_end_parallel() + // CHECK: br label {{%?}}[[BAR_PARALLEL]] + // + // CHECK: [[BAR_PARALLEL]] + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[AWAIT_WORK]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]] + // Create local storage for each capture. + // CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]], + // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], + // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]], + // CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]* + // CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]] + // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] + // CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]] + // CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] + // Store captures in the context. + // CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32* + // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* + // CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* + // CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], + // + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] + // + // CHECK: [[WORKER]] + // CHECK: {{call|invoke}} void [[T6]]_worker() + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[CMTMP1:%.+]] = sub i32 [[CMWS]], 1 + // CHECK: [[CMTMP2:%.+]] = sub i32 [[CMNTH]], 1 + // CHECK: [[MID:%.+]] = and i32 [[CMTMP2]], + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], [[MID]] + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] + // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + // CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]], + // CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]], + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000 + // CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] + // + // CHECK: [[IF_THEN]] + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*)) + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[IF_END:.+]] + // + // CHECK: [[IF_ELSE]] + // CHECK: call void @__kmpc_serialized_parallel( + // CHECK: call void [[PARALLEL_FN4]]( + // CHECK: call void @__kmpc_end_serialized_parallel( + // br label [[IF_END]] + // + // CHECK: [[IF_END]] + // CHECK-64-DAG: load i32, i32* [[REF_A]] + // CHECK-32-DAG: load i32, i32* [[LOCAL_A]] + // CHECK-DAG: load i16, i16* [[REF_AA]] + // CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 + // + // CHECK: br label {{%?}}[[TERMINATE:.+]] + // + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK: define internal void [[PARALLEL_FN4]]( + // CHECK: [[A:%.+]] = alloca i[[SZ:32|64]], + // CHECK: store i[[SZ]] 45, i[[SZ]]* %a, + // CHECK: ret void +#endif