Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -305,6 +305,17 @@ // target region and used by containing directives such as 'parallel' // to emit optimized code. ExecutionMode CurrentExecutionMode; + + /// Map between an outlined function and its wrapper. + llvm::DenseMap WrapperFunctionsMap; + + /// Emit function which wraps the outline parallel region + /// and controls the parameters which are passed to this function. + /// The wrapper ensures that the outlined function is called + /// with the correct arguments when data is shared. + llvm::Function * + createDataSharingWrapper(llvm::Function *OutlinedParallelFn, + const OMPExecutableDirective &D); }; } // CodeGen namespace. Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -294,6 +294,7 @@ EntryFunctionState EST; WorkerFunctionState WST(CGM); Work.clear(); + WrapperFunctionsMap.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { @@ -467,7 +468,7 @@ } void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { - auto &Ctx = CGM.getContext(); + ASTContext &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CGF.disableDebugInfo(); @@ -510,7 +511,10 @@ CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); - llvm::Value *Args[] = {WorkFn.getPointer()}; + // Set up shared arguments + Address SharedArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args"); + llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()}; llvm::Value *Ret = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); @@ -529,6 +533,9 @@ // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); + // Current context + ASTContext &Ctx = CGF.getContext(); + // Process work items: outlined parallel functions. for (auto *W : Work) { // Try to match this outlined function. @@ -544,14 +551,18 @@ // Execute this outlined function. CGF.EmitBlock(ExecuteFNBB); - // Insert call to work function. - // FIXME: Pass arguments to outlined function from master thread. - auto *Fn = cast(W); - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); - llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; - emitCall(CGF, Fn, FnArgs); + // Insert call to work function via shared wrapper. The shared + // wrapper takes exactly three arguments: + // - the parallelism level; + // - the master thread ID; + // - the list of references to shared arguments. + // + // TODO: Assert that the function is a wrapper function.s + Address Capture = CGF.EmitLoadOfPointer(SharedArgs, + Ctx.getPointerType( + Ctx.getPointerType(Ctx.VoidPtrTy)).castAs()); + emitCall(CGF, W, {Bld.getInt16(/*ParallelLevel=*/0), + getMasterThreadID(CGF), Capture.getPointer()}); // Go to end of parallel region. CGF.EmitBranch(TerminateBB); @@ -617,16 +628,18 @@ } case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( - /// void *outlined_function); - llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; + /// void *outlined_function, void ***args, kmp_int32 nArgs); + llvm::Type *TypeParams[] = {CGM.Int8PtrTy, + CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); break; } case OMPRTL_NVPTX__kmpc_kernel_parallel: { - /// Build bool __kmpc_kernel_parallel(void **outlined_function); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; + /// Build bool __kmpc_kernel_parallel(void **outlined_function, void ***args); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, + CGM.Int8PtrPtrTy->getPointerTo(0)}; llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); llvm::FunctionType *FnTy = llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); @@ -845,8 +858,17 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar, - InnermostKind, CodeGen); + + auto *OutlinedFun = cast( + CGOpenMPRuntime::emitParallelOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen)); + if (!isInSpmdExecutionMode()) { + llvm::Function *WrapperFun = + createDataSharingWrapper(OutlinedFun, D); + WrapperFunctionsMap[OutlinedFun] = WrapperFun; + } + + return OutlinedFun; } llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( @@ -898,15 +920,52 @@ CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast(OutlinedFn); + llvm::Function *WFn = WrapperFunctionsMap[Fn]; + assert(WFn && "Wrapper function does not exist!"); - auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) { + // Force inline this outlined function at its call site. + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + + auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF, + PrePostActionTy &) { CGBuilderTy &Bld = CGF.Builder; - // Prepare for parallel region. Indicate the outlined function. - llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); + + if (!CapturedVars.empty()) { + // Prepare for parallel region. Indicate the outlined function. + Address SharedArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, + "shared_args"); + llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); + llvm::Value *Args[] = {ID, SharedArgsPtr, + Bld.getInt32(CapturedVars.size())}; + + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + + unsigned Idx = 0; + ASTContext &Ctx = CGF.getContext(); + for (llvm::Value *V : CapturedVars) { + Address Dst = Bld.CreateConstInBoundsGEP( + CGF.EmitLoadOfPointer(SharedArgs, + Ctx.getPointerType( + Ctx.getPointerType(Ctx.VoidPtrTy)).castAs()), + Idx, CGF.getPointerSize()); + llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy); + CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, + Ctx.getPointerType(Ctx.VoidPtrTy)); + Idx++; + } + } else { + llvm::Value *Args[] = {ID, + llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)), + /*nArgs=*/Bld.getInt32(0)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + } // Activate workers. This barrier is used by the master to signal // work for the workers. @@ -921,7 +980,7 @@ syncCTAThreads(CGF); // Remember for post-processing in worker loop. - Work.push_back(Fn); + Work.emplace_back(WFn); }; auto *RTLoc = emitUpdateLocation(CGF, Loc); @@ -2317,3 +2376,111 @@ } CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); } + +/// Emit the code that each thread requires to execute when it encounters +/// one of the three possible parallelism levels. This also emits the required +/// data sharing code for each level. +static void emitParallelismLevelCode( + CodeGenFunction &CGF) { + // Emit L0 and L1 level parallel code. + // TODO: Emit L1 code for nested parallelism. + llvm::BasicBlock *AfterBB = CGF.createBasicBlock(".after.parallel"); + + // Emit L0 code + llvm::BasicBlock *LBB = CGF.createBasicBlock(".level0.parallel"); + CGF.EmitBlock(LBB); + CGF.EmitBranch(AfterBB); + CGF.EmitBlock(AfterBB); +} + +/// Emit function which wraps the outline parallel region +/// and controls the arguments which are passed to this function. +/// The wrapper ensures that the outlined function is called +/// with the correct arguments when data is shared. +llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { + ASTContext &Ctx = CGM.getContext(); + const auto &CS = *cast(D.getAssociatedStmt()); + + // Create a function that takes as argument the source thread. + FunctionArgList WrapperArgs; + QualType Int16QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); + QualType Int32QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); + QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy); + QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy); + ImplicitParamDecl ParallelLevelArg(Ctx, Int16QTy, ImplicitParamDecl::Other); + ImplicitParamDecl WrapperArg(Ctx, Int32QTy, ImplicitParamDecl::Other); + ImplicitParamDecl SharedArgsList(Ctx, VoidPtrPtrQTy, + ImplicitParamDecl::Other); + WrapperArgs.emplace_back(&ParallelLevelArg); + WrapperArgs.emplace_back(&WrapperArg); + WrapperArgs.emplace_back(&SharedArgsList); + + auto &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); + + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + + CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs); + + const auto *RD = CS.getCapturedRecordDecl(); + auto CurField = RD->field_begin(); + + // Emit code which performas the data sharing. + emitParallelismLevelCode(CGF); + + // Get the array of arguments. + SmallVector Args; + + // TODO: suppport SIMD and pass actual values + Args.emplace_back(llvm::ConstantPointerNull::get( + CGM.Int32Ty->getPointerTo())); + Args.emplace_back(llvm::ConstantPointerNull::get( + CGM.Int32Ty->getPointerTo())); + + CGBuilderTy &Bld = CGF.Builder; + auto CI = CS.capture_begin(); + + // Load the start of the array + auto SharedArgs = + CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList), + VoidPtrPtrQTy->castAs()); + + // For each captured variable + for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) { + // Name of captured variable + StringRef Name; + if (CI->capturesThis()) + Name = "this"; + else + Name = CI->getCapturedVar()->getName(); + + // We retrieve the CLANG type of the argument. We use it to create + // an alloca which will give us the LLVM type. + QualType ElemTy = CurField->getType(); + // If this is a capture by copy the element type has to be the pointer to + // the data. + if (CI->capturesVariableByCopy()) + ElemTy = Ctx.getPointerType(ElemTy); + + // Get shared address of the captured variable. + Address ArgAddress = Bld.CreateConstInBoundsGEP( + SharedArgs, I, CGF.getPointerSize()); + Address TypedArgAddress = Bld.CreateBitCast( + ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy))); + llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress, + /*Volatile=*/false, Int32PtrQTy, SourceLocation()); + Args.emplace_back(Arg); + } + + emitCall(CGF, OutlinedParallelFn, Args); + CGF.FinishFunction(); + return Fn; +} Index: test/OpenMP/nvptx_data_sharing.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_data_sharing.cpp @@ -0,0 +1,52 @@ +// Test device data sharing codegen. +///==========================================================================/// + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +void test_ds(){ + #pragma omp target + { + int a = 10; + #pragma omp parallel + { + a = 1000; + } + } +} + +/// ========= In the worker function ========= /// + +// CK1: define internal void @__omp_offloading_{{.*}}test_ds{{.*}}worker(){{.*}}{ +// CK1: [[SHAREDARGS:%.+]] = alloca i8** +// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i8*** [[SHAREDARGS]]) +// CK1: [[SHARGSTMP:%.+]] = load i8**, i8*** [[SHAREDARGS]] +// CK1: call void @__omp_outlined___wrapper{{.*}}({{.*}}, i8** %5) + +/// ========= In the kernel function ========= /// + +// CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() +// CK1: [[SHAREDARGS1:%.+]] = alloca i8** +// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i8*** [[SHAREDARGS1]], i32 1) +// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]] +// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]] +// CK1: [[SHAREDVAR:%.+]] = bitcast i32* {{.*}} to i8* +// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]] + +/// ========= In the data sharing wrapper function ========= /// + +// CK1: {{.*}}define internal void @__omp_outlined___wrapper({{.*}}i8**){{.*}}{ +// CK1: [[SHAREDARGS2:%.+]] = alloca i8** +// CK1: store i8** %2, i8*** [[SHAREDARGS2]] +// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]] +// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]] +// CK1: [[SHARGSTMP5:%.+]] = bitcast i8** [[SHARGSTMP4]] to i32** +// CK1: [[SHARGSTMP6:%.+]] = load i32*, i32** [[SHARGSTMP5]] +// CK1: call void @__omp_outlined__({{.*}}, i32* [[SHARGSTMP6]]) + +#endif \ No newline at end of file Index: test/OpenMP/nvptx_parallel_codegen.cpp =================================================================== --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: call void [[PARALLEL_FN1]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] // // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]( + // CHECK: call void [[PARALLEL_FN2]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT2]] @@ -152,13 +152,13 @@ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @__kmpc_serialized_parallel( // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -203,7 +203,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -217,11 +217,11 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*) + // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN4:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]] // // CHECK: [[EXEC_PFN]] - // CHECK: call void [[PARALLEL_FN4]]( + // CHECK: call void [[PARALLEL_FN4]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT]] @@ -283,7 +283,7 @@ // CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] // // CHECK: [[IF_THEN]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN4]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[IF_END:.+]] Index: test/OpenMP/nvptx_target_teams_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -146,7 +146,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],