Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -43,6 +43,8 @@ void createWorkerFunction(CodeGenModule &CGM); }; + bool isInSpmdExecutionMode() const; + /// \brief Emit the worker function for the current target region. void emitWorkerFunction(WorkerFunctionState &WST); @@ -58,6 +60,13 @@ /// function. void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// \brief Helper for Spmd mode target directive's entry function. + void emitSpmdEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, + const OMPExecutableDirective &D); + + /// \brief Signal termination of Spmd mode execution. + void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. @@ -87,6 +96,22 @@ llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); + /// \brief Emit outlined function specialized for the Single Program + /// Multiple Data programming model for applicable target directives on the + /// NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// \param CodeGen Object containing the target statements. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitSpmdKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); + /// \brief Emit outlined function for 'target' directive on the NVPTX /// device. /// \param D Directive to emit. @@ -118,6 +143,22 @@ ArrayRef CapturedVars, const Expr *IfCond); + /// \brief Emits code for parallel or serial call of the \a OutlinedFn with + /// variables captured in a record which address is stored in \a + /// CapturedStruct. + /// This call is for a parallel directive within an SPMD target directive. + /// \param OutlinedFn Outlined function to be run in parallel threads. Type of + /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). + /// \param CapturedVars A pointer to the record with the references to + /// variables used in \a OutlinedFn function. + /// \param IfCond Condition in the associated 'if' clause, if it was + /// specified, nullptr otherwise. + /// + void emitSpmdParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond); + protected: /// \brief Get the function name of an outlined region. // The name can be customized depending on the target. @@ -192,6 +233,25 @@ llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) override; + +public: + /// Target codegen is specialized based on two programming models: the + /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' + /// model for constructs like 'target parallel' that support it. + enum ExecutionMode { + /// Single Program Multiple Data. + Spmd, + /// Generic codegen to support fork-join model. + Generic, + Unknown, + }; + +private: + // Track the execution mode when codegening directives within a target + // region. The appropriate mode (generic/spmd) is set on entry to the + // target region and used by containing directives such as 'parallel' + // to emit optimized code. + ExecutionMode CurrentExecutionMode; }; } // CodeGen namespace. Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -26,6 +26,11 @@ OMPRTL_NVPTX__kmpc_kernel_init, /// \brief Call to void __kmpc_kernel_deinit(); OMPRTL_NVPTX__kmpc_kernel_deinit, + /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, + /// short RequiresOMPRuntime, short RequiresDataSharing); + OMPRTL_NVPTX__kmpc_spmd_kernel_init, + /// \brief Call to void __kmpc_spmd_kernel_deinit(); + OMPRTL_NVPTX__kmpc_spmd_kernel_deinit, /// \brief Call to void __kmpc_kernel_prepare_parallel(void /// *outlined_function); OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, @@ -76,6 +81,25 @@ CGF.EmitRuntimeCall(ExitCallee, ExitArgs); } }; + +// A class to track the execution mode when codegening directives within +// a target region. The appropriate mode (generic/spmd) is set on entry +// to the target region and used by containing directives such as 'parallel' +// to emit optimized code. +class ExecutionModeRAII { +private: + CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode; + CGOpenMPRuntimeNVPTX::ExecutionMode &Mode; + +public: + ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, + CGOpenMPRuntimeNVPTX::ExecutionMode NewMode) + : Mode(Mode) { + SavedMode = Mode; + Mode = NewMode; + } + ~ExecutionModeRAII() { Mode = SavedMode; } +}; } // anonymous namespace /// Get the GPU warp size. @@ -116,12 +140,17 @@ static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } /// Get the value of the thread_limit clause in the teams directive. -/// The runtime encodes thread_limit in the launch parameter, always starting -/// thread_limit+warpSize threads per team. -static llvm::Value *getThreadLimit(CodeGenFunction &CGF) { +/// For the 'generic' execution mode, the runtime encodes thread_limit in +/// the launch parameters, always starting thread_limit+warpSize threads per +/// CTA. The threads in the last warp are reserved for master execution. +/// For the 'spmd' execution mode, all threads in a CTA are part of the team. +static llvm::Value *getThreadLimit(CodeGenFunction &CGF, + bool IsInSpmdExecutionMode = false) { CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), - "thread_limit"); + return IsInSpmdExecutionMode + ? getNVPTXNumThreads(CGF) + : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), + "thread_limit"); } /// Get the thread id of the OMP master thread. @@ -159,12 +188,33 @@ CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); } +bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const { + return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; +} + +static CGOpenMPRuntimeNVPTX::ExecutionMode +getExecutionModeForDirective(CodeGenModule &CGM, + const OMPExecutableDirective &D) { + OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); + switch (DirectiveKind) { + case OMPD_target: + return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic; + case OMPD_target_parallel: + return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; + default: + llvm_unreachable("Unsupported directive on NVPTX device."); + } + llvm_unreachable("Unsupported directive on NVPTX device."); +} + void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { + ExecutionModeRAII ModeRAII(CurrentExecutionMode, + CGOpenMPRuntimeNVPTX::ExecutionMode::Generic); EntryFunctionState EST; WorkerFunctionState WST(CGM); Work.clear(); @@ -252,6 +302,94 @@ EST.ExitBB = nullptr; } +void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) { + ExecutionModeRAII ModeRAII(CurrentExecutionMode, + CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd); + EntryFunctionState EST; + + // Emit target region as a standalone region. + class NVPTXPrePostActionTy : public PrePostActionTy { + CGOpenMPRuntimeNVPTX &RT; + CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; + const OMPExecutableDirective &D; + + public: + NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, + CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, + const OMPExecutableDirective &D) + : RT(RT), EST(EST), D(D) {} + void Enter(CodeGenFunction &CGF) override { + RT.emitSpmdEntryHeader(CGF, EST, D); + } + void Exit(CodeGenFunction &CGF) override { + RT.emitSpmdEntryFooter(CGF, EST); + } + } Action(*this, EST, D); + CodeGen.setAction(Action); + emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); + return; +} + +void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader( + CodeGenFunction &CGF, EntryFunctionState &EST, + const OMPExecutableDirective &D) { + auto &Bld = CGF.Builder; + + // Setup BBs in entry function. + llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); + EST.ExitBB = CGF.createBasicBlock(".exit"); + + // Initialize the OMP state in the runtime; called by all active threads. + // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters + // based on code analysis of the target region. + llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true), + /*RequiresOMPRuntime=*/Bld.getInt16(1), + /*RequiresDataSharing=*/Bld.getInt16(1)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); + CGF.EmitBranch(ExecuteBB); + + CGF.EmitBlock(ExecuteBB); +} + +void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF, + EntryFunctionState &EST) { + if (!EST.ExitBB) + EST.ExitBB = CGF.createBasicBlock(".exit"); + + llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); + CGF.EmitBranch(OMPDeInitBB); + + CGF.EmitBlock(OMPDeInitBB); + // DeInitialize the OMP state in the runtime; called by all active threads. + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None); + CGF.EmitBranch(EST.ExitBB); + + CGF.EmitBlock(EST.ExitBB); + EST.ExitBB = nullptr; +} + +// Create a unique global variable to indicate the execution mode of this target +// region. The execution mode is either 'generic', or 'spmd' depending on the +// target directive. This variable is picked up by the offload library to setup +// the device appropriately before kernel launch. If the execution mode is +// 'generic', the runtime reserves one warp for the master, otherwise, all +// warps participate in parallel work. +static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, + CGOpenMPRuntimeNVPTX::ExecutionMode Mode) { + (void)new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode")); +} + void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { auto &Ctx = CGM.getContext(); @@ -385,6 +523,22 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); break; } + case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { + // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, + // short RequiresOMPRuntime, short RequiresDataSharing); + llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init"); + break; + } + case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: { + // Build void __kmpc_spmd_kernel_deinit(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit"); + break; + } case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( /// void *outlined_function); @@ -463,12 +617,27 @@ assert(!ParentName.empty() && "Invalid target region parent name!"); - emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); + CGOpenMPRuntimeNVPTX::ExecutionMode Mode = + getExecutionModeForDirective(CGM, D); + switch (Mode) { + case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic: + emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + break; + case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd: + emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); + break; + case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown: + llvm_unreachable( + "Unknown programming model for OpenMP directive on NVPTX target."); + } + + setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM) { + : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } @@ -523,7 +692,10 @@ if (!CGF.HaveInsertPoint()) return; - emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); + if (isInSpmdExecutionMode()) + emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); + else + emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); } void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( @@ -593,3 +765,20 @@ ThenRCG(CGF); } } + +void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall( + CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef CapturedVars, const Expr *IfCond) { + // Just call the outlined function to execute the parallel region. + // OutlinedFn(>id, &zero, CapturedStruct); + // + // TODO: Do something with IfCond when support for the 'if' clause + // is added on Spmd target directives. + llvm::SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back( + llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + OutlinedFnArgs.push_back( + llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); +} Index: cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp =================================================================== --- cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp +++ cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp @@ -8,6 +8,14 @@ #ifndef HEADER #define HEADER +// Check that the execution mode of all 6 target regions is set to Generic Mode. +// CHECK-DAG: {{@__omp_offloading_.+l98}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l175}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l284}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l321}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l339}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l304}}_exec_mode = weak constant i8 1 + template struct TT{ tx X; @@ -23,7 +31,7 @@ double cn[5][n]; TT d; - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l98}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -54,7 +62,7 @@ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]() + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l98]]() // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() @@ -96,7 +104,7 @@ { } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l175}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -127,7 +135,7 @@ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]]) + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l175]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]]) // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]], // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* @@ -169,7 +177,7 @@ aa += 1; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l284}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -200,7 +208,7 @@ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]] + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l284]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* @@ -353,7 +361,7 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+321}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -384,7 +392,7 @@ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]] + // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l321]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] @@ -439,7 +447,7 @@ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l339}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -470,7 +478,7 @@ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]]( + // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l339]]( // Create local storage for each capture. // CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]* // CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] @@ -529,7 +537,7 @@ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l304}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -560,7 +568,7 @@ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]] + // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l304]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] Index: cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp =================================================================== --- cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -0,0 +1,136 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0 + +template +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target parallel if(target: 0) + { + a += 1; + } + + #pragma omp target parallel map(tofrom: aa) + { + aa += 1; + } + + #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +int bar(int n){ + int a = 0; + + a += ftemplate(n); + + return a; +} + + // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}} + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]]) + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]]) + // CHECK: = alloca i32*, align + // CHECK: = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align + // CHECK: store i16 {{%.+}}, i16* [[AA]], align + // CHECK: ret void + // CHECK: } + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}( + // CHECK: [[A_ADDR:%.+]] = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align + // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align + // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align + // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align + // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align + // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]]) + // CHECK: = alloca i32*, align + // CHECK: = alloca i32*, align + // CHECK: [[A_ADDR:%.+]] = alloca i32*, align + // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align + // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align + // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align + // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align + // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align + // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align + // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align + // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align + // CHECK: store i32 {{%.+}}, i32* [[A]], align + // CHECK: store i16 {{%.+}}, i16* [[AA]], align + // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]], + // CHECK: store i32 {{%.+}}, i32* [[ELT]], align + // CHECK: ret void + // CHECK: } +#endif