diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -231,6 +231,7 @@ LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.") LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") +LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1687,6 +1687,12 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group, Flags<[NoArgumentUnused, HelpHidden]>; +def fopenmp_cuda_parallel_target_regions : Flag<["-"], "fopenmp-cuda-parallel-target-regions">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Support parallel execution of target regions on Cuda-based devices.">; +def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group, + Flags<[NoArgumentUnused, HelpHidden]>, + HelpText<"Support only serial execution of target regions on Cuda-based devices.">; def static_openmp: Flag<["-"], "static-openmp">, HelpText<"Use the static host OpenMP runtime while linking.">; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -85,6 +85,9 @@ /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size, /// int16_t UseSharedMemory); OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack, + /// Call to void* __kmpc_data_sharing_push_stack(size_t size, int16_t + /// UseSharedMemory); + OMPRTL_NVPTX__kmpc_data_sharing_push_stack, /// Call to void __kmpc_data_sharing_pop_stack(void *a); OMPRTL_NVPTX__kmpc_data_sharing_pop_stack, /// Call to void __kmpc_begin_sharing_variables(void ***args, @@ -1753,6 +1756,16 @@ FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack"); break; } + case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { + // Build void *__kmpc_data_sharing_push_stack(size_t size, int16_t + // UseSharedMemory); + llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_data_sharing_push_stack"); + break; + } case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: { // Build void __kmpc_data_sharing_pop_stack(void *a); llvm::Type *TypeParams[] = {CGM.VoidPtrTy}; @@ -2210,7 +2223,7 @@ GlobalRecCastAddr = Phi; I->getSecond().GlobalRecordAddr = Phi; I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (IsInTTDRegion) { + } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { assert(GlobalizedRecords.back().Records.size() < 2 && "Expected less than 2 globalized records: one for target and one " "for teams."); @@ -2283,12 +2296,16 @@ } else { // TODO: allow the usage of shared memory to be controlled by // the user, for now, default to global. + bool UseSharedMemory = + IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; llvm::Value *GlobalRecordSizeArg[] = { llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), + IsInTTDRegion + ? OMPRTL_NVPTX__kmpc_data_sharing_push_stack + : OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack), GlobalRecordSizeArg); GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( GlobalRecValue, GlobalRecPtrTy); @@ -2435,7 +2452,7 @@ OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); CGF.EmitBlock(ExitBB); - } else if (IsInTTDRegion) { + } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { assert(GlobalizedRecords.back().RegionCounter > 0 && "region counter must be > 0."); --GlobalizedRecords.back().RegionCounter; @@ -5085,7 +5102,8 @@ } void CGOpenMPRuntimeNVPTX::clear() { - if (!GlobalizedRecords.empty()) { + if (!GlobalizedRecords.empty() && + !CGM.getLangOpts().OpenMPCUDATargetParallel) { ASTContext &C = CGM.getContext(); llvm::SmallVector GlobalRecs; llvm::SmallVector SharedRecs; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5257,6 +5257,13 @@ options::OPT_fno_openmp_cuda_mode, /*Default=*/false)) CmdArgs.push_back("-fopenmp-cuda-mode"); + // When in OpenMP offloading mode with NVPTX target, forward + // cuda-parallel-target-regions flag + if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions, + options::OPT_fno_openmp_cuda_parallel_target_regions, + /*Default=*/true)) + CmdArgs.push_back("-fopenmp-cuda-parallel-target-regions"); + // When in OpenMP offloading mode with NVPTX target, check if full runtime // is required. if (Args.hasFlag(options::OPT_fopenmp_cuda_force_full_runtime, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3195,6 +3195,12 @@ Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && Args.hasArg(options::OPT_fopenmp_cuda_mode); + // Set CUDA support for parallel execution of target regions for OpenMP target + // NVPTX/AMDGCN if specified in options. + Opts.OpenMPCUDATargetParallel = + Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && + Args.hasArg(options::OPT_fopenmp_cuda_parallel_target_regions); + // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options Opts.OpenMPCUDAForceFullRuntime = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -2,7 +2,8 @@ ///==========================================================================/// // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR // expected-no-diagnostics @@ -26,11 +27,11 @@ } } } -// CK1: [[MEM_TY:%.+]] = type { [128 x i8] } -// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8 -// CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 /// ========= In the worker function ========= /// // CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker() @@ -44,11 +45,12 @@ // CK1: [[SHAREDARGS2:%.+]] = alloca i8** // CK1: call void @__kmpc_kernel_init // CK1: call void @__kmpc_data_sharing_init_stack -// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CK1: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]], -// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0 +// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0 +// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1) // CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* // CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 // CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 @@ -75,8 +77,9 @@ // CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CK1: call void @__kmpc_end_sharing_variables() -// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CK1: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]]) +// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]]) +// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]]) // CK1: call void @__kmpc_kernel_deinit(i16 1) /// ========= In the data sharing wrapper function ========= /// diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -1,9 +1,12 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -21,19 +24,20 @@ return 0; } -// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 -// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 0 - -// CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}) -// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 +// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// CHECK-DAG: @__omp_offloading_{{.*}}_main_l20_exec_mode = weak constant i8 0 + +// CHECK: define weak void @__omp_offloading_{{.*}}_main_l20([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}) +// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 +// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1) // CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty* // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 // CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], @@ -43,8 +47,9 @@ // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) +// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) +// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]]) // CHECK: define internal void [[PARALLEL]]( // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1,9 +1,12 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -72,15 +75,15 @@ return a; } -// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() +// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker() -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -133,7 +136,7 @@ // CHECK: [[EXIT]] // CHECK: ret void -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l29]](i[[SZ:32|64]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] @@ -199,7 +202,7 @@ // CHECK: store i[[SZ]] 44, i[[SZ]]* %a, // CHECK: ret void -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l46}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -243,7 +246,7 @@ // CHECK: [[EXIT]] // CHECK: ret void -// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]] +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l46]](i[[SZ:32|64]] // Create local storage for each capture. // CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]], // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], @@ -323,23 +326,25 @@ // CHECK: declare void @__kmpc_barrier(%struct.ident_t*, i32) #[[#CONVERGENT]] -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}_worker() -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}( // CHECK-32: [[A_ADDR:%.+]] = alloca i32, // CHECK-64: [[A_ADDR:%.+]] = alloca i64, // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty* // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]], // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]], // CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]], -// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) +// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) +// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]]) // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK: [[CC:%.+]] = alloca i32, diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -30,33 +31,35 @@ return a; } -// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l12}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l13}}_worker() // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call i1 @__kmpc_kernel_parallel( // CHECK: call void @__omp_outlined___wrapper( -// CHECK: define weak void @__omp_offloading_{{.*}}l12( -// CHECK: call void @__omp_offloading_{{.*}}l12_worker() +// CHECK: define weak void @__omp_offloading_{{.*}}l13( +// CHECK: call void @__omp_offloading_{{.*}}l13_worker() // CHECK: call void @__kmpc_kernel_init( // CHECK: call void @__kmpc_data_sharing_init_stack() -// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CHECK: call void @__kmpc_kernel_prepare_parallel( // CHECK: call void @__kmpc_begin_sharing_variables({{.*}}, i64 2) // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_end_sharing_variables() -// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) +// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) +// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]]) // CHECK: call void @__kmpc_kernel_deinit(i16 1) // CHECK: define internal void @__omp_outlined__( diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -1,17 +1,20 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER -// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: {{@__omp_offloading_.+}}_l20_exec_mode = weak constant i8 1 -// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: {{@__omp_offloading_.+}}_l23_exec_mode = weak constant i8 1 +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 template tx ftemplate(int n) { @@ -35,10 +38,10 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l23}}_worker() // CHECK: ret void - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l20}}() + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l23}}() // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() @@ -48,7 +51,7 @@ // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l20}}_worker() + // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l23}}_worker() // CHECK: br label {{%?}}[[EXIT:.+]] // // CHECK: [[CHECK_MASTER]] @@ -75,11 +78,12 @@ // CHECK: ret void // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}) - // CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], - // CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], - // CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) - // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]], - // CHECK: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 + // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], + // SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], + // SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) + // SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]], + // SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 + // PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to [[GLOB_TY:%.+]]* // CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0 // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,21 +1,25 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l62}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l69}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -76,31 +80,33 @@ return a; } -// CHECK-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l38( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void [[PARALLEL:@.+]]( // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: define internal void [[PARALLEL]]( -// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 +// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 +// PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CHECK: [[BC:%.+]] = bitcast i8* [[ADDR]] to [[REC:%.+]]* // CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) +// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) +// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[ADDR]]) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( @@ -233,13 +239,13 @@ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l58(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l62(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}}) // CHECK-DIV64: div i64 // CHECK-DIV32-NO: div i64 -// CHECK: define weak void @__omp_offloading_{{.*}}_l65(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l69(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{.*}}) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -1,18 +1,21 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER // Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l36}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l46}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l39}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -62,29 +65,31 @@ return a; } -// CHECK-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 +// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l33( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 +// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 +// PAR: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CHECK: [[BC:%.+]] = bitcast i8* [[PTR]] to [[REC:%.+]]* // CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) +// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) +// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]]) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -1,8 +1,10 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix SEQ +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix PAR // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix SEQ +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -27,13 +29,13 @@ return tmain(argv); } -// CK1: [[MEM_TY:%.+]] = type { [128 x i8] } -// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} -// CK1-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// CK1-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 +// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} +// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 +// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 // only nvptx side: do not outline teams region and do not call fork_teams // CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]]) @@ -41,11 +43,12 @@ // CK1: store {{.+}} 0, {{.+}}, // CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], // CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* -// CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], -// CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] // CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 @@ -62,11 +65,12 @@ // CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]]) // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] -// CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], -// CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], +// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], +// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1) // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] // CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], @@ -83,9 +87,11 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix SEQ2 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix PAR2 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix SEQ2 +// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix PAR2 // expected-no-diagnostics #ifdef CK2 @@ -112,13 +118,13 @@ return tmain(argv); } -// CK2: [[MEM_TY:%.+]] = type { [128 x i8] } -// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer -// CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 -// CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} -// CK2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// CK2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 +// SEQ2: [[MEM_TY:%.+]] = type { [128 x i8] } +// SEQ2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer +// SEQ2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// SEQ2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} +// SEQ2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 +// SEQ2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 // CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]]) // CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, @@ -130,11 +136,12 @@ // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* -// CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], -// CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// CK2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], +// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], +// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1) // CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] // CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 @@ -155,11 +162,12 @@ // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], -// CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], -// CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// CK2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) -// CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], +// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], +// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], +// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 +// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1) // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] // CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -1,9 +1,12 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -12,21 +15,21 @@ // CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] } // CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] } // CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] } -// CHECK-DAG: [[MAP_TY:%.+]] = type { [128 x i8] } +// SEQ-DAG: [[MAP_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null -// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} -// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 +// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 +// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 +// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} +// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] // Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD. -// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0 // CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer @@ -70,9 +73,9 @@ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l41}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l44}}_worker() - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l41]]( + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l44]]( // // CHECK: {{call|invoke}} void [[T1]]_worker() // @@ -337,9 +340,9 @@ // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) // CHECK: ret void - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}_worker() - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l47]]( + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l50]]( // // CHECK: {{call|invoke}} void [[T2]]_worker() @@ -704,13 +707,13 @@ // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) // CHECK: ret void - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l54}}( + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}( // // CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) - // CHECK-NOT: call void @__kmpc_get_team_static_memory + // CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}} // CHECK: store i32 0, // CHECK: store i32 0, // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align