diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -237,6 +237,7 @@ LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.") LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.") +LANGOPT(OpenMPCUDAConstFirstprivate, 1, 0, "Place firstprivate const variables in the constant address space.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2265,6 +2265,12 @@ def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group, Flags<[NoArgumentUnused, HelpHidden]>, HelpText<"Support only serial execution of target regions on Cuda-based devices.">; +def fopenmp_cuda_const_firstprivate : Flag<["-"], "fopenmp-cuda-const-firstprivate">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Allow placement of firstprivate const variables in the constant address space.">; +def fno_openmp_cuda_const_firstprivate : Flag<["-"], "fno-openmp-cuda-const-firstprivate">, Group, + Flags<[NoArgumentUnused, HelpHidden]>, + HelpText<"Do not allow placement of firstprivate const variables in the constant address space.">; def static_openmp: Flag<["-"], "static-openmp">, HelpText<"Use the static host OpenMP runtime while linking.">; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9131,7 +9131,8 @@ CombinedInfo.Types.push_back(getMapModifiersForPrivateClauses(CI)); const VarDecl *VD = CI.getCapturedVar(); auto I = FirstPrivateDecls.find(VD); - if (I != FirstPrivateDecls.end() && + if (CGF.getLangOpts().OpenMPCUDAConstFirstprivate && + I != FirstPrivateDecls.end() && VD->getType().isConstant(CGF.getContext())) { llvm::Constant *Addr = CGF.CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(CGF, VD); diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -775,6 +775,7 @@ return false; bool DeviceConstTarget = getLangOpts().OpenMPIsDevice && + getLangOpts().OpenMPCUDAConstFirstprivate && isOpenMPTargetExecutionDirective(D.getDirectiveKind()); bool FirstprivateIsLastprivate = false; llvm::DenseMap Lastprivates; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5571,6 +5571,13 @@ options::OPT_fno_openmp_cuda_mode, /*Default=*/false)) CmdArgs.push_back("-fopenmp-cuda-mode"); + // When in OpenMP offloading mode with NVPTX target, forward + // cuda-const-firstprivate flag + if (Args.hasFlag(options::OPT_fopenmp_cuda_const_firstprivate, + options::OPT_fno_openmp_cuda_const_firstprivate, + /*Default=*/false)) + CmdArgs.push_back("-fopenmp-cuda-const-firstprivate"); + // When in OpenMP offloading mode with NVPTX target, forward // cuda-parallel-target-regions flag if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3448,6 +3448,9 @@ if (Opts.OpenMPCUDAMode) GenerateArg(Args, OPT_fopenmp_cuda_mode, SA); + if (Opts.OpenMPCUDAConstFirstprivate) + GenerateArg(Args, OPT_fopenmp_cuda_const_firstprivate, SA); + if (Opts.OpenMPCUDATargetParallel) GenerateArg(Args, OPT_fopenmp_cuda_parallel_target_regions, SA); @@ -3843,6 +3846,9 @@ // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && Args.hasArg(options::OPT_fopenmp_cuda_mode); + Opts.OpenMPCUDAConstFirstprivate = + Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && + Args.hasArg(options::OPT_fopenmp_cuda_const_firstprivate); // Set CUDA support for parallel execution of target regions for OpenMP target // NVPTX/AMDGCN if specified in options. diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -281,6 +281,26 @@ // RUN: | FileCheck -check-prefix=NO_CUDA_MODE %s // NO_CUDA_MODE-NOT: "-{{fno-|f}}openmp-cuda-mode" +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-const-firstprivate -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-const-firstprivate -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// CUDA_CONST_FP: clang{{.*}}"-cc1"{{.*}}"-triple" "{{nvptx64-nvidia-cuda|amdgcn-amd-amdhsa}}" +// CUDA_CONST_FP-SAME: "-fopenmp-cuda-const-firstprivate" +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-const-firstprivate -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-const-firstprivate -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// NO_CUDA_CONST_FP-NOT: "-{{fno-|f}}openmp-cuda-const-firstprivate" + // RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-force-full-runtime 2>&1 \ // RUN: | FileCheck -check-prefix=FULL_RUNTIME %s // RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-force-full-runtime -fopenmp-cuda-force-full-runtime 2>&1 \ diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp @@ -1,8 +1,10 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 --check-prefix TCHECK-NONCONST +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-cuda-const-firstprivate -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 --check-prefix TCHECK-CONST // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 --check-prefix TCHECK-NONCONST +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-cuda-const-firstprivate -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 --check-prefix TCHECK-CONST // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -17,7 +19,8 @@ // TCHECK-DAG: [[TTII:%.+]] = type { i32, i32 } // TCHECK-DAG: [[S1:%.+]] = type { double } -// TCHECK: @__omp_offloading_firstprivate__{{.+}}_e_l27 = internal addrspace(4) global [[TTII]] zeroinitializer +// TCHECK-NONCONST-NOT: @__omp_offloading_firstprivate__{{.+}}_e_l30 +// TCHECK-CONST: @__omp_offloading_firstprivate__{{.+}}_e_l30 = internal addrspace(4) global [[TTII]] zeroinitializer int foo(int n, double *ptr) { int a = 0; short aa = 0; @@ -36,7 +39,8 @@ // TCHECK: define {{.*}}void @__omp_offloading_{{.+}}([10 x float] addrspace(1)* noalias [[B_IN:%.+]], i{{[0-9]+}} [[A_IN:%.+]], [[TTII]]* noalias [[E_IN:%.+]]) // TCHECK-NOT: alloca [[TTII]], // TCHECK: [[A_ADDR:%.+]] = alloca i{{[0-9]+}}, - // TCHECK-NOT: alloca [[TTII]], + // TCHECK-NONCONST: alloca [[TTII]], + // TCHECK-CONST-NOT: alloca [[TTII]], // TCHECK-NOT: alloca i{{[0-9]+}}, // TCHECK-64: call void @llvm.dbg.declare(metadata [10 x float] addrspace(1)** %{{.+}}, metadata !{{[0-9]+}}, metadata !DIExpression()) // TCHECK: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],