diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -237,6 +237,7 @@ LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.") LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.") +LANGOPT(OpenMPCUDAConstFirstprivate, 1, 0, "Place firstprivate const variables in the constant address space.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2286,6 +2286,12 @@ def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group, Flags<[NoArgumentUnused, HelpHidden]>, HelpText<"Support only serial execution of target regions on Cuda-based devices.">; +def fopenmp_cuda_const_firstprivate : Flag<["-"], "fopenmp-cuda-const-firstprivate">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Allow placement of firstprivate const variables in the constant address space.">; +def fno_openmp_cuda_const_firstprivate : Flag<["-"], "fno-openmp-cuda-const-firstprivate">, Group, + Flags<[NoArgumentUnused, HelpHidden]>, + HelpText<"Do not allow placement of firstprivate const variables in the constant address space.">; def static_openmp: Flag<["-"], "static-openmp">, HelpText<"Use the static host OpenMP runtime while linking.">; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9161,7 +9161,8 @@ CombinedInfo.Types.push_back(getMapModifiersForPrivateClauses(CI)); const VarDecl *VD = CI.getCapturedVar(); auto I = FirstPrivateDecls.find(VD); - if (I != FirstPrivateDecls.end() && + if (CGF.getLangOpts().OpenMPCUDAConstFirstprivate && + I != FirstPrivateDecls.end() && VD->getType().isConstant(CGF.getContext())) { llvm::Constant *Addr = CGF.CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(CGF, VD); diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -786,6 +786,7 @@ return false; bool DeviceConstTarget = getLangOpts().OpenMPIsDevice && + getLangOpts().OpenMPCUDAConstFirstprivate && isOpenMPTargetExecutionDirective(D.getDirectiveKind()); bool FirstprivateIsLastprivate = false; llvm::DenseMap Lastprivates; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5623,6 +5623,13 @@ options::OPT_fno_openmp_cuda_mode, /*Default=*/false)) CmdArgs.push_back("-fopenmp-cuda-mode"); + // When in OpenMP offloading mode with NVPTX target, forward + // cuda-const-firstprivate flag + if (Args.hasFlag(options::OPT_fopenmp_cuda_const_firstprivate, + options::OPT_fno_openmp_cuda_const_firstprivate, + /*Default=*/false)) + CmdArgs.push_back("-fopenmp-cuda-const-firstprivate"); + // When in OpenMP offloading mode with NVPTX target, forward // cuda-parallel-target-regions flag if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3458,6 +3458,9 @@ if (Opts.OpenMPCUDAMode) GenerateArg(Args, OPT_fopenmp_cuda_mode, SA); + if (Opts.OpenMPCUDAConstFirstprivate) + GenerateArg(Args, OPT_fopenmp_cuda_const_firstprivate, SA); + if (Opts.OpenMPCUDATargetParallel) GenerateArg(Args, OPT_fopenmp_cuda_parallel_target_regions, SA); @@ -3871,6 +3874,9 @@ // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && Args.hasArg(options::OPT_fopenmp_cuda_mode); + Opts.OpenMPCUDAConstFirstprivate = + Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && + Args.hasArg(options::OPT_fopenmp_cuda_const_firstprivate); // Set CUDA support for parallel execution of target regions for OpenMP target // NVPTX/AMDGCN if specified in options. diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -281,6 +281,26 @@ // RUN: | FileCheck -check-prefix=NO_CUDA_MODE %s // NO_CUDA_MODE-NOT: "-{{fno-|f}}openmp-cuda-mode" +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-const-firstprivate -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-const-firstprivate -fopenmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_CONST_FP %s +// CUDA_CONST_FP: clang{{.*}}"-cc1"{{.*}}"-triple" "{{nvptx64-nvidia-cuda|amdgcn-amd-amdhsa}}" +// CUDA_CONST_FP-SAME: "-fopenmp-cuda-const-firstprivate" +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-const-firstprivate -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target -march=gfx906 %s -fopenmp-cuda-const-firstprivate -fno-openmp-cuda-const-firstprivate 2>&1 \ +// RUN: | FileCheck -check-prefix=NO_CUDA_CONST_FP %s +// NO_CUDA_CONST_FP-NOT: "-{{fno-|f}}openmp-cuda-const-firstprivate" + // RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-force-full-runtime 2>&1 \ // RUN: | FileCheck -check-prefix=FULL_RUNTIME %s // RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fno-openmp-cuda-force-full-runtime -fopenmp-cuda-force-full-runtime 2>&1 \ diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp @@ -1,8 +1,10 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 --check-prefix TCHECK-NONCONST +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-cuda-const-firstprivate -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64 --check-prefix TCHECK-CONST // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 --check-prefix TCHECK-NONCONST +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-cuda-const-firstprivate -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32 --check-prefix TCHECK-CONST // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -17,7 +19,8 @@ // TCHECK-DAG: [[TTII:%.+]] = type { i32, i32 } // TCHECK-DAG: [[S1:%.+]] = type { double } -// TCHECK: @__omp_offloading_firstprivate__{{.+}}_e_l27 = internal addrspace(4) global [[TTII]] zeroinitializer +// TCHECK-NONCONST-NOT: @__omp_offloading_firstprivate__{{.+}}_e_l30 +// TCHECK-CONST: @__omp_offloading_firstprivate__{{.+}}_e_l30 = internal addrspace(4) global [[TTII]] zeroinitializer int foo(int n, double *ptr) { int a = 0; short aa = 0; @@ -36,7 +39,8 @@ // TCHECK: define {{.*}}void @__omp_offloading_{{.+}}([10 x float] addrspace(1)* noalias [[B_IN:%.+]], i{{[0-9]+}} [[A_IN:%.+]], [[TTII]]* noalias [[E_IN:%.+]]) // TCHECK-NOT: alloca [[TTII]], // TCHECK: [[A_ADDR:%.+]] = alloca i{{[0-9]+}}, - // TCHECK-NOT: alloca [[TTII]], + // TCHECK-NONCONST: alloca [[TTII]], + // TCHECK-CONST-NOT: alloca [[TTII]], // TCHECK-NOT: alloca i{{[0-9]+}}, // TCHECK-64: call void @llvm.dbg.declare(metadata [10 x float] addrspace(1)** %{{.+}}, metadata !{{[0-9]+}}, metadata !DIExpression()) // TCHECK: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]], diff --git a/clang/test/OpenMP/target_firstprivate_codegen.cpp b/clang/test/OpenMP/target_firstprivate_codegen.cpp --- a/clang/test/OpenMP/target_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_firstprivate_codegen.cpp @@ -52,7 +52,6 @@ // TCHECK-DAG: [[TTII:%.+]] = type { i32, i32 } // TCHECK-DAG: [[S1:%.+]] = type { double } -// CHECK-DAG: [[FP_E:@__omp_offloading_firstprivate_.+_e_l76]] = internal global [[TTII]] zeroinitializer // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i[[SZ:32|64]] 4, i{{64|32}} {{8|4}}] // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 288, i64 49] // CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [9 x i64] [i64 288, i64 161, i64 800, i64 161, i64 161, i64 800, i64 800, i64 161, i64 161] @@ -336,8 +335,6 @@ } // CHECK: [[PTR_ADDR_REF:%.+]] = load double*, double** [[PTR_ADDR]], - // CHECK: [[E_BC:%.+]] = bitcast [[TTII]]* [[E:%.+]] to i8* - // CHECK: call void @llvm.memcpy.p0i8.p0i8.i{{64|32}}(i8* {{.*}} bitcast ([[TTII]]* [[FP_E]] to i8*), i8* {{.*}} [[E_BC]], i{{64|32}} 8, i1 false) // CHECK: [[BASE_PTR_GEP3_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP3_0]] to double** // CHECK: store double* [[PTR_ADDR_REF]], double** [[BCAST_TOPTR]], @@ -346,7 +343,7 @@ // CHECK: store double* [[PTR_ADDR_REF]], double** [[BCAST_TOPTR]], // CHECK: [[BASE_PTR_GEP3_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 // CHECK: [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP3_1]] to [[TTII]]** - // CHECK: store [[TTII]]* [[FP_E]], [[TTII]]** [[BCAST_TOPTR]], + // CHECK: store [[TTII]]* [[FP_E:%.+]], [[TTII]]** [[BCAST_TOPTR]], // CHECK: [[PTR_GEP3_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 // CHECK: [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP3_1]] to [[TTII]]** // CHECK: store [[TTII]]* [[FP_E]], [[TTII]]** [[BCAST_TOPTR]], @@ -356,9 +353,8 @@ // CHECK: {{.+}} = call i32 @__tgt_target_mapper(%struct.ident_t* @{{.+}}, i64 -1, {{.+}}, i32 2, i8** [[BASE_PTR_GEP_ARG3]], i8** [[PTR_GEP_ARG3]], i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT3]], i32 0, i32 0), i8** null, i8** null) // TCHECK: define weak void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]], [[TTII]]* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[E:%.+]]) - // TCHECK-NOT: alloca [[TTII]], // TCHECK: [[PTR_ADDR:%.+]] = alloca double*, - // TCHECK-NOT: alloca [[TTII]], + // TCHECK: alloca [[TTII]], // TCHECK-NOT: alloca double*, // TCHECK: store double* [[PTR_IN]], double** [[PTR_ADDR]], // TCHECK-NOT: store double* %