Index: docs/OpenMPSupport.rst =================================================================== --- docs/OpenMPSupport.rst +++ docs/OpenMPSupport.rst @@ -108,6 +108,16 @@ between the threads and it is user responsibility to share the required data between the threads in the parallel regions. +Collapsed loop nest counter +--------------------------- + +When using the collapse clause on a loop nest the default behaviour is to +automatically extend the representation of the loop counter to 64 bits for +the cases where the sizes of the collapsed loops are not known at compile +time. To prevent this conservative choice and use at most 32 bits, +compile your program with the `-fopenmp-optimistic-collapse`. + + Features not supported or with limited support for Cuda devices --------------------------------------------------------------- Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -207,6 +207,7 @@ LANGOPT(OpenMPHostCXXExceptions , 1, 0, "C++ exceptions handling in the host code.") LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.") LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") +LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -1574,6 +1574,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group, + Flags<[NoArgumentUnused, HelpHidden]>; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group; def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group; def fno_escaping_block_tail_calls : Flag<["-"], "fno-escaping-block-tail-calls">, Group, Flags<[CC1Option]>; Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -4434,6 +4434,10 @@ Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); + if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse, + options::OPT_fno_openmp_optimistic_collapse, + /*Default=*/false)) + CmdArgs.push_back("-fopenmp-optimistic-collapse"); // When in OpenMP offloading mode with NVPTX target, forward // cuda-mode flag Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -2846,6 +2846,11 @@ Opts.OpenMPCUDABlocksPerSM, Diags); } + // Prevent auto-widening the representation of loop counters during an + // OpenMP collapse clause. + Opts.OpenMPOptimisticCollapse = + Args.hasArg(options::OPT_fopenmp_optimistic_collapse) ? 1 : 0; + // Get the OpenMP target triples if any. if (Arg *A = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) { Index: lib/Sema/SemaOpenMP.cpp =================================================================== --- lib/Sema/SemaOpenMP.cpp +++ lib/Sema/SemaOpenMP.cpp @@ -4136,7 +4136,7 @@ if (!TestIsLessOp.hasValue()) TestIsLessOp = IsConstPos || (IsUnsigned && !Subtract); if (UB && (IsConstZero || - (TestIsLessOp.getValue() ? + (TestIsLessOp.getValue() ? (IsConstNeg || (IsUnsigned && Subtract)) : (IsConstPos || (IsUnsigned && !Subtract))))) { SemaRef.Diag(NewStep->getExprLoc(), @@ -4311,7 +4311,7 @@ Op == OO_Less || Op == OO_Greater, CE->getSourceRange(), CE->getOperatorLoc()); break; - case OO_ExclaimEqual: + case OO_ExclaimEqual: return setUB(getInitLCDecl(CE->getArg(0)) == LCDecl ? CE->getArg(1) : CE->getArg(0), /*LessOp=*/llvm::None, @@ -4569,7 +4569,7 @@ ExprResult CondExpr = SemaRef.BuildBinOp(S, DefaultLoc, - TestIsLessOp.getValue() ? + TestIsLessOp.getValue() ? (TestIsStrictOp ? BO_LT : BO_LE) : (TestIsStrictOp ? BO_GT : BO_GE), NewLB.get(), NewUB.get()); @@ -5270,13 +5270,14 @@ // Choose either the 32-bit or 64-bit version. ExprResult LastIteration = LastIteration64; - if (LastIteration32.isUsable() && - C.getTypeSize(LastIteration32.get()->getType()) == 32 && - (AllCountsNeedLessThan32Bits || NestedLoopCount == 1 || - fitsInto( - /*Bits=*/32, - LastIteration32.get()->getType()->hasSignedIntegerRepresentation(), - LastIteration64.get(), SemaRef))) + if (SemaRef.getLangOpts().OpenMPOptimisticCollapse || + (LastIteration32.isUsable() && + C.getTypeSize(LastIteration32.get()->getType()) == 32 && + (AllCountsNeedLessThan32Bits || NestedLoopCount == 1 || + fitsInto( + /*Bits=*/32, + LastIteration32.get()->getType()->hasSignedIntegerRepresentation(), + LastIteration64.get(), SemaRef)))) LastIteration = LastIteration32; QualType VType = LastIteration.get()->getType(); QualType RealVType = VType; @@ -13116,7 +13117,7 @@ continue; } assert(Count < OMPMapClause::NumberOfModifiers && - "Modifiers exceed the allowed number of map type modifiers"); + "Modifiers exceed the allowed number of map type modifiers"); Modifiers[Count] = MapTypeModifiers[I]; ModifiersLoc[Count] = MapTypeModifiersLoc[I]; ++Count; Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -1,6 +1,7 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 @@ -9,11 +10,12 @@ #define HEADER // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -80,7 +82,7 @@ // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32( +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) @@ -214,16 +216,19 @@ // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void -// CHECK: define internal void [[OUTL4]]( +// CHECK-32: define internal void [[OUTL4]]( +// CHECK-64: define internal void [[OUTL4]]( // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l56(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{.*}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l58(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* dereferenceable{{.*}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* dereferenceable{{.*}}) +// CHECK-DIV64: div i64 +// CHECK-DIV32-NO: div i64 -// CHECK: define weak void @__omp_offloading_{{.*}}_l63(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l65(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{.*}})