Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -1494,6 +1494,30 @@ const VarDecl *NativeParam, const VarDecl *TargetParam) const; + /// Gets the default chunk size. + /// \param CodeGenFunction current code generation function. + /// \param OMPLoopDirective Loop directive. + /// \param OpenMPScheduleClauseKind OpenMP schedule type. + virtual llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPScheduleClauseKind ScheduleKind) const; + + /// Gets the default chunk size. + /// \param CodeGenFunction current code generation function. + /// \param OMPLoopDirective Loop directive. + /// \param OpenMPDistScheduleClauseKind OpenMP dist_schedule type. + virtual llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPDistScheduleClauseKind ScheduleKind) const; + + /// Choose a default value for the schedule clause. + virtual void chooseDefaultSchedule( + OpenMPScheduleClauseKind *ScheduleKind) const; + + /// Choose a default value for the dist_schedule clause. + virtual void chooseDefaultSchedule( + OpenMPDistScheduleClauseKind *ScheduleKind) const; + /// Emits call of the outlined function with the provided arguments, /// translating these arguments to correct target-specific arguments. virtual void Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -8947,6 +8947,26 @@ return Address::invalid(); } +void CGOpenMPRuntime::chooseDefaultSchedule( + OpenMPScheduleClauseKind *ScheduleKind) const { + return; +} + +void CGOpenMPRuntime::chooseDefaultSchedule( + OpenMPDistScheduleClauseKind *ScheduleKind) const { + return; +} + +llvm::Value *CGOpenMPRuntime::getDefaultChunkValue(CodeGenFunction &CGF, + const OMPLoopDirective &S, OpenMPScheduleClauseKind ScheduleKind) const { + return nullptr; +} + +llvm::Value *CGOpenMPRuntime::getDefaultChunkValue(CodeGenFunction &CGF, + const OMPLoopDirective &S, OpenMPDistScheduleClauseKind ScheduleKind) const { + return nullptr; +} + llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -340,6 +340,35 @@ /// void functionFinished(CodeGenFunction &CGF) override; + /// For CUDA, to ensure coalesching, the default schedule is chunked. + /// This will return false in the default case to reflect that. + /// + bool isStaticNonchunked(OpenMPDistScheduleClauseKind ScheduleKind, + bool Chunked) const override; + + /// Gets the default chunk size. + /// \param CodeGenFunction current code generation function. + /// \param OMPLoopDirective Loop directive. + /// \param OpenMPScheduleClauseKind OpenMP schedule type. + llvm::Value *getDefaultChunkValue(CodeGenFunction &CGF, + const OMPLoopDirective &S, + OpenMPScheduleClauseKind ScheduleKind) const override; + + /// Choose a default value for the schedule clause. + void chooseDefaultSchedule( + OpenMPScheduleClauseKind *ScheduleKind) const override; + + // Create runtime function call to initialize distribute default + // schedule. + llvm::Constant *createDistributeDefaultInitFunction(unsigned IVSize, + bool IVSigned); + + /// Emits device specific call to runtime function. + void emitDistributeStaticInit( + CodeGenFunction &CGF, SourceLocation Loc, + OpenMPDistScheduleClauseKind SchedKind, + const CGOpenMPRuntime::StaticRTInput &Values) override; + private: /// Track the execution mode when codegening directives within a target /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -4019,3 +4019,97 @@ FunctionGlobalizedDecls.erase(CGF.CurFn); CGOpenMPRuntime::functionFinished(CGF); } + +bool CGOpenMPRuntimeNVPTX::isStaticNonchunked( + OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const { + // For OMPC_DIST_SCHEDULE_unknown we change the default to + // be schedule(static, ). Since the new default is + // chunked we need to return false. + if (ScheduleKind == OMPC_DIST_SCHEDULE_unknown && + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + return false; + return CGOpenMPRuntime::isStaticNonchunked(ScheduleKind, Chunked); +} + +llvm::Value *CGOpenMPRuntimeNVPTX::getDefaultChunkValue(CodeGenFunction &CGF, + const OMPLoopDirective &S, OpenMPScheduleClauseKind ScheduleKind) const { + // For NVPTX, the default schedule for parallel for uses a chunk size of 1 + // for coalescing purposes. + if (ScheduleKind == OMPC_SCHEDULE_static && + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + return CGF.Builder.getIntN(CGM.getDataLayout().getTypeAllocSizeInBits( + CGF.ConvertType(S.getIterationVariable()->getType())), 1); + return CGOpenMPRuntime::getDefaultChunkValue(CGF, S, ScheduleKind); +} + +void CGOpenMPRuntimeNVPTX::chooseDefaultSchedule( + OpenMPScheduleClauseKind *ScheduleKind) const { + if (*ScheduleKind == OMPC_SCHEDULE_unknown && + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) + *ScheduleKind = OMPC_SCHEDULE_static; +} + +llvm::Constant *CGOpenMPRuntimeNVPTX::createDistributeDefaultInitFunction( + unsigned IVSize, bool IVSigned) { + assert((IVSize == 32 || IVSize == 64) && + "IV size is not compatible with the omp runtime"); + StringRef Name = IVSize == 32 ? (IVSigned ? "__kmpc_distribute_default_init_4" + : "__kmpc_distribute_default_init_4u") + : (IVSigned ? "__kmpc_distribute_default_init_8" + : "__kmpc_distribute_default_init_8u"); + llvm::Type *ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty; + auto *PtrTy = llvm::PointerType::getUnqual(ITy); + llvm::Type *TypeParams[] = { + getIdentTyPointerTy(), // loc + CGM.Int32Ty, // tid + CGM.Int32Ty, // schedtype + llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter + PtrTy, // p_lower + PtrTy, // p_upper + PtrTy, // p_stride + ITy, // incr + ITy // chunk + }; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + return CGM.CreateRuntimeFunction(FnTy, Name); +} + +void CGOpenMPRuntimeNVPTX::emitDistributeStaticInit( + CodeGenFunction &CGF, SourceLocation Loc, + OpenMPDistScheduleClauseKind SchedKind, + const CGOpenMPRuntime::StaticRTInput &Values) { + + // When using the default schedule in SPMD mode more effecient code + // can be emitted. + if (SchedKind == OMPC_DIST_SCHEDULE_unknown && + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { + // Use smart default: split loop into chunks of size equal to + // number of threads in the team so that only one iteration per + // thread can be allocated. + llvm::Value *UpdatedLocation = + CGOpenMPRuntime::emitUpdateLocation(CGF, Loc); + llvm::Constant *DistributeDefaultInitFunction = + createDistributeDefaultInitFunction(Values.IVSize, Values.IVSigned); + + if (!CGF.HaveInsertPoint()) + return; + + llvm::Value *Args[] = { + UpdatedLocation, // (not used) + getThreadID(CGF, Loc), // (not used) + CGF.Builder.getInt32(1), // Schedule type (not used) + Values.IL.getPointer(), // &isLastIter (not used) + Values.LB.getPointer(), // &LB + Values.UB.getPointer(), // &UB + Values.ST.getPointer(), // &Stride + CGF.Builder.getIntN(Values.IVSize, 1), // Incr (not used) + CGF.Builder.getIntN(Values.IVSize, 1) // Chunk (not used) + }; + CGF.EmitRuntimeCall(DistributeDefaultInitFunction, Args); + + return; + } + + CGOpenMPRuntime::emitDistributeStaticInit(CGF, Loc, SchedKind, Values); +} Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -2300,7 +2300,9 @@ // Detect the loop schedule kind and chunk. llvm::Value *Chunk = nullptr; OpenMPScheduleTy ScheduleKind; - if (const auto *C = S.getSingleClause()) { + const auto *C = S.getSingleClause(); + if (C) { + // If schedule clause is present. ScheduleKind.Schedule = C->getScheduleKind(); ScheduleKind.M1 = C->getFirstScheduleModifier(); ScheduleKind.M2 = C->getSecondScheduleModifier(); @@ -2310,7 +2312,13 @@ S.getIterationVariable()->getType(), S.getBeginLoc()); } + } else { + // When schedule clause is absent we choose sensible defaults. + CGM.getOpenMPRuntime().chooseDefaultSchedule(&ScheduleKind.Schedule); + Chunk = CGM.getOpenMPRuntime().getDefaultChunkValue( + *this, S, ScheduleKind.Schedule); } + const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); // OpenMP 4.5, 2.7.1 Loop Construct, Description. @@ -3326,6 +3334,7 @@ S.getBeginLoc()); } } + const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -36,7 +36,7 @@ l = i; } - #pragma omp target teams distribute parallel for map(tofrom: aa) num_teams(M) thread_limit(64) +#pragma omp target teams distribute parallel for map(tofrom: aa) num_teams(M) thread_limit(64) for(int i = 0; i < n; i++) { aa[i] += 1; } @@ -86,28 +86,28 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, +// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, // CHECK: {{call|invoke}} void [[OUTL2:@.+]]( // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void // CHECK: define internal void [[OUTL2]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, +// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, +// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, // CHECK: {{call|invoke}} void [[OUTL3:@.+]]( // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void // CHECK: define internal void [[OUTL3]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, +// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void @@ -116,14 +116,14 @@ // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], +// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], // CHECK: {{call|invoke}} void [[OUTL4:@.+]]( // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void // CHECK: define internal void [[OUTL4]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, +// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -24,7 +24,7 @@ // CHECK: define weak void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) // CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @ // CHECK: call void @__kmpc_spmd_kernel_init( -// CHECK: call void @__kmpc_for_static_init_4( +// CHECK: call void @__kmpc_distribute_default_init_4( // CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}}) // CHECK: br label % Index: test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp =================================================================== --- test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -34,7 +34,7 @@ l = i; } - #pragma omp target teams distribute parallel for simd map(tofrom: aa) num_teams(M) thread_limit(64) + #pragma omp target teams distribute parallel for simd map(tofrom: aa) num_teams(M) thread_limit(64) for(int i = 0; i < n; i++) { aa[i] += 1; } @@ -81,28 +81,28 @@ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, +// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, // CHECK: {{call|invoke}} void [[OUTL2:@.+]]( // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void // CHECK: define internal void [[OUTL2]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, +// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, +// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, // CHECK: {{call|invoke}} void [[OUTL3:@.+]]( // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void // CHECK: define internal void [[OUTL3]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, +// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void @@ -111,14 +111,14 @@ // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], +// CHECK: call void @__kmpc_distribute_default_init_4({{.+}}, {{.+}}, {{.+}} 1, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], // CHECK: {{call|invoke}} void [[OUTL4:@.+]]( // CHECK: call void @__kmpc_for_static_fini( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret void // CHECK: define internal void [[OUTL4]]( -// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, +// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void