diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -597,6 +597,10 @@ /// See AbstractState::indicateOptimisticFixpoint(...) ChangeStatus indicateOptimisticFixpoint() override { IsAtFixpoint = true; + ReachingKernelEntries.indicateOptimisticFixpoint(); + SPMDCompatibilityTracker.indicateOptimisticFixpoint(); + ReachedKnownParallelRegions.indicateOptimisticFixpoint(); + ReachedUnknownParallelRegions.indicateOptimisticFixpoint(); return ChangeStatus::UNCHANGED; } @@ -3058,19 +3062,16 @@ if (!KernelInitCB || !KernelDeinitCB) return ChangeStatus::UNCHANGED; - // Known SPMD-mode kernels need no manifest changes. - if (SPMDCompatibilityTracker.isKnown()) - return ChangeStatus::UNCHANGED; - // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. - if (!mayContainParallelRegion() || !changeToSPMDMode(A)) + ChangeStatus Changed = ChangeStatus::UNCHANGED; + if (!changeToSPMDMode(A, Changed)) return buildCustomStateMachine(A); - return ChangeStatus::CHANGED; + return Changed; } - bool changeToSPMDMode(Attributor &A) { + bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { auto &OMPInfoCache = static_cast(A.getInfoCache()); if (!SPMDCompatibilityTracker.isAssumed()) { @@ -3102,6 +3103,24 @@ return false; } + // Check if the kernel is already in SPMD mode, if so, return success. + Function *Kernel = getAnchorScope(); + GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( + (Kernel->getName() + "_exec_mode").str()); + assert(ExecMode && "Kernel without exec mode?"); + assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); + + // Set the global exec mode flag to indicate SPMD-Generic mode. + assert(isa(ExecMode->getInitializer()) && + "ExecMode is not an integer!"); + const int8_t ExecModeVal = + cast(ExecMode->getInitializer())->getSExtValue(); + if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC) + return true; + + // We will now unconditionally modify the IR, indicate a change. + Changed = ChangeStatus::CHANGED; + auto CreateGuardedRegion = [&](Instruction *RegionStartI, Instruction *RegionEndI) { LoopInfo *LI = nullptr; @@ -3312,17 +3331,6 @@ // Adjust the global exec mode flag that tells the runtime what mode this // kernel is executed in. - Function *Kernel = getAnchorScope(); - GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( - (Kernel->getName() + "_exec_mode").str()); - assert(ExecMode && "Kernel without exec mode?"); - assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); - - // Set the global exec mode flag to indicate SPMD-Generic mode. - assert(isa(ExecMode->getInitializer()) && - "ExecMode is not an integer!"); - const int8_t ExecModeVal = - cast(ExecMode->getInitializer())->getSExtValue(); assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && "Initially non-SPMD kernel has SPMD exec mode!"); ExecMode->setInitializer( @@ -3699,6 +3707,7 @@ } // Callback to check a call instruction. + bool AllParallelRegionStatesWereFixed = true; bool AllSPMDStatesWereFixed = true; auto CheckCallInst = [&](Instruction &I) { auto &CB = cast(I); @@ -3706,6 +3715,10 @@ *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); getState() ^= CBAA.getState(); AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint(); + AllParallelRegionStatesWereFixed &= + CBAA.ReachedKnownParallelRegions.isAtFixpoint(); + AllParallelRegionStatesWereFixed &= + CBAA.ReachedUnknownParallelRegions.isAtFixpoint(); return true; }; @@ -3717,6 +3730,23 @@ return indicatePessimisticFixpoint(); } + // If we haven't used any assumed information for the reached parallel + // region states we can fix it. + if (!UsedAssumedInformationInCheckCallInst && + AllParallelRegionStatesWereFixed) { + ReachedKnownParallelRegions.indicateOptimisticFixpoint(); + ReachedUnknownParallelRegions.indicateOptimisticFixpoint(); + } + + // If we are sure there are no parallel regions in the kernel we do not + // want SPMD mode. + if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() && + ReachedKnownParallelRegions.isAtFixpoint() && + ReachedUnknownParallelRegions.isValidState() && + ReachedKnownParallelRegions.isValidState() && + !mayContainParallelRegion()) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + // If we haven't used any assumed information for the SPMD state we can fix // it. if (!UsedAssumedInformationInCheckRWInst && diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll --- a/llvm/test/Transforms/OpenMP/always_inline_device.ll +++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll @@ -6,6 +6,7 @@ @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode = weak constant i8 1 @llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_fd02_c0934fc2_foo_l4_exec_mode], section "llvm.metadata" +@G = external global i8 ; Function Attrs: convergent norecurse nounwind define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 { @@ -16,6 +17,7 @@ ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: +; CHECK-NEXT: store i8 0, i8* @G, align 1 ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: @@ -27,6 +29,12 @@ br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry + ; Ensure we see a 0 here as the kernel doesn't have parallel regions and we want + ; generic execution. + ; TODO: This is not perfect. We should rather go for SPMD mode and tell the runtime + ; to only spawn a single thread. Further, we then should not guard any code. + %isSPMD = call i8 @__kmpc_is_spmd_exec_mode() + store i8 %isSPMD, i8* @G call void @bar() #2 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void @@ -35,6 +43,8 @@ ret void } +declare i8 @__kmpc_is_spmd_exec_mode() + declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -8,9 +8,9 @@ @G = external global i32 ;. -; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32 -; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 +; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 @@ -25,11 +25,11 @@ ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) call void @helper0() call void @helper1() call void @helper2() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ret void } @@ -43,9 +43,9 @@ ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) call void @helper1() - call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ret void }