diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -290,6 +290,10 @@ void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); extern char &AMDGPULateCodeGenPrepareID; +FunctionPass *createAMDGPURewriteUndefForPHIPass(); +void initializeAMDGPURewriteUndefForPHIPass(PassRegistry &); +extern char &AMDGPURewriteUndefForPHIPassID; + void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp @@ -0,0 +1,181 @@ +//===- AMDGPURewriteUndefForPHI.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements the idea to rewrite undef incoming operand for certain +// PHIs in structurized CFG. This pass only works on IR that has gone through +// StructurizedCFG pass, and this pass has some additional limitation that make +// it can only run after SIAnnotateControlFlow. +// +// To achieve optimal code generation for AMDGPU, we assume that divergence +// analysis reports the PHI in join block of divergent branch as uniform if +// it has one unique uniform value plus additional undefined/poisoned incoming +// value. That is to say the later compiler pipeline will ensure such PHI always +// return uniform value and ensure it work correctly. Let's take a look at two +// typical patterns in structured CFG that need to be taken care: (In both +// patterns, block %if terminate with divergent branch.) +// +// Pattern A: Block with undefined incoming value dominates defined predecessor +// %if +// | \ +// | %then +// | / +// %endif: %phi = phi [%undef, %if], [%uniform, %then] +// +// Pattern B: Block with defined incoming value dominates undefined predecessor +// %if +// | \ +// | %then +// | / +// %endif: %phi = phi [%uniform, %if], [%undef, %then] +// +// For pattern A, by reporting %phi as uniform, the later pipeline need to make +// sure it be handled correctly. The backend usually allocates a scalar register +// and if any thread in a wave takes %then path, the scalar register will get +// the %uniform value. +// +// For pattern B, we will replace the undef operand with the other defined value +// in this pass. So the scalar register allocated for such PHI will get correct +// liveness. Without this transformation, the scalar register may be overwritten +// in the %then block. +// +// Limitation note: +// If the join block of divergent threads is a loop header, the pass cannot +// handle it correctly right now. For below case, the undef in %phi should also +// be rewritten. Currently we depend on SIAnnotateControlFlow to split %header +// block to get a separate join block, then we can rewrite the undef correctly. +// %if +// | \ +// | %then +// | / +// -> %header: %phi = phi [%uniform, %if], [%undef, %then], [%uniform2, %header] +// | | +// \--- + +#include "AMDGPU.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-rewrite-undef-for-phi" + +namespace { + +class AMDGPURewriteUndefForPHI : public FunctionPass { +public: + static char ID; + AMDGPURewriteUndefForPHI() : FunctionPass(ID) { + initializeAMDGPURewriteUndefForPHIPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + StringRef getPassName() const override { + return "AMDGPU Rewrite Undef for PHI"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + } +}; + +} // end anonymous namespace +char AMDGPURewriteUndefForPHI::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE, + "Rewrite undef for PHI", false, false) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE, + "Rewrite undef for PHI", false, false) + +bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) { + bool Changed = false; + SmallVector ToBeDeleted; + for (auto &BB : F) { + for (auto &PHI : BB.phis()) { + if (DA->isDivergent(&PHI)) + continue; + + // The unique incoming value except undef/poison for the PHI node. + Value *UniqueDefinedIncoming = nullptr; + // The divergent block with defined incoming value that dominates all + // other block with the same incoming value. + BasicBlock *DominateBB = nullptr; + // Predecessors with undefined incoming value (excluding loop backedge). + SmallVector Undefs; + + for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { + Value *Incoming = PHI.getIncomingValue(i); + BasicBlock *IncomingBB = PHI.getIncomingBlock(i); + + if (Incoming == &PHI) + continue; + + if (isa(Incoming)) { + // Undef from loop backedge will not be replaced. + if (!DT->dominates(&BB, IncomingBB)) + Undefs.push_back(IncomingBB); + continue; + } + + if (!UniqueDefinedIncoming) { + UniqueDefinedIncoming = Incoming; + DominateBB = IncomingBB; + } else if (Incoming == UniqueDefinedIncoming) { + // Update DominateBB if necessary. + if (DT->dominates(IncomingBB, DominateBB)) + DominateBB = IncomingBB; + } else { + UniqueDefinedIncoming = nullptr; + break; + } + } + // We only need to replace the undef for the PHI which is merging + // defined/undefined values from divergent threads. + // TODO: We should still be able to replace undef value if the unique + // value is a Constant. + if (!UniqueDefinedIncoming || Undefs.empty() || + !DA->isDivergent(DominateBB->getTerminator())) + continue; + + // We only replace the undef when DominateBB truly dominates all the + // other predecessors with undefined incoming value. Make sure DominateBB + // dominates BB so that UniqueDefinedIncoming is available in BB and + // afterwards. + if (DT->dominates(DominateBB, &BB) && all_of(Undefs, [&](BasicBlock *UD) { + return DT->dominates(DominateBB, UD); + })) { + PHI.replaceAllUsesWith(UniqueDefinedIncoming); + ToBeDeleted.push_back(&PHI); + Changed = true; + } + } + } + + for (auto *PHI : ToBeDeleted) + PHI->eraseFromParent(); + + return Changed; +} + +bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) { + LegacyDivergenceAnalysis *DA = &getAnalysis(); + DominatorTree *DT = &getAnalysis().getDomTree(); + return rewritePHIs(F, DA, DT); +} + +FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() { + return new AMDGPURewriteUndefForPHI(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -380,6 +380,7 @@ initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); + initializeAMDGPURewriteUndefForPHIPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeAMDGPUReleaseVGPRsPass(*PR); @@ -1198,6 +1199,10 @@ addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); + // TODO: Move this right after structurizeCFG to avoid extra divergence + // analysis. This depends on stopping SIAnnotateControlFlow from making + // control flow modifications. + addPass(createAMDGPURewriteUndefForPHIPass()); } addPass(createLCSSAPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -92,6 +92,7 @@ AMDGPUReplaceLDSUseWithPointer.cpp AMDGPUResourceUsageAnalysis.cpp AMDGPURewriteOutArguments.cpp + AMDGPURewriteUndefForPHI.cpp AMDGPUSetWavePriority.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -83,6 +83,9 @@ ; GCN-O0-NEXT: Memory SSA ; GCN-O0-NEXT: AMDGPU Annotate Uniform Values ; GCN-O0-NEXT: SI annotate control flow +; GCN-O0-NEXT: Post-Dominator Tree Construction +; GCN-O0-NEXT: Legacy Divergence Analysis +; GCN-O0-NEXT: AMDGPU Rewrite Undef for PHI ; GCN-O0-NEXT: LCSSA Verifier ; GCN-O0-NEXT: Loop-Closed SSA Form Pass ; GCN-O0-NEXT: DummyCGSCCPass @@ -264,6 +267,9 @@ ; GCN-O1-NEXT: Memory SSA ; GCN-O1-NEXT: AMDGPU Annotate Uniform Values ; GCN-O1-NEXT: SI annotate control flow +; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Legacy Divergence Analysis +; GCN-O1-NEXT: AMDGPU Rewrite Undef for PHI ; GCN-O1-NEXT: LCSSA Verifier ; GCN-O1-NEXT: Loop-Closed SSA Form Pass ; GCN-O1-NEXT: DummyCGSCCPass @@ -548,6 +554,9 @@ ; GCN-O1-OPTS-NEXT: Memory SSA ; GCN-O1-OPTS-NEXT: AMDGPU Annotate Uniform Values ; GCN-O1-OPTS-NEXT: SI annotate control flow +; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis +; GCN-O1-OPTS-NEXT: AMDGPU Rewrite Undef for PHI ; GCN-O1-OPTS-NEXT: LCSSA Verifier ; GCN-O1-OPTS-NEXT: Loop-Closed SSA Form Pass ; GCN-O1-OPTS-NEXT: DummyCGSCCPass @@ -840,6 +849,9 @@ ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: AMDGPU Annotate Uniform Values ; GCN-O2-NEXT: SI annotate control flow +; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Legacy Divergence Analysis +; GCN-O2-NEXT: AMDGPU Rewrite Undef for PHI ; GCN-O2-NEXT: LCSSA Verifier ; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Analysis if a function is memory bound @@ -1147,6 +1159,9 @@ ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: AMDGPU Annotate Uniform Values ; GCN-O3-NEXT: SI annotate control flow +; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Legacy Divergence Analysis +; GCN-O3-NEXT: AMDGPU Rewrite Undef for PHI ; GCN-O3-NEXT: LCSSA Verifier ; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Analysis if a function is memory bound diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-undef-for-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-undef-for-phi.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rewrite-undef-for-phi.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-- -S -amdgpu-rewrite-undef-for-phi %s | FileCheck -check-prefix=OPT %s + +define amdgpu_ps float @basic(float inreg %c, i32 %x) #0 { +; OPT-LABEL: @basic( +; OPT-NEXT: entry: +; OPT-NEXT: [[CC:%.*]] = icmp slt i32 [[X:%.*]], 0 +; OPT-NEXT: br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]] +; OPT: if: +; OPT-NEXT: br label [[END]] +; OPT: end: +; OPT-NEXT: ret float [[C:%.*]] +; +entry: + %cc = icmp slt i32 %x, 0 + br i1 %cc, label %if, label %end + +if: + br label %end + +end: + %c2 = phi float [ undef, %if ], [ %c, %entry ] + ret float %c2 +} + +define amdgpu_ps float @with_uniform_region_inside(float inreg %c, i32 inreg %d, i32 %x) #0 { +; OPT-LABEL: @with_uniform_region_inside( +; OPT-NEXT: entry: +; OPT-NEXT: [[CC:%.*]] = icmp slt i32 [[X:%.*]], 0 +; OPT-NEXT: br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]] +; OPT: if: +; OPT-NEXT: [[CC2:%.*]] = icmp slt i32 [[D:%.*]], 0 +; OPT-NEXT: br i1 [[CC2]], label [[BB2:%.*]], label [[BB3:%.*]] +; OPT: bb2: +; OPT-NEXT: br label [[END]] +; OPT: bb3: +; OPT-NEXT: [[CC3:%.*]] = icmp slt i32 [[D]], 2 +; OPT-NEXT: br i1 [[CC3]], label [[BB4:%.*]], label [[END]] +; OPT: bb4: +; OPT-NEXT: br label [[END]] +; OPT: end: +; OPT-NEXT: ret float [[C:%.*]] +; +entry: + %cc = icmp slt i32 %x, 0 + br i1 %cc, label %if, label %end + +if: + %cc2 = icmp slt i32 %d, 0 + br i1 %cc2, label %bb2, label %bb3 + +bb2: + br label %end + +bb3: + %cc3 = icmp slt i32 %d, 2 + br i1 %cc3, label %bb4, label %end + +bb4: + br label %end + +end: + %c2 = phi float [ undef, %bb2 ], [ %c, %bb3 ], [ undef, %bb4 ], [ %c, %entry ] + ret float %c2 +} + +define amdgpu_ps float @exclude_backedge(float inreg %c, i32 %x) #0 { +; OPT-LABEL: @exclude_backedge( +; OPT-NEXT: entry: +; OPT-NEXT: [[CC:%.*]] = icmp slt i32 [[X:%.*]], 0 +; OPT-NEXT: br i1 [[CC]], label [[END:%.*]], label [[LOOP:%.*]] +; OPT: loop: +; OPT-NEXT: [[IND:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ] +; OPT-NEXT: [[C2:%.*]] = phi float [ [[C:%.*]], [[ENTRY]] ], [ undef, [[LOOP]] ] +; OPT-NEXT: [[INC]] = add i32 [[IND]], 1 +; OPT-NEXT: [[LOOP_CC:%.*]] = icmp slt i32 [[INC]], 5 +; OPT-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[LOOP_END:%.*]] +; OPT: loop_end: +; OPT-NEXT: br label [[END]] +; OPT: end: +; OPT-NEXT: [[R:%.*]] = phi float [ [[C2]], [[LOOP_END]] ], [ [[C]], [[ENTRY]] ] +; OPT-NEXT: ret float [[R]] +; +entry: + %cc = icmp slt i32 %x, 0 + br i1 %cc, label %end, label %loop + +loop: + %ind = phi i32 [ 0, %entry ], [ %inc, %loop ] + %c2 = phi float [ %c, %entry ], [ undef, %loop ] + %inc = add i32 %ind, 1 + %loop_cc = icmp slt i32 %inc, 5 + br i1 %loop_cc, label %loop, label %loop_end + +loop_end: + br label %end + +end: + %r = phi float [ %c2, %loop_end ], [ %c, %entry ] + ret float %r +} + +attributes #0 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s +; +; This test shows a typical case that a PHI(%c2) in join block was treated as uniform +; as it has one unique uniform incoming value plus one additional undef incoming +; value. This case might suffer from correctness issue if %c2 was assigned a scalar +; register but meanwhile dead in %if. The problem is solved by replacing the %undef +; with %c (thus replacing %c2 with %c in this example). + + +define amdgpu_ps float @uniform_phi_with_undef(float inreg %c, float %v, i32 %x, i32 %y) #0 { +; GCN-LABEL: uniform_phi_with_undef: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cmp_lt_i32_e64 s2, v2, v1 +; GCN-NEXT: s_mov_b32 s1, exec_lo +; GCN-NEXT: s_and_b32 s2, s1, s2 +; GCN-NEXT: s_mov_b32 exec_lo, s2 +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_mov_b32 s2, 2.0 +; GCN-NEXT: v_div_scale_f32 v1, s3, s2, s2, v0 +; GCN-NEXT: v_rcp_f32_e64 v2, v1 +; GCN-NEXT: s_mov_b32 s3, 1.0 +; GCN-NEXT: v_fma_f32 v3, -v1, v2, s3 +; GCN-NEXT: v_fmac_f32_e64 v2, v3, v2 +; GCN-NEXT: v_div_scale_f32 v3, vcc_lo, v0, s2, v0 +; GCN-NEXT: v_mul_f32_e64 v4, v3, v2 +; GCN-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GCN-NEXT: v_fmac_f32_e64 v4, v5, v2 +; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GCN-NEXT: v_div_fixup_f32 v0, v1, s2, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GCN-NEXT: v_add_f32_e64 v0, v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %cc = icmp slt i32 %y, %x + br i1 %cc, label %if, label %end + +if: + %v.if = fdiv float %v, 2.0 + br label %end + +end: + %v2 = phi float [ %v.if, %if ], [ %v, %entry ] + %c2 = phi float [ undef, %if ], [ %c, %entry ] + %r = fadd float %v2, %c2 + ret float %r +} + +attributes #0 = { nounwind optnone noinline }