diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -331,6 +331,9 @@ void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; +FunctionPass *createAMDGPUSetWavePriorityPass(); +void initializeAMDGPUSetWavePriorityPass(PassRegistry &); + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -0,0 +1,202 @@ +//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to temporarily raise the wave priority beginning the start of +/// the shader function until its last VMEM instructions to allow younger +/// waves to issue their VMEM instructions as well. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Allocator.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-set-wave-priority" + +static cl::opt + DisableSetWavePriority("amdgpu-disable-set-wave-priority", + cl::desc("Disable adjusting wave priority"), + cl::init(true), cl::Hidden); + +namespace { + +struct MBBInfo { + MBBInfo() = default; + bool MayReachVMEMLoad = false; +}; + +class MBBInfoSet { +public: + MBBInfoSet() = default; + + MBBInfo &operator[](const MachineBasicBlock *MBB) { + MBBInfo *&info = Infos[MBB]; + if (!info) + info = new (Alloc) MBBInfo(); + return *info; + } + +private: + BumpPtrAllocator Alloc; + DenseMap Infos; +}; + +class AMDGPUSetWavePriority : public MachineFunctionPass { +public: + static char ID; + + AMDGPUSetWavePriority() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "Set wave priority"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const; + bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB, + MBBInfoSet &MBBInfos) const; + + const SIInstrInfo *TII; + const MachineLoopInfo *Loops; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", + false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", + false, false) + +char AMDGPUSetWavePriority::ID = 0; + +FunctionPass *llvm::createAMDGPUSetWavePriorityPass() { + return new AMDGPUSetWavePriority(); +} + +MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF, + unsigned priority) const { + return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority); +} + +bool AMDGPUSetWavePriority::CanLowerPriorityDirectlyInPredecessors( + const MachineBasicBlock &MBB, MBBInfoSet &MBBInfos) const { + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + if (!MBBInfos[Pred].MayReachVMEMLoad) + continue; + if (Loops->getLoopFor(Pred)) + return false; + for (const MachineBasicBlock *Succ : Pred->successors()) { + if (MBBInfos[Succ].MayReachVMEMLoad) + return false; + } + } + return true; +} + +static bool isVMEMLoad(const MachineInstr &MI) { + return SIInstrInfo::isVMEM(MI) && MI.mayLoad(); +} + +bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) { + const unsigned HighPriority = 3; + const unsigned LowPriority = 0; + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + + Loops = &getAnalysis(); + + if (DisableSetWavePriority) + return false; + + MBBInfoSet MBBInfos; + SmallVector Worklist; + for (MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (isVMEMLoad(MI)) { + Worklist.push_back(&MBB); + break; + } + } + } + + // Mark blocks from which control may reach VMEM loads. + while (!Worklist.empty()) { + const MachineBasicBlock *MBB = Worklist.pop_back_val(); + MBBInfo &Info = MBBInfos[MBB]; + if (!Info.MayReachVMEMLoad) { + Info.MayReachVMEMLoad = true; + Worklist.append(MBB->pred_begin(), MBB->pred_end()); + } + } + + MachineBasicBlock &Entry = MF.front(); + if (!MBBInfos[&Entry].MayReachVMEMLoad) + return false; + + MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end(); + while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator()) + ++I; + Entry.insert(I, BuildSetprioMI(MF, HighPriority)); + + // Lower the priorty on edges where control leaves blocks from which + // VMEM loads are reachable. + SmallSet PriorityLoweringBlocks; + for (MachineBasicBlock &MBB : MF) { + if (MBBInfos[&MBB].MayReachVMEMLoad) { + if (MBB.isReturnBlock()) + PriorityLoweringBlocks.insert(&MBB); + continue; + } + + if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + if (MBBInfos[Pred].MayReachVMEMLoad) + PriorityLoweringBlocks.insert(Pred); + } + continue; + } + + // Where lowering the priority in predecessors is not possible, the + // block receiving control either was not part of a loop in the first + // place or the loop simplification/canonicalization pass should have + // already tried to split the edge and insert a preheader, and if for + // whatever reason it failed to do so, then this leaves us with the + // only option of lowering the priority within the loop. + PriorityLoweringBlocks.insert(&MBB); + } + + for (MachineBasicBlock *MBB : PriorityLoweringBlocks) { + MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin(); + while (I != B) { + if (isVMEMLoad(*--I)) { + ++I; + break; + } + } + MBB->insert(I, BuildSetprioMI(MF, LowPriority)); + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1360,8 +1360,10 @@ addPass(&SIInsertHardClausesID); addPass(&SILateBranchLoweringPassID); - if (getOptLevel() > CodeGenOpt::None) + if (getOptLevel() > CodeGenOpt::None) { + addPass(createAMDGPUSetWavePriorityPass()); addPass(&SIPreEmitPeepholeID); + } // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -89,6 +89,7 @@ AMDGPUReplaceLDSUseWithPointer.cpp AMDGPUResourceUsageAnalysis.cpp AMDGPURewriteOutArguments.cpp + AMDGPUSetWavePriority.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -383,6 +383,8 @@ ; GCN-O1-NEXT: SI Insert Hard Clauses ; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Final Branch Preparation +; GCN-O1-NEXT: Machine Natural Loop Construction +; GCN-O1-NEXT: Set wave priority ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer ; GCN-O1-NEXT: Branch relaxation pass @@ -669,6 +671,8 @@ ; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses ; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation +; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction +; GCN-O1-OPTS-NEXT: Set wave priority ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer ; GCN-O1-OPTS-NEXT: Branch relaxation pass @@ -957,6 +961,8 @@ ; GCN-O2-NEXT: SI Insert Hard Clauses ; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: SI Final Branch Preparation +; GCN-O2-NEXT: Machine Natural Loop Construction +; GCN-O2-NEXT: Set wave priority ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer ; GCN-O2-NEXT: Branch relaxation pass @@ -1257,6 +1263,8 @@ ; GCN-O3-NEXT: SI Insert Hard Clauses ; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: SI Final Branch Preparation +; GCN-O3-NEXT: Machine Natural Loop Construction +; GCN-O3-NEXT: Set wave priority ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer ; GCN-O3-NEXT: Branch relaxation pass diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -0,0 +1,153 @@ +; RUN: llc -mtriple=amdgcn -amdgpu-disable-set-wave-priority=false -o - %s | \ +; RUN: FileCheck %s + +; CHECK-LABEL: no_setprio: +; CHECK-NOT: s_setprio +; CHECK: ; return to shader part epilog +define amdgpu_ps <2 x float> @no_setprio() { + ret <2 x float> +} + +; CHECK-LABEL: vmem_in_exit_block: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: ; return to shader part epilog +define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %v +} + +; CHECK-LABEL: branch: +; CHECK: s_setprio 3 +; CHECK: s_cbranch_scc0 [[A:.*]] +; CHECK: {{.*}}: ; %b +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT:.*]] +; CHECK: [[A]]: ; %a +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT]] +; CHECK-NEXT: [[EXIT]]: +define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) { + %cond = icmp eq i32 %i, 0 + br i1 %cond, label %a, label %b + +a: + ret <2 x float> + +b: + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %v +} + +; CHECK-LABEL: setprio_follows_setprio: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK: s_cbranch_scc1 [[C:.*]] +; CHECK: {{.*}}: ; %a +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_cbranch_scc1 [[C]] +; CHECK: {{.*}}: ; %b +; CHECK-NOT: s_setprio +; CHECK: s_branch [[EXIT:.*]] +; CHECK: [[C]]: ; %c +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT]] +; CHECK: [[EXIT]]: +define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) { +entry: + %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %cond1 = icmp ne i32 %i, 0 + br i1 %cond1, label %a, label %c + +a: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %cond2 = icmp ne i32 %i, 1 + br i1 %cond2, label %b, label %c + +b: + ret <2 x float> %v2 + +c: + %v3 = phi <2 x float> [%v1, %entry], [%v2, %a] + %v4 = fadd <2 x float> %v1, %v3 + ret <2 x float> %v4 +} + +; CHECK-LABEL: loop: +; CHECK: {{.*}}: ; %entry +; CHECK: s_setprio 3 +; CHECK-NOT: s_setprio +; CHECK: [[LOOP:.*]]: ; %loop +; CHECK-NOT: s_setprio +; CHECK: buffer_load_dwordx2 +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[LOOP]] +; CHECK-NEXT: {{.*}}: ; %exit +; CHECK-NEXT: s_setprio 0 +define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) { +entry: + br label %loop + +loop: + %i = phi i32 [0, %entry], [%i2, %loop] + %sum = phi <2 x float> [, %entry], [%sum2, %loop] + + %i2 = add i32 %i, 1 + + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0) + %sum2 = fadd <2 x float> %sum, %v + + %cond = icmp ult i32 %i2, 5 + br i1 %cond, label %loop, label %exit + +exit: + ret <2 x float> %sum2 +} + +; CHECK-LABEL: edge_split: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[ANOTHER_LOAD:.*]] +; CHECK: {{.*}}: ; %loop.preheader +; CHECK-NEXT: s_setprio 0 +; CHECK: [[LOOP:.*]]: ; %loop +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[LOOP]] +; CHECK {{.*}}: ; %exit +; CHECK-NOT: s_setprio +; CHECK: s_branch [[RET:.*]] +; CHECK: [[ANOTHER_LOAD]]: ; %another_load +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[RET]] +; CHECK: [[RET]]: +define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) { +entry: + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %cond = icmp ne i32 %x, 0 + br i1 %cond, label %loop, label %another_load + +loop: + %i = phi i32 [0, %entry], [%i2, %loop] + %mul = phi <2 x float> [%v, %entry], [%mul2, %loop] + + %i2 = add i32 %i, 1 + %mul2 = fmul <2 x float> %mul, %v + + %cond2 = icmp ult i32 %i2, 5 + br i1 %cond2, label %loop, label %exit + +exit: + ret <2 x float> %mul2 + +another_load: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %sum = fadd <2 x float> %v, %v2 + ret <2 x float> %sum +} + +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind