diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -331,6 +331,9 @@ void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; +FunctionPass *createAMDGPUSetWavePriorityPass(); +void initializeAMDGPUSetWavePriorityPass(PassRegistry &); + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -0,0 +1,166 @@ +//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to temporarily raise the wave priority beginning the start of +/// the shader function until its last VMEM instructions to allow younger +/// waves to issue their VMEM instructions as well. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Allocator.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-set-wave-priority" + +namespace { + +struct MBBInfo { + MBBInfo() = default; + bool MayReachVMEMLoad = false; +}; + +using MBBInfoSet = DenseMap; + +class AMDGPUSetWavePriority : public MachineFunctionPass { +public: + static char ID; + + AMDGPUSetWavePriority() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "Set wave priority"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const; + + const SIInstrInfo *TII; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false, + false) + +char AMDGPUSetWavePriority::ID = 0; + +FunctionPass *llvm::createAMDGPUSetWavePriorityPass() { + return new AMDGPUSetWavePriority(); +} + +MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF, + unsigned priority) const { + return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority); +} + +// Checks that for every predecessor Pred that can reach a VMEM load, +// none of Pred's successors can reach a VMEM load. +static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB, + MBBInfoSet &MBBInfos) { + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + if (!MBBInfos[Pred].MayReachVMEMLoad) + continue; + for (const MachineBasicBlock *Succ : Pred->successors()) { + if (MBBInfos[Succ].MayReachVMEMLoad) + return false; + } + } + return true; +} + +static bool isVMEMLoad(const MachineInstr &MI) { + return SIInstrInfo::isVMEM(MI) && MI.mayLoad(); +} + +bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) { + const unsigned HighPriority = 3; + const unsigned LowPriority = 0; + + Function &F = MF.getFunction(); + if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + + MBBInfoSet MBBInfos; + SmallVector Worklist; + for (MachineBasicBlock &MBB : MF) { + if (any_of(MBB, isVMEMLoad)) + Worklist.push_back(&MBB); + } + + // Mark blocks from which control may reach VMEM loads. + while (!Worklist.empty()) { + const MachineBasicBlock *MBB = Worklist.pop_back_val(); + MBBInfo &Info = MBBInfos[MBB]; + if (!Info.MayReachVMEMLoad) { + Info.MayReachVMEMLoad = true; + Worklist.append(MBB->pred_begin(), MBB->pred_end()); + } + } + + MachineBasicBlock &Entry = MF.front(); + if (!MBBInfos[&Entry].MayReachVMEMLoad) + return false; + + // Raise the priority at the beginning of the shader. + MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end(); + while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator()) + ++I; + Entry.insert(I, BuildSetprioMI(MF, HighPriority)); + + // Lower the priority on edges where control leaves blocks from which + // VMEM loads are reachable. + SmallSet PriorityLoweringBlocks; + for (MachineBasicBlock &MBB : MF) { + if (MBBInfos[&MBB].MayReachVMEMLoad) { + if (MBB.succ_empty()) + PriorityLoweringBlocks.insert(&MBB); + continue; + } + + if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + if (MBBInfos[Pred].MayReachVMEMLoad) + PriorityLoweringBlocks.insert(Pred); + } + continue; + } + + // Where lowering the priority in predecessors is not possible, the + // block receiving control either was not part of a loop in the first + // place or the loop simplification/canonicalization pass should have + // already tried to split the edge and insert a preheader, and if for + // whatever reason it failed to do so, then this leaves us with the + // only option of lowering the priority within the loop. + PriorityLoweringBlocks.insert(&MBB); + } + + for (MachineBasicBlock *MBB : PriorityLoweringBlocks) { + MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin(); + while (I != B) { + if (isVMEMLoad(*--I)) { + ++I; + break; + } + } + MBB->insert(I, BuildSetprioMI(MF, LowPriority)); + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -277,6 +277,10 @@ cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc")); +static cl::opt EnableSetWavePriority("amdgpu-set-wave-priority", + cl::desc("Adjust wave priority"), + cl::init(false), cl::Hidden); + static cl::opt EnableScalarIRPasses( "amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), @@ -1360,6 +1364,8 @@ addPass(&SIInsertHardClausesID); addPass(&SILateBranchLoweringPassID); + if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) + addPass(createAMDGPUSetWavePriorityPass()); if (getOptLevel() > CodeGenOpt::None) addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -89,6 +89,7 @@ AMDGPUReplaceLDSUseWithPointer.cpp AMDGPUResourceUsageAnalysis.cpp AMDGPURewriteOutArguments.cpp + AMDGPUSetWavePriority.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -0,0 +1,153 @@ +; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \ +; RUN: FileCheck %s + +; CHECK-LABEL: no_setprio: +; CHECK-NOT: s_setprio +; CHECK: ; return to shader part epilog +define amdgpu_ps <2 x float> @no_setprio() { + ret <2 x float> +} + +; CHECK-LABEL: vmem_in_exit_block: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: ; return to shader part epilog +define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) { + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %v +} + +; CHECK-LABEL: branch: +; CHECK: s_setprio 3 +; CHECK: s_cbranch_scc0 [[A:.*]] +; CHECK: {{.*}}: ; %b +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT:.*]] +; CHECK: [[A]]: ; %a +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT]] +; CHECK-NEXT: [[EXIT]]: +define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) { + %cond = icmp eq i32 %i, 0 + br i1 %cond, label %a, label %b + +a: + ret <2 x float> + +b: + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %v +} + +; CHECK-LABEL: setprio_follows_setprio: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK: s_cbranch_scc1 [[C:.*]] +; CHECK: {{.*}}: ; %a +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_cbranch_scc1 [[C]] +; CHECK: {{.*}}: ; %b +; CHECK-NOT: s_setprio +; CHECK: s_branch [[EXIT:.*]] +; CHECK: [[C]]: ; %c +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[EXIT]] +; CHECK: [[EXIT]]: +define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) { +entry: + %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %cond1 = icmp ne i32 %i, 0 + br i1 %cond1, label %a, label %c + +a: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %cond2 = icmp ne i32 %i, 1 + br i1 %cond2, label %b, label %c + +b: + ret <2 x float> %v2 + +c: + %v3 = phi <2 x float> [%v1, %entry], [%v2, %a] + %v4 = fadd <2 x float> %v1, %v3 + ret <2 x float> %v4 +} + +; CHECK-LABEL: loop: +; CHECK: {{.*}}: ; %entry +; CHECK: s_setprio 3 +; CHECK-NOT: s_setprio +; CHECK: [[LOOP:.*]]: ; %loop +; CHECK-NOT: s_setprio +; CHECK: buffer_load_dwordx2 +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[LOOP]] +; CHECK-NEXT: {{.*}}: ; %exit +; CHECK-NEXT: s_setprio 0 +define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) { +entry: + br label %loop + +loop: + %i = phi i32 [0, %entry], [%i2, %loop] + %sum = phi <2 x float> [, %entry], [%sum2, %loop] + + %i2 = add i32 %i, 1 + + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0) + %sum2 = fadd <2 x float> %sum, %v + + %cond = icmp ult i32 %i2, 5 + br i1 %cond, label %loop, label %exit + +exit: + ret <2 x float> %sum2 +} + +; CHECK-LABEL: edge_split: +; CHECK: s_setprio 3 +; CHECK: buffer_load_dwordx2 +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[ANOTHER_LOAD:.*]] +; CHECK: {{.*}}: ; %loop.preheader +; CHECK-NEXT: s_setprio 0 +; CHECK: [[LOOP:.*]]: ; %loop +; CHECK-NOT: s_setprio +; CHECK: s_cbranch_scc1 [[LOOP]] +; CHECK {{.*}}: ; %exit +; CHECK-NOT: s_setprio +; CHECK: s_branch [[RET:.*]] +; CHECK: [[ANOTHER_LOAD]]: ; %another_load +; CHECK: buffer_load_dwordx2 +; CHECK-NEXT: s_setprio 0 +; CHECK: s_branch [[RET]] +; CHECK: [[RET]]: +define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) { +entry: + %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0) + %cond = icmp ne i32 %x, 0 + br i1 %cond, label %loop, label %another_load + +loop: + %i = phi i32 [0, %entry], [%i2, %loop] + %mul = phi <2 x float> [%v, %entry], [%mul2, %loop] + + %i2 = add i32 %i, 1 + %mul2 = fmul <2 x float> %mul, %v + + %cond2 = icmp ult i32 %i2, 5 + br i1 %cond2, label %loop, label %exit + +exit: + ret <2 x float> %mul2 + +another_load: + %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0) + %sum = fadd <2 x float> %v, %v2 + ret <2 x float> %sum +} + +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind