Index: llvm/trunk/include/llvm/CodeGen/MachineInstr.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/MachineInstr.h +++ llvm/trunk/include/llvm/CodeGen/MachineInstr.h @@ -917,6 +917,10 @@ return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1; } + /// Returns true if the MachineInstr has an implicit-use operand of exactly + /// the given register (not considering sub/super-registers). + bool hasRegisterImplicitUseOperand(unsigned Reg) const; + /// Returns the operand index that is a use of the specific register or -1 /// if it is not found. It further tightens the search criteria to a use /// that kills the register if isKill is true. Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -328,6 +328,13 @@ [IntrNoMem]>; // See int_amdgcn_v_interp_p1 for why this is // IntrNoMem. +// Pixel shaders only: whether the current pixel is live (i.e. not a helper +// invocation for derivative computation). +def int_amdgcn_ps_live : Intrinsic < + [llvm_i1_ty], + [], + [IntrNoMem]>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; Index: llvm/trunk/lib/CodeGen/MachineInstr.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineInstr.cpp +++ llvm/trunk/lib/CodeGen/MachineInstr.cpp @@ -1271,6 +1271,17 @@ return Size; } +/// Returns true if the MachineInstr has an implicit-use operand of exactly +/// the given register (not considering sub/super-registers). +bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const { + for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { + const MachineOperand &MO = getOperand(i); + if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg) + return true; + } + return false; +} + /// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of /// the specific register or -1 if it is not found. It further tightens /// the search criteria to a use that kills the register if isKill is true. Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -250,6 +250,7 @@ case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: return true; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1597,8 +1597,7 @@ // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { - const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); - if (!Exec || !Exec->isImplicit()) { + if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { ErrInfo = "VALU instruction does not implicitly read exec mask"; return false; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -2003,6 +2003,14 @@ } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 +let SALU = 1 in +def SI_PS_LIVE : InstSI < + (outs SReg_64:$dst), + (ins), + "si_ps_live $dst", + [(set i1:$dst, (int_amdgcn_ps_live))] +>; + // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -101,11 +101,12 @@ DenseMap Instructions; DenseMap Blocks; SmallVector ExecExports; + SmallVector LiveMaskQueries; - char scanInstructions(const MachineFunction &MF, std::vector& Worklist); + char scanInstructions(MachineFunction &MF, std::vector& Worklist); void propagateInstruction(const MachineInstr &MI, std::vector& Worklist); void propagateBlock(const MachineBasicBlock &MBB, std::vector& Worklist); - char analyzeFunction(const MachineFunction &MF); + char analyzeFunction(MachineFunction &MF); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); @@ -113,6 +114,8 @@ unsigned SavedWQM); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + void lowerLiveMaskQueries(unsigned LiveMaskReg); + public: static char ID; @@ -148,15 +151,15 @@ // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. -char SIWholeQuadMode::scanInstructions(const MachineFunction &MF, +char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - const MachineBasicBlock &MBB = *BI; + MachineBasicBlock &MBB = *BI; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { - const MachineInstr &MI = *II; + MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); char Flags; @@ -167,8 +170,13 @@ Flags = StateExact; } else { // Handle export instructions with the exec mask valid flag set - if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0) - ExecExports.push_back(&MI); + if (Opcode == AMDGPU::EXP) { + if (MI.getOperand(4).getImm() != 0) + ExecExports.push_back(&MI); + } else if (Opcode == AMDGPU::SI_PS_LIVE) { + LiveMaskQueries.push_back(&MI); + } + continue; } @@ -290,7 +298,7 @@ } } -char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) { +char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { std::vector Worklist; char GlobalFlags = scanInstructions(MF, Worklist); @@ -424,6 +432,16 @@ } } +void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + for (MachineInstr *MI : LiveMaskQueries) { + DebugLoc DL = MI->getDebugLoc(); + unsigned Dest = MI->getOperand(0).getReg(); + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + MI->eraseFromParent(); + } +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) return false; @@ -431,30 +449,43 @@ Instructions.clear(); Blocks.clear(); ExecExports.clear(); + LiveMaskQueries.clear(); TII = static_cast(MF.getSubtarget().getInstrInfo()); TRI = static_cast(MF.getSubtarget().getRegisterInfo()); MRI = &MF.getRegInfo(); char GlobalFlags = analyzeFunction(MF); - if (!(GlobalFlags & StateWQM)) - return false; + if (!(GlobalFlags & StateWQM)) { + lowerLiveMaskQueries(AMDGPU::EXEC); + return !LiveMaskQueries.empty(); + } + // Store a copy of the original live mask when required MachineBasicBlock &Entry = MF.front(); MachineInstr *EntryMI = Entry.getFirstNonPHI(); + unsigned LiveMaskReg = 0; + + if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { + LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + } if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); + + lowerLiveMaskQueries(LiveMaskReg); + // EntryMI may become invalid here return true; } - // Handle the general case - unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + lowerLiveMaskQueries(LiveMaskReg); + EntryMI = nullptr; + // Handle the general case for (const auto &BII : Blocks) processBlock(const_cast(*BII.first), LiveMaskReg, BII.first == &*MF.begin()); Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -0,0 +1,59 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s + +; CHECK-LABEL: {{^}}test1: +; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec +; +; Note: We could generate better code here if we recognized earlier that +; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, +; the expectation is that the intrinsic will be used in non-trivial shaders, +; so such an optimization doesn't seem worth the effort. +define amdgpu_ps float @test1() { + %live = call i1 @llvm.amdgcn.ps.live() + %live.32 = zext i1 %live to i32 + %r = bitcast i32 %live.32 to float + ret float %r +} + +; CHECK-LABEL: {{^}}test2: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_wqm_b64 exec, exec +; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] +; CHECK: image_sample v0, [[VAR]], +define amdgpu_ps float @test2() { + %live = call i1 @llvm.amdgcn.ps.live() + %live.32 = zext i1 %live to i32 + + %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + %r = extractelement <4 x float> %t, i32 0 + ret float %r +} + +; CHECK-LABEL: {{^}}test3: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_wqm_b64 exec, exec +; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 +; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] +; CHECK: ; %dead +define amdgpu_ps float @test3(i32 %in) { +entry: + %live = call i1 @llvm.amdgcn.ps.live() + br i1 %live, label %end, label %dead + +dead: + %tc.dead = mul i32 %in, 2 + br label %end + +end: + %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ] + %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + %r = extractelement <4 x float> %t, i32 0 + ret float %r +} + +declare i1 @llvm.amdgcn.ps.live() #0 + +declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readnone }