Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -919,23 +919,26 @@ /// Returns the operand index that is a use of the specific register or -1 /// if it is not found. It further tightens the search criteria to a use - /// that kills the register if isKill is true. + /// that kills the register if isKill is true and to a use that is implicit + /// is isImplicit is true. int findRegisterUseOperandIdx(unsigned Reg, bool isKill = false, + bool isImplicit = false, const TargetRegisterInfo *TRI = nullptr) const; /// Wrapper for findRegisterUseOperandIdx, it returns /// a pointer to the MachineOperand rather than an index. MachineOperand *findRegisterUseOperand(unsigned Reg, bool isKill = false, + bool isImplicit = false, const TargetRegisterInfo *TRI = nullptr) { - int Idx = findRegisterUseOperandIdx(Reg, isKill, TRI); + int Idx = findRegisterUseOperandIdx(Reg, isKill, isImplicit, TRI); return (Idx == -1) ? nullptr : &getOperand(Idx); } const MachineOperand *findRegisterUseOperand( - unsigned Reg, bool isKill = false, + unsigned Reg, bool isKill = false, bool isImplicit = false, const TargetRegisterInfo *TRI = nullptr) const { return const_cast(this)-> - findRegisterUseOperand(Reg, isKill, TRI); + findRegisterUseOperand(Reg, isKill, isImplicit, TRI); } /// Returns the operand index that is a def of the specified register or Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -328,6 +328,13 @@ [IntrNoMem]>; // See int_amdgcn_v_interp_p1 for why this is // IntrNoMem. +// Pixel shaders only: whether the current pixel is live (i.e. not a helper +// invocation for derivative computation). +def int_amdgcn_ps_live : Intrinsic < + [llvm_i1_ty], + [], + [IntrNoMem]>; + def int_amdgcn_mbcnt_lo : GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; Index: lib/CodeGen/MachineCSE.cpp =================================================================== --- lib/CodeGen/MachineCSE.cpp +++ lib/CodeGen/MachineCSE.cpp @@ -605,7 +605,7 @@ for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II) for (auto ImplicitDef : ImplicitDefs) if (MachineOperand *MO = II->findRegisterUseOperand( - ImplicitDef, /*isKill=*/true, TRI)) + ImplicitDef, /*isKill=*/true, /*isImplicit=*/false, TRI)) MO->setIsKill(false); } else { // If the instructions aren't in the same BB, bail out and clear the Index: lib/CodeGen/MachineInstr.cpp =================================================================== --- lib/CodeGen/MachineInstr.cpp +++ lib/CodeGen/MachineInstr.cpp @@ -1273,8 +1273,10 @@ /// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of /// the specific register or -1 if it is not found. It further tightens -/// the search criteria to a use that kills the register if isKill is true. +/// the search criteria to a use that kills the register if isKill is true +/// and to an implicit use if isImplicit is true. int MachineInstr::findRegisterUseOperandIdx(unsigned Reg, bool isKill, + bool isImplicit, const TargetRegisterInfo *TRI) const { for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { const MachineOperand &MO = getOperand(i); @@ -1287,9 +1289,11 @@ (TRI && TargetRegisterInfo::isPhysicalRegister(MOReg) && TargetRegisterInfo::isPhysicalRegister(Reg) && - TRI->isSubRegister(MOReg, Reg))) - if (!isKill || MO.isKill()) + TRI->isSubRegister(MOReg, Reg))) { + if ((!isKill || MO.isKill()) && + (!isImplicit || MO.isImplicit())) return i; + } } return -1; } Index: lib/CodeGen/StackSlotColoring.cpp =================================================================== --- lib/CodeGen/StackSlotColoring.cpp +++ lib/CodeGen/StackSlotColoring.cpp @@ -407,7 +407,7 @@ ++NumDead; changed = true; - if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) { + if (NextMI->findRegisterUseOperandIdx(LoadReg, true) != -1) { ++NumDead; toErase.push_back(I); } Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -294,7 +294,7 @@ if (!LIS) { // Update kill and LV information. KillMO->setIsKill(false); - KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI); + KillMO = MI->findRegisterUseOperand(SavedReg, false, false, TRI); KillMO->setIsKill(true); if (LV) @@ -708,7 +708,7 @@ if (LIS) LIS->ReplaceMachineInstrInMaps(*mi, *NewMI); - if (NewMI->findRegisterUseOperand(RegB, false, TRI)) + if (NewMI->findRegisterUseOperand(RegB, false, false, TRI)) // FIXME: Temporary workaround. If the new instruction doesn't // uses RegB, convertToThreeAddress must have created more // then one instruction. Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -250,6 +250,7 @@ case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: return true; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1597,8 +1597,10 @@ // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { - const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); - if (!Exec || !Exec->isImplicit()) { + const MachineOperand *Exec = + MI->findRegisterUseOperand(AMDGPU::EXEC, /*isKill=*/false, + /*isImplicit*/true); + if (!Exec) { ErrInfo = "VALU instruction does not implicitly read exec mask"; return false; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2014,6 +2014,14 @@ } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 +let SALU = 1 in +def SI_PS_LIVE : InstSI < + (outs SReg_64:$dst), + (ins), + "si_ps_live $dst", + [(set i1:$dst, (int_amdgcn_ps_live))] +>; + // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -101,11 +101,12 @@ DenseMap Instructions; DenseMap Blocks; SmallVector ExecExports; + SmallVector LiveMaskQueries; - char scanInstructions(const MachineFunction &MF, std::vector& Worklist); + char scanInstructions(MachineFunction &MF, std::vector& Worklist); void propagateInstruction(const MachineInstr &MI, std::vector& Worklist); void propagateBlock(const MachineBasicBlock &MBB, std::vector& Worklist); - char analyzeFunction(const MachineFunction &MF); + char analyzeFunction(MachineFunction &MF); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); @@ -113,6 +114,8 @@ unsigned SavedWQM); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + void lowerLiveMaskQueries(unsigned LiveMaskReg); + public: static char ID; @@ -148,15 +151,15 @@ // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. -char SIWholeQuadMode::scanInstructions(const MachineFunction &MF, +char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - const MachineBasicBlock &MBB = *BI; + MachineBasicBlock &MBB = *BI; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { - const MachineInstr &MI = *II; + MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); char Flags; @@ -167,8 +170,13 @@ Flags = StateExact; } else { // Handle export instructions with the exec mask valid flag set - if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0) - ExecExports.push_back(&MI); + if (Opcode == AMDGPU::EXP) { + if (MI.getOperand(4).getImm() != 0) + ExecExports.push_back(&MI); + } else if (Opcode == AMDGPU::SI_PS_LIVE) { + LiveMaskQueries.push_back(&MI); + } + continue; } @@ -290,7 +298,7 @@ } } -char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) { +char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { std::vector Worklist; char GlobalFlags = scanInstructions(MF, Worklist); @@ -424,6 +432,16 @@ } } +void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + for (MachineInstr *MI : LiveMaskQueries) { + DebugLoc DL = MI->getDebugLoc(); + unsigned Dest = MI->getOperand(0).getReg(); + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + MI->eraseFromParent(); + } +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) return false; @@ -431,30 +449,43 @@ Instructions.clear(); Blocks.clear(); ExecExports.clear(); + LiveMaskQueries.clear(); TII = static_cast(MF.getSubtarget().getInstrInfo()); TRI = static_cast(MF.getSubtarget().getRegisterInfo()); MRI = &MF.getRegInfo(); char GlobalFlags = analyzeFunction(MF); - if (!(GlobalFlags & StateWQM)) - return false; + if (!(GlobalFlags & StateWQM)) { + lowerLiveMaskQueries(AMDGPU::EXEC); + return !LiveMaskQueries.empty(); + } + // Store a copy of the original live mask when required MachineBasicBlock &Entry = MF.front(); MachineInstr *EntryMI = Entry.getFirstNonPHI(); + unsigned LiveMaskReg = 0; + + if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { + LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + } if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); + + lowerLiveMaskQueries(LiveMaskReg); + // EntryMI may become invalid here return true; } - // Handle the general case - unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + lowerLiveMaskQueries(LiveMaskReg); + EntryMI = nullptr; + // Handle the general case for (const auto &BII : Blocks) processBlock(const_cast(*BII.first), LiveMaskReg, BII.first == &*MF.begin()); Index: lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.cpp +++ lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -3440,7 +3440,7 @@ // FIXME: This doesn't properly handle multiple uses. int Idx = -1; while (II != E && II->isInsideBundle()) { - Idx = II->findRegisterUseOperandIdx(Reg, false, TRI); + Idx = II->findRegisterUseOperandIdx(Reg, false, false, TRI); if (Idx != -1) break; if (II->getOpcode() != ARM::t2IT) @@ -4473,7 +4473,7 @@ case ARM::VMOVv2i32: case ARM::VMOVv2f32: case ARM::VMOVv1i64: - UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI); + UseOp = MI->findRegisterUseOperandIdx(Reg, false, false, TRI); break; // Explicitly reads the dependency. Index: lib/Target/SystemZ/SystemZElimCompare.cpp =================================================================== --- lib/Target/SystemZ/SystemZElimCompare.cpp +++ lib/Target/SystemZ/SystemZElimCompare.cpp @@ -427,7 +427,7 @@ RegMask = MBBI->getOperand(2).getRegMask(); // Clear out all current operands. - int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI); + int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, false, TRI); assert(CCUse >= 0 && "BRC/BCR must use CC"); Branch->RemoveOperand(CCUse); // Remove target (branch) or regmask (sibcall). Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -0,0 +1,59 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s + +; CHECK-LABEL: {{^}}test1: +; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec +; +; Note: We could generate better code here if we recognized earlier that +; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, +; the expectation is that the intrinsic will be used in non-trivial shaders, +; so such an optimization doesn't seem worth the effort. +define amdgpu_ps float @test1() { + %live = call i1 @llvm.amdgcn.ps.live() + %live.32 = zext i1 %live to i32 + %r = bitcast i32 %live.32 to float + ret float %r +} + +; CHECK-LABEL: {{^}}test2: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_wqm_b64 exec, exec +; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] +; CHECK: image_sample v0, [[VAR]], +define amdgpu_ps float @test2() { + %live = call i1 @llvm.amdgcn.ps.live() + %live.32 = zext i1 %live to i32 + + %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + %r = extractelement <4 x float> %t, i32 0 + ret float %r +} + +; CHECK-LABEL: {{^}}test3: +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_wqm_b64 exec, exec +; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 +; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] +; CHECK: ; %dead +define amdgpu_ps float @test3(i32 %in) { +entry: + %live = call i1 @llvm.amdgcn.ps.live() + br i1 %live, label %end, label %dead + +dead: + %tc.dead = mul i32 %in, 2 + br label %end + +end: + %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ] + %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + %r = extractelement <4 x float> %t, i32 0 + ret float %r +} + +declare i1 @llvm.amdgcn.ps.live() #0 + +declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readnone }