Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -537,8 +537,9 @@ insertPass(&MachineSchedulerID, &RegisterCoalescerID); } + insertPass(&MachineSchedulerID, &SIWholeQuadModeID); + addPass(createSIShrinkInstructionsPass()); - addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -12,10 +12,9 @@ /// shaders. /// /// Whole quad mode is required for derivative computations, but it interferes -/// with shader side effects (stores and atomics). This pass is run on the -/// scheduled machine IR but before register coalescing, so that machine SSA is -/// available for analysis. It ensures that WQM is enabled when necessary, but -/// disabled around stores and atomics. +/// with shader side effects (stores and atomics). This pass is run after +/// machine instruction scheduling but before register allocation. It ensures +/// that WQM is enabled when necessary, but disabled around stores and atomics. /// /// When necessary, this pass creates a function prolog /// @@ -57,6 +56,9 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Format.h" + +#include using namespace llvm; @@ -75,18 +77,35 @@ }; struct BlockInfo { - char Needs = 0; - char InNeeds = 0; - char OutNeeds = 0; + struct NeedsFlags { + char Self = 0; + char In = 0; + char Out = 0; + }; + + NeedsFlags Needs; + NeedsFlags Propagated; }; struct WorkItem { + struct Value { + SlotIndex Slot; + unsigned Reg = 0; + LaneBitmask LaneMask = 0; + }; + MachineBasicBlock *MBB = nullptr; MachineInstr *MI = nullptr; + Value V; WorkItem() {} WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} WorkItem(MachineInstr *MI) : MI(MI) {} + WorkItem(SlotIndex Slot, unsigned Reg, unsigned LaneMask) { + V.Slot = Slot; + V.Reg = Reg; + V.LaneMask = LaneMask; + } }; class SIWholeQuadMode : public MachineFunctionPass { @@ -94,15 +113,25 @@ const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; + LiveIntervals *LIS; DenseMap Instructions; DenseMap Blocks; + std::map, LaneBitmask> WQMValues; SmallVector LiveMaskQueries; - void printInfo(); + void printInfo(MachineFunction &MF) const; + void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask, + std::vector &Worklist); void markUsesWQM(const MachineInstr &MI, std::vector &Worklist); + void markInstruction(MachineInstr &MI, char Flag, + std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); + void propagateValueSub(const WorkItem::Value &V, LaneBitmask LaneMask, + const LiveRange &LR, std::vector &Worklist); + void propagateValue(const WorkItem::Value &V, + std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); @@ -128,6 +157,9 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -137,8 +169,11 @@ char SIWholeQuadMode::ID = 0; -INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, - "SI Whole Quad Mode", false, false) +INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; @@ -153,39 +188,58 @@ Str = "WQM"; if (state & StateExact) { if (!Str.empty()) - Str += "|"; + Str += '|'; Str += "Exact"; } return Str; } -void SIWholeQuadMode::printInfo() { - for (const auto &BII : Blocks) { - dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"; - dbgs() << " InNeeds = " << stateString(BII.second.InNeeds) - << ", Needs = " << stateString(BII.second.Needs) - << ", OutNeeds = " << stateString(BII.second.OutNeeds) << "\n\n"; +void SIWholeQuadMode::printInfo(MachineFunction &MF) const { + for (MachineBasicBlock &MBB : MF) { + BlockInfo BI; + auto BII = Blocks.find(&MBB); + if (BII != Blocks.end()) + BI = BII->second; - for (const MachineInstr &MI : *BII.first) { + dbgs() << "\nBB#" << MBB.getNumber() + << ": In = " << stateString(BI.Needs.In) + << ", Self = " << stateString(BI.Needs.Self) + << ", Out = " << stateString(BI.Needs.Out) << '\n'; + + for (MachineInstr &MI : MBB) { + InstrInfo II; auto III = Instructions.find(&MI); - if (III == Instructions.end()) - continue; + if (III != Instructions.end()) + II = III->second; - dbgs() << " " << MI; - dbgs() << " Needs = " << stateString(III->second.Needs) - << ", OutNeeds = " << stateString(III->second.OutNeeds) << "\n"; + dbgs() << ' ' << left_justify(stateString(II.Needs), 5) << ' ' << MI; } } } +void SIWholeQuadMode::markValueWQM(SlotIndex Slot, unsigned Reg, + LaneBitmask LaneMask, + std::vector &Worklist) { + LaneBitmask &Mask = WQMValues[std::make_pair(Slot, Reg)]; + LaneMask &= ~Mask; + if (LaneMask) { + Mask |= LaneMask; + Worklist.emplace_back(Slot, Reg, LaneMask); + } +} + /// Mark all instructions defining the uses in \p MI as WQM. void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, std::vector &Worklist) { + SlotIndex Idx = LIS->getInstructionIndex(MI); + for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; + unsigned Reg = Use.getReg(); + // At this point, physical registers appear as (shader) inputs or // non-monolithic shader outputs. Following those makes no sense (and would // in fact be incorrect when the same VGPR is used as both an output and an @@ -193,27 +247,49 @@ // // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we // have to trace this, in practice it happens for 64-bit computations like - // pointers where both dwords are followed already anyway. - if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) + // pointers where both dwords are followed already anyway. Branch-relevant + // code still uses virtual registers at this point. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; - for (MachineOperand &Def : MRI->def_operands(Use.getReg())) { - MachineInstr *DefMI = Def.getParent(); - InstrInfo &DefII = Instructions[DefMI]; - - // Obviously skip if DefMI is already flagged as NeedWQM. - // - // The instruction might also be flagged as NeedExact. This happens when - // the result of an atomic is used in a WQM computation. In this case, - // the atomic must not run for helper pixels and the WQM result is - // undefined. - if (DefII.Needs != 0) - continue; - - DefII.Needs = StateWQM; - Worklist.push_back(DefMI); + unsigned SubReg = Use.getSubReg(); + LaneBitmask LaneMask = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) + : MRI->getMaxLaneMaskForVReg(Reg); + + markValueWQM(Idx, Reg, LaneMask, Worklist); + } +} + +void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, + std::vector &Worklist) { + InstrInfo &II = Instructions[&MI]; + if (II.Needs == Flag) + return; + + assert(!II.Needs); + assert(Flag == StateWQM || Flag == StateExact); + + II.Needs = Flag; + + MachineBasicBlock *MBB = MI.getParent(); + BlockInfo &BI = Blocks[MBB]; + + if (!(BI.Needs.Self & Flag)) { + BI.Needs.Self |= Flag; + BI.Needs.In |= Flag; + Worklist.push_back(MBB); + } + + if (MachineInstr *PrevMI = MI.getPrevNode()) { + InstrInfo &PrevII = Instructions[PrevMI]; + if (Flag & ~PrevII.OutNeeds) { + PrevII.OutNeeds |= Flag; + Worklist.push_back(PrevMI); } } + + if (Flag == StateWQM) + markUsesWQM(MI, Worklist); } // Scan instructions to determine which ones require an Exact execmask and @@ -229,10 +305,10 @@ for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); - char Flags = 0; if (TII->isDS(Opcode)) { - Flags = StateWQM; + markInstruction(MI, StateWQM, Worklist); + GlobalFlags |= StateWQM; } else if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been @@ -240,13 +316,13 @@ markUsesWQM(MI, Worklist); GlobalFlags |= StateWQM; } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { - Flags = StateExact; + markInstruction(MI, StateExact, Worklist); + GlobalFlags |= StateExact; } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); } else if (WQMOutputs) { - // The function is in machine SSA form, which means that physical - // VGPRs correspond to shader inputs and outputs. Inputs are + // Physical VGPRs correspond to shader inputs and outputs. Inputs are // only used, outputs are only defined. for (const MachineOperand &MO : MI.defs()) { if (!MO.isReg()) @@ -256,24 +332,20 @@ if (!TRI->isVirtualRegister(Reg) && TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { - Flags = StateWQM; + markInstruction(MI, StateWQM, Worklist); + GlobalFlags |= StateWQM; break; } } } - - if (!Flags) - continue; } - - Instructions[&MI].Needs = Flags; - Worklist.push_back(&MI); - GlobalFlags |= Flags; } if (WQMOutputs && MBB.succ_empty()) { // This is a prolog shader. Make sure we go back to exact mode at the end. - Blocks[&MBB].OutNeeds = StateExact; + assert(!Blocks[&MBB].Needs.Out); + Blocks[&MBB].Needs.Out = StateExact; + Blocks[&MBB].Needs.In |= StateExact; Worklist.push_back(&MBB); GlobalFlags |= StateExact; } @@ -282,78 +354,118 @@ return GlobalFlags; } -void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, - std::vector& Worklist) { - MachineBasicBlock *MBB = MI.getParent(); - InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references - BlockInfo &BI = Blocks[MBB]; +/// Helper function of \ref propagateValue that handles individual subranges. +void SIWholeQuadMode::propagateValueSub(const WorkItem::Value &V, + LaneBitmask LaneMask, + const LiveRange &LR, + std::vector &Worklist) { + const VNInfo *Value = LR.Query(V.Slot).valueIn(); + if (!Value) + return; - // Control flow-type instructions that are followed by WQM computations - // must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { - Instructions[&MI].Needs = StateWQM; - II.Needs = StateWQM; + if (Value->isPHIDef()) { + MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); + + for (MachineBasicBlock *Pred : MBB->predecessors()) { + SlotIndex PredIndex = LIS->getMBBEndIdx(Pred).getPrevIndex(); + + markValueWQM(PredIndex, V.Reg, LaneMask, Worklist); + } + } else { + MachineInstr *DefMI = LIS->getInstructionFromIndex(Value->def); + + // Obviously skip if DefMI is already flagged as NeedWQM. + // + // The instruction might also be flagged as NeedExact. This happens when + // the result of an atomic is used in a WQM computation. In this case, + // the atomic must not run for helper pixels and the WQM result is + // undefined. + if (!Instructions[DefMI].Needs) + markInstruction(*DefMI, StateWQM, Worklist); } +} - // Propagate to block level - BI.Needs |= II.Needs; - if ((BI.InNeeds | II.Needs) != BI.InNeeds) { - BI.InNeeds |= II.Needs; - Worklist.push_back(MBB); +void SIWholeQuadMode::propagateValue(const WorkItem::Value &V, + std::vector &Worklist) { + const LiveInterval &LI = LIS->getInterval(V.Reg); + LaneBitmask LaneMask = V.LaneMask; + + if (LI.hasSubRanges()) { + for (const LiveInterval::SubRange &S : LI.subranges()) { + LaneBitmask Common = LaneMask & S.LaneMask; + + if (Common) { + LaneMask &= ~Common; + propagateValueSub(V, Common, S, Worklist); + } + } + } else { + propagateValueSub(V, LaneMask, LI, Worklist); } +} + +void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, + std::vector &Worklist) { + const InstrInfo II = Instructions[&MI]; // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = II.Needs | II.OutNeeds; - if (!PrevMI->isPHI()) { - InstrInfo &PrevII = Instructions[PrevMI]; - if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { - PrevII.OutNeeds |= InNeeds; - Worklist.push_back(PrevMI); - } + InstrInfo &PrevII = Instructions[PrevMI]; + if (II.OutNeeds & ~PrevII.OutNeeds) { + PrevII.OutNeeds |= II.OutNeeds; + Worklist.push_back(PrevMI); } } - // Propagate WQM flag to instruction inputs - assert(II.Needs != (StateWQM | StateExact)); - if (II.Needs == StateWQM) - markUsesWQM(MI, Worklist); + if (MI.getOpcode() == AMDGPU::SI_KILL && II.OutNeeds & StateWQM) + markInstruction(MI, StateWQM, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, - std::vector& Worklist) { + std::vector &Worklist) { BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. // Propagate through instructions if (!MBB.empty()) { MachineInstr *LastMI = &*MBB.rbegin(); InstrInfo &LastII = Instructions[LastMI]; - if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { - LastII.OutNeeds |= BI.OutNeeds; + if (BI.Needs.Out & ~LastII.OutNeeds) { + LastII.OutNeeds |= BI.Needs.Out; Worklist.push_back(LastMI); } } // Predecessor blocks must provide for our WQM/Exact needs. - for (MachineBasicBlock *Pred : MBB.predecessors()) { - BlockInfo &PredBI = Blocks[Pred]; - if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) - continue; - - PredBI.OutNeeds |= BI.InNeeds; - PredBI.InNeeds |= BI.InNeeds; - Worklist.push_back(Pred); + if (BI.Needs.In & ~BI.Propagated.In) { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (BI.Needs.In & ~PredBI.Needs.Out) { + PredBI.Needs.Out |= BI.Needs.In; + PredBI.Needs.In |= BI.Needs.In; + Worklist.push_back(Pred); + } + } } - // All successors must be prepared to accept the same set of WQM/Exact data. - for (MachineBasicBlock *Succ : MBB.successors()) { - BlockInfo &SuccBI = Blocks[Succ]; - if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) - continue; + if (BI.Needs.Out & ~BI.Propagated.Out) { + // All successors must be prepared to accept the same set of WQM/Exact + // data. + for (MachineBasicBlock *Succ : MBB.successors()) { + BlockInfo &SuccBI = Blocks[Succ]; + if (BI.Needs.Out & ~SuccBI.Needs.In) { + SuccBI.Needs.In |= BI.Needs.Out; + Worklist.push_back(Succ); + } + } - SuccBI.InNeeds |= BI.OutNeeds; - Worklist.push_back(Succ); + // Mark terminators as WQM if required + if (BI.Needs.Out & ~BI.Propagated.Out & StateWQM) { + for (MachineInstr &Terminator : MBB.terminators()) + markInstruction(Terminator, StateWQM, Worklist); + } } + + Blocks[&MBB].Propagated = BI.Needs; } char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { @@ -366,8 +478,10 @@ if (WI.MI) propagateInstruction(*WI.MI, Worklist); - else + else if (WI.MBB) propagateBlock(*WI.MBB, Worklist); + else + propagateValue(WI.V, Worklist); } return GlobalFlags; @@ -376,29 +490,37 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg) { + MachineInstr *MI; + if (SaveWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM) { + MachineInstr *MI; + if (SavedWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) - .addReg(SavedWQM); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, @@ -409,12 +531,12 @@ const BlockInfo &BI = BII->second; - if (!(BI.InNeeds & StateWQM)) + if (!(BI.Needs.In & StateWQM)) return; // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + if (!isEntry && !(BI.Needs.Self & StateExact) && BI.Needs.Out != StateExact) return; DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); @@ -477,17 +599,24 @@ } else { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, &MI, SavedWQMReg); - SavedWQMReg = 0; + + if (SavedWQMReg) { + LIS->createAndComputeVirtRegInterval(SavedWQMReg); + SavedWQMReg = 0; + } } State = Needs; } } - if ((BI.OutNeeds & StateWQM) && State != StateWQM) { + if ((BI.Needs.Out & StateWQM) && State != StateWQM) { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, MBB.end(), SavedWQMReg); - } else if (BI.OutNeeds == StateExact && State != StateExact) { + + if (SavedWQMReg) + LIS->createAndComputeVirtRegInterval(SavedWQMReg); + } else if (BI.Needs.Out == StateExact && State != StateExact) { toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM) : MBB.getFirstTerminator(), 0, LiveMaskReg); @@ -498,8 +627,11 @@ for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); unsigned Dest = MI->getOperand(0).getReg(); - BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) - .addReg(LiveMaskReg); + MachineInstr *NewMI = + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + LIS->ReplaceMachineInstrInMaps(*MI, *NewMI); + MI->eraseFromParent(); } } @@ -509,6 +641,7 @@ return false; Instructions.clear(); + WQMValues.clear(); Blocks.clear(); LiveMaskQueries.clear(); @@ -517,6 +650,7 @@ TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); + LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); if (!(GlobalFlags & StateWQM)) { @@ -524,6 +658,8 @@ return !LiveMaskQueries.empty(); } + DEBUG(printInfo(MF)); + // Store a copy of the original live mask when required unsigned LiveMaskReg = 0; { @@ -532,29 +668,32 @@ if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + MachineInstr *MI = + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); } if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); - - lowerLiveMaskQueries(LiveMaskReg); - // EntryMI may become invalid here - return true; + MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); } } - DEBUG(printInfo()); - lowerLiveMaskQueries(LiveMaskReg); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + if (GlobalFlags != StateWQM) { + // Handle the general case + for (auto &BII : Blocks) + processBlock(const_cast(*BII.first), LiveMaskReg, + BII.first == &*MF.begin()); + } + + if (LiveMaskReg) + LIS->createAndComputeVirtRegInterval(LiveMaskReg); return true; } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,7 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s ; CHECK-LABEL: {{^}}test1: -; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: v_cndmask_b32_e64 v0, 0, 1, [[LIVE]] ; ; Note: We could generate better code here if we recognized earlier that ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -358,13 +358,77 @@ ret float %s } +; CHECK-LABEL: {{^}}test_subregs: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_interp_p1_f32 +; CHECK: v_interp_p2_f32 +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: _store +; CHECK: s_wqm_b64 exec, exec +; CHECK: image_sample +; +; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires +; tracking of subregisters. +; +define amdgpu_ps <4 x float> @test_subregs(float addrspace(1)* inreg %ptr, i32 inreg %prims, <2 x i32> %ij, i32 %idx) #1 { +main_body: + %c = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prims, <2 x i32> %ij) + + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx + store float 1.0, float addrspace(1)* %gep + + %c.i = bitcast float %c to i32 + %c2.0 = insertelement <2 x i32> undef, i32 %c.i, i32 0 + %c2.i = insertelement <2 x i32> %c2.0, i32 1, i32 1 + %tex = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %c2.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +; Test tracking of vector condition codes. +; +; CHECK-LABEL: {{^}}test_vcc_tracking: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_cmp_gt_i32_e32 vcc, +; CHECK: ; %else +; CHECK: image_sample +; CHECK: ; %if +; CHECK: image_sample +; CHECK: ; %end +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: _store +define amdgpu_ps <4 x float> @test_vcc_tracking(float addrspace(1)* inreg %ptr, i32 %sel, i32 %idx) #1 { +main_body: + %cc = icmp sgt i32 %sel, 0 + br i1 %cc, label %if, label %else + +if: + %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +else: + %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +end: + %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] + + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx + store float 1.0, float addrspace(1)* %gep + + ret <4 x float> %r +} + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #3 declare void @llvm.AMDGPU.kill(float) declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)