Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -555,8 +555,9 @@ insertPass(&MachineSchedulerID, &RegisterCoalescerID); } + insertPass(&MachineSchedulerID, &SIWholeQuadModeID); + addPass(createSIShrinkInstructionsPass()); - addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -12,10 +12,9 @@ /// shaders. /// /// Whole quad mode is required for derivative computations, but it interferes -/// with shader side effects (stores and atomics). This pass is run on the -/// scheduled machine IR but before register coalescing, so that machine SSA is -/// available for analysis. It ensures that WQM is enabled when necessary, but -/// disabled around stores and atomics. +/// with shader side effects (stores and atomics). This pass is run after +/// machine instruction scheduling but before register allocation. It ensures +/// that WQM is enabled when necessary, but disabled around stores and atomics. /// /// When necessary, this pass creates a function prolog /// @@ -57,6 +56,9 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Format.h" + +#include using namespace llvm; @@ -71,9 +73,11 @@ struct PrintState { public: - explicit PrintState(int State) : State(State) {} + explicit PrintState(int State, bool Align = false) + : State(State), Align(Align) {} int State; + bool Align; }; static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { @@ -85,6 +89,13 @@ OS << "Exact"; } + if (PS.Align) { + if (!PS.State) + OS << " "; + else if (!(PS.State & StateExact)) + OS << " "; + } + return OS; } @@ -94,18 +105,35 @@ }; struct BlockInfo { - char Needs = 0; - char InNeeds = 0; - char OutNeeds = 0; + struct NeedsFlags { + char Self = 0; + char In = 0; + char Out = 0; + }; + + NeedsFlags Needs; + NeedsFlags Propagated; }; struct WorkItem { + struct Value { + SlotIndex Slot; + unsigned Reg = 0; + LaneBitmask LaneMask = 0; + }; + MachineBasicBlock *MBB = nullptr; MachineInstr *MI = nullptr; + Value V; WorkItem() {} WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} WorkItem(MachineInstr *MI) : MI(MI) {} + WorkItem(SlotIndex Slot, unsigned Reg, unsigned LaneMask) { + V.Slot = Slot; + V.Reg = Reg; + V.LaneMask = LaneMask; + } }; class SIWholeQuadMode : public MachineFunctionPass { @@ -117,14 +145,21 @@ DenseMap Instructions; DenseMap Blocks; + std::map, LaneBitmask> WQMValues; SmallVector LiveMaskQueries; - void printInfo(); + void printInfo(MachineFunction &MF) const; + void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask, + std::vector &Worklist); void markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist); void markUsesWQM(const MachineInstr &MI, std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); + void propagateValueSub(const WorkItem::Value &V, LaneBitmask LaneMask, + const LiveRange &LR, std::vector &Worklist); + void propagateValue(const WorkItem::Value &V, + std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); @@ -151,6 +186,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -172,24 +209,40 @@ return new SIWholeQuadMode; } -void SIWholeQuadMode::printInfo() { - for (const auto &BII : Blocks) { - dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" - << " InNeeds = " << PrintState(BII.second.InNeeds) - << ", Needs = " << PrintState(BII.second.Needs) - << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; +void SIWholeQuadMode::printInfo(MachineFunction &MF) const { + for (MachineBasicBlock &MBB : MF) { + BlockInfo BI; + auto BII = Blocks.find(&MBB); + if (BII != Blocks.end()) + BI = BII->second; + + dbgs() << "\nBB#" << MBB.getNumber() << ":\n" + << " In = " << PrintState(BI.Needs.In) + << ", Self = " << PrintState(BI.Needs.Self) + << ", Out = " << PrintState(BI.Needs.Out) << '\n'; - for (const MachineInstr &MI : *BII.first) { + for (MachineInstr &MI : MBB) { + InstrInfo II; auto III = Instructions.find(&MI); - if (III == Instructions.end()) - continue; + if (III != Instructions.end()) + II = III->second; - dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) - << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; + dbgs() << ' ' << PrintState(II.Needs, true) << ' ' << MI; } } } +void SIWholeQuadMode::markValueWQM(SlotIndex Slot, unsigned Reg, + LaneBitmask LaneMask, + std::vector &Worklist) { + LaneBitmask &Mask = WQMValues[std::make_pair(Slot, Reg)]; + LaneMask &= ~Mask; + if (LaneMask) { + Mask |= LaneMask; + Worklist.emplace_back(Slot, Reg, LaneMask); + } +} + void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist) { InstrInfo &II = Instructions[&MI]; @@ -204,12 +257,33 @@ return; II.Needs = Flag; - Worklist.push_back(&MI); + + MachineBasicBlock *MBB = MI.getParent(); + BlockInfo &BI = Blocks[MBB]; + + if (!(BI.Needs.Self & Flag)) { + BI.Needs.Self |= Flag; + BI.Needs.In |= Flag; + Worklist.push_back(MBB); + } + + if (MachineInstr *PrevMI = MI.getPrevNode()) { + InstrInfo &PrevII = Instructions[PrevMI]; + if (Flag & ~PrevII.OutNeeds) { + PrevII.OutNeeds |= Flag; + Worklist.push_back(PrevMI); + } + } + + if (Flag == StateWQM) + markUsesWQM(MI, Worklist); } /// Mark all instructions defining the uses in \p MI as WQM. void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, std::vector &Worklist) { + SlotIndex Idx = LIS->getInstructionIndex(MI); + for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; @@ -229,8 +303,8 @@ if (!Value) continue; - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. + // We do not need to track physical registers across basic blocks + // before register allocation if (Value->isPHIDef()) continue; @@ -241,8 +315,11 @@ continue; } - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); + unsigned SubReg = Use.getSubReg(); + LaneBitmask LaneMask = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) + : MRI->getMaxLaneMaskForVReg(Reg); + + markValueWQM(Idx, Reg, LaneMask, Worklist); } } @@ -259,25 +336,24 @@ for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); - char Flags = 0; if (TII->isDS(Opcode)) { - Flags = StateWQM; + markInstruction(MI, StateWQM, Worklist); + GlobalFlags |= StateWQM; } else if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. markUsesWQM(MI, Worklist); GlobalFlags |= StateWQM; - continue; } else if (TII->isDisableWQM(MI)) { - Flags = StateExact; + markInstruction(MI, StateExact, Worklist); + GlobalFlags |= StateExact; } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); } else if (WQMOutputs) { - // The function is in machine SSA form, which means that physical - // VGPRs correspond to shader inputs and outputs. Inputs are + // Physical VGPRs correspond to shader inputs and outputs. Inputs are // only used, outputs are only defined. for (const MachineOperand &MO : MI.defs()) { if (!MO.isReg()) @@ -287,98 +363,128 @@ if (!TRI->isVirtualRegister(Reg) && TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { - Flags = StateWQM; + markInstruction(MI, StateWQM, Worklist); + GlobalFlags |= StateWQM; break; } } } - - if (!Flags) - continue; } - - markInstruction(MI, Flags, Worklist); - GlobalFlags |= Flags; } } return GlobalFlags; } -void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, - std::vector& Worklist) { - MachineBasicBlock *MBB = MI.getParent(); - InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references - BlockInfo &BI = Blocks[MBB]; +/// Helper function of \ref propagateValue that handles individual subranges. +void SIWholeQuadMode::propagateValueSub(const WorkItem::Value &V, + LaneBitmask LaneMask, + const LiveRange &LR, + std::vector &Worklist) { + const VNInfo *Value = LR.Query(V.Slot).valueIn(); + if (!Value) + return; - // Control flow-type instructions and stores to temporary memory that are - // followed by WQM computations must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !II.Needs && - (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { - Instructions[&MI].Needs = StateWQM; - II.Needs = StateWQM; - } + if (Value->isPHIDef()) { + MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); - // Propagate to block level - BI.Needs |= II.Needs; - if ((BI.InNeeds | II.Needs) != BI.InNeeds) { - BI.InNeeds |= II.Needs; - Worklist.push_back(MBB); + for (MachineBasicBlock *Pred : MBB->predecessors()) { + SlotIndex PredIndex = LIS->getMBBEndIdx(Pred).getPrevIndex(); + + markValueWQM(PredIndex, V.Reg, LaneMask, Worklist); + } + } else { + MachineInstr *DefMI = LIS->getInstructionFromIndex(Value->def); + + markInstruction(*DefMI, StateWQM, Worklist); } +} - // Propagate backwards within block - if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = II.Needs | II.OutNeeds; - if (!PrevMI->isPHI()) { - InstrInfo &PrevII = Instructions[PrevMI]; - if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { - PrevII.OutNeeds |= InNeeds; - Worklist.push_back(PrevMI); +void SIWholeQuadMode::propagateValue(const WorkItem::Value &V, + std::vector &Worklist) { + const LiveInterval &LI = LIS->getInterval(V.Reg); + LaneBitmask LaneMask = V.LaneMask; + + if (LI.hasSubRanges()) { + for (const LiveInterval::SubRange &S : LI.subranges()) { + LaneBitmask Common = LaneMask & S.LaneMask; + + if (Common) { + LaneMask &= ~Common; + propagateValueSub(V, Common, S, Worklist); } } + } else { + propagateValueSub(V, LaneMask, LI, Worklist); } +} - // Propagate WQM flag to instruction inputs - assert(II.Needs != (StateWQM | StateExact)); +// Backwards propagation of OutNeeds and related effects. +void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, + std::vector &Worklist) { + const InstrInfo II = Instructions[&MI]; - if (II.Needs == StateWQM) - markUsesWQM(MI, Worklist); + // Stores to temporary memory (i.e., not marked as Exact) may be relevant for + // WQM computations. + if (!II.Needs && (II.OutNeeds & StateWQM) && + (TII->usesVM_CNT(MI) && MI.mayStore())) + markInstruction(MI, StateWQM, Worklist); + + // Propagate backwards within block + if (MachineInstr *PrevMI = MI.getPrevNode()) { + InstrInfo &PrevII = Instructions[PrevMI]; + if (II.OutNeeds & ~PrevII.OutNeeds) { + PrevII.OutNeeds |= II.OutNeeds; + Worklist.push_back(PrevMI); + } + } } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, - std::vector& Worklist) { + std::vector &Worklist) { BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. // Propagate through instructions if (!MBB.empty()) { MachineInstr *LastMI = &*MBB.rbegin(); InstrInfo &LastII = Instructions[LastMI]; - if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { - LastII.OutNeeds |= BI.OutNeeds; + if (BI.Needs.Out & ~LastII.OutNeeds) { + LastII.OutNeeds |= BI.Needs.Out; Worklist.push_back(LastMI); } } // Predecessor blocks must provide for our WQM/Exact needs. - for (MachineBasicBlock *Pred : MBB.predecessors()) { - BlockInfo &PredBI = Blocks[Pred]; - if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) - continue; - - PredBI.OutNeeds |= BI.InNeeds; - PredBI.InNeeds |= BI.InNeeds; - Worklist.push_back(Pred); + if (BI.Needs.In & ~BI.Propagated.In) { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if (BI.Needs.In & ~PredBI.Needs.Out) { + PredBI.Needs.Out |= BI.Needs.In; + PredBI.Needs.In |= BI.Needs.In; + Worklist.push_back(Pred); + } + } } - // All successors must be prepared to accept the same set of WQM/Exact data. - for (MachineBasicBlock *Succ : MBB.successors()) { - BlockInfo &SuccBI = Blocks[Succ]; - if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) - continue; + if (BI.Needs.Out & ~BI.Propagated.Out) { + // All successors must be prepared to accept the same set of WQM/Exact + // data. + for (MachineBasicBlock *Succ : MBB.successors()) { + BlockInfo &SuccBI = Blocks[Succ]; + if (BI.Needs.Out & ~SuccBI.Needs.In) { + SuccBI.Needs.In |= BI.Needs.Out; + Worklist.push_back(Succ); + } + } - SuccBI.InNeeds |= BI.OutNeeds; - Worklist.push_back(Succ); + // Mark terminators as WQM if required + if (BI.Needs.Out & ~BI.Propagated.Out & StateWQM) { + for (MachineInstr &Terminator : MBB.terminators()) + markInstruction(Terminator, StateWQM, Worklist); + } } + + Blocks[&MBB].Propagated = BI.Needs; } char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { @@ -391,8 +497,10 @@ if (WI.MI) propagateInstruction(*WI.MI, Worklist); - else + else if (WI.MBB) propagateBlock(*WI.MBB, Worklist); + else + propagateValue(WI.V, Worklist); } return GlobalFlags; @@ -401,29 +509,37 @@ void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg) { + MachineInstr *MI; + if (SaveWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM) { + MachineInstr *MI; + if (SavedWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) - .addReg(SavedWQM); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, @@ -434,12 +550,12 @@ const BlockInfo &BI = BII->second; - if (!(BI.InNeeds & StateWQM)) + if (!(BI.Needs.In & StateWQM)) return; // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + if (!isEntry && !(BI.Needs.Self & StateExact) && BI.Needs.Out != StateExact) return; DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); @@ -502,20 +618,27 @@ } else { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, &MI, SavedWQMReg); - SavedWQMReg = 0; + + if (SavedWQMReg) { + LIS->createAndComputeVirtRegInterval(SavedWQMReg); + SavedWQMReg = 0; + } } State = Needs; } - if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) + if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.Needs.Out == StateExact) MI.getOperand(3).setImm(1); } - if ((BI.OutNeeds & StateWQM) && State != StateWQM) { + if ((BI.Needs.Out & StateWQM) && State != StateWQM) { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, MBB.end(), SavedWQMReg); - } else if (BI.OutNeeds == StateExact && State != StateExact) { + + if (SavedWQMReg) + LIS->createAndComputeVirtRegInterval(SavedWQMReg); + } else if (BI.Needs.Out == StateExact && State != StateExact) { toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM) : MBB.getFirstTerminator(), 0, LiveMaskReg); @@ -526,8 +649,11 @@ for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); unsigned Dest = MI->getOperand(0).getReg(); - BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) - .addReg(LiveMaskReg); + MachineInstr *NewMI = + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + LIS->ReplaceMachineInstrInMaps(*MI, *NewMI); + MI->eraseFromParent(); } } @@ -537,6 +663,7 @@ return false; Instructions.clear(); + WQMValues.clear(); Blocks.clear(); LiveMaskQueries.clear(); @@ -553,6 +680,8 @@ return !LiveMaskQueries.empty(); } + DEBUG(printInfo(MF)); + // Store a copy of the original live mask when required unsigned LiveMaskReg = 0; { @@ -561,29 +690,31 @@ if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); } if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); - - lowerLiveMaskQueries(LiveMaskReg); - // EntryMI may become invalid here - return true; + MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); } } - DEBUG(printInfo()); - lowerLiveMaskQueries(LiveMaskReg); - // Handle the general case - for (auto BII : Blocks) - processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + if (GlobalFlags != StateWQM) { + // Handle the general case + for (auto &BII : Blocks) + processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + } + + if (LiveMaskReg) + LIS->createAndComputeVirtRegInterval(LiveMaskReg); return true; } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,7 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s ; CHECK-LABEL: {{^}}test1: -; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: v_cndmask_b32_e64 v0, 0, 1, [[LIVE]] ; ; Note: We could generate better code here if we recognized earlier that ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, @@ -16,8 +17,9 @@ ; CHECK-LABEL: {{^}}test2: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]] ; CHECK-DAG: s_wqm_b64 exec, exec -; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] +; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[COPY]] ; CHECK: image_sample v0, [[VAR]], define amdgpu_ps float @test2() { %live = call i1 @llvm.amdgcn.ps.live() @@ -31,8 +33,9 @@ ; CHECK-LABEL: {{^}}test3: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]] ; CHECK-DAG: s_wqm_b64 exec, exec -; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 +; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[COPY]], -1 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] ; CHECK: ; %dead define amdgpu_ps float @test3(i32 %in) { Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -464,6 +464,67 @@ ret <4 x float> %dtex } +; CHECK-LABEL: {{^}}test_subregs: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_interp_p1_f32 +; CHECK: v_interp_p2_f32 +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: _store +; CHECK: s_wqm_b64 exec, exec +; CHECK: image_sample +; +; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires +; tracking of subregisters. +; +define amdgpu_ps <4 x float> @test_subregs(i32 inreg %prims, <2 x i32> %ij, i32 %idx) #1 { +main_body: + %c = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prims, <2 x i32> %ij) + + call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + + %c.i = bitcast float %c to i32 + %c2.0 = insertelement <2 x i32> undef, i32 %c.i, i32 0 + %c2.i = insertelement <2 x i32> %c2.0, i32 1, i32 1 + %tex = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %c2.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +; Test tracking of vector condition codes. +; +; CHECK-LABEL: {{^}}test_vcc_tracking: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_cmp_gt_i32_e32 vcc, +; CHECK: ; %else +; CHECK: image_sample +; CHECK: ; %if +; CHECK: image_sample +; CHECK: ; %end +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: _store +define amdgpu_ps <4 x float> @test_vcc_tracking(i32 %sel, i32 %idx) #1 { +main_body: + %cc = icmp sgt i32 %sel, 0 + br i1 %cc, label %if, label %else + +if: + %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +else: + %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +end: + %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] + + call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + + ret <4 x float> %r +} + + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 @@ -472,8 +533,10 @@ declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #3 declare void @llvm.AMDGPU.kill(float) declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)