Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -343,11 +343,6 @@ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { - // If we are trying to copy to or from SCC, there is a bug somewhere else in - // the backend. While it may be theoretically possible to do this, it should - // never be necessary. - assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); - static const int16_t Sub0_15[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, @@ -392,6 +387,13 @@ ArrayRef SubIndices; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) + .addImm(-1) + .addImm(0); + return; + } + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -418,6 +420,12 @@ .addReg(SrcReg, getKillRegState(KillSrc)); return; + } else if (DestReg == AMDGPU::SCC) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + return; } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); Opcode = AMDGPU::S_MOV_B64; Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -129,6 +129,14 @@ void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); + bool requiresCorrectState(const MachineInstr &MI) const; + + MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before); + MachineBasicBlock::iterator + prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Last, bool PreferLast, + bool SaveSCC); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, @@ -398,32 +406,140 @@ return GlobalFlags; } +/// Whether \p MI really requires the exec state computed during analysis. +/// +/// Scalar instructions must occasionally be marked WQM for correct propagation +/// (e.g. thread masks leading up to branches), but when it comes to actual +/// execution, they don't care about EXEC. +bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { + if (MI.isTerminator()) + return true; + + // Skip instructions that are not affected by EXEC + if (TII->isScalarUnit(MI)) + return false; + + // Generic instructions such as COPY will either disappear by register + // coalescing or be lowered to SALU or VALU instructions. + if (MI.isTransient()) { + if (MI.getNumExplicitOperands() >= 1) { + const MachineOperand &Op = MI.getOperand(0); + if (Op.isReg()) { + if (TRI->isSGPRReg(*MRI, Op.getReg())) { + // SGPR instructions are not affected by EXEC + return false; + } + } + } + } + + return true; +} + +MachineBasicBlock::iterator +SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before) { + unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + + MachineInstr *Save = + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) + .addReg(AMDGPU::SCC); + MachineInstr *Restore = + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(SaveReg); + + LIS->InsertMachineInstrInMaps(*Save); + LIS->InsertMachineInstrInMaps(*Restore); + LIS->createAndComputeVirtRegInterval(SaveReg); + + return Restore; +} + +// Return an iterator in the (inclusive) range [First, Last] at which +// instructions can be safely inserted, keeping in mind that some of the +// instructions we want to add necessarily clobber SCC. +MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( + MachineBasicBlock &MBB, MachineBasicBlock::iterator First, + MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) { + if (!SaveSCC) + return PreferLast ? Last : First; + + LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); + auto MBBE = MBB.end(); + SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) + : LIS->getMBBEndIdx(&MBB); + SlotIndex LastIdx = + Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB); + SlotIndex Idx = PreferLast ? LastIdx : FirstIdx; + const LiveRange::Segment *S; + + for (;;) { + S = LR.getSegmentContaining(Idx); + if (!S) + break; + + if (PreferLast) { + SlotIndex Next = S->start.getBaseIndex(); + if (Next < FirstIdx) + break; + Idx = Next; + } else { + SlotIndex Next = S->end.getNextIndex().getBaseIndex(); + if (Next > LastIdx) + break; + Idx = Next; + } + } + + MachineBasicBlock::iterator MBBI; + + if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) + MBBI = MI; + else { + assert(Idx == LIS->getMBBEndIdx(&MBB)); + MBBI = MBB.end(); + } + + if (S) + MBBI = saveSCC(MBB, MBBI); + + return MBBI; +} + void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg) { + MachineInstr *MI; + if (SaveWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), - SaveWQM) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(LiveMaskReg); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM) { + MachineInstr *MI; + if (SavedWQM) { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) - .addReg(SavedWQM); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); } else { - BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); } + + LIS->InsertMachineInstrInMaps(*MI); } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, @@ -447,76 +563,77 @@ unsigned SavedWQMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; - MachineInstr *FirstNonWQM = nullptr; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); - while (II != IE) { - MachineInstr &MI = *II; - ++II; + if (isEntry) + ++II; // Skip the instruction that saves LiveMask - // Skip instructions that are not affected by EXEC - if (TII->isScalarUnit(MI) && !MI.isTerminator()) - continue; + MachineBasicBlock::iterator First = IE; + for (;;) { + MachineBasicBlock::iterator Next = II; + char Needs = 0; + char OutNeeds = 0; - // Generic instructions such as COPY will either disappear by register - // coalescing or be lowered to SALU or VALU instructions. - if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { - if (MI.getNumExplicitOperands() >= 1) { - const MachineOperand &Op = MI.getOperand(0); - if (Op.isReg()) { - if (TRI->isSGPRReg(*MRI, Op.getReg())) { - // SGPR instructions are not affected by EXEC - continue; - } + if (First == IE) + First = II; + + if (II != IE) { + MachineInstr &MI = *II; + + if (requiresCorrectState(MI)) { + auto III = Instructions.find(&MI); + if (III != Instructions.end()) { + Needs = III->second.Needs; + OutNeeds = III->second.OutNeeds; } } - } - char Needs = 0; - char OutNeeds = 0; - auto InstrInfoIt = Instructions.find(&MI); - if (InstrInfoIt != Instructions.end()) { - Needs = InstrInfoIt->second.Needs; - OutNeeds = InstrInfoIt->second.OutNeeds; - } - - // Keep track of the first consecutive non-WQM instruction, so that we - // switch away from WQM as soon as possible, potentially saving a small - // bit of bandwidth on loads. - if (Needs == StateWQM) - FirstNonWQM = nullptr; - else if (!FirstNonWQM) - FirstNonWQM = &MI; - - // State switching - if (Needs && State != Needs) { - if (Needs == StateExact) { - assert(!SavedWQMReg); + if (MI.isTerminator() && !Needs && OutNeeds == StateExact) + Needs = StateExact; - if (!WQMFromExec && (OutNeeds & StateWQM)) - SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) + MI.getOperand(3).setImm(1); - toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg); - } else { - assert(WQMFromExec == (SavedWQMReg == 0)); - toWQM(MBB, &MI, SavedWQMReg); - SavedWQMReg = 0; + ++Next; + } else { + // End of basic block + if (BI.OutNeeds & StateWQM) + Needs = StateWQM; + else if (BI.OutNeeds == StateExact) + Needs = StateExact; + } + + if (Needs) { + if (Needs != State) { + MachineBasicBlock::iterator Before = + prepareInsertion(MBB, First, II, Needs == StateWQM, + Needs == StateExact || WQMFromExec); + + if (Needs == StateExact) { + if (!WQMFromExec && (OutNeeds & StateWQM)) + SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + toExact(MBB, Before, SavedWQMReg, LiveMaskReg); + } else { + assert(WQMFromExec == (SavedWQMReg == 0)); + + toWQM(MBB, Before, SavedWQMReg); + + if (SavedWQMReg) { + LIS->createAndComputeVirtRegInterval(SavedWQMReg); + SavedWQMReg = 0; + } + } + + State = Needs; } - State = Needs; + First = IE; } - if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) - MI.getOperand(3).setImm(1); - } - - if ((BI.OutNeeds & StateWQM) && State != StateWQM) { - assert(WQMFromExec == (SavedWQMReg == 0)); - toWQM(MBB, MBB.end(), SavedWQMReg); - } else if (BI.OutNeeds == StateExact && State != StateExact) { - toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM) - : MBB.getFirstTerminator(), - 0, LiveMaskReg); + if (II == IE) + break; + II = Next; } } @@ -524,8 +641,11 @@ for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); unsigned Dest = MI->getOperand(0).getReg(); - BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) - .addReg(LiveMaskReg); + MachineInstr *Copy = + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + + LIS->ReplaceMachineInstrInMaps(*MI, *Copy); MI->eraseFromParent(); } } @@ -559,8 +679,10 @@ if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) - .addReg(AMDGPU::EXEC); + MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + LIS->InsertMachineInstrInMaps(*MI); } if (GlobalFlags == StateWQM) { @@ -583,5 +705,10 @@ for (auto BII : Blocks) processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + // Physical registers like SCC aren't tracked by default anyway, so just + // removing the ranges we computed is the simplest option for maintaining + // the analysis results. + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI)); + return true; } Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -466,6 +466,42 @@ ret <4 x float> %dtex } +; Test awareness that s_wqm_b64 clobbers SCC. +; +; CHECK-LABEL: {{^}}test_scc: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: s_cmp_ +; CHECK-NEXT: s_cbranch_scc +; CHECK: ; %if +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: image_sample +; CHECK: ; %else +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: image_sample +; CHECK: ; %end +define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { +main_body: + %cc = icmp sgt i32 %sel, 0 + br i1 %cc, label %if, label %else + +if: + %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +else: + %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +end: + %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] + + call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) + + ret <4 x float> %r +} + + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 @@ -474,6 +510,7 @@ declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare void @llvm.AMDGPU.kill(float)