Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -124,6 +124,14 @@ std::map, LaneBitmask> WQMValues; SmallVector LiveMaskQueries; + // Tracking of feasible instruction insertion points. + MachineInstr *EarlyInsertMI; + MachineInstr *LastInsertMI; + bool InsertSCCLive; + bool EarlyInsertSCCLive; + bool EarlyInsertSCCUsed; + bool PreferInsertLate; + void printInfo(MachineFunction &MF); void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask, @@ -142,10 +150,13 @@ std::vector &Worklist); char analyzeFunction(MachineFunction &MF); - void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg); - void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - unsigned SavedWQM); + void resetInsertCandidates(bool PreferLate); + void addInsertCandidate(MachineInstr &MI); + MachineBasicBlock::iterator prepareInsert(MachineBasicBlock &MBB, + bool SaveSCC); + + void toExact(MachineBasicBlock &MBB, unsigned SaveWQM, unsigned LiveMaskReg); + void toWQM(MachineBasicBlock &MBB, unsigned SavedWQM); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); @@ -488,18 +499,130 @@ return GlobalFlags; } -void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - unsigned SaveWQM, unsigned LiveMaskReg) { +void SIWholeQuadMode::resetInsertCandidates(bool PreferLate) { + EarlyInsertMI = nullptr; + LastInsertMI = nullptr; + PreferInsertLate = PreferLate; +} + +void SIWholeQuadMode::addInsertCandidate(MachineInstr &MI) { + const bool DefSCC = MI.definesRegister(AMDGPU::SCC); + const bool UseSCC = MI.readsRegister(AMDGPU::SCC); + const bool KillsSCC = MI.killsRegister(AMDGPU::SCC); + + if (!EarlyInsertMI) { + EarlyInsertMI = &MI; + EarlyInsertSCCLive = InsertSCCLive; + EarlyInsertSCCUsed = false; + } + + if (EarlyInsertSCCLive) { + if (UseSCC) { + EarlyInsertSCCUsed = true; + } else { + if (EarlyInsertSCCUsed) { + EarlyInsertMI = &MI; + EarlyInsertSCCUsed = false; + } + } + } + + if (KillsSCC) { + EarlyInsertSCCLive = false; + InsertSCCLive = false; + } + + if (DefSCC) { + EarlyInsertSCCLive = false; + InsertSCCLive = true; + } + + LastInsertMI = &MI; + + if (!UseSCC && DefSCC) { + assert(!EarlyInsertSCCUsed && !EarlyInsertSCCLive); + + if (PreferInsertLate) + EarlyInsertMI = &MI; + } +} + +MachineBasicBlock::iterator +SIWholeQuadMode::prepareInsert(MachineBasicBlock &MBB, bool SaveSCC) { + if (!EarlyInsertMI) { + assert(MBB.getFirstTerminator() == MBB.end()); + return MBB.end(); + } + + if (!SaveSCC) + return PreferInsertLate ? LastInsertMI : EarlyInsertMI; + + // Find the insertion point + MachineInstr *MI; + bool Clean; + + if (!EarlyInsertSCCLive && !EarlyInsertSCCUsed) { + MI = EarlyInsertMI; + Clean = true; + } else { + MI = LastInsertMI; + Clean = true; + + if (InsertSCCLive) { + for (MachineBasicBlock::iterator MII = LastInsertMI, MIE = MBB.end(); + MII != MIE; ++MII) { + if (MII->readsRegister(AMDGPU::SCC)) { + Clean = false; + break; + } + if (MII->definesRegister(AMDGPU::SCC)) + break; + } + } + + const bool EarlyClean = + !EarlyInsertSCCUsed && (!EarlyInsertSCCLive || Clean); + + if (EarlyClean && (!Clean || !PreferInsertLate)) { + MI = EarlyInsertMI; + Clean = true; + } + } + + // Backup SCC if necessary -- this should really only happen when the + // scheduler makes some very odd decisions. + if (!Clean) { + unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineInstr *Save = + BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_CSELECT_B32), SaveReg) + .addImm(1) + .addImm(0); + MachineInstr *Restore = + BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_CMP_GT_U32)) + .addReg(SaveReg) + .addImm(0); + + SI->insertMachineInstrInMaps(*Save); + SI->insertMachineInstrInMaps(*Restore); + LIS->createAndComputeVirtRegInterval(SaveReg); + + MI = Restore; + } + + return MI; +} + +void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, unsigned SaveWQM, + unsigned LiveMaskReg) { + MachineBasicBlock::iterator II = prepareInsert(MBB, true); MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + MI = BuildMI(MBB, II, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveWQM) .addReg(LiveMaskReg); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) + MI = BuildMI(MBB, II, DebugLoc(), TII->get(AMDGPU::S_AND_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(LiveMaskReg); } @@ -507,17 +630,15 @@ SI->insertMachineInstrInMaps(*MI); } -void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - unsigned SavedWQM) { +void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, unsigned SavedWQM) { + MachineBasicBlock::iterator II = prepareInsert(MBB, SavedWQM == 0); MachineInstr *MI; if (SavedWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + MI = BuildMI(MBB, II, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) .addReg(SavedWQM); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC) + MI = BuildMI(MBB, II, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC); } @@ -545,13 +666,20 @@ unsigned SavedWQMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; - MachineInstr *FirstNonWQM = nullptr; + + InsertSCCLive = false; + resetInsertCandidates(State == StateExact); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + if (isEntry) + ++II; // skip the instruction that saves LiveMask + while (II != IE) { MachineInstr &MI = *II; ++II; + addInsertCandidate(MI); + // Skip instructions that are not affected by EXEC if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) && !MI.isBranch() && !MI.isTerminator()) @@ -581,14 +709,6 @@ OutNeeds = InstrInfoIt->second.OutNeeds; } - // Keep track of the first consecutive non-WQM instruction, so that we - // switch away from WQM as soon as possible, potentially saving a small - // bit of bandwidth on loads. - if (Needs == StateWQM) - FirstNonWQM = nullptr; - else if (!FirstNonWQM) - FirstNonWQM = &MI; - // State switching if (Needs && State != Needs) { if (Needs == StateExact) { @@ -597,10 +717,10 @@ if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg); + toExact(MBB, SavedWQMReg, LiveMaskReg); } else { assert(WQMFromExec == (SavedWQMReg == 0)); - toWQM(MBB, &MI, SavedWQMReg); + toWQM(MBB, SavedWQMReg); if (SavedWQMReg) { LIS->createAndComputeVirtRegInterval(SavedWQMReg); @@ -611,20 +731,22 @@ State = Needs; } + // Cannot switch to exact before a WQM instruction and vice versa. + if (Needs) + resetInsertCandidates(State == StateExact); + if (MI.getOpcode() == AMDGPU::SI_KILL) WQMFromExec = false; } if ((BI.Needs.Out & StateWQM) && State != StateWQM) { assert(WQMFromExec == (SavedWQMReg == 0)); - toWQM(MBB, MBB.end(), SavedWQMReg); + toWQM(MBB, SavedWQMReg); if (SavedWQMReg) LIS->createAndComputeVirtRegInterval(SavedWQMReg); } else if (BI.Needs.Out == StateExact && State != StateExact) { - toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM) - : MBB.getFirstTerminator(), - 0, LiveMaskReg); + toExact(MBB, 0, LiveMaskReg); } } Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -420,6 +420,42 @@ ret <4 x float> %r } +; Test awareness that s_wqm_b64 clobbers SCC. +; +; CHECK-LABEL: {{^}}test_scc: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: s_cmp_ +; CHECK: s_cbranch_scc +; CHECK: ; %if +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: image_sample +; CHECK: ; %else +; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: image_sample +; CHECK: ; %end +define amdgpu_ps <4 x float> @test_scc(float addrspace(1)* inreg %ptr, i32 inreg %sel, i32 %idx) #1 { +main_body: + %cc = icmp sgt i32 %sel, 0 + br i1 %cc, label %if, label %else + +if: + %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +else: + %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + br label %end + +end: + %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] + + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx + store float 1.0, float addrspace(1)* %gep + + ret <4 x float> %r +} + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2