Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4624,10 +4624,10 @@ } // Emit the actual waterfall loop, executing the wrapped instruction for each -// unique value of \p Rsrc across all lanes. In the best case we execute 1 +// unique value of \p Rsrc (4 dwords) across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). static void -emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, +emitLoadSRsrcFromVGPR128Loop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, MachineOperand &Rsrc) { MachineFunction &MF = *OrigBB.getParent(); @@ -4708,6 +4708,129 @@ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); } + +// Emit the actual waterfall loop, executing the wrapped instruction for each +// unique value of \p Rsrc (8 dwords) across all lanes. +static void +emitLoadSRsrcFromVGPR256Loop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, + const DebugLoc &DL, MachineOperand &Rsrc) { + MachineFunction &MF = *OrigBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned SaveExecOpc = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + unsigned XorTermOpc = + ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + unsigned AndOpc = + ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + MachineBasicBlock::iterator I = LoopBB.begin(); + + Register VRsrc = Rsrc.getReg(); + unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); + + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg2 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg3 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond0 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond1 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond = MRI.createVirtualRegister(BoolXExecRC); + Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub4 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub5 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub6 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub7 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_256RegClass); + + // Beginning of the loop, read the next Rsrc variant. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub4) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub4); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub5) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub5); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub6) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub6); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub7) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub7); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(SRsrcSub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcSub1) + .addImm(AMDGPU::sub1) + .addReg(SRsrcSub2) + .addImm(AMDGPU::sub2) + .addReg(SRsrcSub3) + .addImm(AMDGPU::sub3) + .addReg(SRsrcSub4) + .addImm(AMDGPU::sub4) + .addReg(SRsrcSub5) + .addImm(AMDGPU::sub5) + .addReg(SRsrcSub6) + .addImm(AMDGPU::sub6) + .addReg(SRsrcSub7) + .addImm(AMDGPU::sub7); + + // Update Rsrc operand to use the SGPR Rsrc. + Rsrc.setReg(SRsrc); + Rsrc.setIsKill(true); + + // Identify all lanes with identical Rsrc operands in their VGPRs. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) + .addReg(SRsrc, 0, AMDGPU::sub0_sub1) + .addReg(VRsrc, 0, AMDGPU::sub0_sub1); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) + .addReg(SRsrc, 0, AMDGPU::sub2_sub3) + .addReg(VRsrc, 0, AMDGPU::sub2_sub3); + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond0) + .addReg(CondReg0) + .addReg(CondReg1); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg2) + .addReg(SRsrc, 0, AMDGPU::sub4_sub5) + .addReg(VRsrc, 0, AMDGPU::sub4_sub5); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg3) + .addReg(SRsrc, 0, AMDGPU::sub6_sub7) + .addReg(VRsrc, 0, AMDGPU::sub6_sub7); + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond1) + .addReg(CondReg2) + .addReg(CondReg3); + + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) + .addReg(AndCond0) + .addReg(AndCond1); + + MRI.setSimpleHint(SaveExec, AndCond); + + // Update EXEC to matching lanes, saving original to SaveExec. + BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) + .addReg(AndCond, RegState::Kill); + + // The original instruction is here; we insert the terminators after it. + I = LoopBB.end(); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) + .addReg(Exec) + .addReg(SaveExec); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); +} + // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register // with SGPRs by iterating over all unique values across all lanes. static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, @@ -4771,7 +4894,14 @@ } } - emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + // Emit waterfall loop based on Rsrc size. + const TargetRegisterClass *OpRC = MRI.getRegClass(Rsrc.getReg()); + if (OpRC == &AMDGPU::VReg_128RegClass) + emitLoadSRsrcFromVGPR128Loop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + else if (OpRC == &AMDGPU::VReg_256RegClass) + emitLoadSRsrcFromVGPR256Loop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + else // Not implemented yet. + assert(false && "waterfall loop not imlemented for Rsrc size"); // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); @@ -4960,16 +5090,13 @@ (AMDGPU::isShader(MF.getFunction().getCallingConv()) && (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); - if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); - SRsrc->setReg(SGPR); - } + if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) + loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); - if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); - SSamp->setReg(SGPR); - } + if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) + loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); + return; }