diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -130,7 +130,7 @@ MachineInstr &SCCDefInst, SetVectorType &Worklist, Register NewCond = Register()) const; - void addSCCDefsToVALUWorklist(MachineOperand &Op, + void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, SetVectorType &Worklist) const; const TargetRegisterClass * diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6362,7 +6362,6 @@ continue; } - if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. @@ -6372,43 +6371,90 @@ continue; } + // Handle converting generic instructions like COPY-to-SGPR into + // COPY-to-VGPR. + if (NewOpcode == Opcode) { + Register DstReg = Inst.getOperand(0).getReg(); + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + + // Make sure we don't leave around a dead VGPR->SGPR copy. Normally + // these are deleted later, but at -O0 it would leave a suspicious + // looking illegal copy of an undef register. + for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) + Inst.removeOperand(I); + Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); + continue; + } + + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + legalizeOperands(Inst, MDT); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + continue; + } + // Use the new VALU Opcode. - const MCInstrDesc &NewDesc = get(NewOpcode); - Inst.setDesc(NewDesc); + auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) + .setMIFlags(Inst.getFlags()); + for (const MachineOperand &Op : Inst.explicit_operands()) + NewInstr->addOperand(Op); // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. - for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { - MachineOperand &Op = Inst.getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { + for (MachineOperand &Op : Inst.implicit_operands()) { + if (Op.getReg() == AMDGPU::SCC) { // Only propagate through live-def of SCC. if (Op.isDef() && !Op.isDead()) addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); if (Op.isUse()) - addSCCDefsToVALUWorklist(Op, Worklist); - Inst.removeOperand(i); + addSCCDefsToVALUWorklist(NewInstr, Worklist); } } + Inst.eraseFromParent(); + + Register NewDstReg; + if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { + Register DstReg = NewInstr->getOperand(0).getReg(); + assert(DstReg.isVirtual()); + + // Update the destination register class. + const TargetRegisterClass *NewDstRC = + getDestEquivalentVGPRClass(*NewInstr); + assert(NewDstRC); + + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst.addOperand(MachineOperand::CreateImm(0)); - Inst.addOperand(MachineOperand::CreateImm(Size)); - + NewInstr.addImm(0); + NewInstr.addImm(Size); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. - Inst.addOperand(MachineOperand::CreateImm(0)); + NewInstr.addImm(0); } - Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); - fixImplicitOperands(Inst); - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = Inst.getOperand(2); + const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); // If we need to move this to VGPRs, we need to unpack the second operand // back into the 2 separate ones for bit offset and width. assert(OffsetWidthOp.isImm() && @@ -6417,56 +6463,20 @@ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst.removeOperand(2); // Remove old immediate. - Inst.addOperand(MachineOperand::CreateImm(Offset)); - Inst.addOperand(MachineOperand::CreateImm(BitWidth)); + NewInstr->removeOperand(2); + NewInstr.addImm(Offset); + NewInstr.addImm(BitWidth); } - bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); - Register NewDstReg; - if (HasDst) { - Register DstReg = Inst.getOperand(0).getReg(); - if (DstReg.isPhysical()) - continue; - - // Update the destination register class. - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); - if (!NewDstRC) - continue; - - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); - MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); - MRI.clearKillFlags(Inst.getOperand(1).getReg()); - Inst.getOperand(0).setReg(DstReg); - - // Make sure we don't leave around a dead VGPR->SGPR copy. Normally - // these are deleted later, but at -O0 it would leave a suspicious - // looking illegal copy of an undef register. - for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) - Inst.removeOperand(I); - Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); - continue; - } - - NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - } + fixImplicitOperands(*NewInstr); // Legalize the operands - CreatedBBTmp = legalizeOperands(Inst, MDT); + CreatedBBTmp = legalizeOperands(*NewInstr, MDT); if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) CreatedBB = CreatedBBTmp; - if (HasDst) - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (NewDstReg) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } return CreatedBB; } @@ -7229,11 +7239,8 @@ // SCC must be changed to an instruction that defines VCC. This function makes // sure that the instruction that defines SCC is added to the moveToVALU // worklist. -void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, SetVectorType &Worklist) const { - assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()); - - MachineInstr *SCCUseInst = Op.getParent(); // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v0, v0, v1 -; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -33,7 +33,7 @@ ; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v0, v0, v1 -; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -58,7 +58,7 @@ ; CHECK-NEXT: v_add_f32_e32 v1, v1, v2 ; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen ; CHECK-NEXT: v_add_f32_e32 v0, v1, v1 -; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog main_body: @@ -124,7 +124,7 @@ ; CHECK-NEXT: s_mov_b64 exec, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v0, v1 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 -; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -156,7 +156,7 @@ ; CHECK-NEXT: s_mov_b64 exec, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v0, v1 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 -; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -191,7 +191,7 @@ ; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 -; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; CHECK-NEXT: .LBB6_4: ; %END ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 @@ -246,7 +246,7 @@ ; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 -; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; CHECK-NEXT: .LBB7_4: ; %END ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_and_b64 exec, exec, s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -204,7 +204,7 @@ ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -219,7 +219,7 @@ ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: @@ -243,7 +243,7 @@ ; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -258,7 +258,7 @@ ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: @@ -496,7 +496,7 @@ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -518,7 +518,7 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -962,7 +962,7 @@ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -986,7 +986,7 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog @@ -1176,7 +1176,7 @@ ; GFX9-W64-NEXT: s_nop 0 ; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen ; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec -; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 @@ -1193,7 +1193,7 @@ ; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec -; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 @@ -2500,7 +2500,7 @@ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -2522,7 +2522,7 @@ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 -; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog