Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -567,6 +567,12 @@ void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; + void legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, MachineRegisterInfo &MRI, + const DebugLoc &DL) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr &MI) const; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2546,6 +2546,39 @@ } } +void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, + MachineRegisterInfo &MRI, + const DebugLoc &DL) const { + + unsigned OpReg = Op.getReg(); + unsigned OpSubReg = Op.getSubReg(); + + const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( + RI.getRegClassForReg(MRI, OpReg), OpSubReg); + + // Check if operand is already the correct register class. + if (DstRC == OpRC) + return; + + unsigned DstReg = MRI.createVirtualRegister(DstRC); + MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setSubReg(0); + + MachineInstr *Def = MRI.getVRegDef(OpReg); + if (!Def) + return; + + // Try to eliminate the copy if it is copying an immediate value. + if (Def->isMoveImmediate()) + FoldImmediate(*Copy, *Def, OpReg, &MRI); +} + void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -2603,15 +2636,14 @@ MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; - unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - Op.setReg(DstReg); + // Avoid creating no-op copies with the same src and dst reg class. These + // confuse some of the machine passes. + legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); } } @@ -2635,12 +2667,7 @@ if (VRC == OpRC) continue; - unsigned DstReg = MRI.createVirtualRegister(VRC); - - BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - - Op.setReg(DstReg); + legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); Op.setIsKill(); } } @@ -2656,11 +2683,9 @@ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI.getParent(); - unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI.getOperand(1).setReg(NewSrc0); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &Op = MI.getOperand(1); + legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); } return; } @@ -3000,6 +3025,22 @@ continue; unsigned DstReg = Inst.getOperand(0).getReg(); + if (Inst.isCopy() && + TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + continue; + } + NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h @@ -172,6 +172,8 @@ unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; + const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, + unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; bool isSGPRPressureSet(unsigned SetID) const { Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1059,17 +1059,6 @@ return AMDGPU::NoRegister; } -bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, - unsigned Reg) const { - const TargetRegisterClass *RC; - if (TargetRegisterInfo::isVirtualRegister(Reg)) - RC = MRI.getRegClass(Reg); - else - RC = getPhysRegClass(Reg); - - return hasVGPRs(RC); -} - unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const { if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 800; @@ -1363,3 +1352,17 @@ llvm_unreachable("unhandled register size"); } } + +const TargetRegisterClass* +SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, + unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + + return getPhysRegClass(Reg); +} + +bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + return hasVGPRs(getRegClassForReg(MRI, Reg)); +} Index: llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -108,9 +108,9 @@ ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:12 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -133,11 +133,11 @@ ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:12 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] Index: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -207,7 +207,6 @@ ; GCN: buffer_load_ushort v{{[0-9]+}}, off ; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}} ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 @@ -418,7 +417,7 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64: -; GCN: SCRATCH_RSRC_DWORD +; GCN-DAG: SCRATCH_RSRC_DWORD ; FIXME: Should be able to eliminate this? @@ -426,8 +425,8 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}} ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}} -; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 0{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_FI0]], s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -48,7 +48,9 @@ ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} -; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8000 +; This constant isn't folded, because it has multiple uses. +; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8000 +; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { Index: llvm/trunk/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -3,7 +3,7 @@ ; register operands in the correct order when modifying the opcode of an ; instruction to V_ADD_I32_e32. -; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 killed %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec +; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: