Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -643,33 +643,33 @@ if (FoldingImmLike && UseMI->isCopy()) { Register DestReg = UseMI->getOperand(0).getReg(); + Register SrcReg = UseMI->getOperand(1).getReg(); + assert(SrcReg.isVirtual()); - // Don't fold into a copy to a physical register. Doing so would interfere - // with the register coalescer's logic which would avoid redundant - // initalizations. - if (DestReg.isPhysical()) - return; + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); - const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg); + // Don't fold into a copy to a physical register with the same class. Doing + // so would interfere with the register coalescer's logic which would avoid + // redundant initalizations. + if (DestReg.isPhysical() && SrcRC->contains(DestReg)) + return; - Register SrcReg = UseMI->getOperand(1).getReg(); - if (SrcReg.isVirtual()) { // XXX - This can be an assert? - const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { - MachineRegisterInfo::use_iterator NextUse; - SmallVector CopyUses; - for (MachineRegisterInfo::use_iterator - Use = MRI->use_begin(DestReg), E = MRI->use_end(); - Use != E; Use = NextUse) { - NextUse = std::next(Use); - FoldCandidate FC = FoldCandidate(Use->getParent(), - Use.getOperandNo(), &UseMI->getOperand(1)); - CopyUses.push_back(FC); - } - for (auto & F : CopyUses) { - foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, - FoldList, CopiesToReplace); - } + const TargetRegisterClass *DestRC = + TRI->getRegClassForReg(*MRI, DestReg); + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { + MachineRegisterInfo::use_iterator NextUse; + SmallVector CopyUses; + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(DestReg), E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + FoldCandidate FC = FoldCandidate(Use->getParent(), + Use.getOperandNo(), &UseMI->getOperand(1)); + CopyUses.push_back(FC); + } + for (auto & F : CopyUses) { + foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, + FoldList, CopiesToReplace); } } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -115,10 +115,8 @@ } ; ALL-LABEL: {{^}}func_kernarg_segment_ptr: -; ALL: s_mov_b32 [[S_LO:s[0-9]+]], 0{{$}} -; ALL: s_mov_b32 [[S_HI:s[0-9]+]], 0{{$}} -; ALL: v_mov_b32_e32 v0, [[S_LO]]{{$}} -; ALL: v_mov_b32_e32 v1, [[S_HI]]{{$}} +; ALL: v_mov_b32_e32 v0, 0{{$}} +; ALL: v_mov_b32_e32 v1, 0{{$}} define i8 addrspace(4)* @func_kernarg_segment_ptr() { %ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ret i8 addrspace(4)* %ptr Index: llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -139,9 +139,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -149,9 +148,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -162,9 +160,8 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr @@ -177,11 +174,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -189,11 +184,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -204,11 +197,9 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr Index: llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -59,3 +59,31 @@ S_ENDPGM 0, implicit %1, implicit %2 ... + +# GCN-LABEL: name: no_fold_imm_into_m0{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 -8 +# GCN-NEXT: $m0 = COPY %0 + +--- +name: no_fold_imm_into_m0 +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 -8 + $m0 = COPY %0 + S_ENDPGM 0, implicit $m0 + +... + +# GCN-LABEL: name: fold_sgpr_imm_to_vgpr_copy{{$}} +# GCN: $vgpr0 = V_MOV_B32_e32 -8, implicit $exec +--- +name: fold_sgpr_imm_to_vgpr_copy +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 -8 + $vgpr0 = COPY %0 + S_ENDPGM 0, implicit $vgpr0 + +...