Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -85,18 +85,6 @@ private: static char ID; - std::pair - getCopyRegClasses(const MachineInstr &Copy, - const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) const; - - bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, - const TargetRegisterClass *DstRC, - const SIRegisterInfo &TRI) const; - - bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, - const TargetRegisterClass *DstRC, - const SIRegisterInfo &TRI) const; public: SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } @@ -134,10 +122,10 @@ return false; } -std::pair -SIFixSGPRCopies::getCopyRegClasses(const MachineInstr &Copy, - const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) const { +static std::pair +getCopyRegClasses(const MachineInstr &Copy, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { unsigned DstReg = Copy.getOperand(0).getReg(); unsigned SrcReg = Copy.getOperand(1).getReg(); @@ -157,18 +145,95 @@ return std::make_pair(SrcRC, DstRC); } -bool SIFixSGPRCopies::isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, - const TargetRegisterClass *DstRC, - const SIRegisterInfo &TRI) const { +static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); } -bool SIFixSGPRCopies::isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, - const TargetRegisterClass *DstRC, - const SIRegisterInfo &TRI) const { +static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } +// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. +// +// SGPRx = ... +// SGPRy = REG_SEQUENCE SGPRx, sub0 ... +// VGPRz = COPY SGPRy +// +// ==> +// +// VGPRx = COPY SGPRx +// VGPRz = REG_SEQUENCE VGPRx, sub0 +// +// This exposes immediate folding opportunities when materializing 64-bit +// immediates. +static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + assert(MI.isRegSequence()); + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) + return false; + + if (!MRI.hasOneUse(DstReg)) + return false; + + MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); + if (!CopyUse.isCopy()) + return false; + + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); + + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + return false; + + // TODO: Could have multiple extracts? + unsigned SubReg = CopyUse.getOperand(1).getSubReg(); + if (SubReg != AMDGPU::NoSubRegister) + return false; + + MRI.setRegClass(DstReg, DstRC); + + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + + MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + unsigned SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getReg(); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + assert(TRI->isSGPRClass(SrcRC) && + "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); + + SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); + + unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) + .addOperand(MI.getOperand(I)); + + MI.getOperand(I).setReg(TmpReg); + } + + CopyUse.eraseFromParent(); + return true; +} + + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = @@ -273,8 +338,10 @@ } case AMDGPU::REG_SEQUENCE: { if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) + !hasVGPROperands(MI, TRI)) { + foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; + } DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); Index: test/CodeGen/AMDGPU/imm.ll =================================================================== --- test/CodeGen/AMDGPU/imm.ll +++ test/CodeGen/AMDGPU/imm.ll @@ -3,8 +3,7 @@ ; Use a 64-bit value with lo bits that can be represented as an inline constant ; CHECK-LABEL: {{^}}i64_imm_inline_lo: -; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]] +; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { entry: @@ -14,8 +13,7 @@ ; Use a 64-bit value with hi bits that can be represented as an inline constant ; CHECK-LABEL: {{^}}i64_imm_inline_hi: -; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]] +; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5 ; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { entry: @@ -24,10 +22,8 @@ } ; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { store i64 -9223372036854775808, i64 addrspace(1) *%out @@ -523,10 +519,8 @@ ; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { store double -0.0, double addrspace(1)* %out @@ -606,10 +600,8 @@ } ; CHECK-LABEL: {{^}}store_literal_imm_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} define void @store_literal_imm_f64(double addrspace(1)* %out) { store double 4096.0, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -68,10 +68,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: -; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -92,10 +90,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: -; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0 -; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}} -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]] +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -185,8 +185,7 @@ ; Make sure load width gets reduced to i32 load. ; GCN-LABEL: {{^}}s_shl_32_i64: ; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} -; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { Index: test/CodeGen/AMDGPU/srl.ll =================================================================== --- test/CodeGen/AMDGPU/srl.ll +++ test/CodeGen/AMDGPU/srl.ll @@ -190,8 +190,7 @@ ; Make sure load width gets reduced to i32 load. ; GCN-LABEL: {{^}}s_lshr_32_i64: ; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}} -; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]] ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -245,18 +245,16 @@ ; GCN-LABEL: {{^}}test_s0_s1_k_f64: ; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} -; GCN-DAG: s_mov_b32 s[[SK0_SUB1:[0-9]+]], 0x40900000 -; GCN-DAG: s_mov_b32 s[[SZERO:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]] -; GCN: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]] +; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000 +; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}} -; GCN-DAG: s_mov_b32 s[[SK1_SUB1:[0-9]+]], 0x40b00000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] -; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB0:[0-9]+]], s[[SZERO]] -; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB1]] -; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}} -; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}} +; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}} + +; Same zero component is re-used for half of each immediate. +; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 +; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]] Index: test/CodeGen/AMDGPU/zero_extend.ll =================================================================== --- test/CodeGen/AMDGPU/zero_extend.ll +++ test/CodeGen/AMDGPU/zero_extend.ll @@ -7,8 +7,7 @@ ; R600: MEM_RAT_CACHELESS STORE_RAW ; SI: {{^}}test: -; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}} -; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]] +; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: