Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -264,6 +264,13 @@ ArrayRef getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1474,3 +1474,23 @@ unsigned Reg) const { return hasVGPRs(getRegClassForReg(MRI, Reg)); } + +bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + unsigned SrcSize = SrcRC->getSize(); + unsigned DstSize = DstRC->getSize(); + unsigned NewSize = NewRC->getSize(); + + // Do not increase size of registers beyond dword, we would need to allocate + // adjacent registers and constraint regalloc more than needed. + + // Always allow dword and sub-dword coalescing. + if (SrcSize <= 4 || DstSize <= 4) + return true; + + return NewSize <= DstSize || NewSize <= SrcSize; +} Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -399,15 +399,15 @@ ; XVI-NOT: v_cvt_f32_f16 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] -; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] -; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] -; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] -; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] - -; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] -; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] +; VI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] +; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] +; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; GCN-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] + +; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] +; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] +; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] ; GCN-NOT: v_cvt_f64_f32_e32 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} Index: test/CodeGen/AMDGPU/limit-coalesce.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/limit-coalesce.mir @@ -0,0 +1,54 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s + +# Check that coalescer does not create 96 bit register tuple and uses 64 bit register +# for addressing + +# CHECK: - { id: 2, class: vreg_64 } +# CHECK: - { id: 3, class: vreg_64 } +# CHECK: - { id: 4, class: vreg_64 } +# No more registers shall be defined +# CHECK-NEXT: liveins: +# CHECK: FLAT_STORE_DWORDX2 %4, + +--- +name: main +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } + - { id: 2, class: vreg_64, preferred-register: '%2' } + - { id: 3, class: vreg_64 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64 } +liveins: + - { reg: '%sgpr6', virtual-reg: '%1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0, %vgpr0_vgpr1 + + %3 = IMPLICIT_DEF + undef %4.sub0 = COPY %sgpr0 + %4.sub1 = COPY killed %3.sub0 + undef %5.sub0 = COPY %4.sub1 + %5.sub1 = COPY %4.sub0 + FLAT_STORE_DWORDX2 killed %5, killed %vgpr0_vgpr1, 0, 0, 0, implicit %exec, implicit %flat_scr + +...