diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -175,10 +175,12 @@ unsigned OffsetBits) const; std::pair - selectDS1Addr1OffsetImpl(MachineOperand &Src) const; - + selectDS1Addr1OffsetImpl(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectDS1Addr1Offset(MachineOperand &Root) const; + + std::pair + selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectDS64Bit4ByteAligned(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2374,18 +2374,16 @@ return std::make_pair(Root.getReg(), 0); int64_t ConstAddr = 0; - if (isBaseWithConstantOffset(Root, *MRI)) { - const MachineOperand &LHS = RootDef->getOperand(1); - const MachineOperand &RHS = RootDef->getOperand(2); - const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); - const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); - if (LHSDef && RHSDef) { - int64_t PossibleOffset = - RHSDef->getOperand(1).getCImm()->getSExtValue(); - if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { - // (add n0, c0) - return std::make_pair(LHS.getReg(), PossibleOffset); - } + + Register PtrBase; + int64_t Offset; + std::tie(PtrBase, Offset) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + + if (Offset) { + if (isDSOffsetLegal(PtrBase, Offset, 16)) { + // (add n0, c0) + return std::make_pair(PtrBase, Offset); } } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { // TODO @@ -2401,7 +2399,6 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { - Register Reg; unsigned Offset; std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); @@ -2413,19 +2410,26 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { + Register Reg; + unsigned Offset; + std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } + }}; +} + +std::pair +AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); - if (!RootDef) { - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } - }}; - } + if (!RootDef) + return std::make_pair(Root.getReg(), 0); int64_t ConstAddr = 0; + Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); @@ -2434,11 +2438,7 @@ int64_t DWordOffset1 = DWordOffset0 + 1; if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { // (add n0, c0) - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); } - }}; + return std::make_pair(PtrBase, DWordOffset0); } } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { // TODO @@ -2448,11 +2448,7 @@ } - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(1); } - }}; + return std::make_pair(Root.getReg(), 0); } /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -1141,10 +1141,9 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 8, v0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_dec_rtn_u32 v3, v0, v1 +; CI-NEXT: ds_dec_rtn_u32 v3, v0, v1 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,10 +1159,9 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 8, v0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_dec_rtn_u32 v3, v0, v1 +; VI-NEXT: ds_dec_rtn_u32 v3, v0, v1 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1708,15 +1706,14 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0, v0 ; CI-NEXT: v_mov_b32_e32 v0, 9 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -1728,15 +1725,14 @@ ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0, v0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -506,10 +506,9 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 8, v0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_inc_rtn_u32 v3, v0, v1 +; CI-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -525,10 +524,9 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 8, v0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_inc_rtn_u32 v3, v0, v1 +; VI-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -544,9 +542,8 @@ ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: ds_inc_rtn_u32 v3, v0, v1 +; GFX9-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1279,15 +1276,14 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0, v0 ; CI-NEXT: v_mov_b32_e32 v0, 9 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -1299,15 +1295,14 @@ ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0, v0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1319,14 +1314,13 @@ ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_add_u32_e32 v4, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3