Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1844,6 +1844,26 @@ if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) { Addr = LHS; ImmOffset = COffsetVal; + } else if (!LHS->isDivergent()) { + SDLoc SL(N); + // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + const unsigned NumBits = TII->getNumFlatOffsetBits(true); + + // Use signed division by a power of two to truncate towards 0. + int64_t D = 1LL << (NumBits - 1); + uint64_t RemainderOffset = (static_cast(COffsetVal) / D) * D; + ImmOffset = COffsetVal - RemainderOffset; + + if (isUInt<32>(RemainderOffset)) { + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SL, MVT::i32, + CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + SAddr = LHS; + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; + } } } Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -72,6 +72,8 @@ GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } }; + bool isSGPR(Register Reg) const; + bool isInstrUniform(const MachineInstr &MI) const; bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2298,6 +2298,10 @@ getAddrModeInfo(*PtrMI, MRI, AddrInfo); } +bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { + return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; +} + bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { if (!MI.hasOneMemOperand()) return false; @@ -3489,28 +3493,63 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { Register PtrBase; - int64_t ImmOffset; + int64_t ConstOffset; + int64_t ImmOffset = 0; // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ImmOffset) = getPtrBaseWithConstantOffset(Root.getReg(), - *MRI); + std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Root.getReg(), + *MRI); + + auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); + if (!PtrBaseDef) + return None; // TODO: Could split larger constant into VGPR offset. - if (ImmOffset != 0 && - !TII.isLegalFLATOffset(ImmOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) { + if (ConstOffset != 0 && + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) { PtrBase = Root.getReg(); - ImmOffset = 0; + ImmOffset = ConstOffset; + } else if (ConstOffset != 0 && isSGPR(PtrBaseDef->Reg)) { + // Offset is too large. + // + // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + const unsigned NumBits = TII.getNumFlatOffsetBits(true); + + // Use signed division by a power of two to truncate towards 0. + int64_t D = 1LL << (NumBits - 1); + uint64_t RemainderOffset = (static_cast(ConstOffset) / D) * D; + ImmOffset = ConstOffset - RemainderOffset; + + if (isUInt<32>(RemainderOffset)) { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), + HighBits) + .addImm(RemainderOffset); + + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(PtrBase); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(HighBits); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }}}; + } } // Match the variable offset. - const MachineInstr *PtrBaseDef = getDefIgnoringCopies(PtrBase, *MRI); - if (PtrBaseDef->getOpcode() != AMDGPU::G_PTR_ADD) + if (PtrBaseDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) return None; // Look through the SGPR->VGPR copy. Register PtrBaseSrc = - getSrcRegIgnoringCopies(PtrBaseDef->getOperand(1).getReg(), *MRI); + getSrcRegIgnoringCopies(PtrBaseDef->MI->getOperand(1).getReg(), *MRI); if (!PtrBaseSrc) return None; @@ -3519,7 +3558,7 @@ return None; Register SAddr = PtrBaseSrc; - Register PtrBaseOffset = PtrBaseDef->getOperand(2).getReg(); + Register PtrBaseOffset = PtrBaseDef->MI->getOperand(2).getReg(); // It's possible voffset is an SGPR here, but the copy to VGPR will be // inserted later. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir @@ -800,20 +800,10 @@ ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX10: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX10: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc - ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def $scc, implicit $scc - ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -2,9 +2,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s -# TODO: Better to initialize 0 vgpr and use sgpr base --- - name: load_global_s32_from_sgpr legalized: true regBankSelected: true @@ -182,28 +180,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX10: %zero:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10: %zext:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %zero, %subreg.sub1 - ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 - ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %zext.sub0 - ; GFX10: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 - ; GFX10: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %zext.sub1 - ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX10: %24:vgpr_32, dead %26:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %24, %subreg.sub1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; GFX10: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX10: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX10: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX10: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec - ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec - ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -239,28 +217,8 @@ ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX10: %zero:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10: %zext:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %zero, %subreg.sub1 - ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 - ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %zext.sub0 - ; GFX10: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 - ; GFX10: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %zext.sub1 - ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX10: %24:vgpr_32, dead %26:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %24, %subreg.sub1 - ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 - ; GFX10: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX10: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX10: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX10: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 - ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec - ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec - ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(p1) = COPY %0 @@ -273,3 +231,449 @@ $vgpr0 = COPY %7 ... +--- +name: load_global_s32_from_sgpr_base_offset_4096 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4096 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4096 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 4096 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... + +--- +name: load_global_s32_from_sgpr_base_offset_4097 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4097 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4097 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 4097 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... + +--- +name: load_global_s32_from_sgpr_base_offset_neg4097 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199 + ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 -4097 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... + +--- +name: load_global_s32_from_sgpr_base_offset_2049 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_2049 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2049 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 2049 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... + +--- +name: load_global_s32_from_sgpr_base_offset_neg2049 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247 + ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 -2049 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... +--- +name: load_global_s32_from_sgpr_base_offset_4294967295 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2047, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 4294967295 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... +--- +name: load_global_s32_from_sgpr_base_offset_4294967296 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 4294967296 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... + +--- +name: load_global_s32_from_sgpr_base_offset_4294971390 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094 + ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 4294971390 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... + +--- +name: load_global_s32_from_sgpr_base_offset_neg4294967295 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 -4294967295 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... +--- +name: load_global_s32_from_sgpr_base_offset_neg4294967296 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296 + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296 + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX10: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX10: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc + ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_CONSTANT i64 -4294967296 + %2:sgpr(p1) = G_PTR_ADD %0, %1 + %3:vgpr(p1) = COPY %2 + %4:vgpr(s32) = G_LOAD %3 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %4 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -72,16 +72,13 @@ ; GCN-LABEL: global_atomic_csub_sgpr_base_offset: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: s_load_dword s2, s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x1000 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm @@ -95,15 +92,12 @@ ; GCN-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: s_load_dword s2, s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc ; GCN-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) Index: llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -40,11 +40,9 @@ ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off offset:28 glc ; GCN-NEXT: BB0_2: ; %endif ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_co_u32_e64 v1, s0, 0x3d0800, s0 -; GCN-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, s1, s0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dword v[1:2], v0, off offset:252 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252 ; GCN-NEXT: s_endpgm entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999 Index: llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -40,13 +40,9 @@ ; GCN-NEXT: global_load_dword v0, v[0:1], off ; GCN-NEXT: BB0_2: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_add_co_u32_e32 v1, vcc, 0x3d0000, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dword v[1:2], v0, off offset:2300 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300 ; GCN-NEXT: s_endpgm entry: %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999 Index: llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -45,9 +45,8 @@ ; GCN: s_and_saveexec_b64 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xf000, -; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, -; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}} +; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0xf000{{$}} +; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}} ; GCN: {{^}}BB1_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -103,7 +102,8 @@ ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: ; GCN: s_and_saveexec_b64 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x1000{{$}} +; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}} ; GCN: {{^}}BB3_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { Index: llvm/test/CodeGen/AMDGPU/global-saddr-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4,6 +4,463 @@ ; Test using saddr addressing mode of global_*load_* flat instructions. +; -------------------------------------------------------------------------------- +; No vgpr offset, constants +; -------------------------------------------------------------------------------- + +; SGPR base with maximum gfx9 immediate offset +define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_4095: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_4095: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx9 immediate offset + 1 +define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) { +; GCN-LABEL: global_load_saddr_i8_offset_4096: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx9 immediate offset + 2 +define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) { +; GCN-LABEL: global_load_saddr_i8_offset_4097: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx9 immediate offset +define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx9 immediate offset -1 +define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx9 immediate offset -2 +define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx10 immediate offset +define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx10 immediate offset + 1 +define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_2049: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2049 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_2049: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx10 immediate offset + 2 +define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_2050: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2050 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_2050: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx10 immediate offset +define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) { +; GCN-LABEL: global_load_saddr_i8_offset_neg2048: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx10 immediate offset - 1 +define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff800, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx10 immediate offset - 1 +define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2050 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff800, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_4294967295: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_4294967295: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_4294967296: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_4294967296: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_4294967297: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_4294967297: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_4294971391: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_4294971391: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x800, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_4294971392: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_4294971392: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x1000, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x800, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) { +; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297 + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + ; -------------------------------------------------------------------------------- ; Basic addressing patterns ; -------------------------------------------------------------------------------- Index: llvm/test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -25,9 +25,8 @@ ; SIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0 ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x8000, -; GFX9-NEXT: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:3232{{$}} +; GFX9: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000{{$}} +; GFX9: global_atomic_add [[OFFSET]], v{{[0-9]+}}, s{{\[[0-9]:[0-9]+\]}} offset:3232{{$}} define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 Index: llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -991,9 +991,8 @@ ; CIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x11940 ; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x11000, -; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:2368{{$}} +; GFX9: v_mov_b32_e32 [[VOFFSET:v[0-9]+]], 0x11000{{$}} +; GFX9: global_atomic_cmpswap_x2 [[VOFFSET]], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:2368{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000 Index: llvm/test/CodeGen/AMDGPU/offset-split-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -701,11 +701,10 @@ ; GFX10-LABEL: global_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -719,12 +718,9 @@ ; GFX9-LABEL: global_inst_salu_offset_13bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -732,11 +728,10 @@ ; GFX10-LABEL: global_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -850,11 +845,10 @@ ; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -868,12 +862,9 @@ ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -881,11 +872,10 @@ ; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -899,12 +889,9 @@ ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -912,11 +899,10 @@ ; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x3800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm