diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1795,65 +1795,78 @@ SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; - } else if (!LHS->isDivergent() && COffsetVal > 0) { - SDLoc SL(N); - // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) + - // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( - COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); - - if (isUInt<32>(RemainderOffset)) { - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SL, MVT::i32, - CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - SAddr = LHS; - Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); - return true; + } else if (!LHS->isDivergent()) { + if (COffsetVal > 0) { + SDLoc SL(N); + // saddr + large_offset -> saddr + + // (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + + if (isUInt<32>(RemainderOffset)) { + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SL, MVT::i32, + CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + SAddr = LHS; + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + return true; + } } + + // We are adding a 64 bit SGPR and a constant. If constant bus limit + // is 1 we would need to perform 1 or 2 extra moves for each half of + // the constant and it is better to do a scalar add and then issue a + // single VALU instruction to materialize zero. Otherwise it is less + // instructions to perform VALU adds with immediates or inline literals. + unsigned NumLiterals = + !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) + + !TII->isInlineConstant(APInt(32, COffsetVal >> 32)); + if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) + return false; } } // Match the variable offset. - if (Addr.getOpcode() != ISD::ADD) { - if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || - isa(Addr)) - return false; - - // It's cheaper to materialize a single 32-bit zero for vaddr than the two - // moves required to copy a 64-bit SGPR to VGPR. - SAddr = Addr; - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, - CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); - return true; - } + if (Addr.getOpcode() == ISD::ADD) { + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); - LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); + if (!LHS->isDivergent()) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + SAddr = LHS; + VOffset = ZextRHS; + } + } - if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { - SAddr = LHS; - VOffset = ZextRHS; + if (!SAddr && !RHS->isDivergent()) { + // add (zero_extend (i32 vgpr)), (i64 sgpr) + if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + SAddr = RHS; + VOffset = ZextLHS; + } } - } - if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { - SAddr = RHS; - VOffset = ZextLHS; + if (SAddr) { + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; } } - if (!SAddr) + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || + isa(Addr)) return false; + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + SAddr = Addr; + SDNode *VMov = + CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3528,36 +3528,52 @@ SIInstrFlags::FlatGlobal)) { Addr = PtrBase; ImmOffset = ConstOffset; - } else if (ConstOffset > 0) { + } else { auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); if (!PtrBaseDef) return None; if (isSGPR(PtrBaseDef->Reg)) { - // Offset is too large. - // - // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) - // + (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( - ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); - - if (isUInt<32>(RemainderOffset)) { - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - Register HighBits - = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), - HighBits) - .addImm(RemainderOffset); - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, - }}; + if (ConstOffset > 0) { + // Offset is too large. + // + // saddr + large_offset -> saddr + + // (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( + ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + + if (isUInt<32>(RemainderOffset)) { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register HighBits = + MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), + HighBits) + .addImm(RemainderOffset); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr + [=](MachineInstrBuilder &MIB) { + MIB.addReg(HighBits); + }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + }}; + } } + + // We are adding a 64 bit SGPR and a constant. If constant bus limit + // is 1 we would need to perform 1 or 2 extra moves for each half of + // the constant and it is better to do a scalar add and then issue a + // single VALU instruction to materialize zero. Otherwise it is less + // instructions to perform VALU adds with immediates or inline literals. + unsigned NumLiterals = + !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) + + !TII.isInlineConstant(APInt(32, ConstOffset >> 32)); + if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) + return None; } } } @@ -3567,57 +3583,50 @@ return None; // Match the variable offset. - if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) { - // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and - // drop this. - if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || - AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT) - return None; - - // It's cheaper to materialize a single 32-bit zero for vaddr than the two - // moves required to copy a 64-bit SGPR to VGPR. - const Register SAddr = AddrDef->Reg; - if (!isSGPR(SAddr)) - return None; - - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), - VOffset) - .addImm(0); - - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr - [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset - }}; + if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { + // Look through the SGPR->VGPR copy. + Register SAddr = + getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); + + if (SAddr && isSGPR(SAddr)) { + Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + + // It's possible voffset is an SGPR here, but the copy to VGPR will be + // inserted later. + if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }}}; + } + } } - // Look through the SGPR->VGPR copy. - Register SAddr = - getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); - if (!SAddr || !isSGPR(SAddr)) + // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and + // drop this. + if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || + AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) return None; - Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // It's possible voffset is an SGPR here, but the copy to VGPR will be - // inserted later. - Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset); - if (!VOffset) - return None; + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) + .addImm(0); - return {{[=](MachineInstrBuilder &MIB) { // saddr - MIB.addReg(SAddr); - }, - [=](MachineInstrBuilder &MIB) { // voffset - MIB.addReg(VOffset); - }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(ImmOffset); - }}}; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; } InstructionSelector::ComplexRendererFns diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -350,9 +350,9 @@ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 @@ -553,9 +553,9 @@ ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -101,11 +101,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4097: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -127,11 +126,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4098: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -348,10 +346,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4294971391: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_add_u32 s0, s2, 0xfff +; GFX9-NEXT: s_addc_u32 s1, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -372,11 +370,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4294971392: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_u32 s0, s2, 0x1000 +; GFX9-NEXT: s_addc_u32 s1, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -2391,13 +2388,12 @@ ; GFX9-LABEL: global_addr_64bit_lsr_iv: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: BB128_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_add_u32 s4, s2, s0 ; GFX9-NEXT: s_addc_u32 s5, s3, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -2408,18 +2404,18 @@ ; ; GFX10-LABEL: global_addr_64bit_lsr_iv: ; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: BB128_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s4, s2, s0 ; GFX10-NEXT: s_addc_u32 s5, s3, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_add_u32 s0, s0, 4 +; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_cbranch_scc0 BB128_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -2439,45 +2435,46 @@ br i1 %i9, label %bb2, label %bb3 } +; Make sure we only have a single zero vaddr initialization. + define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) { ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: BB129_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_add_u32 s4, s2, s0 ; GFX9-NEXT: s_addc_u32 s5, s3, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_load_dword v2, v[0:1], off glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[0:1], off glc +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX9-NEXT: ; kill: killed $vgpr0_vgpr1 +; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 ; GFX9-NEXT: s_cbranch_scc0 BB129_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-NEXT: BB129_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s4, s2, s0 ; GFX10-NEXT: s_addc_u32 s5, s3, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX10-NEXT: ; kill: killed $vgpr0_vgpr1 -; GFX10-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 ; GFX10-NEXT: s_cbranch_scc0 BB129_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -41,10 +41,9 @@ ; VI: flat_atomic_add -; GFX9: v_mov_b32_e32 [[HIGH_K:v[0-9]+]], 0xabcd -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xd000, -; GFX9-NEXT: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, [[HIGH_K]], v{{[0-9]+}}, vcc -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:3756{{$}} +; GFX9: s_add_u32 s[[LOW_K:[0-9]+]], s{{[0-9]+}}, 0xdeac +; GFX9: s_addc_u32 s[[HIGH_K:[0-9]+]], s{{[0-9]+}}, 0xabcd +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[LOW_K]]:[[HIGH_K]]]{{$}} define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 @@ -69,7 +68,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -83,7 +82,7 @@ ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} ; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: @@ -119,7 +118,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_addr64: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -132,7 +131,7 @@ ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -169,7 +168,7 @@ ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -183,7 +182,7 @@ ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_and [[RET:v[0-9]]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -219,7 +218,7 @@ ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -232,7 +231,7 @@ ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_and [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -269,7 +268,7 @@ ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -283,7 +282,7 @@ ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -319,7 +318,7 @@ ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_sub v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -332,7 +331,7 @@ ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -369,7 +368,7 @@ ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -383,7 +382,7 @@ ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -419,7 +418,7 @@ ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -432,7 +431,7 @@ ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -468,7 +467,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -482,7 +481,7 @@ ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -517,7 +516,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_addr64: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -530,7 +529,7 @@ ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -566,7 +565,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -580,7 +579,7 @@ ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -615,7 +614,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_addr64: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -628,7 +627,7 @@ ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -664,7 +663,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -678,7 +677,7 @@ ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -712,7 +711,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_addr64: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -725,7 +724,7 @@ ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -761,7 +760,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 +; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -775,7 +774,7 @@ ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -810,7 +809,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_addr64: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -823,7 +822,7 @@ ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -870,7 +869,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -884,7 +883,7 @@ ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -918,7 +917,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -931,7 +930,7 @@ ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -969,7 +968,7 @@ ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16{{$}} +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -983,7 +982,7 @@ ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword v[[RET]] -; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1020,7 +1019,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1033,7 +1032,7 @@ ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword v[[RET]] -; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1070,7 +1069,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1084,7 +1083,7 @@ ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1118,7 +1117,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_addr64: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1131,7 +1130,7 @@ ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1202,7 +1201,7 @@ ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index @@ -1217,7 +1216,7 @@ ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index @@ -1231,7 +1230,7 @@ ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_load_f32_addr64_offset(float addrspace(1)* %in, float addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr float, float addrspace(1)* %in, i64 %index @@ -1275,7 +1274,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1287,7 +1286,7 @@ ; GCN-LABEL: {{^}}atomic_store_f32_addr64_offset: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, float addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr float, float addrspace(1)* %out, i64 %index @@ -1299,7 +1298,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_addr64: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1310,7 +1309,7 @@ ; GCN-LABEL: {{^}}atomic_store_f32_addr64: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_f32_addr64(float %in, float addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr float, float addrspace(1)* %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -43,7 +43,7 @@ ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -77,7 +77,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -90,7 +90,7 @@ ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -139,7 +139,7 @@ ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -173,7 +173,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -186,7 +186,7 @@ ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -221,7 +221,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -235,7 +235,7 @@ ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -269,7 +269,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -282,7 +282,7 @@ ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -317,7 +317,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -331,7 +331,7 @@ ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -365,7 +365,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -378,7 +378,7 @@ ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -413,7 +413,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -427,7 +427,7 @@ ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -461,7 +461,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -474,7 +474,7 @@ ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -509,7 +509,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -523,7 +523,7 @@ ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -557,7 +557,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -570,7 +570,7 @@ ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -606,7 +606,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -620,7 +620,7 @@ ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -654,7 +654,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -667,7 +667,7 @@ ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -702,7 +702,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -716,7 +716,7 @@ ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -750,7 +750,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -763,7 +763,7 @@ ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -810,7 +810,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -824,7 +824,7 @@ ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -858,7 +858,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -871,7 +871,7 @@ ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -906,7 +906,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -920,7 +920,7 @@ ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -954,7 +954,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -967,7 +967,7 @@ ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1017,7 +1017,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1031,7 +1031,7 @@ ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]: -; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1067,7 +1067,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1080,7 +1080,7 @@ ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]: -; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1142,7 +1142,7 @@ ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index @@ -1157,7 +1157,7 @@ ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index @@ -1171,7 +1171,7 @@ ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_load_f64_addr64_offset(double addrspace(1)* %in, double addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr double, double addrspace(1)* %in, i64 %index @@ -1205,7 +1205,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], off offset:32{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1217,7 +1217,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_addr64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1228,7 +1228,7 @@ ; GCN-LABEL: {{^}}atomic_store_f64_addr64_offset: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], off offset:32{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, double addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr double, double addrspace(1)* %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -764,12 +764,11 @@ ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -899,12 +898,11 @@ ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -929,12 +927,11 @@ ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -960,11 +957,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -990,11 +987,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x800 +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1020,11 +1017,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xfff +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1050,12 +1047,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1081,12 +1077,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1112,12 +1107,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1143,13 +1137,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1157,11 +1149,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1176,13 +1168,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x800 +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1190,11 +1180,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x800 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1209,13 +1199,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xfff +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1223,11 +1211,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xfff +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1242,13 +1230,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1256,11 +1242,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1275,13 +1261,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1289,11 +1273,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1308,13 +1292,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1322,11 +1304,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm