Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1816,44 +1816,43 @@ } // Match the variable offset. - if (Addr.getOpcode() != ISD::ADD) { - if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || - isa(Addr)) - return false; - - // It's cheaper to materialize a single 32-bit zero for vaddr than the two - // moves required to copy a 64-bit SGPR to VGPR. - SAddr = Addr; - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, - CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); - return true; - } + if (Addr.getOpcode() == ISD::ADD) { + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); - LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); + if (!LHS->isDivergent()) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + SAddr = LHS; + VOffset = ZextRHS; + } + } - if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { - SAddr = LHS; - VOffset = ZextRHS; + if (!SAddr && !RHS->isDivergent()) { + // add (zero_extend (i32 vgpr)), (i64 sgpr) + if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + SAddr = RHS; + VOffset = ZextLHS; + } } } - if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { - SAddr = RHS; - VOffset = ZextLHS; - } + if (SAddr) { + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; } - if (!SAddr) + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || + isa(Addr)) return false; + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + SAddr = Addr; + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; } Index: llvm/test/CodeGen/AMDGPU/global-saddr-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -85,9 +85,10 @@ ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg4096: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000 +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096 @@ -101,19 +102,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4097: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg4097: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0xffffefff +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097 @@ -127,19 +128,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4098: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg4098: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0xffffeffe +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098 @@ -241,9 +242,10 @@ ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg2049: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff7ff +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049 @@ -264,9 +266,10 @@ ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg2050: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff7fe +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050 @@ -300,18 +303,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4294967296: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_u32 s0, s2, 0 +; GFX9-NEXT: s_addc_u32 s1, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_4294967296: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0 +; GFX10-NEXT: s_addc_u32 s1, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296 @@ -324,18 +328,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4294967297: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX9-NEXT: s_add_u32 s0, s2, 1 +; GFX9-NEXT: s_addc_u32 s1, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_4294967297: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 1 +; GFX10-NEXT: s_addc_u32 s1, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297 @@ -348,18 +353,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4294971391: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_add_u32 s0, s2, 0xfff +; GFX9-NEXT: s_addc_u32 s1, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_4294971391: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0xfff +; GFX10-NEXT: s_addc_u32 s1, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391 @@ -372,19 +378,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4294971392: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_u32 s0, s2, 0x1000 +; GFX9-NEXT: s_addc_u32 s1, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_4294971392: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0x1000 +; GFX10-NEXT: s_addc_u32 s1, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392 @@ -397,19 +403,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 +; GFX9-NEXT: s_add_u32 s0, s2, 1 +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 1 +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295 @@ -422,18 +428,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_u32 s0, s2, 0 +; GFX9-NEXT: s_addc_u32 s1, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 0 +; GFX10-NEXT: s_addc_u32 s1, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296 @@ -446,18 +453,19 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_add_u32 s0, s2, -1 +; GFX9-NEXT: s_addc_u32 s1, s3, -2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_u32 s0, s2, -1 +; GFX10-NEXT: s_addc_u32 s1, s3, -2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297 Index: llvm/test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -41,10 +41,9 @@ ; VI: flat_atomic_add -; GFX9: v_mov_b32_e32 [[HIGH_K:v[0-9]+]], 0xabcd -; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xd000, -; GFX9-NEXT: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, [[HIGH_K]], v{{[0-9]+}}, vcc -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:3756{{$}} +; GFX9: s_add_u32 s[[LOW_K:[0-9]+]], s{{[0-9]+}}, 0xdeac +; GFX9: s_addc_u32 s[[HIGH_K:[0-9]+]], s{{[0-9]+}}, 0xabcd +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[LOW_K]]:[[HIGH_K]]]{{$}} define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 @@ -69,7 +68,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -83,7 +82,7 @@ ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} ; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: @@ -119,7 +118,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_addr64: ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -132,7 +131,7 @@ ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -169,7 +168,7 @@ ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -183,7 +182,7 @@ ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_and [[RET:v[0-9]]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -219,7 +218,7 @@ ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -232,7 +231,7 @@ ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_and [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -269,7 +268,7 @@ ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -283,7 +282,7 @@ ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -319,7 +318,7 @@ ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_sub v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -332,7 +331,7 @@ ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -369,7 +368,7 @@ ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -383,7 +382,7 @@ ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -419,7 +418,7 @@ ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -432,7 +431,7 @@ ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -468,7 +467,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -482,7 +481,7 @@ ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -517,7 +516,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i32_addr64: ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -530,7 +529,7 @@ ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -566,7 +565,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -580,7 +579,7 @@ ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -615,7 +614,7 @@ ; GCN-LABEL: {{^}}atomic_min_i32_addr64: ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -628,7 +627,7 @@ ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -664,7 +663,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -678,7 +677,7 @@ ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -712,7 +711,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i32_addr64: ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -725,7 +724,7 @@ ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -761,7 +760,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 +; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -775,7 +774,7 @@ ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -810,7 +809,7 @@ ; GCN-LABEL: {{^}}atomic_or_i32_addr64: ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -823,7 +822,7 @@ ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -870,7 +869,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -884,7 +883,7 @@ ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -918,7 +917,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64: ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -931,7 +930,7 @@ ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -969,7 +968,7 @@ ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16{{$}} +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -983,7 +982,7 @@ ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword v[[RET]] -; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1020,7 +1019,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64: ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1033,7 +1032,7 @@ ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword v[[RET]] -; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_atomic_cmpswap v[[RET:[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1070,7 +1069,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1084,7 +1083,7 @@ ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1118,7 +1117,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i32_addr64: ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} -; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1131,7 +1130,7 @@ ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1202,7 +1201,7 @@ ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index @@ -1217,7 +1216,7 @@ ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index @@ -1231,7 +1230,7 @@ ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_load_f32_addr64_offset(float addrspace(1)* %in, float addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr float, float addrspace(1)* %in, i64 %index @@ -1275,7 +1274,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1287,7 +1286,7 @@ ; GCN-LABEL: {{^}}atomic_store_f32_addr64_offset: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, float addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr float, float addrspace(1)* %out, i64 %index @@ -1299,7 +1298,7 @@ ; GCN-LABEL: {{^}}atomic_store_i32_addr64: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1310,7 +1309,7 @@ ; GCN-LABEL: {{^}}atomic_store_f32_addr64: ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_f32_addr64(float %in, float addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr float, float addrspace(1)* %out, i64 %index Index: llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -29,7 +29,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -43,7 +43,7 @@ ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -77,7 +77,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_addr64: ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -90,7 +90,7 @@ ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -139,7 +139,7 @@ ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -173,7 +173,7 @@ ; GCN-LABEL: {{^}}atomic_and_i64_addr64: ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -186,7 +186,7 @@ ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -221,7 +221,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -235,7 +235,7 @@ ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -269,7 +269,7 @@ ; GCN-LABEL: {{^}}atomic_sub_i64_addr64: ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -282,7 +282,7 @@ ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -317,7 +317,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -331,7 +331,7 @@ ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -365,7 +365,7 @@ ; GCN-LABEL: {{^}}atomic_max_i64_addr64: ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -378,7 +378,7 @@ ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -413,7 +413,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -427,7 +427,7 @@ ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -461,7 +461,7 @@ ; GCN-LABEL: {{^}}atomic_umax_i64_addr64: ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -474,7 +474,7 @@ ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -509,7 +509,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -523,7 +523,7 @@ ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -557,7 +557,7 @@ ; GCN-LABEL: {{^}}atomic_min_i64_addr64: ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -570,7 +570,7 @@ ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -606,7 +606,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -620,7 +620,7 @@ ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -654,7 +654,7 @@ ; GCN-LABEL: {{^}}atomic_umin_i64_addr64: ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -667,7 +667,7 @@ ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -702,7 +702,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -716,7 +716,7 @@ ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -750,7 +750,7 @@ ; GCN-LABEL: {{^}}atomic_or_i64_addr64: ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -763,7 +763,7 @@ ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -810,7 +810,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -824,7 +824,7 @@ ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -858,7 +858,7 @@ ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64: ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -871,7 +871,7 @@ ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -906,7 +906,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -920,7 +920,7 @@ ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -954,7 +954,7 @@ ; GCN-LABEL: {{^}}atomic_xor_i64_addr64: ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} -; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -967,7 +967,7 @@ ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1017,7 +1017,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1031,7 +1031,7 @@ ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]: -; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1067,7 +1067,7 @@ ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64: ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1080,7 +1080,7 @@ ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]: -; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1142,7 +1142,7 @@ ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index @@ -1157,7 +1157,7 @@ ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index @@ -1171,7 +1171,7 @@ ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_load_f64_addr64_offset(double addrspace(1)* %in, double addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr double, double addrspace(1)* %in, i64 %index @@ -1205,7 +1205,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], off offset:32{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1217,7 +1217,7 @@ ; GCN-LABEL: {{^}}atomic_store_i64_addr64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1228,7 +1228,7 @@ ; GCN-LABEL: {{^}}atomic_store_f64_addr64_offset: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], off offset:32{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, double addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr double, double addrspace(1)* %out, i64 %index Index: llvm/test/CodeGen/AMDGPU/offset-split-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -747,10 +747,11 @@ ; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 +; GFX10-NEXT: s_addc_u32 s1, s1, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -764,12 +765,11 @@ ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -777,10 +777,11 @@ ; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 +; GFX10-NEXT: s_addc_u32 s1, s1, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -882,10 +883,11 @@ ; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 +; GFX10-NEXT: s_addc_u32 s1, s1, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -899,12 +901,11 @@ ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -912,10 +913,11 @@ ; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 +; GFX10-NEXT: s_addc_u32 s1, s1, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -929,12 +931,11 @@ ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000 +; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -942,10 +943,11 @@ ; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 +; GFX10-NEXT: s_addc_u32 s1, s1, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -960,11 +962,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -972,10 +974,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX10-NEXT: s_addc_u32 s1, s1, 2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -990,11 +993,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x800 +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1002,10 +1005,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x800 +; GFX10-NEXT: s_addc_u32 s1, s1, 2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1020,11 +1024,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xfff +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1032,10 +1036,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xfff +; GFX10-NEXT: s_addc_u32 s1, s1, 2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1050,12 +1055,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1063,10 +1067,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX10-NEXT: s_addc_u32 s1, s1, 2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1081,12 +1086,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1094,10 +1098,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0x1800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX10-NEXT: s_addc_u32 s1, s1, 2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1112,12 +1117,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1125,10 +1129,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX10-NEXT: s_addc_u32 s1, s1, 2 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1143,13 +1148,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1157,11 +1160,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1176,13 +1179,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x800 +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1190,11 +1191,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x800 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1209,13 +1210,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0xfff +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1223,11 +1222,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0xfff +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1242,13 +1241,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1256,11 +1253,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1275,13 +1272,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1289,11 +1284,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -1308,13 +1303,11 @@ ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -1322,11 +1315,11 @@ ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc +; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm