diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1814,32 +1814,41 @@ MachineFunction &MF = B.getMF(); const GCNSubtarget &ST = MF.getSubtarget(); const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); if (ST.hasApertureRegs()) { - // FIXME: Use inline constants (src_{shared, private}_base) instead of - // getreg. - unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : - AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; - unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : - AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; - unsigned Encoding = - AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | - Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | - WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; - - Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - - B.buildInstr(AMDGPU::S_GETREG_B32) - .addDef(GetReg) - .addImm(Encoding); - MRI.setType(GetReg, S32); - - auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); - return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); + // Note: this feature (register) is broken. When used as a 32-bit operand, + // it returns a wrong value (all zeroes?). The real value is in the upper 32 + // bits. + // + // To work around the issue, directly emit a 64 bit mov from this register + // then extract the high bits. Note that this shouldn't even result in a + // shift being emitted and simply become a pair of registers (e.g.): + // s_mov_b64 s[6:7], src_shared_base + // v_mov_b32_e32 v1, s7 + const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) + ? AMDGPU::SRC_SHARED_BASE + : AMDGPU::SRC_PRIVATE_BASE; + + // FIXME: SRC_SHARED_BASE is 32 bit, not 64, so verifier complains. + /* + *** Bad machine code: Illegal physical register for instruction *** + - function: is_private_vgpr + - basic block: %bb.1 (0x561e85f7c6d0) + - instruction: %21:sreg_64_xexec(s64) = S_MOV_B64 $src_private_base + - operand 1: $src_private_base + $src_private_base is not a SReg_64 register. + LLVM ERROR: Found 1 machine code errors. + */ + Register MovDst = MRI.createGenericVirtualRegister(S64); + MRI.setRegClass(MovDst, &AMDGPU::SReg_64_XEXECRegClass); + B.buildInstr(AMDGPU::S_MOV_B64) + .addDef(MovDst) + .addReg(Register(ApertureRegNo)); + Register Srl = B.buildLShr(S64, MovDst, B.buildConstant(S64, 32)).getReg(0); + return B.buildTrunc(S32, Srl).getReg(0); } // TODO: can we be smarter about machine pointer info? diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -420,9 +420,6 @@ OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), OFFSET_MEM_VIOL = 8, - - OFFSET_SRC_SHARED_BASE = 16, - OFFSET_SRC_PRIVATE_BASE = 0 }; enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11] @@ -430,9 +427,6 @@ WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), - - WIDTH_M1_SRC_SHARED_BASE = 15, - WIDTH_M1_SRC_PRIVATE_BASE = 15 }; // Some values from WidthMinusOne mapped into Width domain. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5503,24 +5503,25 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { - // FIXME: Use inline constants (src_{shared, private}_base) instead. if (Subtarget->hasApertureRegs()) { - unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : - AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; - unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : - AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; - unsigned Encoding = - AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | - Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | - WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; - - SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); - SDValue ApertureReg = SDValue( - DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); - SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); - return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); + const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) + ? AMDGPU::SRC_SHARED_BASE + : AMDGPU::SRC_PRIVATE_BASE; + // Note: this feature (register) is broken. When used as a 32-bit operand, + // it returns a wrong value (all zeroes?). The real value is in the upper 32 + // bits. + // + // To work around the issue, directly emit a 64 bit mov from this register + // then extract the high bits. Note that this shouldn't even result in a + // shift being emitted and simply become a pair of registers (e.g.): + // s_mov_b64 s[6:7], src_shared_base + // v_mov_b32_e32 v1, s7 + SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, + DAG.getRegister(ApertureRegNo, MVT::i32)); + return DAG.getNode( + ISD::TRUNCATE, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, MVT::i64, + {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); } // For code object version 5, private_base and shared_base are passed through diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll @@ -10,11 +10,9 @@ ; MESA-LABEL: amdgpu_ps: ; MESA: ; %bb.0: ; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 -; MESA-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; MESA-NEXT: s_lshl_b32 s0, s0, 16 +; MESA-NEXT: v_mov_b32_e32 v1, src_private_base ; MESA-NEXT: v_mov_b32_e32 v0, 4 -; MESA-NEXT: v_mov_b32_e32 v1, s0 ; MESA-NEXT: v_mov_b32_e32 v2, 0 ; MESA-NEXT: flat_store_dword v[0:1], v2 ; MESA-NEXT: s_waitcnt vmcnt(0) @@ -25,15 +23,13 @@ ; PAL-NEXT: s_getpc_b64 s[2:3] ; PAL-NEXT: s_mov_b32 s2, s0 ; PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; PAL-NEXT: v_mov_b32_e32 v1, src_private_base ; PAL-NEXT: v_mov_b32_e32 v0, 4 ; PAL-NEXT: v_mov_b32_e32 v2, 0 ; PAL-NEXT: s_waitcnt lgkmcnt(0) ; PAL-NEXT: s_and_b32 s3, s3, 0xffff ; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 -; PAL-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; PAL-NEXT: s_lshl_b32 s0, s0, 16 -; PAL-NEXT: v_mov_b32_e32 v1, s0 ; PAL-NEXT: flat_store_dword v[0:1], v2 ; PAL-NEXT: s_waitcnt vmcnt(0) ; PAL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -84,78 +84,72 @@ ; ; GFX9V3-LABEL: addrspacecast: ; GFX9V3: ; %bb.0: -; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s3, s2, 16 -; GFX9V3-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b32 s1, src_private_base +; GFX9V3-NEXT: s_mov_b32 s3, src_shared_base ; GFX9V3-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_mov_b32 s2, s0 -; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_mov_b32 s0, s4 +; GFX9V3-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9V3-NEXT: s_mov_b32 s2, s5 +; GFX9V3-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V3-NEXT: s_lshl_b32 s5, s4, 16 -; GFX9V3-NEXT: s_mov_b32 s4, s1 -; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 -; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX9V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V3-NEXT: flat_store_dword v[0:1], v2 ; GFX9V3-NEXT: s_waitcnt vmcnt(0) -; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V3-NEXT: v_mov_b32_e32 v2, 2 -; GFX9V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V3-NEXT: flat_store_dword v[0:1], v2 ; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: s_endpgm ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s3, s2, 16 -; GFX9V4-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9V4-NEXT: s_mov_b32 s1, src_private_base +; GFX9V4-NEXT: s_mov_b32 s3, src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_mov_b32 s2, s0 -; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_mov_b32 s0, s4 +; GFX9V4-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9V4-NEXT: s_mov_b32 s2, s5 +; GFX9V4-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V4-NEXT: s_lshl_b32 s5, s4, 16 -; GFX9V4-NEXT: s_mov_b32 s4, s1 -; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 2 -; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: s_endpgm ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s3, s2, 16 -; GFX9V5-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9V5-NEXT: s_mov_b32 s1, src_private_base +; GFX9V5-NEXT: s_mov_b32 s3, src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_mov_b32 s2, s0 -; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_mov_b32 s0, s4 +; GFX9V5-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9V5-NEXT: s_mov_b32 s2, s5 +; GFX9V5-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V5-NEXT: s_lshl_b32 s5, s4, 16 -; GFX9V5-NEXT: s_mov_b32 s4, s1 -; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 2 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm @@ -210,9 +204,7 @@ ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, src_shared_base ; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -223,9 +215,7 @@ ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, src_shared_base ; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -236,9 +226,7 @@ ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, src_shared_base ; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off @@ -294,9 +282,7 @@ ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, src_private_base ; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -307,9 +293,7 @@ ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, src_private_base ; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -320,9 +304,7 @@ ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, src_private_base ; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -228,15 +228,13 @@ ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $src_private_base ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; SI-LABEL: name: test_addrspacecast_p5_to_p0 ; SI: liveins: $vgpr0 @@ -323,15 +321,13 @@ ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $src_shared_base ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; SI-LABEL: name: test_addrspacecast_p3_to_p0 ; SI: liveins: $vgpr0 @@ -568,21 +564,18 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $src_shared_base ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] - ; GFX9-NEXT: [[S_GETREG_B32_1:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_1]], [[C]](s32) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $src_shared_base ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[SHL1]](s32) - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]] + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[COPY2]](s32) + ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[SELECT]](p0), [[SELECT1]](p0) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p0>) ; SI-LABEL: name: test_addrspacecast_v2p3_to_v2p0 @@ -778,13 +771,11 @@ ; VI-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](p0) ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0 ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $src_private_base ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[MV]](p0) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY]](p0) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[COPY]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY [[MV]](p0) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY1]](p0) ; SI-LABEL: name: test_addrspacecast_p5_fi_to_p0 ; SI: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 ; SI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[FRAME_INDEX]](p5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -32,9 +32,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, src_private_base, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -46,10 +44,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, src_private_base, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -61,10 +56,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, src_private_base, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -98,9 +90,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_cmp_lg_u32 s1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s1, src_private_base ; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -113,9 +103,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_cmp_lg_u32 s1, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, src_private_base ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -128,10 +116,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_cmp_lg_u32 s1, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, src_private_base ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -32,9 +32,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, src_shared_base, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -46,10 +44,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, src_shared_base, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -61,10 +56,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, src_shared_base, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -98,9 +90,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_cmp_lg_u32 s1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s1, src_shared_base ; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -113,9 +103,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_cmp_lg_u32 s1, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, src_shared_base ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -128,10 +116,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_cmp_lg_u32 s1, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, src_shared_base ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll @@ -10,10 +10,8 @@ } ; CHECK-LABEL: {{^}}cast_alloca: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 -; CHECK: v_lshrrev_b32_e64 v0, 6, s33 -; CHECK-NEXT: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_private_base +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] ; CHECK-NOT: v0 ; CHECK-NOT: v1 define void @cast_alloca() { @@ -26,10 +24,9 @@ @lds = internal unnamed_addr addrspace(3) global i8 undef, align 4 ; CHECK-LABEL: {{^}}cast_lds_gv: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_shared_base ; CHECK: v_mov_b32_e32 v0, 0 -; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] ; CHECK-NOT: v0 ; CHECK-NOT: v1 define void @cast_lds_gv() { @@ -55,20 +52,18 @@ } ; CHECK-LABEL: {{^}}cast_constant_lds_other_gv: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_shared_base ; CHECK: v_mov_b32_e32 v0, 0x7b -; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] define void @cast_constant_lds_other_gv() { call void @flat_user(i8* addrspacecast (i8 addrspace(3)* inttoptr (i32 123 to i8 addrspace(3)*) to i8*)) ret void } ; CHECK-LABEL: {{^}}cast_constant_private_other_gv: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_private_base ; CHECK: v_mov_b32_e32 v0, 0x7b -; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] define void @cast_constant_private_other_gv() { call void @flat_user(i8* addrspacecast (i8 addrspace(5)* inttoptr (i32 123 to i8 addrspace(5)*) to i8*)) ret void diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -15,12 +15,9 @@ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_SHARED_BASE]], 0 +; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], src_shared_base, 0 ; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] @@ -44,11 +41,8 @@ ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] -; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base ; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc ; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc @@ -75,14 +69,10 @@ ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 - -; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_PRIVATE_BASE]], 0 +; GFX9: s_cselect_b32 s[[HI:[0-9]+]], src_private_base, 0 ; GFX9: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] @@ -213,11 +203,8 @@ ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] -; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} @@ -263,11 +250,8 @@ ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] -; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_private_base ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -134,17 +134,13 @@ ; GFX90A-LABEL: syncscope_workgroup_rtn: ; GFX90A: ; %bb.0: ; %atomicrmw.check.shared ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, src_shared_base, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB1_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX90A-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX90A-NEXT: s_lshl_b32 s6, s6, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, src_private_base, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] @@ -206,9 +202,7 @@ ; GFX908-LABEL: syncscope_workgroup_nortn: ; GFX908: ; %bb.0: ; %atomicrmw.check.shared ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX908-NEXT: s_lshl_b32 s4, s4, 16 -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, src_shared_base, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 @@ -220,9 +214,7 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private -; GFX908-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX908-NEXT: s_lshl_b32 s6, s6, 16 -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, src_private_base, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX908-NEXT: s_cbranch_execz .LBB2_5 @@ -260,9 +252,7 @@ ; GFX90A-LABEL: syncscope_workgroup_nortn: ; GFX90A: ; %bb.0: ; %atomicrmw.check.shared ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, src_shared_base, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 @@ -274,9 +264,7 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private -; GFX90A-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX90A-NEXT: s_lshl_b32 s6, s6, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, src_private_base, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB2_5 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -39,9 +39,9 @@ ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: ; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10 -; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]] ; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]] define hidden void @use_queue_ptr_addrspacecast() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -11,22 +11,18 @@ ; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; FLAT_SCR_OPT-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, src_private_base ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 4 -; FLAT_SCR_OPT-NEXT: s_lshl_b32 s0, s0, 16 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s0 ; FLAT_SCR_OPT-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, src_private_base ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 4 -; FLAT_SCR_ARCH-NEXT: s_lshl_b32 s0, s0, 16 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, s0 ; FLAT_SCR_ARCH-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_ARCH-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_ARCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -80,19 +80,15 @@ ; GFX9V3-LABEL: addrspacecast: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9V3-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b32 s2, src_private_base, 0 ; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V3-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V3-NEXT: s_cselect_b32 s0, src_shared_base, 0 ; GFX9V3-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V3-NEXT: v_mov_b32_e32 v3, s0 @@ -106,19 +102,15 @@ ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_cselect_b32 s2, src_private_base, 0 ; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V4-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V4-NEXT: s_cselect_b32 s0, src_shared_base, 0 ; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V4-NEXT: v_mov_b32_e32 v3, s0 @@ -132,19 +124,15 @@ ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_cselect_b32 s2, src_private_base, 0 ; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V5-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V5-NEXT: s_cselect_b32 s0, src_shared_base, 0 ; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V5-NEXT: v_mov_b32_e32 v3, s0 @@ -201,10 +189,8 @@ ; GFX9V3-LABEL: llvm_amdgcn_is_shared: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cmp_eq_u32 s0, src_shared_base ; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -214,10 +200,8 @@ ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cmp_eq_u32 s0, src_shared_base ; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -227,10 +211,8 @@ ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cmp_eq_u32 s0, src_shared_base ; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off @@ -282,10 +264,8 @@ ; GFX9V3-LABEL: llvm_amdgcn_is_private: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cmp_eq_u32 s0, src_private_base ; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -295,10 +275,8 @@ ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cmp_eq_u32 s0, src_private_base ; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -308,10 +286,8 @@ ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cmp_eq_u32 s0, src_private_base ; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -4,9 +4,8 @@ ; GCN-LABEL: {{^}}is_private_vgpr: ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] +; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] +; GFX9: v_cmp_eq_u32_e32 vcc, src_private_base, v[[PTR_HI]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -23,13 +22,12 @@ ; GCN-LABEL: {{^}}is_private_sgpr: ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} -; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; GFX9: s_cmp_eq_u32 [[PTR_HI]], src_private_base ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_private_sgpr(i8* %ptr) { %val = call i1 @llvm.amdgcn.is.private(i8* %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -4,10 +4,9 @@ ; GCN-LABEL: {{^}}is_local_vgpr: ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] +; GFX9: v_cmp_eq_u32_e32 vcc, src_shared_base, v[[PTR_HI]] +; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -24,13 +23,12 @@ ; GCN-LABEL: {{^}}is_local_sgpr: ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} -; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; GFX9: s_cmp_eq_u32 [[PTR_HI]], src_shared_base +; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_local_sgpr(i8* %ptr) { %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)