diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1814,32 +1814,21 @@ MachineFunction &MF = B.getMF(); const GCNSubtarget &ST = MF.getSubtarget(); const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); if (ST.hasApertureRegs()) { - // FIXME: Use inline constants (src_{shared, private}_base) instead of - // getreg. - unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : - AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; - unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : - AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; - unsigned Encoding = - AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | - Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | - WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; - - Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - - B.buildInstr(AMDGPU::S_GETREG_B32) - .addDef(GetReg) - .addImm(Encoding); - MRI.setType(GetReg, S32); - - auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); - return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); + // Note: this register is somewhat broken. When used as a 32-bit operand, + // it only returns zeroes. The real value is in the upper 32 bits. + // Thus, we must emit a S_MOV_B64 & extract the high 32 bits. + const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) + ? AMDGPU::SRC_SHARED_BASE64 + : AMDGPU::SRC_PRIVATE_BASE64; + Register Tmp = MRI.createGenericVirtualRegister(S64); + MRI.setRegClass(Tmp, &AMDGPU::SReg_64_XEXECRegClass); + B.buildInstr(AMDGPU::S_MOV_B64).addDef(Tmp).addReg(Register(ApertureRegNo)); + return B.buildUnmerge(S32, Tmp).getReg(1); } // TODO: can we be smarter about machine pointer info? diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -420,9 +420,6 @@ OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), OFFSET_MEM_VIOL = 8, - - OFFSET_SRC_SHARED_BASE = 16, - OFFSET_SRC_PRIVATE_BASE = 0 }; enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11] @@ -430,9 +427,6 @@ WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), - - WIDTH_M1_SRC_SHARED_BASE = 15, - WIDTH_M1_SRC_PRIVATE_BASE = 15 }; // Some values from WidthMinusOne mapped into Width domain. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5503,24 +5503,25 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { - // FIXME: Use inline constants (src_{shared, private}_base) instead. if (Subtarget->hasApertureRegs()) { - unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : - AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; - unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? - AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : - AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; - unsigned Encoding = - AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | - Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | - WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; - - SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); - SDValue ApertureReg = SDValue( - DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); - SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); - return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); + const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) + ? AMDGPU::SRC_SHARED_BASE64 + : AMDGPU::SRC_PRIVATE_BASE64; + // Note: this feature (register) is broken. When used as a 32-bit operand, + // it returns a wrong value (all zeroes?). The real value is in the upper 32 + // bits. + // + // To work around the issue, directly emit a 64 bit mov from this register + // then extract the high bits. Note that this shouldn't even result in a + // shift being emitted and simply become a pair of registers (e.g.): + // s_mov_b64 s[6:7], src_shared_base + // v_mov_b32_e32 v1, s7 + SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, + DAG.getRegister(ApertureRegNo, MVT::i64)); + return DAG.getNode( + ISD::TRUNCATE, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, MVT::i64, + {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); } // For code object version 5, private_base and shared_base are passed through diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll @@ -10,11 +10,10 @@ ; MESA-LABEL: amdgpu_ps: ; MESA: ; %bb.0: ; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 -; MESA-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; MESA-NEXT: s_mov_b64 s[0:1], src_private_base ; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; MESA-NEXT: s_lshl_b32 s0, s0, 16 ; MESA-NEXT: v_mov_b32_e32 v0, 4 -; MESA-NEXT: v_mov_b32_e32 v1, s0 +; MESA-NEXT: v_mov_b32_e32 v1, s1 ; MESA-NEXT: v_mov_b32_e32 v2, 0 ; MESA-NEXT: flat_store_dword v[0:1], v2 ; MESA-NEXT: s_waitcnt vmcnt(0) @@ -30,10 +29,9 @@ ; PAL-NEXT: s_waitcnt lgkmcnt(0) ; PAL-NEXT: s_and_b32 s3, s3, 0xffff ; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 -; PAL-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; PAL-NEXT: s_mov_b64 s[0:1], src_private_base ; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; PAL-NEXT: s_lshl_b32 s0, s0, 16 -; PAL-NEXT: v_mov_b32_e32 v1, s0 +; PAL-NEXT: v_mov_b32_e32 v1, s1 ; PAL-NEXT: flat_store_dword v[0:1], v2 ; PAL-NEXT: s_waitcnt vmcnt(0) ; PAL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -85,15 +85,13 @@ ; GFX9V3-LABEL: addrspacecast: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s3, s2, 16 -; GFX9V3-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V3-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V3-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V3-NEXT: s_mov_b32 s2, s0 ; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V3-NEXT: s_lshl_b32 s5, s4, 16 ; GFX9V3-NEXT: s_mov_b32 s4, s1 ; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 @@ -111,15 +109,13 @@ ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s3, s2, 16 -; GFX9V4-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s2, s0 ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V4-NEXT: s_lshl_b32 s5, s4, 16 ; GFX9V4-NEXT: s_mov_b32 s4, s1 ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 @@ -137,15 +133,13 @@ ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s3, s2, 16 -; GFX9V5-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s2, s0 ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V5-NEXT: s_lshl_b32 s5, s4, 16 ; GFX9V5-NEXT: s_mov_b32 s4, s1 ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 @@ -209,10 +203,9 @@ ; GFX9V3-LABEL: llvm_amdgcn_is_shared: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s3 ; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -222,10 +215,9 @@ ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 ; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -235,10 +227,9 @@ ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 ; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off @@ -293,10 +284,9 @@ ; GFX9V3-LABEL: llvm_amdgcn_is_private: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s3 ; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -306,10 +296,9 @@ ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 ; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -319,10 +308,9 @@ ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 ; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -228,15 +228,14 @@ ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec(s64) = S_MOV_B64 $src_private_base64 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; SI-LABEL: name: test_addrspacecast_p5_to_p0 ; SI: liveins: $vgpr0 @@ -323,15 +322,14 @@ ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec(s64) = S_MOV_B64 $src_shared_base64 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; SI-LABEL: name: test_addrspacecast_p3_to_p0 ; SI: liveins: $vgpr0 @@ -568,21 +566,20 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec(s64) = S_MOV_B64 $src_shared_base64 + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] - ; GFX9-NEXT: [[S_GETREG_B32_1:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_1]], [[C]](s32) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec(s64) = S_MOV_B64 $src_shared_base64 + ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64) ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[SHL1]](s32) - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]] + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32) + ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[SELECT]](p0), [[SELECT1]](p0) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p0>) ; SI-LABEL: name: test_addrspacecast_v2p3_to_v2p0 @@ -778,11 +775,10 @@ ; VI-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](p0) ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0 ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 - ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec(s64) = S_MOV_B64 $src_private_base64 + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[MV]](p0) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY]](p0) ; SI-LABEL: name: test_addrspacecast_p5_fi_to_p0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -32,9 +32,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -47,9 +46,8 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -61,10 +59,9 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -97,10 +94,9 @@ ; GFX9-LABEL: is_private_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_cmp_lg_u32 s1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s1, s3 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -112,10 +108,9 @@ ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_cmp_lg_u32 s1, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, s3 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -127,11 +122,9 @@ ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_cmp_lg_u32 s1, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, s3 ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -32,9 +32,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1 +; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -47,9 +46,8 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -61,10 +59,9 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -97,10 +94,9 @@ ; GFX9-LABEL: is_local_sgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_cmp_lg_u32 s1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s1, s3 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -112,10 +108,9 @@ ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_cmp_lg_u32 s1, s0 +; GFX10-NEXT: s_cmp_lg_u32 s1, s3 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -127,11 +122,9 @@ ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_cmp_lg_u32 s1, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, s3 ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll @@ -10,10 +10,8 @@ } ; CHECK-LABEL: {{^}}cast_alloca: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 -; CHECK: v_lshrrev_b32_e64 v0, 6, s33 -; CHECK-NEXT: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_private_base +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] ; CHECK-NOT: v0 ; CHECK-NOT: v1 define void @cast_alloca() { @@ -26,10 +24,9 @@ @lds = internal unnamed_addr addrspace(3) global i8 undef, align 4 ; CHECK-LABEL: {{^}}cast_lds_gv: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_shared_base ; CHECK: v_mov_b32_e32 v0, 0 -; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] ; CHECK-NOT: v0 ; CHECK-NOT: v1 define void @cast_lds_gv() { @@ -55,20 +52,18 @@ } ; CHECK-LABEL: {{^}}cast_constant_lds_other_gv: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_shared_base ; CHECK: v_mov_b32_e32 v0, 0x7b -; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] define void @cast_constant_lds_other_gv() { call void @flat_user(i8* addrspacecast (i8 addrspace(3)* inttoptr (i32 123 to i8 addrspace(3)*) to i8*)) ret void } ; CHECK-LABEL: {{^}}cast_constant_private_other_gv: -; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: s_mov_b64 s[{{[0-9]+}}:[[HIREG:[0-9]+]]], src_private_base ; CHECK: v_mov_b32_e32 v0, 0x7b -; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK: v_mov_b32_e32 v1, s[[HIREG]] define void @cast_constant_private_other_gv() { call void @flat_user(i8* addrspacecast (i8 addrspace(5)* inttoptr (i32 123 to i8 addrspace(5)*) to i8*)) ret void diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -13,15 +13,14 @@ ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 +; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base + ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_SHARED_BASE]], 0 -; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 +; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 +; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] @@ -43,15 +42,14 @@ ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0 +; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base + ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] -; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base +; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]] ; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc -; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 { @@ -75,15 +73,12 @@ ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 - -; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base +; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_PRIVATE_BASE]], 0 -; GFX9: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 +; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 +; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] @@ -213,11 +208,8 @@ ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] -; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base +; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} @@ -263,11 +255,8 @@ ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] -; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] -; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base +; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -134,17 +134,15 @@ ; GFX90A-LABEL: syncscope_workgroup_rtn: ; GFX90A: ; %bb.0: ; %atomicrmw.check.shared ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB1_6 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX90A-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX90A-NEXT: s_lshl_b32 s6, s6, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] @@ -206,9 +204,8 @@ ; GFX908-LABEL: syncscope_workgroup_nortn: ; GFX908: ; %bb.0: ; %atomicrmw.check.shared ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX908-NEXT: s_lshl_b32 s4, s4, 16 -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 @@ -220,9 +217,8 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.check.private -; GFX908-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX908-NEXT: s_lshl_b32 s6, s6, 16 -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 +; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX908-NEXT: s_cbranch_execz .LBB2_5 @@ -260,9 +256,8 @@ ; GFX90A-LABEL: syncscope_workgroup_nortn: ; GFX90A: ; %bb.0: ; %atomicrmw.check.shared ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 @@ -274,9 +269,8 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.check.private -; GFX90A-NEXT: s_getreg_b32 s6, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX90A-NEXT: s_lshl_b32 s6, s6, 16 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execz .LBB2_5 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -39,10 +39,13 @@ ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: ; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10 -; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] -; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]] +; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] + +; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base +; GFX9-DAG: v_mov_b32_e32 v[[VGPR_HI:[0-9]+]], s[[HI]] +; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[VGPR_HI]]] + ; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]] define hidden void @use_queue_ptr_addrspacecast() #1 { %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32* diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -11,22 +11,20 @@ ; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; FLAT_SCR_OPT-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[0:1], src_private_base ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 4 -; FLAT_SCR_OPT-NEXT: s_lshl_b32 s0, s0, 16 +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s0 ; FLAT_SCR_OPT-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[0:1], src_private_base ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 4 -; FLAT_SCR_ARCH-NEXT: s_lshl_b32 s0, s0, 16 +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, s1 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, s0 ; FLAT_SCR_ARCH-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_ARCH-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_ARCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -80,19 +80,17 @@ ; GFX9V3-LABEL: addrspacecast: ; GFX9V3: ; %bb.0: ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V3-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V3-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b32 s2, s3, 0 ; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V3-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V3-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V3-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V3-NEXT: v_mov_b32_e32 v3, s0 @@ -106,19 +104,17 @@ ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0 ; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V4-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V4-NEXT: v_mov_b32_e32 v3, s0 @@ -132,19 +128,17 @@ ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0 ; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V5-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V5-NEXT: v_mov_b32_e32 v3, s0 @@ -200,11 +194,10 @@ ; ; GFX9V3-LABEL: llvm_amdgcn_is_shared: ; GFX9V3: ; %bb.0: -; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V3-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V3-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cmp_eq_u32 s2, s1 ; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -213,11 +206,10 @@ ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -226,11 +218,10 @@ ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off @@ -281,11 +272,10 @@ ; ; GFX9V3-LABEL: llvm_amdgcn_is_private: ; GFX9V3: ; %bb.0: -; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V3-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V3-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cmp_eq_u32 s2, s1 ; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V3-NEXT: global_store_dword v[0:1], v0, off @@ -294,11 +284,10 @@ ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -307,11 +296,10 @@ ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -4,9 +4,11 @@ ; GCN-LABEL: {{^}}is_private_vgpr: ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] +; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] + +; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base +; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] + ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -23,13 +25,15 @@ ; GCN-LABEL: {{^}}is_private_sgpr: ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} -; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] + +; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base +; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] + ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_private_sgpr(i8* %ptr) { %val = call i1 @llvm.amdgcn.is.private(i8* %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -4,10 +4,11 @@ ; GCN-LABEL: {{^}}is_local_vgpr: ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 -; GCN: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] +; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base +; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] + +; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -24,13 +25,14 @@ ; GCN-LABEL: {{^}}is_local_sgpr: ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} -; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} -; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base +; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] + +; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_local_sgpr(i8* %ptr) { %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)