Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -399,10 +399,6 @@ case AMDGPU::EXEC_HI: case AMDGPU::SCC: case AMDGPU::M0: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: continue; case AMDGPU::VCC: Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -540,10 +540,6 @@ case 124: return createRegOperand(M0); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); - case 235: return createRegOperand(SRC_SHARED_BASE); - case 236: return createRegOperand(SRC_SHARED_LIMIT); - case 237: return createRegOperand(SRC_PRIVATE_BASE); - case 238: return createRegOperand(SRC_PRIVATE_LIMIT); // TODO: SRC_POPS_EXITING_WAVE_ID // ToDo: no support for vccz register case 251: break; Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -248,6 +248,7 @@ ID_LDS_ALLOC = 6, ID_IB_STS = 7, ID_SYMBOLIC_LAST_ = 8, + ID_MEM_BASES = 15, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -257,14 +258,20 @@ OFFSET_DEFAULT_ = 0, OFFSET_SHIFT_ = 6, OFFSET_WIDTH_ = 5, - OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_) + OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + + OFFSET_SRC_SHARED_BASE = 16, + OFFSET_SRC_PRIVATE_BASE = 0 }; enum WidthMinusOne { // WidthMinusOne, (5) [15:11] WIDTH_M1_DEFAULT_ = 31, WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, - WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_) + WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), + + WIDTH_M1_SRC_SHARED_BASE = 15, + WIDTH_M1_SRC_PRIVATE_BASE = 15 }; } // namespace Hwreg Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -58,7 +58,9 @@ /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2351,16 +2351,28 @@ return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; } -SDValue SITargetLowering::getSegmentAperture(unsigned AS, +SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { - - if (Subtarget->hasApertureRegs()) { // Read from Aperture Registers directly. - unsigned RegNo = (AS == AMDGPUASI.LOCAL_ADDRESS) ? AMDGPU::SRC_SHARED_BASE : - AMDGPU::SRC_PRIVATE_BASE; - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, RegNo, MVT::i32); + // FIXME: Use inline constants (src_{shared, private}_base) instead. + if (Subtarget->hasApertureRegs()) { + unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : + AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : + AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = + AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + + SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); + SDValue ApertureReg = SDValue( + DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); + SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); + return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); } - SDLoc SL; MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); unsigned UserSGPR = Info->getQueuePtrUserSGPR(); @@ -2373,8 +2385,8 @@ // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; - SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, - DAG.getConstant(StructOffset, SL, MVT::i64)); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, DL, MVT::i64)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not @@ -2383,7 +2395,7 @@ AMDGPUASI.CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(V, StructOffset); - return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, MinAlign(64, StructOffset), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -2428,7 +2440,7 @@ SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); SDValue CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -146,12 +146,6 @@ reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve the memory aperture registers. - reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); - reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); - reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); - reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); - // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,11 +44,6 @@ def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; -def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; -def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; -def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; -def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; - // Trap handler registers def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; @@ -266,8 +261,7 @@ // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, - TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, - SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { let AllocationPriority = 7; } Index: test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast.ll +++ test/CodeGen/AMDGPU/addrspacecast.ll @@ -12,7 +12,9 @@ ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] @@ -45,7 +47,9 @@ ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base +; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] @@ -153,7 +157,9 @@ ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] -; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} Index: test/MC/Disassembler/AMDGPU/aperture-regs.ll =================================================================== --- test/MC/Disassembler/AMDGPU/aperture-regs.ll +++ /dev/null @@ -1,13 +0,0 @@ -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX9 %s - -# GFX9: v_mov_b32_e32 v1, src_shared_base ; encoding: [0xeb,0x02,0x02,0x7e] -0xeb 0x02 0x02 0x7e - -# GFX9: v_mov_b32_e32 v1, src_shared_limit ; encoding: [0xec,0x02,0x02,0x7e] -0xec 0x02 0x02 0x7e - -# GFX9: v_mov_b32_e32 v1, src_private_base ; encoding: [0xed,0x02,0x02,0x7e] -0xed 0x02 0x02 0x7e - -# GFX9: v_mov_b32_e32 v1, src_private_limit ; encoding: [0xee,0x02,0x02,0x7e] -0xee 0x02 0x02 0x7e