Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -24,6 +24,8 @@ class AMDGPUAnnotateKernelFeatures : public ModulePass { private: + static bool hasAddrSpaceCast(const Function &F); + void addAttrToCallers(Function *Intrin, StringRef AttrName); bool addAttrsForIntrinsics(Module &M, ArrayRef); @@ -48,12 +50,29 @@ char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; +INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { + unsigned SrcAS = ASC->getSrcAddressSpace(); -INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) -INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) + // The queue ptr is only needed when casting to flat, not from it. + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +} +// Return true if an addrspacecast is used that requires the queue ptr. +bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + if (const AddrSpaceCastInst *ASC = dyn_cast(&I)) { + if (castRequiresQueuePtr(ASC)) + return true; + } + } + } + + return false; +} void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, StringRef AttrName) { @@ -117,9 +136,18 @@ // always initialized. bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA) + if (TT.getOS() == Triple::AMDHSA) { Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + for (Function &F : M) { + if (F.hasFnAttribute("amdgpu-queue-ptr")) + continue; + + if (hasAddrSpaceCast(F)) + F.addFnAttr("amdgpu-queue-ptr"); + } + } + return Changed; } Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -147,7 +147,6 @@ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; - SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -526,8 +525,6 @@ Lowering.legalizeTargetIndependentNode(N, *CurDAG); break; } - case ISD::ADDRSPACECAST: - return SelectAddrSpaceCast(N); case ISD::AND: case ISD::SRL: case ISD::SRA: @@ -1332,69 +1329,6 @@ !isa(Offset); } -// FIXME: This is incorrect and only enough to be able to compile. -SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { - AddrSpaceCastSDNode *ASC = cast(N); - SDLoc DL(N); - - const MachineFunction &MF = CurDAG->getMachineFunction(); - DiagnosticInfoUnsupported NotImplemented( - *MF.getFunction(), "addrspacecast not implemented", DL.getDebugLoc()); - CurDAG->getContext()->diagnose(NotImplemented); - - assert(Subtarget->hasFlatAddressSpace() && - "addrspacecast only supported with flat address space!"); - - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && - "Can only cast to / from flat address space!"); - - // The flat instructions read the address as the index of the VGPR holding the - // address, so casting should just be reinterpreting the base VGPR, so just - // insert trunc / bitcast / zext. - - SDValue Src = ASC->getOperand(0); - EVT DestVT = ASC->getValueType(0); - EVT SrcVT = Src.getValueType(); - - unsigned SrcSize = SrcVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); - - if (SrcSize > DestSize) { - assert(SrcSize == 64 && DestSize == 32); - return CurDAG->getMachineNode( - TargetOpcode::EXTRACT_SUBREG, - DL, - DestVT, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); - } - - if (DestSize > SrcSize) { - assert(SrcSize == 32 && DestSize == 64); - - // FIXME: This is probably wrong, we should never be defining - // a register class with both VGPRs and SGPRs - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, - MVT::i32); - - const SDValue Ops[] = { - RC, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(0, DL, MVT::i32)), 0), - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); - } - - assert(SrcSize == 64 && DestSize == 64); - return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); -} - SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, uint32_t Offset, uint32_t Width) { // Transformation function, pack the offset and width of a BFE into Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -45,6 +45,9 @@ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -278,6 +278,11 @@ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + if (Subtarget->hasFlatAddressSpace()) { + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1232,6 +1237,7 @@ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); } return SDValue(); } @@ -1390,6 +1396,84 @@ return Chain; } +SDValue SITargetLowering::getSegmentAperture(unsigned AS, + SelectionDAG &DAG) const { + SDLoc SL; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo(); + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, Info->getQueuePtrUserSGPR(), MVT::i64); + + // Offset into amd_queue_t for group_segment_aperture_base_hi / + // private_segment_aperture_base_hi. + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, SL, MVT::i64)); + + // TODO: Use custom target PseudoSourceValue. + //TODO: We should use the value from the IR intrinsic call, but it might not + // be available and how do we get it? + Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), + AMDGPUAS::CONSTANT_ADDRESS)); + + MachinePointerInfo PtrInfo(V, StructOffset); + return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, + PtrInfo, false, + false, true, + MinAlign(64, StructOffset)); +} + +SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + const AddrSpaceCastSDNode *ASC = cast(Op); + + SDValue Src = ASC->getOperand(0); + + // FIXME: Really support non-0 null pointers. + SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); + SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + + // flat -> local/private + if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + return DAG.getNode(ISD::SELECT, SL, MVT::i32, + NonNull, Ptr, SegmentNullPtr); + } + } + + // local/private -> flat + if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull + = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue CvtPtr + = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, + DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + FlatNullPtr); + } + } + + // global <-> flat are no-ops and never emitted. + + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported InvalidAddrSpaceCast( + *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); + DAG.getContext()->diagnose(InvalidAddrSpaceCast); + + return DAG.getUNDEF(ASC->getValueType(0)); +} + SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -270,6 +270,10 @@ ScratchWaveOffsetReg = Reg; } + unsigned getQueuePtrUserSGPR() const { + return QueuePtrUserSGPR; + } + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } Index: test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast.ll +++ test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,18 +1,208 @@ -; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s -; ERROR: addrspacecast not implemented +; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 1 -; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s -; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} + +; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] + +; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]] +; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] +; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 + +; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] +define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 7, i32 addrspace(4)* %stof + ret void +} + +; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 1 + +; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} +; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} + +; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] + +; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]] +; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] +; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 + +; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] +define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { + %stof = addrspacecast i32* %ptr to i32 addrspace(4)* + store volatile i32 7, i32 addrspace(4)* %stof + ret void +} + +; no-op +; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: +; HSA: enable_sgpr_queue_ptr = 0 + +; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} +; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] +; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] +define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* + store volatile i32 7, i32 addrspace(4)* %stof + ret void +} + +; no-op +; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: +; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} +; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] +; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] +; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} +define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* + %ld = load volatile i32, i32 addrspace(4)* %stof + ret void +} + +; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 + +; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} +; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} +; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] +; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] +; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} +; HSA: ds_write_b32 [[CASTPTR]], v[[K]] +define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* + store volatile i32 0, i32 addrspace(3)* %ftos + ret void +} + +; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 + +; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} +; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} +; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] +; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] +; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} +; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* + store volatile i32 0, i32* %ftos + ret void +} + +; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast: +; HSA: enable_sgpr_queue_ptr = 0 + +; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 +; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] +; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 +; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] +define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* + store volatile i32 0, i32 addrspace(1)* %ftos + ret void +} + +; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast: +; HSA: enable_sgpr_queue_ptr = 0 + +; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 +; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 +define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* + load volatile i32, i32 addrspace(2)* %ftos + ret void +} + +; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: +; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 +; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} +; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] +define void @cast_0_group_to_flat_addrspacecast() #0 { + %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)* + store i32 7, i32 addrspace(4)* %cast + ret void +} + +; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast: +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} +; HSA: ds_write_b32 [[PTR]], [[K]] +define void @cast_0_flat_to_group_addrspacecast() #0 { + %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)* + store i32 7, i32 addrspace(3)* %cast + ret void +} + +; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: +; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} +; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] +define void @cast_neg1_group_to_flat_addrspacecast() #0 { + %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)* + store i32 7, i32 addrspace(4)* %cast + ret void +} + +; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast: +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} +; HSA: ds_write_b32 [[PTR]], [[K]] +define void @cast_neg1_flat_to_group_addrspacecast() #0 { + %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)* + store i32 7, i32 addrspace(3)* %cast + ret void +} + +; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: +; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 +; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] +; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} +; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] +define void @cast_0_private_to_flat_addrspacecast() #0 { + %cast = addrspacecast i32* null to i32 addrspace(4)* + store i32 7, i32 addrspace(4)* %cast + ret void +} + +; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} +; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +define void @cast_0_flat_to_private_addrspacecast() #0 { + %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* + store i32 7, i32* %cast + ret void +} ; Disable optimizations in case there are optimizations added that ; specialize away generic pointer accesses. -; CHECK-LABEL: {{^}}branch_use_flat_i32: -; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; CHECK: s_endpgm +; HSA-LABEL: {{^}}branch_use_flat_i32: +; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} +; HSA: s_endpgm define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { entry: %cmp = icmp ne i32 %c, 0 @@ -34,20 +224,17 @@ ret void } -; TODO: This should not be zero when registers are used for small -; scratch allocations again. - ; Check for prologue initializing special SGPRs pointing to scratch. -; CHECK-LABEL: {{^}}store_flat_scratch: -; CHECK: s_movk_i32 flat_scratch_lo, 0 -; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} -; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} -; CHECK: flat_store_dword -; CHECK: s_barrier -; CHECK: flat_load_dword +; HSA-LABEL: {{^}}store_flat_scratch: +; HSA: s_mov_b32 flat_scratch_lo, s9 +; HSA: s_add_u32 [[ADD:s[0-9]+]], s8, s11 +; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 +; HSA: flat_store_dword +; HSA: s_barrier +; HSA: flat_load_dword define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { %alloca = alloca i32, i32 9, align 4 - %x = call i32 @llvm.amdgcn.workitem.id.x() #3 + %x = call i32 @llvm.amdgcn.workitem.id.x() #2 %pptr = getelementptr i32, i32* %alloca, i32 %x %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* store i32 %x, i32 addrspace(4)* %fptr @@ -59,8 +246,8 @@ } declare void @llvm.amdgcn.s.barrier() #1 -declare i32 @llvm.amdgcn.workitem.id.x() #3 +declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind } attributes #1 = { nounwind convergent } -attributes #3 = { nounwind readnone } +attributes #2 = { nounwind readnone } Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -164,6 +164,63 @@ ret void } +; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { +define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 { +define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 { + %stof = addrspacecast i32* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* + store volatile i32 0, i32 addrspace(3)* %ftos + ret void +} + +; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* + store volatile i32 0, i32* %ftos + ret void +} + +; No-op addrspacecast should not use queue ptr +; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { +define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { +define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* + %ld = load volatile i32, i32 addrspace(4)* %stof + ret void +} + +; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* + store volatile i32 0, i32 addrspace(1)* %ftos + ret void +} + +; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 { + %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* + %ld = load volatile i32, i32 addrspace(2)* %ftos + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/invalid-addrspacecast.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -0,0 +1,8 @@ +; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: error: :0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast +define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)* + store volatile i32 0, i32 addrspace(1)* %stof + ret void +}