Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1812,6 +1812,27 @@ return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } +/// Return true if the value is a known valid address, such that a null check is +/// not necessary. +static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, + const AMDGPUTargetMachine &TM, unsigned AddrSpace) { + MachineInstr *Def = MRI.getVRegDef(Val); + switch (Def->getOpcode()) { + case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_GLOBAL_VALUE: + case AMDGPU::G_BLOCK_ADDR: + return true; + case AMDGPU::G_CONSTANT: { + const ConstantInt *CI = Def->getOperand(1).getCImm(); + return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); + } + default: + return false; + } + + return false; +} + bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1862,6 +1883,14 @@ if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + // Extract low 32-bits of the pointer. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + unsigned NullVal = TM.getNullPointerValue(DestAS); auto SegmentNull = B.buildConstant(DstTy, NullVal); @@ -1884,24 +1913,29 @@ if (!ST.hasFlatAddressSpace()) return false; - auto SegmentNull = - B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); - auto FlatNull = - B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); if (!ApertureReg.isValid()) return false; - auto CmpRes = - B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); - // Coerce the type of the low half of the result so we can use merge_values. Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + B.buildCopy(Dst, BuildPtr); + MI.eraseFromParent(); + return true; + } + + auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); MI.eraseFromParent(); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5501,6 +5501,22 @@ MachineMemOperand::MOInvariant); } +/// Return true if the value is a known valid address, such that a null check is +/// not necessary. +static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, + const AMDGPUTargetMachine &TM, unsigned AddrSpace) { + if (isa(Val) || isa(Val) || + isa(Val)) + return true; + + if (auto *ConstVal = dyn_cast(Val)) + return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); + + // TODO: Search through arithmetic, handle arguments and loads + // marked nonnull. + return false; +} + SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -5508,44 +5524,51 @@ SDValue Src = ASC->getOperand(0); SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + unsigned SrcAS = ASC->getSrcAddressSpace(); const AMDGPUTargetMachine &TM = static_cast(getTargetMachine()); // flat -> local/private - if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { unsigned DestAS = ASC->getDestAddressSpace(); if (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + if (isKnownNonNull(Src, DAG, TM, SrcAS)) + return Ptr; + unsigned NullVal = TM.getNullPointerValue(DestAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); - SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); - return DAG.getNode(ISD::SELECT, SL, MVT::i32, - NonNull, Ptr, SegmentNullPtr); + return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, + SegmentNullPtr); } } // local/private -> flat if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - unsigned SrcAS = ASC->getSrcAddressSpace(); - if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); + SDValue CvtPtr = + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + + if (isKnownNonNull(Src, DAG, TM, SrcAS)) + return CvtPtr; + unsigned NullVal = TM.getNullPointerValue(SrcAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); - SDValue CvtPtr - = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); - - return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, - DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, FlatNullPtr); } } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -174,28 +174,28 @@ ; VI-LABEL: name: test_addrspacecast_p5_to_p0 ; VI: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; VI-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 - ; VI-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; VI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) - ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 - ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64) + ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 + ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C]](s64) ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) - ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p5), [[C]] ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5) ; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) - ; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 + ; VI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p5), [[C1]] + ; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; GFX9-LABEL: name: test_addrspacecast_p5_to_p0 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C2]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[C1]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; SI-LABEL: name: test_addrspacecast_p5_to_p0 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -257,28 +257,28 @@ ; VI-LABEL: name: test_addrspacecast_p3_to_p0 ; VI: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; VI-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; VI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) - ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 - ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64) + ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C]](s64) ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4) - ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p3), [[C]] ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3) ; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) - ; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; VI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p3), [[C1]] + ; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; GFX9-LABEL: name: test_addrspacecast_p3_to_p0 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C2]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C1]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](p0) ; SI-LABEL: name: test_addrspacecast_p3_to_p0 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -463,43 +463,43 @@ ; VI: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>) - ; VI-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; VI-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; VI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) - ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 - ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C2]](s64) + ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY2]], [[C]](s64) ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4) - ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) - ; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; VI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]] + ; VI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] ; VI-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) - ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY3]], [[C2]](s64) + ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY3]], [[C]](s64) ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4) - ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] ; VI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; VI-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32) - ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]] + ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]] + ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]] ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[SELECT]](p0), [[SELECT1]](p0) ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p0>) ; GFX9-LABEL: name: test_addrspacecast_v2p3_to_v2p0 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C2]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 + ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] ; GFX9-NEXT: [[S_GETREG_B32_1:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_1]], [[C2]](s32) - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_1]], [[C]](s32) ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[SHL1]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]] + ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[SELECT]](p0), [[SELECT1]](p0) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p0>) ; SI-LABEL: name: test_addrspacecast_v2p3_to_v2p0 @@ -639,3 +639,40 @@ %1:_(p0) = G_ADDRSPACE_CAST %0 $vgpr0_vgpr1 = COPY %1 ... +--- +name: test_addrspacecast_p5_fi_to_p0 +machineFunctionInfo: + argumentInfo: + queuePtr: { reg: '$sgpr4_sgpr5' } +stack: + - { id: 0, size: 4, alignment: 4 } +body: | + bb.0: + ; VI-LABEL: name: test_addrspacecast_p5_fi_to_p0 + ; VI: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; VI-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) + ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 + ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY1]], [[C]](s64) + ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) + ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) + ; VI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) + ; VI-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[MV]](p0) + ; VI-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](p0) + ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0 + ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; GFX9-NEXT: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C]](s32) + ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[MV]](p0) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY]](p0) + ; SI-LABEL: name: test_addrspacecast_p5_fi_to_p0 + ; SI: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; SI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[FRAME_INDEX]](p5) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ADDRSPACE_CAST]](p0) + %0:_(p5) = G_FRAME_INDEX %stack.0 + %1:_(p0) = G_ADDRSPACE_CAST %0 + $vgpr0_vgpr1 = COPY %1 +... Index: llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/addrspacecast-known-non-null.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s + +; Test that a null check is not emitted for lowered addrspacecast + + +define void @flat_user(i8* %ptr) { + store i8 0, i8* %ptr + ret void +} + +; CHECK-LABEL: {{^}}cast_alloca: +; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; CHECK-NEXT: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; CHECK-NEXT: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK-NOT: v0 +; CHECK-NOT: v1 +define void @cast_alloca() { + %alloca = alloca i8, addrspace(5) + %cast = addrspacecast i8 addrspace(5)* %alloca to i8* + call void @flat_user(i8* %cast) + ret void +} + +@lds = internal unnamed_addr addrspace(3) global i8 undef, align 4 + +; CHECK-LABEL: {{^}}cast_lds_gv: +; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; CHECK-NEXT: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, [[APERTURE]] +; CHECK-NOT: v0 +; CHECK-NOT: v1 +define void @cast_lds_gv() { + %cast = addrspacecast i8 addrspace(3)* @lds to i8* + call void @flat_user(i8* %cast) + ret void +} + +; CHECK-LABEL: {{^}}cast_constant_lds_neg1_gv: +; CHECK: v_mov_b32_e32 v0, 0 +; CHECK: v_mov_b32_e32 v1, 0 +define void @cast_constant_lds_neg1_gv() { + call void @flat_user(i8* addrspacecast (i8 addrspace(3)* inttoptr (i32 -1 to i8 addrspace(3)*) to i8*)) + ret void +} + +; CHECK-LABEL: {{^}}cast_constant_private_neg1_gv: +; CHECK: v_mov_b32_e32 v0, 0 +; CHECK: v_mov_b32_e32 v1, 0 +define void @cast_constant_private_neg1_gv() { + call void @flat_user(i8* addrspacecast (i8 addrspace(5)* inttoptr (i32 -1 to i8 addrspace(5)*) to i8*)) + ret void +} + +; CHECK-LABEL: {{^}}cast_constant_lds_other_gv: +; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: v_mov_b32_e32 v0, 0x7b +; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +define void @cast_constant_lds_other_gv() { + call void @flat_user(i8* addrspacecast (i8 addrspace(3)* inttoptr (i32 123 to i8 addrspace(3)*) to i8*)) + ret void +} + +; CHECK-LABEL: {{^}}cast_constant_private_other_gv: +; CHECK: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; CHECK: s_lshl_b32 [[APERTURE:s[0-9]+]], [[GETREG]], 16 +; CHECK: v_mov_b32_e32 v0, 0x7b +; CHECK: v_mov_b32_e32 v1, [[APERTURE]] +define void @cast_constant_private_other_gv() { + call void @flat_user(i8* addrspacecast (i8 addrspace(5)* inttoptr (i32 123 to i8 addrspace(5)*) to i8*)) + ret void +} \ No newline at end of file Index: llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -11,26 +11,22 @@ ; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 4 ; FLAT_SCR_OPT-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 4 ; FLAT_SCR_OPT-NEXT: s_lshl_b32 s0, s0, 16 -; FLAT_SCR_OPT-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 -; FLAT_SCR_OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; FLAT_SCR_OPT-NEXT: v_cndmask_b32_e64 v1, 0, s0, vcc_lo +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s0 ; FLAT_SCR_OPT-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 4 ; FLAT_SCR_ARCH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 4 ; FLAT_SCR_ARCH-NEXT: s_lshl_b32 s0, s0, 16 -; FLAT_SCR_ARCH-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 -; FLAT_SCR_ARCH-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; FLAT_SCR_ARCH-NEXT: v_cndmask_b32_e64 v1, 0, s0, vcc_lo +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, s0 ; FLAT_SCR_ARCH-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_ARCH-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_ARCH-NEXT: s_endpgm