Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -903,17 +903,34 @@ case Intrinsic::ptrmask: { unsigned OldAS = OldV->getType()->getPointerAddressSpace(); unsigned NewAS = NewV->getType()->getPointerAddressSpace(); - if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) - return nullptr; - - Module *M = II->getParent()->getParent()->getParent(); Value *MaskOp = II->getArgOperand(1); Type *MaskTy = MaskOp->getType(); - Function *NewDecl = Intrinsic::getDeclaration(M, Intrinsic::ptrmask, - {NewV->getType(), MaskTy}); - CallInst *NewCall = CallInst::Create(NewDecl->getFunctionType(), NewDecl, - {NewV, MaskOp}, "", II); - return NewCall; + + bool DoTruncate = false; + if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) { + // All valid 64-bit to 32-bit casts work by chopping off the high + // bits. Any masking only clearing the low bits will also apply in the new + // address space. + if (DL.getPointerSizeInBits(OldAS) != 64 || + DL.getPointerSizeInBits(NewAS) != 32) + return nullptr; + + // TODO: Do we need to thread more context in here? + KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II); + if (Known.countMinLeadingOnes() < 32) + return nullptr; + + DoTruncate = true; + } + + IRBuilder<> B(II); + if (DoTruncate) { + MaskTy = B.getInt32Ty(); + MaskOp = B.CreateTrunc(MaskOp, MaskTy); + } + + return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, + {NewV, MaskOp}); } default: return nullptr; Index: llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll =================================================================== --- llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll +++ llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces -instsimplify %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + define i8 @ptrmask_cast_local_to_flat(i8 addrspace(3)* %src.ptr, i64 %mask) { ; CHECK-LABEL: @ptrmask_cast_local_to_flat( ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8* @@ -179,9 +181,192 @@ ret i8 %add } +; Do not fold this since it clears a single high bit. +define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffeffffffff( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8* +; CHECK-NEXT: [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 -4294967297) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -4294967297) + %load = load i8, i8* %masked + ret i8 %load +} + +; Do not fold this since it clears a single high bit. +define i8 @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_7fffffffffffffff( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8* +; CHECK-NEXT: [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i64(i8* [[CAST]], i64 9223372036854775807) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 9223372036854775807) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffff00000000( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 0) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -4294967296) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffff80000000(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffff80000000( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -2147483648) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -2147483648) + %load = load i8, i8* %masked + ret i8 %load +} + +; Test some align-down patterns. These only touch the low bits, which are preserved through the cast. +define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffff0000(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffff0000( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -65536) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -65536) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffff00(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffff00( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -256) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -256) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffe0(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffe0( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -32) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -32) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff0(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff0( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -16) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -16) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff8(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffff8( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -8) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -8) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffc(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffc( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -4) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -4) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffe(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffe( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -2) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -2) + %load = load i8, i8* %masked + ret i8 %load +} + +define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff( +; CHECK-NEXT: [[TMP1:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 -1) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP1]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 -1) + %load = load i8, i8* %masked + ret i8 %load +} + +; Make sure non-constant masks can also be handled. +define i8 @ptrmask_cast_local_to_flat_load_range_mask(i8 addrspace(3)* %src.ptr, i64 addrspace(1)* %mask.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_load_range_mask( +; CHECK-NEXT: [[LOAD_MASK:%.*]] = load i64, i64 addrspace(1)* [[MASK_PTR:%.*]], align 8, !range !0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[LOAD_MASK]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* [[SRC_PTR:%.*]], i32 [[TMP1]]) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(3)* [[TMP2]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %load.mask = load i64, i64 addrspace(1)* %mask.ptr, align 8, !range !0 + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i64(i8* %cast, i64 %load.mask) + %load = load i8, i8* %masked + ret i8 %load +} + +; This should not be folded, as the mask is implicitly zero extended, +; so it would clear the high bits. +define i8 @ptrmask_cast_local_to_flat_const_mask_32bit_neg4(i8 addrspace(3)* %src.ptr) { +; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_32bit_neg4( +; CHECK-NEXT: [[CAST:%.*]] = addrspacecast i8 addrspace(3)* [[SRC_PTR:%.*]] to i8* +; CHECK-NEXT: [[MASKED:%.*]] = call i8* @llvm.ptrmask.p0i8.i32(i8* [[CAST]], i32 -4) +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[MASKED]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %cast = addrspacecast i8 addrspace(3)* %src.ptr to i8* + %masked = call i8* @llvm.ptrmask.p0i8.i32(i8* %cast, i32 -4) + %load = load i8, i8* %masked + ret i8 %load +} + declare i8* @llvm.ptrmask.p0i8.i64(i8*, i64) #0 +declare i8* @llvm.ptrmask.p0i8.i32(i8*, i32) #0 declare i8 addrspace(5)* @llvm.ptrmask.p5i8.i32(i8 addrspace(5)*, i32) #0 declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)*, i32) #0 declare i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)*, i64) #0 attributes #0 = { nounwind readnone speculatable willreturn } + +!0 = !{i64 -64, i64 -1}