diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -1,37 +1,52 @@ -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s #include "Inputs/cuda.h" // Coerced struct from `struct S` without all generic pointers lowered into // global ones. -// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } -// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] } +// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } +// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] } // On the host-side compilation, generic pointer won't be coerced. // HOST-NOT: %struct.S.coerce // HOST-NOT: %struct.T.coerce -// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) // HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: inttoptr __global__ void kernel1(int *x) { x[0]++; } -// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* nonnull align 4 dereferenceable(4) %x.coerce) // HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel2(int &x) { x++; } -// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) // HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +// CHECK-LABEL: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* __global__ void kernel3(__attribute__((address_space(2))) int *x, __attribute__((address_space(1))) int *y) { y[0] = x[0]; } -// CHECK: define void @_Z4funcPi(i32* %x) +// COMMON-LABEL: define void @_Z4funcPi(i32*{{.*}} %x) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __device__ void func(int *x) { x[0]++; } @@ -42,16 +57,22 @@ }; // `by-val` struct will be coerced into a similar struct with all generic // pointers lowerd into global ones. -// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel4(struct S s) { s.x[0]++; s.y[0] += 1.f; } // If a pointer to struct is passed, only the pointer itself is coerced into the global one. -// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce) // HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -61,16 +82,22 @@ float *x[2]; }; // `by-val` array is also coerced. -// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; t.x[1][0] += 2.f; } // Check that coerced pointers retain the noalias attribute when qualified with __restrict. -// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce) // HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel7(int *__restrict x) { x[0]++; } diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -143,6 +143,7 @@ /// InferAddressSpaces class InferAddressSpaces : public FunctionPass { const TargetTransformInfo *TTI = nullptr; + const DataLayout *DL = nullptr; /// Target specific address space which uses of should be replaced if /// possible. @@ -219,10 +220,25 @@ INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) +static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout *DL) { + assert(I2P->getOpcode() == Instruction::IntToPtr); + auto *P2I = dyn_cast(I2P->getOperand(0)); + if (!P2I || P2I->getOpcode() != Instruction::PtrToInt) + return false; + // The pair of `ptrtoint`/`inttoptr` is a no-op cast if both of them are + // no-op casts. + return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()), + I2P->getOperand(0)->getType(), I2P->getType(), + *DL) && + CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()), + P2I->getOperand(0)->getType(), P2I->getType(), + *DL); +} + // Returns true if V is an address expression. // TODO: Currently, we consider only phi, bitcast, addrspacecast, and // getelementptr operators. -static bool isAddressExpression(const Value &V) { +static bool isAddressExpression(const Value &V, const DataLayout *DL) { const Operator *Op = dyn_cast(&V); if (!Op) return false; @@ -241,6 +257,8 @@ const IntrinsicInst *II = dyn_cast(&V); return II && II->getIntrinsicID() == Intrinsic::ptrmask; } + case Instruction::IntToPtr: + return isNoopPtrIntCastPair(Op, DL); default: return false; } @@ -249,7 +267,8 @@ // Returns the pointer operands of V. // // Precondition: V is an address expression. -static SmallVector getPointerOperands(const Value &V) { +static SmallVector getPointerOperands(const Value &V, + const DataLayout *DL) { const Operator &Op = cast(V); switch (Op.getOpcode()) { case Instruction::PHI: { @@ -269,6 +288,11 @@ "unexpected intrinsic call"); return {II.getArgOperand(0)}; } + case Instruction::IntToPtr: { + assert(isNoopPtrIntCastPair(&Op, DL)); + auto *P2I = cast(Op.getOperand(0)); + return {P2I->getOperand(0)}; + } default: llvm_unreachable("Unexpected instruction type."); } @@ -337,13 +361,13 @@ // expressions. if (ConstantExpr *CE = dyn_cast(V)) { // TODO: Look in non-address parts, like icmp operands. - if (isAddressExpression(*CE) && Visited.insert(CE).second) + if (isAddressExpression(*CE, DL) && Visited.insert(CE).second) PostorderStack.emplace_back(CE, false); return; } - if (isAddressExpression(*V) && + if (isAddressExpression(*V, DL) && V->getType()->getPointerAddressSpace() == FlatAddrSpace) { if (Visited.insert(V).second) { PostorderStack.emplace_back(V, false); @@ -351,7 +375,7 @@ Operator *Op = cast(V); for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) { if (ConstantExpr *CE = dyn_cast(Op->getOperand(I))) { - if (isAddressExpression(*CE) && Visited.insert(CE).second) + if (isAddressExpression(*CE, DL) && Visited.insert(CE).second) PostorderStack.emplace_back(CE, false); } } @@ -407,6 +431,10 @@ } else if (auto *ASC = dyn_cast(&I)) { if (!ASC->getType()->isVectorTy()) PushPtrOperand(ASC->getPointerOperand()); + } else if (auto *I2P = dyn_cast(&I)) { + if (isNoopPtrIntCastPair(cast(I2P), DL)) + PushPtrOperand( + cast(I2P->getOperand(0))->getPointerOperand()); } } @@ -423,7 +451,7 @@ } // Otherwise, adds its operands to the stack and explores them. PostorderStack.back().setInt(true); - for (Value *PtrOperand : getPointerOperands(*TopVal)) { + for (Value *PtrOperand : getPointerOperands(*TopVal, DL)) { appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack, Visited); } @@ -536,6 +564,14 @@ assert(I->getType()->isPointerTy()); return SelectInst::Create(I->getOperand(0), NewPointerOperands[1], NewPointerOperands[2], "", nullptr, I); + case Instruction::IntToPtr: { + assert(isNoopPtrIntCastPair(cast(I), DL)); + Value *Src = cast(I->getOperand(0))->getOperand(0); + assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); + if (Src->getType() != NewPtrType) + return new BitCastInst(Src, NewPtrType); + return Src; + } default: llvm_unreachable("Unexpected opcode"); } @@ -629,7 +665,7 @@ const ValueToValueMapTy &ValueWithNewAddrSpace, SmallVectorImpl *UndefUsesToFix) const { // All values in Postorder are flat address expressions. - assert(isAddressExpression(*V) && + assert(isAddressExpression(*V, DL) && V->getType()->getPointerAddressSpace() == FlatAddrSpace); if (Instruction *I = dyn_cast(V)) { @@ -669,6 +705,7 @@ return false; TTI = &getAnalysis().getTTI(F); + DL = &F.getParent()->getDataLayout(); if (FlatAddrSpace == UninitializedAddressSpace) { FlatAddrSpace = TTI->getFlatAddressSpace(); @@ -773,7 +810,7 @@ else NewAS = joinAddressSpaces(Src0AS, Src1AS); } else { - for (Value *PtrOperand : getPointerOperands(V)) { + for (Value *PtrOperand : getPointerOperands(V, DL)) { auto I = InferredAddrSpace.find(PtrOperand); unsigned OperandAS = I != InferredAddrSpace.end() ? I->second : PtrOperand->getType()->getPointerAddressSpace(); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1703,8 +1703,10 @@ NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { if (NewTy->isPointerTy() && OldTy->isPointerTy()) { - return cast(NewTy)->getPointerAddressSpace() == - cast(OldTy)->getPointerAddressSpace(); + unsigned OldAS = cast(OldTy)->getPointerAddressSpace(); + unsigned NewAS = cast(NewTy)->getPointerAddressSpace(); + return OldAS == NewAS || + DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS); } // We can convert integers to integral pointers, but not to non-integral @@ -1772,6 +1774,17 @@ return IRB.CreatePtrToInt(V, NewTy); } + if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) { + unsigned OldAS = OldTy->getPointerAddressSpace(); + unsigned NewAS = NewTy->getPointerAddressSpace(); + // Generate pair of ptrtoint/inttoptr. + if (OldAS != NewAS) { + assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); + return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + } + } + return IRB.CreateBitCast(V, NewTy); } diff --git a/llvm/test/Transforms/InferAddressSpaces/noop-ptrint-pair.ll b/llvm/test/Transforms/InferAddressSpaces/noop-ptrint-pair.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/noop-ptrint-pair.ll @@ -0,0 +1,17 @@ +; RUN: opt -S -o - -sroa -infer-address-spaces %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-LABEL: @noop_ptrint_pair( +; CHECK-NEXT: store i32 0, i32 addrspace(1)* %{{.*}} +; CHECK-NEXT: ret void +define void @noop_ptrint_pair(i32 addrspace(1)* %x.coerce) { + %x = alloca i32*, align 8, addrspace(5) + %x1 = addrspacecast i32* addrspace(5)* %x to i32** + %x2 = bitcast i32** %x1 to i32 addrspace(1)** + store i32 addrspace(1)* %x.coerce, i32 addrspace(1)** %x2 + %x3 = load i32*, i32** %x1 + store i32 0, i32* %x3 + ret void +} diff --git a/llvm/test/Transforms/SROA/address-spaces.ll b/llvm/test/Transforms/SROA/address-spaces.ll --- a/llvm/test/Transforms/SROA/address-spaces.ll +++ b/llvm/test/Transforms/SROA/address-spaces.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -sroa -S | FileCheck %s -target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" +target datalayout = "e-p:64:64:64-p1:16:16:16-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture readonly, i32, i1) @@ -71,7 +71,8 @@ @g = common global i32 0, align 4 @l = common addrspace(3) global i32 0, align 4 -; Make sure an illegal bitcast isn't introduced +; If pointers from different address spaces have different sizes, make sure an +; illegal bitcast isn't introduced define void @pr27557() { ; CHECK-LABEL: @pr27557( ; CHECK: %[[CAST:.*]] = bitcast i32** {{.*}} to i32 addrspace(3)** @@ -84,6 +85,21 @@ ret void } +@l2 = common addrspace(2) global i32 0, align 4 + +; If pointers from different address spaces have the same size, that pointer +; should be promoted through the pair of `ptrtoint`/`inttoptr`. +define i32* @pr27557.alt() { +; CHECK-LABEL: @pr27557.alt( +; CHECK: ret i32* inttoptr (i64 ptrtoint (i32 addrspace(2)* @l2 to i64) to i32*) + %1 = alloca %union.anon, align 8 + %2 = bitcast %union.anon* %1 to i32 addrspace(2)** + store i32 addrspace(2)* @l2, i32 addrspace(2)** %2, align 8 + %3 = bitcast %union.anon* %1 to i32** + %4 = load i32*, i32** %3, align 8 + ret i32* %4 +} + ; Make sure pre-splitting doesn't try to introduce an illegal bitcast define float @presplit(i64 addrspace(1)* %p) { entry: diff --git a/llvm/test/Transforms/SROA/alloca-address-space.ll b/llvm/test/Transforms/SROA/alloca-address-space.ll --- a/llvm/test/Transforms/SROA/alloca-address-space.ll +++ b/llvm/test/Transforms/SROA/alloca-address-space.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -sroa -S | FileCheck %s -target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64-A2" +target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64-A2" declare void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i1) declare void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i1) @@ -70,7 +70,8 @@ @g = common global i32 0, align 4 @l = common addrspace(3) global i32 0, align 4 -; Make sure an illegal bitcast isn't introduced +; If pointers from different address spaces have different sizes, make sure an +; illegal bitcast isn't introduced ; CHECK-LABEL: @pr27557( ; CHECK: %[[CAST:.*]] = bitcast i32* addrspace(2)* {{.*}} to i32 addrspace(3)* addrspace(2)* ; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %[[CAST]] @@ -83,6 +84,21 @@ ret void } +@l4 = common addrspace(4) global i32 0, align 4 + +; If pointers from different address spaces have the same size, that pointer +; should be promoted through the pair of `ptrtoint`/`inttoptr`. +define i32* @pr27557.alt() { +; CHECK-LABEL: @pr27557.alt( +; CHECK: ret i32* inttoptr (i64 ptrtoint (i32 addrspace(4)* @l4 to i64) to i32*) + %1 = alloca %union.anon, align 8, addrspace(2) + %2 = bitcast %union.anon addrspace(2)* %1 to i32 addrspace(4)* addrspace(2)* + store i32 addrspace(4)* @l4, i32 addrspace(4)* addrspace(2)* %2, align 8 + %3 = bitcast %union.anon addrspace(2)* %1 to i32* addrspace(2)* + %4 = load i32*, i32* addrspace(2)* %3, align 8 + ret i32* %4 +} + ; Test load from and store to non-zero address space. define void @test_load_store_diff_addr_space([2 x float] addrspace(1)* %complex1, [2 x float] addrspace(1)* %complex2) { ; CHECK-LABEL: @test_load_store_diff_addr_space