diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -1,37 +1,52 @@ -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s #include "Inputs/cuda.h" // Coerced struct from `struct S` without all generic pointers lowered into // global ones. -// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } -// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] } +// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } +// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] } // On the host-side compilation, generic pointer won't be coerced. // HOST-NOT: %struct.S.coerce // HOST-NOT: %struct.T.coerce -// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) // HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: inttoptr __global__ void kernel1(int *x) { x[0]++; } -// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* nonnull align 4 dereferenceable(4) %x.coerce) // HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel2(int &x) { x++; } -// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) // HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +// CHECK-LABEL: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* __global__ void kernel3(__attribute__((address_space(2))) int *x, __attribute__((address_space(1))) int *y) { y[0] = x[0]; } -// CHECK: define void @_Z4funcPi(i32* %x) +// COMMON-LABEL: define void @_Z4funcPi(i32*{{.*}} %x) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __device__ void func(int *x) { x[0]++; } @@ -42,16 +57,22 @@ }; // `by-val` struct will be coerced into a similar struct with all generic // pointers lowerd into global ones. -// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel4(struct S s) { s.x[0]++; s.y[0] += 1.f; } // If a pointer to struct is passed, only the pointer itself is coerced into the global one. -// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce) // HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -61,16 +82,22 @@ float *x[2]; }; // `by-val` array is also coerced. -// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; t.x[1][0] += 2.f; } // Check that coerced pointers retain the noalias attribute when qualified with __restrict. -// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce) // HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel7(int *__restrict x) { x[0]++; } diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -143,6 +143,7 @@ /// InferAddressSpaces class InferAddressSpaces : public FunctionPass { const TargetTransformInfo *TTI = nullptr; + const DataLayout *DL = nullptr; /// Target specific address space which uses of should be replaced if /// possible. @@ -219,10 +220,30 @@ INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", false, false) +static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout *DL, + const TargetTransformInfo *TTI) { + assert(I2P->getOpcode() == Instruction::IntToPtr); + auto *P2I = dyn_cast(I2P->getOperand(0)); + if (!P2I || P2I->getOpcode() != Instruction::PtrToInt) + return false; + // The pair of `ptrtoint`/`inttoptr` is a no-op cast if both of them are + // no-op casts. + return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()), + I2P->getOperand(0)->getType(), I2P->getType(), + *DL) && + CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()), + P2I->getOperand(0)->getType(), P2I->getType(), + *DL) && + TTI->isNoopAddrSpaceCast( + P2I->getOperand(0)->getType()->getPointerAddressSpace(), + I2P->getType()->getPointerAddressSpace()); +} + // Returns true if V is an address expression. // TODO: Currently, we consider only phi, bitcast, addrspacecast, and // getelementptr operators. -static bool isAddressExpression(const Value &V) { +static bool isAddressExpression(const Value &V, const DataLayout *DL, + const TargetTransformInfo *TTI) { const Operator *Op = dyn_cast(&V); if (!Op) return false; @@ -241,6 +262,8 @@ const IntrinsicInst *II = dyn_cast(&V); return II && II->getIntrinsicID() == Intrinsic::ptrmask; } + case Instruction::IntToPtr: + return isNoopPtrIntCastPair(Op, DL, TTI); default: return false; } @@ -249,7 +272,9 @@ // Returns the pointer operands of V. // // Precondition: V is an address expression. -static SmallVector getPointerOperands(const Value &V) { +static SmallVector +getPointerOperands(const Value &V, const DataLayout *DL, + const TargetTransformInfo *TTI) { const Operator &Op = cast(V); switch (Op.getOpcode()) { case Instruction::PHI: { @@ -269,6 +294,11 @@ "unexpected intrinsic call"); return {II.getArgOperand(0)}; } + case Instruction::IntToPtr: { + assert(isNoopPtrIntCastPair(&Op, DL, TTI)); + auto *P2I = cast(Op.getOperand(0)); + return {P2I->getOperand(0)}; + } default: llvm_unreachable("Unexpected instruction type."); } @@ -337,13 +367,13 @@ // expressions. if (ConstantExpr *CE = dyn_cast(V)) { // TODO: Look in non-address parts, like icmp operands. - if (isAddressExpression(*CE) && Visited.insert(CE).second) + if (isAddressExpression(*CE, DL, TTI) && Visited.insert(CE).second) PostorderStack.emplace_back(CE, false); return; } - if (isAddressExpression(*V) && + if (isAddressExpression(*V, DL, TTI) && V->getType()->getPointerAddressSpace() == FlatAddrSpace) { if (Visited.insert(V).second) { PostorderStack.emplace_back(V, false); @@ -351,7 +381,7 @@ Operator *Op = cast(V); for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) { if (ConstantExpr *CE = dyn_cast(Op->getOperand(I))) { - if (isAddressExpression(*CE) && Visited.insert(CE).second) + if (isAddressExpression(*CE, DL, TTI) && Visited.insert(CE).second) PostorderStack.emplace_back(CE, false); } } @@ -407,6 +437,10 @@ } else if (auto *ASC = dyn_cast(&I)) { if (!ASC->getType()->isVectorTy()) PushPtrOperand(ASC->getPointerOperand()); + } else if (auto *I2P = dyn_cast(&I)) { + if (isNoopPtrIntCastPair(cast(I2P), DL, TTI)) + PushPtrOperand( + cast(I2P->getOperand(0))->getPointerOperand()); } } @@ -423,7 +457,7 @@ } // Otherwise, adds its operands to the stack and explores them. PostorderStack.back().setInt(true); - for (Value *PtrOperand : getPointerOperands(*TopVal)) { + for (Value *PtrOperand : getPointerOperands(*TopVal, DL, TTI)) { appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack, Visited); } @@ -536,6 +570,14 @@ assert(I->getType()->isPointerTy()); return SelectInst::Create(I->getOperand(0), NewPointerOperands[1], NewPointerOperands[2], "", nullptr, I); + case Instruction::IntToPtr: { + assert(isNoopPtrIntCastPair(cast(I), DL, TTI)); + Value *Src = cast(I->getOperand(0))->getOperand(0); + assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); + if (Src->getType() != NewPtrType) + return new BitCastInst(Src, NewPtrType); + return Src; + } default: llvm_unreachable("Unexpected opcode"); } @@ -545,8 +587,9 @@ // constant expression `CE` with its operands replaced as specified in // ValueWithNewAddrSpace. static Value *cloneConstantExprWithNewAddressSpace( - ConstantExpr *CE, unsigned NewAddrSpace, - const ValueToValueMapTy &ValueWithNewAddrSpace) { + ConstantExpr *CE, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, const DataLayout *DL, + const TargetTransformInfo *TTI) { Type *TargetType = CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); @@ -577,6 +620,13 @@ } } + if (CE->getOpcode() == Instruction::IntToPtr) { + assert(isNoopPtrIntCastPair(cast(CE), DL, TTI)); + Constant *Src = cast(CE->getOperand(0))->getOperand(0); + assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); + return ConstantExpr::getBitCast(Src, TargetType); + } + // Computes the operands of the new constant expression. bool IsNew = false; SmallVector NewOperands; @@ -594,7 +644,7 @@ } if (auto CExpr = dyn_cast(Operand)) if (Value *NewOperand = cloneConstantExprWithNewAddressSpace( - CExpr, NewAddrSpace, ValueWithNewAddrSpace)) { + CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) { IsNew = true; NewOperands.push_back(cast(NewOperand)); continue; @@ -629,7 +679,7 @@ const ValueToValueMapTy &ValueWithNewAddrSpace, SmallVectorImpl *UndefUsesToFix) const { // All values in Postorder are flat address expressions. - assert(isAddressExpression(*V) && + assert(isAddressExpression(*V, DL, TTI) && V->getType()->getPointerAddressSpace() == FlatAddrSpace); if (Instruction *I = dyn_cast(V)) { @@ -645,7 +695,7 @@ } return cloneConstantExprWithNewAddressSpace( - cast(V), NewAddrSpace, ValueWithNewAddrSpace); + cast(V), NewAddrSpace, ValueWithNewAddrSpace, DL, TTI); } // Defines the join operation on the address space lattice (see the file header @@ -669,6 +719,7 @@ return false; TTI = &getAnalysis().getTTI(F); + DL = &F.getParent()->getDataLayout(); if (FlatAddrSpace == UninitializedAddressSpace) { FlatAddrSpace = TTI->getFlatAddressSpace(); @@ -773,7 +824,7 @@ else NewAS = joinAddressSpaces(Src0AS, Src1AS); } else { - for (Value *PtrOperand : getPointerOperands(V)) { + for (Value *PtrOperand : getPointerOperands(V, DL, TTI)) { auto I = InferredAddrSpace.find(PtrOperand); unsigned OperandAS = I != InferredAddrSpace.end() ? I->second : PtrOperand->getType()->getPointerAddressSpace(); diff --git a/llvm/test/Transforms/InferAddressSpaces/noop-ptrint-pair.ll b/llvm/test/Transforms/InferAddressSpaces/noop-ptrint-pair.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/noop-ptrint-pair.ll @@ -0,0 +1,45 @@ +; RUN: opt -S -o - -infer-address-spaces %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-LABEL: @noop_ptrint_pair( +; CHECK-NEXT: store i32 0, i32 addrspace(1)* %{{.*}} +; CHECK-NEXT: ret void +define void @noop_ptrint_pair(i32 addrspace(1)* %x.coerce) { + %1 = ptrtoint i32 addrspace(1)* %x.coerce to i64 + %2 = inttoptr i64 %1 to i32* + store i32 0, i32* %2 + ret void +} + +; CHECK-LABEL: @non_noop_ptrint_pair( +; CHECK-NEXT: ptrtoint i32 addrspace(3)* %{{.*}} to i64 +; CHECK-NEXT: inttoptr i64 %{{.*}} to i32* +; CHECK-NEXT: store i32 0, i32* %{{.*}} +; CHECK-NEXT: ret void +define void @non_noop_ptrint_pair(i32 addrspace(3)* %x.coerce) { + %1 = ptrtoint i32 addrspace(3)* %x.coerce to i64 + %2 = inttoptr i64 %1 to i32* + store i32 0, i32* %2 + ret void +} + +@g = addrspace(1) global i32 0, align 4 +@l = addrspace(3) global i32 0, align 4 + +; CHECK-LABEL: @noop_ptrint_pair_ce( +; CHECK-NEXT: store i32 0, i32 addrspace(1)* @g +; CHECK-NEXT: ret void +define void @noop_ptrint_pair_ce() { + store i32 0, i32* inttoptr (i64 ptrtoint (i32 addrspace(1)* @g to i64) to i32*) + ret void +} + +; CHECK-LABEL: @non_noop_ptrint_pair_ce( +; CHECK-NEXT: store i32 0, i32* inttoptr (i64 ptrtoint (i32 addrspace(3)* @l to i64) to i32*) +; CHECK-NEXT: ret void +define void @non_noop_ptrint_pair_ce() { + store i32 0, i32* inttoptr (i64 ptrtoint (i32 addrspace(3)* @l to i64) to i32*) + ret void +}