diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1305,6 +1305,15 @@ DstTy = Dst.getType()->getElementType(); } + llvm::PointerType *SrcPtrTy = llvm::dyn_cast(SrcTy); + llvm::PointerType *DstPtrTy = llvm::dyn_cast(DstTy); + if (SrcPtrTy && DstPtrTy && + SrcPtrTy->getAddressSpace() != DstPtrTy->getAddressSpace()) { + Src = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Src, DstTy); + CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); + return; + } + // If the source and destination are integer or pointer types, just do an // extension or truncation to the desired type. if ((isa(SrcTy) || isa(SrcTy)) && diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -7685,6 +7685,42 @@ bool isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const override; + // Coerce HIP pointer arguments from generic pointers to global ones. + llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, + unsigned ToAS) const { + // Structure types. + if (auto STy = dyn_cast(Ty)) { + SmallVector EltTys; + bool Changed = false; + for (auto T : STy->elements()) { + auto NT = coerceKernelArgumentType(T, FromAS, ToAS); + EltTys.push_back(NT); + Changed |= (NT != T); + } + // Skip if there is no change in element types. + if (!Changed) + return STy; + if (STy->hasName()) + return llvm::StructType::create( + EltTys, (STy->getName() + ".coerce").str(), STy->isPacked()); + return llvm::StructType::get(getVMContext(), EltTys, STy->isPacked()); + } + // Arrary types. + if (auto ATy = dyn_cast(Ty)) { + auto T = ATy->getElementType(); + auto NT = coerceKernelArgumentType(T, FromAS, ToAS); + // Skip if there is no change in that element type. + if (NT == T) + return ATy; + return llvm::ArrayType::get(NT, ATy->getNumElements()); + } + // Single value types. + if (Ty->isPointerTy() && Ty->getPointerAddressSpace() == FromAS) + return llvm::PointerType::get( + cast(Ty)->getElementType(), ToAS); + return Ty; + } + public: explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {} @@ -7812,14 +7848,22 @@ // TODO: Can we omit empty structs? - // Coerce single element structs to its element. + llvm::Type *LTy = nullptr; if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) - return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + LTy = CGT.ConvertType(QualType(SeltTy, 0)); + + if (getContext().getLangOpts().HIP) { + if (!LTy) + LTy = CGT.ConvertType(Ty); + LTy = coerceKernelArgumentType( + LTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), + /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); + } // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. - return ABIArgInfo::getDirect(nullptr, 0, nullptr, false); + return ABIArgInfo::getDirect(LTy, 0, nullptr, false); } ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -0,0 +1,69 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s + +#include "Inputs/cuda.h" + +// Coerced struct from `struct S` without all generic pointers lowered into +// global ones. +// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } +// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] } + +// On the host-side compilation, generic pointer won't be coerced. +// HOST-NOT: %struct.S.coerce +// HOST-NOT: %struct.T.coerce + +// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) +// HOST: define void @_Z7kernel1Pi.stub(i32* %x) +__global__ void kernel1(int *x) { + x[0]++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) +// HOST: define void @_Z7kernel2Ri.stub(i32* dereferenceable(4) %x) +__global__ void kernel2(int &x) { + x++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +// HOST: define void @_Z7kernel3PU3AS2iPU3AS1i.stub(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, + __attribute__((address_space(1))) int *y) { + y[0] = x[0]; +} + +// CHECK: define void @_Z4funcPi(i32* %x) +__device__ void func(int *x) { + x[0]++; +} + +struct S { + int *x; + float *y; +}; +// `by-val` struct will be coerced into a similar struct with all generic +// pointers lowerd into global ones. +// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) +// HOST: define void @_Z7kernel41S.stub(i32* %s.coerce0, float* %s.coerce1) +__global__ void kernel4(struct S s) { + s.x[0]++; + s.y[0] += 1.f; +} + +// If a pointer to struct is passed, only the pointer itself is coerced into the global one. +// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce) +// HOST: define void @_Z7kernel5P1S.stub(%struct.S* %s) +__global__ void kernel5(struct S *s) { + s->x[0]++; + s->y[0] += 1.f; +} + +struct T { + float *x[2]; +}; +// `by-val` array is also coerced. +// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) +// HOST: define void @_Z7kernel61T.stub(float* %t.coerce0, float* %t.coerce1) +__global__ void kernel6(struct T t) { + t.x[0][0] += 1.f; + t.x[1][0] += 2.f; +}