diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1305,6 +1305,14 @@ DstTy = Dst.getType()->getElementType(); } + if (isa(SrcTy) && + isa(DstTy) && + SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace()) { + Src = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Src, DstTy); + CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); + return; + } + // If the source and destination are integer or pointer types, just do an // extension or truncation to the desired type. if ((isa(SrcTy) || isa(SrcTy)) && diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -7685,6 +7685,53 @@ bool isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const override; + // Coercion type builder for lower HIP pointer argument from generic pointer + // to global pointer. + class CoerceGenericPointerTypeBuilder { + llvm::LLVMContext &Context; + unsigned DefaultAS; + unsigned GlobalAS; + + public: + CoerceGenericPointerTypeBuilder(llvm::LLVMContext &VMCtx, unsigned DAS, + unsigned GAS) + : Context(VMCtx), DefaultAS(DAS), GlobalAS(GAS) {} + + llvm::Type *coerce(llvm::Type *Ty) { + // Structure types. + if (auto STy = dyn_cast(Ty)) { + SmallVector EltTys; + bool Changed = false; + for (auto T : STy->elements()) { + auto NT = coerce(T); + EltTys.push_back(NT); + Changed |= (NT != T); + } + // Skip if there is no change in element types. + if (!Changed) + return STy; + if (STy->hasName()) + return llvm::StructType::create( + EltTys, (STy->getName() + ".coerce").str(), STy->isPacked()); + return llvm::StructType::get(Context, EltTys, STy->isPacked()); + } + // Arrary types. + if (auto ATy = dyn_cast(Ty)) { + auto T = ATy->getElementType(); + auto NT = coerce(T); + // Skip if there is no change in that element type. + if (NT == T) + return ATy; + return llvm::ArrayType::get(NT, ATy->getNumElements()); + } + // Single value types. + if (Ty->isPointerTy() && Ty->getPointerAddressSpace() == DefaultAS) + return llvm::PointerType::get( + cast(Ty)->getElementType(), GlobalAS); + return Ty; + } + }; + public: explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {} @@ -7812,14 +7859,23 @@ // TODO: Can we omit empty structs? - // Coerce single element structs to its element. + llvm::Type *LTy = nullptr; if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) - return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + LTy = CGT.ConvertType(QualType(SeltTy, 0)); + + if (getContext().getLangOpts().HIP) { + if (!LTy) + LTy = CGT.ConvertType(Ty); + CoerceGenericPointerTypeBuilder Builder(getVMContext(), + getContext().getTargetAddressSpace(LangAS::Default), + getContext().getTargetAddressSpace(LangAS::cuda_device)); + LTy = Builder.coerce(LTy); + } // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. - return ABIArgInfo::getDirect(nullptr, 0, nullptr, false); + return ABIArgInfo::getDirect(LTy, 0, nullptr, false); } ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -emit-llvm -x hip %s -o - | FileCheck %s +#include "Inputs/cuda.h" +// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) +__global__ void kernel1(int *x) { + x[0]++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) +__global__ void kernel2(int &x) { + x++; +} + +// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +__global__ void kernel3(__attribute__((address_space(2))) int *x, + __attribute__((address_space(1))) int *y) { + y[0] = x[0]; +} + +// CHECK: define void @_Z4funcPi(i32* %x) +__device__ void func(int *x) { + x[0]++; +}