diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -8707,42 +8707,6 @@ bool isHomogeneousAggregateSmallEnough(const Type *Base, uint64_t Members) const override; - // Coerce HIP pointer arguments from generic pointers to global ones. - llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS, - unsigned ToAS) const { - // Structure types. - if (auto STy = dyn_cast(Ty)) { - SmallVector EltTys; - bool Changed = false; - for (auto T : STy->elements()) { - auto NT = coerceKernelArgumentType(T, FromAS, ToAS); - EltTys.push_back(NT); - Changed |= (NT != T); - } - // Skip if there is no change in element types. - if (!Changed) - return STy; - if (STy->hasName()) - return llvm::StructType::create( - EltTys, (STy->getName() + ".coerce").str(), STy->isPacked()); - return llvm::StructType::get(getVMContext(), EltTys, STy->isPacked()); - } - // Array types. - if (auto ATy = dyn_cast(Ty)) { - auto T = ATy->getElementType(); - auto NT = coerceKernelArgumentType(T, FromAS, ToAS); - // Skip if there is no change in that element type. - if (NT == T) - return ATy; - return llvm::ArrayType::get(NT, ATy->getNumElements()); - } - // Single value types. - if (Ty->isPointerTy() && Ty->getPointerAddressSpace() == FromAS) - return llvm::PointerType::get( - cast(Ty)->getElementType(), ToAS); - return Ty; - } - public: explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {} @@ -8873,22 +8837,9 @@ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) Ty = QualType(SeltTy, 0); - llvm::Type *OrigLTy = CGT.ConvertType(Ty); - llvm::Type *LTy = OrigLTy; - if (getContext().getLangOpts().HIP) { - LTy = coerceKernelArgumentType( - OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), - /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); - } - // FIXME: Should also use this for OpenCL, but it requires addressing the // problem of kernels being called. - // - // FIXME: This doesn't apply the optimization of coercing pointers in structs - // to global address space when using byref. This would require implementing a - // new kind of coercion of the in-memory type when for indirect arguments. - if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && - isAggregateTypeForABI(Ty)) { + if (!getContext().getLangOpts().OpenCL && isAggregateTypeForABI(Ty)) { return ABIArgInfo::getIndirectAliased( getContext().getTypeAlignInChars(Ty), getContext().getTargetAddressSpace(LangAS::opencl_constant), @@ -8898,6 +8849,7 @@ // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. + llvm::Type *LTy = CGT.ConvertType(Ty); return ABIArgInfo::getDirect(LTy, 0, nullptr, false); } diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -9,32 +9,26 @@ // Coerced struct from `struct S` without all generic pointers lowered into // global ones. -// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } -// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] } // On the host-side compilation, generic pointer won't be coerced. // HOST-NOT: %struct.S.coerce // HOST-NOT: %struct.T.coerce // HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x) -// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce) -// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4 +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32*{{.*}} %x) +// OPT: [[VAL:%.*]] = load i32, i32* %x, align 4 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 -// OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4 +// OPT: store i32 [[INC]], i32* %x, align 4 // OPT: ret void __global__ void kernel1(int *x) { x[0]++; } // HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x) -// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce) -// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* -// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4 +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32*{{.*}} nonnull align 4 dereferenceable(4) %x) +// OPT: [[VAL:%.*]] = load i32, i32* %x, align 4 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1 -// OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4 +// OPT: store i32 [[INC]], i32* %x, align 4 // OPT: ret void __global__ void kernel2(int &x) { x++; @@ -61,15 +55,17 @@ // `by-val` struct will be coerced into a similar struct with all generic // pointers lowerd into global ones. // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) -// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) -// OPT: [[P0:%.*]] = extractvalue %struct.S.coerce %s.coerce, 0 -// OPT: [[P1:%.*]] = extractvalue %struct.S.coerce %s.coerce, 1 -// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[P0]], align 4 +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S addrspace(4)*{{.*}} byref(%struct.S) align 8 %0) +// OPT: [[R0:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0 +// OPT: [[P0:%.*]] = load i32*, i32* addrspace(4)* [[R0]], align 8 +// OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1 +// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8 +// OPT: [[V0:%.*]] = load i32, i32* [[P0]], align 4 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1 -// OPT: store i32 [[INC]], i32 addrspace(1)* [[P0]], align 4 -// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[P1]], align 4 +// OPT: store i32 [[INC]], i32* [[P0]], align 4 +// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4 // OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00 -// OPT: store float [[ADD]], float addrspace(1)* [[P1]], align 4 +// OPT: store float [[ADD]], float* [[P1]], align 4 // OPT: ret void __global__ void kernel4(struct S s) { s.x[0]++; @@ -78,7 +74,7 @@ // If a pointer to struct is passed, only the pointer itself is coerced into the global one. // HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s) -// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S*{{.*}} %s) __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -89,16 +85,17 @@ }; // `by-val` array is also coerced. // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) -// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) -// OPT: [[ARR:%.*]] = extractvalue %struct.T.coerce %t.coerce, 0 -// OPT: [[P0:%.*]] = extractvalue [2 x float addrspace(1)*] [[ARR]], 0 -// OPT: [[P1:%.*]] = extractvalue [2 x float addrspace(1)*] [[ARR]], 1 -// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[P0]], align 4 +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T addrspace(4)*{{.*}} byref(%struct.T) align 8 %0) +// OPT: [[R0:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 0 +// OPT: [[P0:%.*]] = load float*, float* addrspace(4)* [[R0]], align 8 +// OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1 +// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8 +// OPT: [[V0:%.*]] = load float, float* [[P0]], align 4 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00 -// OPT: store float [[ADD0]], float addrspace(1)* [[P0]], align 4 -// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[P1]], align 4 +// OPT: store float [[ADD0]], float* [[P0]], align 4 +// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4 // OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00 -// OPT: store float [[ADD1]], float addrspace(1)* [[P1]], align 4 +// OPT: store float [[ADD1]], float* [[P1]], align 4 // OPT: ret void __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; @@ -107,7 +104,7 @@ // Check that coerced pointers retain the noalias attribute when qualified with __restrict. // HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x) -// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32* noalias{{.*}} %x) __global__ void kernel7(int *__restrict x) { x[0]++; } diff --git a/clang/test/CodeGenCUDA/kernel-args.cu b/clang/test/CodeGenCUDA/kernel-args.cu --- a/clang/test/CodeGenCUDA/kernel-args.cu +++ b/clang/test/CodeGenCUDA/kernel-args.cu @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -fcuda-is-device \ // RUN: -emit-llvm %s -o - | FileCheck -check-prefix=AMDGCN %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda- -fcuda-is-device \ // RUN: -emit-llvm %s -o - | FileCheck -check-prefix=NVPTX %s @@ -6,17 +6,18 @@ struct A { int a[32]; + float *p; }; -// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}) -// NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x) +// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}) +// NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 8 %x) __global__ void kernel(A x) { } class Kernel { public: - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}) - // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}) + // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 8 %x) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} }; @@ -29,11 +30,11 @@ void test() { Kernel K; - // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}} - // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x) + // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}} + // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 8 %x) launch((void*)templateKernel); - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}} - // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}} + // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 8 %x) launch((void*)Kernel::templateMemberKernel); }