diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1272,19 +1272,87 @@ // store the elements rather than the aggregate to be more friendly to // fast-isel. // FIXME: Do we need to recurse here? -static void BuildAggStore(CodeGenFunction &CGF, llvm::Value *Val, - Address Dest, bool DestIsVolatile) { +static void BuildAggStore(CodeGenFunction &CGF, llvm::Value *Val, Address Dest, + bool DestIsVolatile, llvm::Type *DstTy = nullptr) { + auto &DL = CGF.CGM.getDataLayout(); + llvm::Type *SrcTy = Val->getType(); // Prefer scalar stores to first-class aggregate stores. - if (llvm::StructType *STy = - dyn_cast(Val->getType())) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + if (llvm::StructType *SrcSTy = dyn_cast(SrcTy)) { + llvm::StructType *DstSTy = dyn_cast_or_null(DstTy); + const llvm::StructLayout *SrcSL = nullptr; + const llvm::StructLayout *DstSL = nullptr; + if (DstSTy && SrcSTy->getNumElements() == DstSTy->getNumElements()) { + // Retrive StructLayout objects if both src and dst are struct types. + SrcSL = DL.getStructLayout(SrcSTy); + DstSL = DL.getStructLayout(DstSTy); + } + for (unsigned i = 0, e = SrcSTy->getNumElements(); i != e; ++i) { Address EltPtr = CGF.Builder.CreateStructGEP(Dest, i); llvm::Value *Elt = CGF.Builder.CreateExtractValue(Val, i); + // Check if the element starts from the same offset. + if (SrcSL && DstSL && + SrcSL->getElementOffset(i) == DstSL->getElementOffset(i)) { + llvm::Type *SrcEltTy = SrcSTy->getElementType(i); + llvm::Type *DstEltTy = DstSTy->getElementType(i); + assert(Elt->getType() == SrcEltTy); + // Check if the store size is same as well. + if (DL.getTypeStoreSize(SrcEltTy) == DL.getTypeStoreSize(DstEltTy)) { + llvm::PointerType *SrcPtrTy = dyn_cast(SrcEltTy); + llvm::PointerType *DstPtrTy = dyn_cast(DstEltTy); + // Apply `addrspacecast` when necessary. + if (SrcPtrTy && DstPtrTy && + SrcPtrTy->getAddressSpace() != DstPtrTy->getAddressSpace()) { + Elt = + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Elt, DstEltTy); + } + EltPtr = CGF.Builder.CreateElementBitCast(EltPtr, DstEltTy); + BuildAggStore(CGF, Elt, EltPtr, DestIsVolatile, DstEltTy); + continue; + } + } + // If there is any mismatch, i.e. the different offsets or the different + // sizes, clear StructLayout objects to skip further checking. + SrcSL = DstSL = nullptr; CGF.Builder.CreateStore(Elt, EltPtr, DestIsVolatile); } - } else { - CGF.Builder.CreateStore(Val, Dest, DestIsVolatile); + return; + } + // For array types, prefer scalar stores as well if they have matching + // layouts and reasonable number of fields. + if (llvm::ArrayType *SrcATy = dyn_cast(SrcTy)) { + llvm::ArrayType *DstATy = dyn_cast_or_null(DstTy); + if (DstATy && DstATy->getNumElements() <= 16 && + SrcATy->getNumElements() == DstATy->getNumElements() && + CGF.CGM.getDataLayout().getTypeAllocSize(SrcATy->getElementType()) == + CGF.CGM.getDataLayout().getTypeAllocSize( + DstATy->getElementType())) { + llvm::Type *SrcEltTy = SrcATy->getElementType(); + llvm::Type *DstEltTy = DstATy->getElementType(); + llvm::PointerType *DstPtrTy = nullptr; + if (isa(SrcEltTy) && + isa(DstEltTy) && + cast(SrcEltTy)->getAddressSpace() != + cast(DstEltTy)->getAddressSpace()) { + // For matching layout, check the case where `addrspacecast` is + // required. + DstPtrTy = cast(DstEltTy); + } + for (uint64_t i = 0, e = SrcATy->getNumElements(); i < e; ++i) { + Address EltPtr = CGF.Builder.CreateConstArrayGEP(Dest, i); + llvm::Value *Elt = CGF.Builder.CreateExtractValue(Val, i); + if (DstPtrTy) { + // Insert `addrspacecast` if necessary. + Elt = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Elt, DstPtrTy); + } + EltPtr = CGF.Builder.CreateElementBitCast(EltPtr, DstEltTy); + BuildAggStore(CGF, Elt, EltPtr, DestIsVolatile, DstEltTy); + } + return; + } + // Fall back to aggregate store if it's not safe due to the layout + // mismatch. } + CGF.Builder.CreateStore(Val, Dest, DestIsVolatile); } /// CreateCoercedStore - Create a store to \arg DstPtr from \arg Src, @@ -1298,6 +1366,7 @@ bool DstIsVolatile, CodeGenFunction &CGF) { llvm::Type *SrcTy = Src->getType(); + llvm::Type *OrigDstTy = Dst.getElementType(); llvm::Type *DstTy = Dst.getElementType(); if (SrcTy == DstTy) { CGF.Builder.CreateStore(Src, Dst, DstIsVolatile); @@ -1334,7 +1403,7 @@ // If store is legal, just bitcast the src pointer. if (SrcSize <= DstSize) { Dst = CGF.Builder.CreateElementBitCast(Dst, SrcTy); - BuildAggStore(CGF, Src, Dst, DstIsVolatile); + BuildAggStore(CGF, Src, Dst, DstIsVolatile, OrigDstTy); } else { // Otherwise do coercion through memory. This is stupid, but // simple. diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -1,37 +1,52 @@ -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s #include "Inputs/cuda.h" // Coerced struct from `struct S` without all generic pointers lowered into // global ones. -// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } -// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] } +// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } +// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] } // On the host-side compilation, generic pointer won't be coerced. // HOST-NOT: %struct.S.coerce // HOST-NOT: %struct.T.coerce -// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) // HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: inttoptr __global__ void kernel1(int *x) { x[0]++; } -// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* nonnull align 4 dereferenceable(4) %x.coerce) // HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel2(int &x) { x++; } -// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) // HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +// CHECK-LABEL: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* __global__ void kernel3(__attribute__((address_space(2))) int *x, __attribute__((address_space(1))) int *y) { y[0] = x[0]; } -// CHECK: define void @_Z4funcPi(i32* %x) +// COMMON-LABEL: define void @_Z4funcPi(i32*{{.*}} %x) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __device__ void func(int *x) { x[0]++; } @@ -42,16 +57,26 @@ }; // `by-val` struct will be coerced into a similar struct with all generic // pointers lowerd into global ones. -// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) +// CHECK-COUNT-2: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel4(struct S s) { s.x[0]++; s.y[0] += 1.f; } // If a pointer to struct is passed, only the pointer itself is coerced into the global one. -// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce) // HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -61,16 +86,26 @@ float *x[2]; }; // `by-val` array is also coerced. -// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) +// CHECK-COUNT-2: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; t.x[1][0] += 2.f; } // Check that coerced pointers retain the noalias attribute when qualified with __restrict. -// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce) // HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel7(int *__restrict x) { x[0]++; }