Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -199,13 +199,21 @@ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); CharUnits Offset = CharUnits::Zero(); for (const VarDecl *A : Args) { + auto *Arg = CGF.GetAddrOfLocalVar(A).getPointer(); CharUnits TyWidth, TyAlign; - std::tie(TyWidth, TyAlign) = - CGM.getContext().getTypeInfoInChars(A->getType()); + auto *Aux = CGM.getContext().getAuxTargetInfo(); + if (Aux && Aux->getTriple().getArch() == llvm::Triple::amdgcn) { + auto *ArgTy = Arg->getType()->getPointerElementType(); + auto &DL = CGM.getDataLayout(); + TyWidth = CharUnits::fromQuantity(DL.getTypeStoreSize(ArgTy)); + TyAlign = CharUnits::fromQuantity(DL.getABITypeAlignment(ArgTy)); + } else { + std::tie(TyWidth, TyAlign) = + CGM.getContext().getTypeInfoInChars(A->getType()); + } Offset = Offset.alignTo(TyAlign); llvm::Value *Args[] = { - CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), - VoidPtrTy), + CGF.Builder.CreatePointerCast(Arg, VoidPtrTy), llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), }; Index: test/CodeGenCUDA/kernel-args-alignment.cu =================================================================== --- test/CodeGenCUDA/kernel-args-alignment.cu +++ test/CodeGenCUDA/kernel-args-alignment.cu @@ -1,8 +1,15 @@ // RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \ -// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s +// RUN: FileCheck -check-prefixes=HOST,HOST-NV,CHECK %s // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \ -// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s +// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-NV,CHECK %s + +// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -x hip \ +// RUN: -aux-triple amdgcn-amd-amdhsa -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefixes=HOST,HOST-AMD,CHECK %s + +// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple amdgcn-amd-amdhsa \ +// RUN: -x hip -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-AMD,CHECK %s #include "Inputs/cuda.h" @@ -23,14 +30,25 @@ static_assert(alignof(S) == 8, "Unexpected alignment."); // HOST-LABEL: @_Z6kernelc1SPi -// Marshalled kernel args should be: +// For NVPTX backend, marshalled kernel args should be: // 1. offset 0, width 1 // 2. offset 8 (because alignof(S) == 8), width 16 // 3. offset 24, width 8 -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// AMDGPU backend assumes struct type kernel arguments are passed directly, +// not byval. It lays out kernel arguments by size and alignment in IR. +// Packed struct type in IR always has ABI alignment of 1. +// For AMDGPU backend, marshalled kernel args should be: +// 1. offset 0, width 1 +// 2. offset 1 (because ABI alignment of S is 1), width 16 +// 3. offset 24, width 8 +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 16, i64 1) +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 8, i64 24) // DEVICE-LABEL: @_Z6kernelc1SPi -// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +// DEVICE-NV-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +// DEVICE-AMD-SAME: i8{{[^,]*}}, %struct.S{{[^,*]*}}, i32* __global__ void kernel(char a, S s, int *b) {}