Index: lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- lib/CodeGen/CGCUDABuiltin.cpp +++ lib/CodeGen/CGCUDABuiltin.cpp @@ -99,6 +99,12 @@ llvm::SmallVector ArgTypes; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) ArgTypes.push_back(Args[I].RV.getScalarVal()->getType()); + + // Using llvm::StructType is correct only because printf doesn't accept + // aggregates. If we had to handle aggregates here, we'd have to manually + // compute the offsets within the alloca -- we wouldn't be able to assume + // that the alignment of the llvm type was the same as the alignment of the + // clang type. llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args"); llvm::Value *Alloca = CreateTempAlloca(AllocaTy); Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -118,37 +118,28 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) { - // Build the argument value list and the argument stack struct type. - SmallVector ArgValues; - std::vector ArgTypes; - for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end(); - I != E; ++I) { - llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer(); - ArgValues.push_back(V); - assert(isa(V->getType()) && "Arg type not PointerType"); - ArgTypes.push_back(cast(V->getType())->getElementType()); - } - llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes); - - llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); - - // Emit the calls to cudaSetupArgument + // Emit a call to cudaSetupArgument for each arg in Args. llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); - for (unsigned I = 0, E = Args.size(); I != E; ++I) { - llvm::Value *Args[3]; - llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); - Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy); - Args[1] = CGF.Builder.CreateIntCast( - llvm::ConstantExpr::getSizeOf(ArgTypes[I]), - SizeTy, false); - Args[2] = CGF.Builder.CreateIntCast( - llvm::ConstantExpr::getOffsetOf(ArgStackTy, I), - SizeTy, false); + llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); + CharUnits Offset = CharUnits::Zero(); + for (const VarDecl *A : Args) { + CharUnits TyWidth, TyAlign; + std::tie(TyWidth, TyAlign) = + CGM.getContext().getTypeInfoInChars(A->getType()); + Offset = Offset.alignTo(TyAlign); + llvm::Value *Args[] = { + CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), + VoidPtrTy), + llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), + llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), + }; llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); + llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); CGF.EmitBlock(NextBlock); + Offset += TyWidth; } // Emit the call to cudaLaunch Index: test/CodeGenCUDA/kernel-args-alignment.cu =================================================================== --- /dev/null +++ test/CodeGenCUDA/kernel-args-alignment.cu @@ -0,0 +1,39 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + +// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s + +// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \ +// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s + +#include "Inputs/cuda.h" + +struct U { + short x; +} __attribute__((packed)); + +struct S { + int *ptr; + char a; + U u; +}; + +// Clang should generate a packed LLVM struct for S (denoted by the <>s), +// otherwise this test isn't interesting. +// CHECK: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }> + +static_assert(alignof(S) == 8, "Unexpected alignment."); + +// HOST-LABEL: @_Z6kernelc1SPi +// Marshalled kernel args should be: +// 1. offset 0, width 1 +// 2. offset 8 (because alignof(S) == 8), width 16 +// 3. offset 24, width 8 +// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) +// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) + +// DEVICE-LABEL: @_Z6kernelc1SPi +// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +__global__ void kernel(char a, S s, int *b) {}