Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -116,7 +116,9 @@ } void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); - void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyNew(CodeGenFunction &CGF, llvm::Value *Kernel, + const Address &KernelArgs, + Expr const *const *ConfigArgs = nullptr); std::string getDeviceSideName(const NamedDecl *ND) override; public: @@ -149,6 +151,13 @@ llvm::Function *makeModuleCtorFunction() override; /// Creates module destructor function llvm::Function *makeModuleDtorFunction() override; + + RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF, + const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue) override; + + Address createTempVarForKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args); }; } @@ -241,19 +250,17 @@ FunctionArgList &Args) { EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), - CudaFeature::CUDA_USES_NEW_LAUNCH) || - (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) - emitDeviceStubBodyNew(CGF, Args); - else + CudaFeature::CUDA_USES_NEW_LAUNCH)) + emitDeviceStubBodyNew(CGF, CGF.CurFn, + createTempVarForKernelArgs(CGF, Args)); + else if (!CGF.getLangOpts().HIP || !CGF.getLangOpts().HIPUseNewLaunchAPI) emitDeviceStubBodyLegacy(CGF, Args); } -// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local -// array and kernels are launched using cudaLaunchKernel(). -void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, - FunctionArgList &Args) { - // Build the shadow stack entry at the very start of the function. - +// Create a temporary array to hold all kernel arguments for kernel stub. +// \p Args is the kernel argument list of the kernel stub. +Address CGNVCUDARuntime::createTempVarForKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args) { // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. @@ -267,8 +274,19 @@ CGF.Builder.CreateDefaultAlignedStore( VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); } + return KernelArgs; +} - llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + llvm::Value *Kernel, + const Address &KernelArgs, + Expr const *const *ConfigArgs) { + // Build the shadow stack entry at the very start of the function. + llvm::BasicBlock *EndBlock = nullptr; + if (!CGF.getLangOpts().HIP) + EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel/hipLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, @@ -296,39 +314,49 @@ // Create temporary dim3 grid_dim, block_dim. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); QualType Dim3Ty = GridDimParam->getType(); - Address GridDim = - CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); - Address BlockDim = - CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); - Address ShmemSize = - CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); - Address Stream = - CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); - llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, - {/*gridDim=*/GridDim.getType(), - /*blockDim=*/BlockDim.getType(), - /*ShmemSize=*/ShmemSize.getType(), - /*Stream=*/Stream.getType()}, - /*isVarArg=*/false), - addUnderscoredPrefixToName("PopCallConfiguration")); - - CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, - {GridDim.getPointer(), BlockDim.getPointer(), - ShmemSize.getPointer(), Stream.getPointer()}); + RValue ConfigArgRVals[4]; + if (!CGF.getLangOpts().HIP) { + Address GridDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); + Address BlockDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); + Address ShmemSize = + CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); + Address Stream = + CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); + llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, + {/*gridDim=*/GridDim.getType(), + /*blockDim=*/BlockDim.getType(), + /*ShmemSize=*/ShmemSize.getType(), + /*Stream=*/Stream.getType()}, + /*isVarArg=*/false), + addUnderscoredPrefixToName("PopCallConfiguration")); + CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, + {GridDim.getPointer(), BlockDim.getPointer(), + ShmemSize.getPointer(), Stream.getPointer()}); + ConfigArgRVals[0] = RValue::getAggregate(GridDim); + ConfigArgRVals[1] = RValue::getAggregate(BlockDim); + ConfigArgRVals[2] = RValue::get(CGF.Builder.CreateLoad(ShmemSize)); + ConfigArgRVals[3] = RValue::get(CGF.Builder.CreateLoad(Stream)); + } else { + assert(ConfigArgs); + for (unsigned I = 0; I < 4; ++I) + ConfigArgRVals[I] = CGF.EmitAnyExprToTemp(ConfigArgs[I]); + } // Emit the call to cudaLaunch - llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); CallArgList LaunchKernelArgs; - LaunchKernelArgs.add(RValue::get(Kernel), - cudaLaunchKernelFD->getParamDecl(0)->getType()); - LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); - LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); + LaunchKernelArgs.add( + RValue::get(CGF.Builder.CreatePointerCast(Kernel, VoidPtrTy)), + cudaLaunchKernelFD->getParamDecl(0)->getType()); + LaunchKernelArgs.add(ConfigArgRVals[0], Dim3Ty); + LaunchKernelArgs.add(ConfigArgRVals[1], Dim3Ty); LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), cudaLaunchKernelFD->getParamDecl(3)->getType()); - LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), + LaunchKernelArgs.add(ConfigArgRVals[2], cudaLaunchKernelFD->getParamDecl(4)->getType()); - LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), + LaunchKernelArgs.add(ConfigArgRVals[3], cudaLaunchKernelFD->getParamDecl(5)->getType()); QualType QT = cudaLaunchKernelFD->getType(); @@ -342,9 +370,12 @@ CGM.CreateRuntimeFunction(FTy, LaunchKernelName); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); - CGF.EmitBranch(EndBlock); - CGF.EmitBlock(EndBlock); + if (!CGF.getLangOpts().HIP) { + assert(EndBlock); + CGF.EmitBranch(EndBlock); + CGF.EmitBlock(EndBlock); + } } void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, @@ -846,3 +877,50 @@ CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { return new CGNVCUDARuntime(CGM); } + +RValue CGNVCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF, + const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue) { + if (!CGF.CGM.getLangOpts().HIP || !CGF.getLangOpts().HIPUseNewLaunchAPI) + return CGCUDARuntime::EmitCUDAKernelCallExpr(CGF, E, ReturnValue); + + CGCallee Callee = CGF.EmitCallee(E->getCallee()); + auto FnType = E->getCallee() + ->getType() + ->getAs() + ->getPointeeType() + ->getAs(); + CallArgList Args; + CGF.EmitCallArgs(Args, FnType, E->arguments()); + + Address KernelArgs = CGF.CreateTempAlloca( + VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", + llvm::ConstantInt::get(SizeTy, std::max(1, Args.size()))); + for (unsigned I = 0; I < Args.size(); ++I) { + auto RV = Args[I].getRValue(CGF); + llvm::Value *VoidArgPtr; + if (RV.isScalar()) { + llvm::Value *Arg = RV.getScalarVal(); + auto Ty = Arg->getType(); + Address ArgPtr = CGF.CreateTempAlloca( + Ty, + CharUnits::fromQuantity( + CGF.CGM.getDataLayout().getPrefTypeAlignment(Ty)), + "kernel_arg"); + CGF.Builder.CreateDefaultAlignedStore(Arg, ArgPtr.getPointer()); + VoidArgPtr = + CGF.Builder.CreatePointerCast(ArgPtr.getPointer(), VoidPtrTy); + } else { + Address ArgPtr = RV.getAggregateAddress(); + VoidArgPtr = + CGF.Builder.CreatePointerCast(ArgPtr.getPointer(), VoidPtrTy); + } + CGF.Builder.CreateDefaultAlignedStore( + VoidArgPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), I)); + } + + emitDeviceStubBodyNew(CGF, Callee.getFunctionPointer(), KernelArgs, + E->getConfig()->getArgs()); + + return RValue::get(nullptr); +} Index: clang/test/CodeGenCUDA/kernel-call.hip =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/kernel-call.hip @@ -0,0 +1,69 @@ +// RUN: %clang_cc1 -fhip-new-launch-api -triple x86_64-unknown-linux-gnu \ +// RUN: -std=c++11 -emit-llvm %s -o - | FileCheck %s + +#include "Inputs/cuda.h" + +struct A { int a[10]; }; + +__global__ void g1(int x) {} +__global__ void g2(A x) {} +__global__ void g3(A &x) {} +template __global__ void g4(F f, int *x) { *x = f(); } +void (*pg1)(int x) = g1; + +// CHECK-LABEL: define{{.*}}test1 +void test1() { + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 2, i32 1, i32 1) + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 3, i32 1, i32 1) + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g1i{{.*}}, i64 0, %struct.hipStream* null) + g1<<<2, 3>>>(0); + + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 4, i32 5, i32 6) + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 7, i32 8, i32 9) + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g1i{{.*}}, i64 10, {{.*}}inttoptr (i64 11 + g1<<>>(0); + + // CHECK: %[[LD:.*]] = load void (i32)*, void (i32)** @pg1 + // CHECK: %[[PTR:.*]] = bitcast void (i32)* %[[LD]] to i8* + // CHECK: call i32 @hipLaunchKernel({{.*}}%[[PTR]]{{.*}}, i64 0, %struct.hipStream* null) + pg1<<<1, 1>>>(0); +} + +// CHECK-LABEL: define{{.*}}test2 +void test2() { + A a; + // CHECK: %agg.tmp = alloca %struct.A, align 4 + // CHECK: %kernel_args = alloca i8*, i64 1, align 16 + // CHECK: %[[CAST:.*]] = bitcast %struct.A* %agg.tmp to i8* + // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 + // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8 + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g21A{{.*}}, i64 0, %struct.hipStream* null) + g2<<<1, 1>>>(a); +} + +// CHECK-LABEL: define{{.*}}test3 +void test3() { + A a; + // CHECK: %a = alloca %struct.A, align 4 + // CHECK: %kernel_arg = alloca %struct.A*, align 8 + // CHECK: %kernel_args = alloca i8*, i64 1, align 16 + // CHECK: store %struct.A* %a, %struct.A** %kernel_arg, align 8 + // CHECK: %[[CAST:.*]] = bitcast %struct.A** %kernel_arg to i8* + // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 + // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8 + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g3R1A{{.*}}, i64 0, %struct.hipStream* null) + g3<<<1, 1>>>(a); +} + +// CHECK-LABEL: define{{.*}}test4 +void test4() { + int x = 123; + int y; + // CHECK: %agg.tmp = alloca %class.anon, align 4 + // CHECK: %kernel_args = alloca i8*, i64 2, align 16 + // CHECK: %[[CAST:.*]] = bitcast %class.anon* %agg.tmp to i8* + // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 + // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8 + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g4IZ5test4vEUlvE_EvT_Pi{{.*}}, i64 0, %struct.hipStream* null) + g4<<<1, 1>>>([=]() { return x; }, &y); +} Index: clang/test/CodeGenCUDA/kernel-call.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-call.cu +++ clang/test/CodeGenCUDA/kernel-call.cu @@ -1,19 +1,19 @@ // RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK +// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK,COMMON // RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK +// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK,COMMON // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK +// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK,COMMON // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK +// RUN: | FileCheck %s --check-prefixes=HIP-NEW #include "Inputs/cuda.h" // CHECK-LABEL: define{{.*}}g1 // HIP-OLD: call{{.*}}hipSetupArgument // HIP-OLD: call{{.*}}hipLaunchByPtr -// HIP-NEW: call{{.*}}__hipPopCallConfiguration -// HIP-NEW: call{{.*}}hipLaunchKernel +// HIP-NEW-NOT: call{{.*}}__hipPopCallConfiguration +// HIP-NEW-NOT: call{{.*}}hipLaunchKernel // CUDA-OLD: call{{.*}}cudaSetupArgument // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration @@ -23,11 +23,12 @@ // CHECK-LABEL: define{{.*}}main int main(void) { // HIP-OLD: call{{.*}}hipConfigureCall - // HIP-NEW: call{{.*}}__hipPushCallConfiguration + // HIP-NEW-NOT: call{{.*}}__hipPushCallConfiguration + // HIP-NEW: call{{.*}}hipLaunchKernel // CUDA-OLD: call{{.*}}cudaConfigureCall // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration - // CHECK: icmp - // CHECK: br - // CHECK: call{{.*}}g1 + // COMMON: icmp + // COMMON: br + // COMMON: call{{.*}}g1 g1<<<1, 1>>>(42); } Index: clang/test/lit.cfg.py =================================================================== --- clang/test/lit.cfg.py +++ clang/test/lit.cfg.py @@ -25,7 +25,7 @@ config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu', +config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu', '.hip', '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs', '.ifs'] # excludes: A list of directories to exclude from the testsuite. The 'Inputs'