Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -42,12 +42,16 @@ llvm::LLVMContext &Context; /// Convenience reference to the current module llvm::Module &TheModule; - /// Keeps track of kernel launch stubs emitted in this module + /// Keeps track of kernel launch stubs and handles emitted in this module struct KernelInfo { - llvm::Function *Kernel; + llvm::Function *Kernel; // stub function to help launch kernel const Decl *D; }; llvm::SmallVector EmittedKernels; + // Map a device stub function to a symbol for identifying kernel in host code. + // For CUDA, the symbol for identifying the kernel is the same as the device + // stub function. For HIP, they are different. + llvm::DenseMap KernelHandles; struct VarInfo { llvm::GlobalVariable *Var; const VarDecl *D; @@ -270,6 +274,18 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); + llvm::GlobalValue *KernelHandle = CGF.CurFn; + if (CGF.getLangOpts().HIP) { + auto Linkage = CGF.CurFn->getLinkage(); + auto *Var = new llvm::GlobalVariable( + TheModule, VoidPtrTy, /*isConstant=*/true, Linkage, + /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrTy), + CGM.getMangledName(GlobalDecl(cast(CGF.CurFuncDecl), + KernelReferenceKind::Kernel))); + Var->setAlignment(CGM.getPointerAlign().getAsAlign()); + KernelHandle = Var; + } + KernelHandles[CGF.CurFn] = KernelHandle; if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), CudaFeature::CUDA_USES_NEW_LAUNCH) || (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) @@ -348,7 +364,8 @@ ShmemSize.getPointer(), Stream.getPointer()}); // Emit the call to cudaLaunch - llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); + llvm::Value *Kernel = + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy); CallArgList LaunchKernelArgs; LaunchKernelArgs.add(RValue::get(Kernel), cudaLaunchKernelFD->getParamDecl(0)->getType()); @@ -403,7 +420,8 @@ // Emit the call to cudaLaunch llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); - llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); + llvm::Value *Arg = + CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy); CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); CGF.EmitBranch(EndBlock); @@ -497,7 +515,7 @@ llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); llvm::Value *Args[] = { &GpuBinaryHandlePtr, - Builder.CreateBitCast(I.Kernel, VoidPtrTy), + Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy), KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), Index: clang/test/CodeGenCUDA/Inputs/cuda.h =================================================================== --- clang/test/CodeGenCUDA/Inputs/cuda.h +++ clang/test/CodeGenCUDA/Inputs/cuda.h @@ -2,6 +2,7 @@ #include +#if __HIP__ || __CUDA__ #define __constant__ __attribute__((constant)) #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -11,13 +12,22 @@ #define __managed__ __attribute__((managed)) #endif #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) +#else +#define __constant__ +#define __device__ +#define __global__ +#define __host__ +#define __shared__ +#define __managed__ +#define __launch_bounds__(...) +#endif struct dim3 { unsigned x, y, z; __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; -#ifdef __HIP__ +#if __HIP__ || HIP_PLATFORM typedef struct hipStream *hipStream_t; typedef enum hipError {} hipError_t; int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, Index: clang/test/CodeGenCUDA/cxx-call-kernel.cpp =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/cxx-call-kernel.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc +// RUN: %clang_cc1 -mlink-bitcode-file %t.hip.bc -DHIP_PLATFORM -emit-llvm \ +// RUN: %s -o - | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK: @_Z2g1i = constant i8* null +#if __HIP__ +__global__ void g1(int x) {} +#else +extern void g1(int x); + +// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i +void test() { + hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0); +} + +// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i +#endif Index: clang/test/CodeGenCUDA/kernel-dbg-info.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-dbg-info.cu +++ clang/test/CodeGenCUDA/kernel-dbg-info.cu @@ -30,6 +30,9 @@ *a = 1; } +// Kernel symbol for launching kernel. +// CHECK: @[[SYM:ckernel]] = constant i8* null + // Device side kernel names // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00" @@ -40,7 +43,7 @@ // Make sure there is no !dbg between function attributes and '{' // CHECK: define{{.*}} void @[[CSTUB:__device_stub__ckernel]]{{.*}} #{{[0-9]+}} { // CHECK-NOT: call {{.*}}@hipLaunchByPtr{{.*}}!dbg -// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]] +// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[SYM]] // CHECK-NOT: ret {{.*}}!dbg // CHECK-LABEL: define {{.*}}@_Z8hostfuncPi{{.*}}!dbg Index: clang/test/CodeGenCUDA/kernel-stub-name.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-stub-name.cu +++ clang/test/CodeGenCUDA/kernel-stub-name.cu @@ -6,6 +6,12 @@ #include "Inputs/cuda.h" +// Kernel handles + +// CHECK: @[[HCKERN:ckernel]] = constant i8* null +// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant i8* null +// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant i8* null + extern "C" __global__ void ckernel() {} namespace ns { @@ -26,9 +32,9 @@ // Non-template kernel stub functions // CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]] // CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]] // CHECK-LABEL: define{{.*}}@_Z8hostfuncv() // CHECK: call void @[[CSTUB]]() @@ -45,11 +51,11 @@ // Template kernel stub functions // CHECK: define{{.*}}@[[TSTUB]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]] // CHECK: declare{{.*}}@[[DSTUB]] // CHECK-LABEL: define{{.*}}@__hip_register_globals -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]] -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]] -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]] Index: clang/test/CodeGenCUDA/unnamed-types.cu =================================================================== --- clang/test/CodeGenCUDA/unnamed-types.cu +++ clang/test/CodeGenCUDA/unnamed-types.cu @@ -54,7 +54,7 @@ [] __device__ (float x) { return x + 5.f; }); } // HOST: @__hip_register_globals -// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 -// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1 +// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 +// HOST: __hipRegisterFunction{{.*}}@_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1 // MSVC: __hipRegisterFunction{{.*}}@"??$k0@V@?0???R1?0??f1@@YAXPEAM@Z@QEBA@0@Z@@@YAXPEAMV@?0???R0?0??f1@@YAX0@Z@QEBA@0@Z@@Z{{.*}}@0 // MSVC: __hipRegisterFunction{{.*}}@"??$k1@V@?0??f1@@YAXPEAM@Z@V@?0??2@YAX0@Z@V@?0??2@YAX0@Z@@@YAXPEAMV@?0??f1@@YAX0@Z@V@?0??1@YAX0@Z@V@?0??1@YAX0@Z@@Z{{.*}}@1