Index: clang/include/clang/Basic/Specifiers.h =================================================================== --- clang/include/clang/Basic/Specifiers.h +++ clang/include/clang/Basic/Specifiers.h @@ -263,24 +263,25 @@ /// CallingConv - Specifies the calling convention that a function uses. enum CallingConv { - CC_C, // __attribute__((cdecl)) - CC_X86StdCall, // __attribute__((stdcall)) - CC_X86FastCall, // __attribute__((fastcall)) - CC_X86ThisCall, // __attribute__((thiscall)) - CC_X86VectorCall, // __attribute__((vectorcall)) - CC_X86Pascal, // __attribute__((pascal)) - CC_Win64, // __attribute__((ms_abi)) - CC_X86_64SysV, // __attribute__((sysv_abi)) - CC_X86RegCall, // __attribute__((regcall)) - CC_AAPCS, // __attribute__((pcs("aapcs"))) - CC_AAPCS_VFP, // __attribute__((pcs("aapcs-vfp"))) - CC_IntelOclBicc, // __attribute__((intel_ocl_bicc)) - CC_SpirFunction, // default for OpenCL functions on SPIR target - CC_OpenCLKernel, // inferred for OpenCL kernels - CC_Swift, // __attribute__((swiftcall)) - CC_PreserveMost, // __attribute__((preserve_most)) - CC_PreserveAll, // __attribute__((preserve_all)) + CC_C, // __attribute__((cdecl)) + CC_X86StdCall, // __attribute__((stdcall)) + CC_X86FastCall, // __attribute__((fastcall)) + CC_X86ThisCall, // __attribute__((thiscall)) + CC_X86VectorCall, // __attribute__((vectorcall)) + CC_X86Pascal, // __attribute__((pascal)) + CC_Win64, // __attribute__((ms_abi)) + CC_X86_64SysV, // __attribute__((sysv_abi)) + CC_X86RegCall, // __attribute__((regcall)) + CC_AAPCS, // __attribute__((pcs("aapcs"))) + CC_AAPCS_VFP, // __attribute__((pcs("aapcs-vfp"))) + CC_IntelOclBicc, // __attribute__((intel_ocl_bicc)) + CC_SpirFunction, // default for OpenCL functions on SPIR target + CC_OpenCLKernel, // inferred for OpenCL kernels + CC_Swift, // __attribute__((swiftcall)) + CC_PreserveMost, // __attribute__((preserve_most)) + CC_PreserveAll, // __attribute__((preserve_all)) CC_AArch64VectorCall, // __attribute__((aarch64_vector_pcs)) + CC_DeviceStub, // inferred for HIP device stub }; /// Checks whether the given calling convention supports variadic @@ -296,6 +297,7 @@ case CC_SpirFunction: case CC_OpenCLKernel: case CC_Swift: + case CC_DeviceStub: return false; default: return true; Index: clang/lib/AST/ItaniumMangle.cpp =================================================================== --- clang/lib/AST/ItaniumMangle.cpp +++ clang/lib/AST/ItaniumMangle.cpp @@ -483,6 +483,7 @@ const AbiTagList *AdditionalAbiTags); void mangleSourceName(const IdentifierInfo *II); void mangleRegCallName(const IdentifierInfo *II); + void mangleDeviceStubName(const IdentifierInfo *II); void mangleSourceNameWithAbiTags( const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr); void mangleLocalName(const Decl *D, @@ -1302,7 +1303,12 @@ bool IsRegCall = FD && FD->getType()->castAs()->getCallConv() == clang::CC_X86RegCall; - if (IsRegCall) + bool IsDeviceStub = + FD && FD->getType()->castAs()->getCallConv() == + clang::CC_DeviceStub; + if (IsDeviceStub) + mangleDeviceStubName(II); + else if (IsRegCall) mangleRegCallName(II); else mangleSourceName(II); @@ -1491,6 +1497,14 @@ << II->getName(); } +void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) { + // ::= __device_stub__ + // ::= [n] + // ::= + Out << II->getLength() + sizeof("__device_stub__") - 1 << "__device_stub__" + << II->getName(); +} + void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) { // ::= // ::= [n] @@ -2734,6 +2748,7 @@ case CC_OpenCLKernel: case CC_PreserveMost: case CC_PreserveAll: + case CC_DeviceStub: // FIXME: we should be mangling all of the above. return ""; Index: clang/lib/AST/MicrosoftMangle.cpp =================================================================== --- clang/lib/AST/MicrosoftMangle.cpp +++ clang/lib/AST/MicrosoftMangle.cpp @@ -2393,6 +2393,7 @@ llvm_unreachable("Unsupported CC for mangling"); case CC_Win64: case CC_X86_64SysV: + case CC_DeviceStub: case CC_C: Out << 'A'; break; case CC_X86Pascal: Out << 'C'; break; case CC_X86ThisCall: Out << 'E'; break; Index: clang/lib/AST/Type.cpp =================================================================== --- clang/lib/AST/Type.cpp +++ clang/lib/AST/Type.cpp @@ -2947,6 +2947,8 @@ case CC_Swift: return "swiftcall"; case CC_PreserveMost: return "preserve_most"; case CC_PreserveAll: return "preserve_all"; + case CC_DeviceStub: + return "device_stub"; } llvm_unreachable("Invalid calling convention."); Index: clang/lib/AST/TypePrinter.cpp =================================================================== --- clang/lib/AST/TypePrinter.cpp +++ clang/lib/AST/TypePrinter.cpp @@ -893,6 +893,7 @@ break; case CC_SpirFunction: case CC_OpenCLKernel: + case CC_DeviceStub: // Do nothing. These CCs are not available as attributes. break; case CC_Swift: Index: clang/lib/Basic/Targets/X86.h =================================================================== --- clang/lib/Basic/Targets/X86.h +++ clang/lib/Basic/Targets/X86.h @@ -313,6 +313,7 @@ case CC_X86Pascal: case CC_IntelOclBicc: case CC_OpenCLKernel: + case CC_DeviceStub: return CCCR_OK; default: return CCCR_Warning; @@ -659,6 +660,7 @@ case CC_PreserveAll: case CC_X86RegCall: case CC_OpenCLKernel: + case CC_DeviceStub: return CCCR_OK; default: return CCCR_Warning; @@ -733,6 +735,7 @@ case CC_Swift: case CC_X86RegCall: case CC_OpenCLKernel: + case CC_DeviceStub: return CCCR_OK; default: return CCCR_Warning; Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -117,7 +117,6 @@ void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); - std::string getDeviceSideName(const Decl *ND); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -132,8 +131,7 @@ llvm::Function *makeModuleCtorFunction() override; /// Creates module destructor function llvm::Function *makeModuleDtorFunction() override; - /// Construct and return the stub name of a kernel. - std::string getDeviceStubName(llvm::StringRef Name) const override; + std::string getDeviceSideName(const Decl *ND) override; }; } @@ -231,8 +229,8 @@ assert((CGF.CGM.getContext().getAuxTargetInfo() && (CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI() != CGF.CGM.getContext().getTargetInfo().getCXXABI())) || - getDeviceStubName(getDeviceSideName(CGF.CurFuncDecl)) == - CGF.CurFn->getName()); + CGF.getLangOpts().HIP || + getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName()); EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), @@ -797,12 +795,6 @@ return ModuleDtorFunc; } -std::string CGNVCUDARuntime::getDeviceStubName(llvm::StringRef Name) const { - if (!CGM.getLangOpts().HIP) - return Name; - return (Name + ".stub").str(); -} - CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { return new CGNVCUDARuntime(CGM); } Index: clang/lib/CodeGen/CGCUDARuntime.h =================================================================== --- clang/lib/CodeGen/CGCUDARuntime.h +++ clang/lib/CodeGen/CGCUDARuntime.h @@ -25,6 +25,7 @@ namespace clang { class CUDAKernelCallExpr; +class Decl; class VarDecl; namespace CodeGen { @@ -66,8 +67,8 @@ /// Must be called after ModuleCtorFunction virtual llvm::Function *makeModuleDtorFunction() = 0; - /// Construct and return the stub name of a kernel. - virtual std::string getDeviceStubName(llvm::StringRef Name) const = 0; + // Returns device side kernel name. + virtual std::string getDeviceSideName(const Decl *ND) = 0; }; /// Creates an instance of a CUDA runtime class. Index: clang/lib/CodeGen/CGDebugInfo.cpp =================================================================== --- clang/lib/CodeGen/CGDebugInfo.cpp +++ clang/lib/CodeGen/CGDebugInfo.cpp @@ -1149,6 +1149,7 @@ static unsigned getDwarfCC(CallingConv CC) { switch (CC) { + case CC_DeviceStub: case CC_C: // Avoid emitting DW_AT_calling_convention if the C convention was used. return 0; Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -1004,6 +1004,10 @@ FD->getType()->castAs()->getCallConv() == CC_X86RegCall) { llvm::raw_svector_ostream Out(Buffer); Out << "__regcall3__" << II->getName(); + } else if (FD && FD->getType()->castAs()->getCallConv() == + CC_DeviceStub) { + llvm::raw_svector_ostream Out(Buffer); + Out << "__device_stub__" << II->getName(); } else { Out << II->getName(); } @@ -1089,13 +1093,40 @@ // Keep the first result in the case of a mangling collision. const auto *ND = cast(GD.getDecl()); - std::string MangledName = getMangledNameImpl(*this, GD, ND); - - // Adjust kernel stub mangling as we may need to be able to differentiate - // them from the kernel itself (e.g., for HIP). - if (auto *FD = dyn_cast(GD.getDecl())) - if (!getLangOpts().CUDAIsDevice && FD->hasAttr()) - MangledName = getCUDARuntime().getDeviceStubName(MangledName); + std::string MangledName; + + // ToDo: HIP needs unmangled name of device stub to be prefixed with + // __device_stub__. This is done by adding device stub calling convention + // to the stub function or its template temporarily then let the mangler + // mangle it differently. We cannot let the stub function have device stub + // calling convention in AST permanently because: 1. we still need to use + // the same FunctionDecl to mangle the kernel name which should not have + // the prefix; 2. the stub function does not really have a specific calling + // convention which is enforced by type checking. + // Need a better way to mangle the device stub function. + auto *FD = dyn_cast(GD.getDecl()); + if (getLangOpts().HIP && FD && + getContext().getTargetInfo().getCXXABI() != TargetCXXABI::Microsoft && + !getLangOpts().CUDAIsDevice && FD->hasAttr()) { + + // Make sure the non-prefixed stub name is the same as device side kernel + // name. + assert(getCUDARuntime().getDeviceSideName(FD) == + getMangledNameImpl(*this, GD, FD) && + "Mismatch between host side and device side kernel name"); + + if (auto *TD = cast(FD)->getPrimaryTemplate()) + FD = TD->getTemplatedDecl(); + auto OldQT = FD->getType(); + auto *OldFT = OldQT->getAs(); + auto *NewFT = getContext().adjustFunctionType( + OldFT, OldFT->getExtInfo().withCallingConv(CC_DeviceStub)); + const_cast(FD)->setType(QualType(NewFT, 0)); + MangledName = getMangledNameImpl(*this, GD, ND); + assert(MangledName.find("__device_stub__") != StringRef::npos); + const_cast(FD)->setType(OldQT); + } else + MangledName = getMangledNameImpl(*this, GD, ND); auto Result = Manglings.insert(std::make_pair(MangledName, GD)); return MangledDeclNames[CanonicalGD] = Result.first->first(); Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -13,19 +13,19 @@ // HOST-NOT: %struct.T.coerce // CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) -// HOST: define void @_Z7kernel1Pi.stub(i32* %x) +// HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x) __global__ void kernel1(int *x) { x[0]++; } // CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* dereferenceable(4) %x.coerce) -// HOST: define void @_Z7kernel2Ri.stub(i32* dereferenceable(4) %x) +// HOST: define void @_Z22__device_stub__kernel2Ri(i32* dereferenceable(4) %x) __global__ void kernel2(int &x) { x++; } // CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) -// HOST: define void @_Z7kernel3PU3AS2iPU3AS1i.stub(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +// HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) __global__ void kernel3(__attribute__((address_space(2))) int *x, __attribute__((address_space(1))) int *y) { y[0] = x[0]; @@ -43,7 +43,7 @@ // `by-val` struct will be coerced into a similar struct with all generic // pointers lowerd into global ones. // CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) -// HOST: define void @_Z7kernel41S.stub(i32* %s.coerce0, float* %s.coerce1) +// HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) __global__ void kernel4(struct S s) { s.x[0]++; s.y[0] += 1.f; @@ -51,7 +51,7 @@ // If a pointer to struct is passed, only the pointer itself is coerced into the global one. // CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce) -// HOST: define void @_Z7kernel5P1S.stub(%struct.S* %s) +// HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s) __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -62,7 +62,7 @@ }; // `by-val` array is also coerced. // CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) -// HOST: define void @_Z7kernel61T.stub(float* %t.coerce0, float* %t.coerce1) +// HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; t.x[1][0] += 2.f; Index: clang/test/CodeGenCUDA/kernel-stub-name.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-stub-name.cu +++ clang/test/CodeGenCUDA/kernel-stub-name.cu @@ -6,15 +6,50 @@ #include "Inputs/cuda.h" +extern "C" __global__ void ckernel() {} + +namespace ns { +__global__ void nskernel() {} +} // namespace ns + template __global__ void kernelfunc() {} +__global__ void kernel_decl(); + +// Device side kernel names + +// CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00" +// CHECK: @[[NSKERN:[0-9]*]] = {{.*}} c"_ZN2ns8nskernelEv\00" +// CHECK: @[[TKERN:[0-9]*]] = {{.*}} c"_Z10kernelfuncIiEvv\00" + +// Non-template kernel stub functions + +// CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]] +// CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]] + // CHECK-LABEL: define{{.*}}@_Z8hostfuncv() -// CHECK: call void @[[STUB:_Z10kernelfuncIiEvv.stub]]() -void hostfunc(void) { kernelfunc<<<1, 1>>>(); } +// CHECK: call void @[[CSTUB]]() +// CHECK: call void @[[NSSTUB]]() +// CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]() +// CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]() +void hostfunc(void) { + ckernel<<<1, 1>>>(); + ns::nskernel<<<1, 1>>>(); + kernelfunc<<<1, 1>>>(); + kernel_decl<<<1, 1>>>(); +} + +// Template kernel stub functions + +// CHECK: define{{.*}}@[[TSTUB]] +// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]] -// CHECK: define{{.*}}@[[STUB]] -// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[STUB]] +// CHECK: declare{{.*}}@[[DSTUB]] // CHECK-LABEL: define{{.*}}@__hip_register_globals -// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[STUB]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]] +// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]] Index: clang/test/CodeGenCUDA/unnamed-types.cu =================================================================== --- clang/test/CodeGenCUDA/unnamed-types.cu +++ clang/test/CodeGenCUDA/unnamed-types.cu @@ -36,4 +36,4 @@ }(p); } // HOST: @__hip_register_globals -// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0 +// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0