Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -472,6 +472,19 @@ CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); } + // Create destructor and register it with atexit() the way NVCC does it. Doing + // it during regular destructor phase worked in CUDA before 9.2 but results in + // double-free in 9.2. + if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { + // extern "C" int atexit(void (*f)(void)); + llvm::FunctionType *AtExitTy = + llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); + llvm::Constant *AtExitFunc = + CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), + /*Local=*/true); + CtorBuilder.CreateCall(AtExitFunc, CleanupFn); + } + CtorBuilder.CreateRetVoid(); return ModuleCtorFunc; } Index: lib/CodeGen/CodeGenModule.cpp =================================================================== --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -404,10 +404,9 @@ AddGlobalCtor(ObjCInitFunction); if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice && CUDARuntime) { - if (llvm::Function *CudaCtorFunction = CUDARuntime->makeModuleCtorFunction()) + if (llvm::Function *CudaCtorFunction = + CUDARuntime->makeModuleCtorFunction()) AddGlobalCtor(CudaCtorFunction); - if (llvm::Function *CudaDtorFunction = CUDARuntime->makeModuleDtorFunction()) - AddGlobalDtor(CudaDtorFunction); } if (OpenMPRuntime) { if (llvm::Function *OpenMPRegistrationFunction = Index: test/CodeGenCUDA/device-stub.cu =================================================================== --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -86,8 +86,6 @@ // HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32 // * Make sure our constructor was added to global ctor list. // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor -// * In separate mode we also register a destructor. -// NORDC: @llvm.global_dtors = appending global {{.*}}@__[[PREFIX]]_module_dtor // * Alias to global symbol containing the NVModuleID. // RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* } // RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper @@ -127,6 +125,8 @@ // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle // .. and then calls __[[PREFIX]]_register_globals // NORDC-NEXT: call void @__[[PREFIX]]_register_globals +// * In separate mode we also register a destructor. +// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID% // RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](