Changeset View
Changeset View
Standalone View
Standalone View
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Show First 20 Lines • Show All 2,849 Lines • ▼ Show 20 Lines | StringRef TransferMediumName = | ||||
"__openmp_nvptx_data_transfer_temporary_storage"; | "__openmp_nvptx_data_transfer_temporary_storage"; | ||||
llvm::GlobalVariable *TransferMedium = | llvm::GlobalVariable *TransferMedium = | ||||
M.getGlobalVariable(TransferMediumName); | M.getGlobalVariable(TransferMediumName); | ||||
unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); | unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); | ||||
if (!TransferMedium) { | if (!TransferMedium) { | ||||
auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); | auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); | ||||
unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); | unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); | ||||
TransferMedium = new llvm::GlobalVariable( | TransferMedium = new llvm::GlobalVariable( | ||||
M, Ty, /*isConstant=*/false, llvm::GlobalVariable::InternalLinkage, | M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage, | ||||
llvm::UndefValue::get(Ty), TransferMediumName, | llvm::UndefValue::get(Ty), TransferMediumName, | ||||
ABataev: "Internalization" is not the best option, it increases mem pressure. Common linkage is a better… | |||||
Not Done ReplyInline ActionsFWIW, if we do not depend on the zero initialization, we should go with undef. jdoerfert: FWIW, if we do not depend on the zero initialization, we should go with undef.
| |||||
Not Done ReplyInline ActionsSure. ABataev: Sure. | |||||
/*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, | /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, | ||||
SharedAddressSpace); | SharedAddressSpace); | ||||
CGM.addCompilerUsedGlobal(TransferMedium); | CGM.addCompilerUsedGlobal(TransferMedium); | ||||
} | } | ||||
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); | auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); | ||||
// Get the CUDA thread id of the current OpenMP thread on the GPU. | // Get the CUDA thread id of the current OpenMP thread on the GPU. | ||||
llvm::Value *ThreadID = RT.getGPUThreadID(CGF); | llvm::Value *ThreadID = RT.getGPUThreadID(CGF); | ||||
▲ Show 20 Lines • Show All 1,918 Lines • ▼ Show 20 Lines | if (!SharedStaticRD->field_empty()) { | ||||
SharedStaticRD->addDecl(Field); | SharedStaticRD->addDecl(Field); | ||||
} | } | ||||
SharedStaticRD->completeDefinition(); | SharedStaticRD->completeDefinition(); | ||||
if (!SharedStaticRD->field_empty()) { | if (!SharedStaticRD->field_empty()) { | ||||
QualType StaticTy = C.getRecordType(SharedStaticRD); | QualType StaticTy = C.getRecordType(SharedStaticRD); | ||||
llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); | llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); | ||||
auto *GV = new llvm::GlobalVariable( | auto *GV = new llvm::GlobalVariable( | ||||
CGM.getModule(), LLVMStaticTy, | CGM.getModule(), LLVMStaticTy, | ||||
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage, | /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, | ||||
llvm::UndefValue::get(LLVMStaticTy), | llvm::UndefValue::get(LLVMStaticTy), | ||||
Perhaps weak_any + undef? Could use internal for symbols that may vary in size and weak_any for those that don't. JonChesterfield: Perhaps weak_any + undef?
Could use internal for symbols that may vary in size and weak_any… | |||||
Not Done ReplyInline ActionsYeah, it is a good idea, I think. ABataev: Yeah, it is a good idea, I think. | |||||
"_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, | "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, | ||||
llvm::GlobalValue::NotThreadLocal, | llvm::GlobalValue::NotThreadLocal, | ||||
C.getTargetAddressSpace(LangAS::cuda_shared)); | C.getTargetAddressSpace(LangAS::cuda_shared)); | ||||
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( | auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( | ||||
GV, CGM.VoidPtrTy); | GV, CGM.VoidPtrTy); | ||||
for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { | for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { | ||||
Rec->Buffer->replaceAllUsesWith(Replacement); | Rec->Buffer->replaceAllUsesWith(Replacement); | ||||
Rec->Buffer->eraseFromParent(); | Rec->Buffer->eraseFromParent(); | ||||
▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines |
"Internalization" is not the best option, it increases mem pressure. Common linkage is a better choice, allows to "squash" the same objects, defined in different units. Make it arch dependable, maybe?
For NVPTX zero initialization is not a problem, it is resolved when PTX is generated.