diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -71,6 +71,8 @@ "Number of OpenMP parallel regions replaced with ID in GPU state machines"); STATISTIC(NumOpenMPParallelRegionsMerged, "Number of OpenMP parallel regions merged"); +STATISTIC(NumBytesMovedToSharedMemory, + "Amount of memory pushed to shared memory"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -517,6 +519,7 @@ // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); + Changed |= replaceGlobalization(); Changed |= deleteParallelRegions(); if (HideMemoryTransferLatency) Changed |= hideMemTransfersLatency(); @@ -982,6 +985,85 @@ return Changed; } + /// Replace globalization calls in the device with shared memory. + bool replaceGlobalization() { + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + bool Changed = false; + + auto ReplaceAllocCalls = [&](Use &U, Function &F) { + auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; + CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CB) + return false; + + // Push every globalized variable whose lifetime is the same as the kernel + // to global shared memory. + // TODO: Determine memory lifetime and parallel level differently to catch + // more cases + if (!isKernel(F) || !isSPMDMode(F)) + return false; + + ConstantInt *AllocSize = dyn_cast(CB->getArgOperand(0)); + if (!AllocSize) + return false; + + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in " + << CB->getCaller()->getName() << " with " + << AllocSize->getZExtValue() + << " bytes of shared memory\n"); + + // Remove the free call + for (auto *U : CB->users()) { + CallBase *FC = dyn_cast(U); + if (FC && FC->getCalledFunction() == FreeCall.Declaration) { + FC->eraseFromParent(); + break; + } + } + + // Create a new shared memory buffer of the same size as the allocation + // and replace all the uses of the original allocation with it. + Type *Int8Ty = Type::getInt8Ty(M.getContext()); + Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); + auto *SharedMem = new GlobalVariable( + M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, + UndefValue::get(Int8ArrTy), CB->getName(), nullptr, + GlobalValue::NotThreadLocal, + static_cast(AddressSpace::Shared)); + + auto *NullInt = Constant::getNullValue(Type::getInt64Ty(M.getContext())); + auto *GEPExpr = ConstantExpr::getGetElementPtr( + Int8ArrTy, SharedMem, SmallVector({NullInt, NullInt})); + + auto *NewBuffer = new AddrSpaceCastInst(GEPExpr, Int8Ty->getPointerTo(), + CB->getName() + "_shared", CB); + + SharedMem->setAlignment(MaybeAlign(8)); + NewBuffer->setDebugLoc(CB->getDebugLoc()); + CB->replaceAllUsesWith(NewBuffer); + CB->eraseFromParent(); + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Replaced globalized variable with " + << ore::NV("SharedMemory", AllocSize->getZExtValue()) + << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ") + << "of shared memory"; + }; + emitRemark(NewBuffer, "OpenMPReplaceGlobalization", + Remark); + + NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); + Changed = true; + + return true; + }; + RFI.foreachUse(SCC, ReplaceAllocCalls); + + if (Changed) + OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_free_shared); + return Changed; + } + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; @@ -1484,10 +1566,29 @@ /// Kernel (=GPU) optimizations and utility functions /// ///{{ + enum class AddressSpace : unsigned { + Generic = 0, + Global = 1, + Shared = 3, + Constant = 4, + Local = 5, + }; /// Check if \p F is a kernel, hence entry point for target offloading. bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } + /// Check if \p F is an SPMD kernel + bool isSPMDMode(Function &F) { + if (!OMPInfoCache.Kernels.count(&F)) + return false; + + auto *GVal = M.getNamedValue(F.getName().str() + "_exec_mode"); + if (auto *GVar = dyn_cast(GVal)) + return GVar->getInitializer()->isOneValue(); + + return false; + } + /// Cache to remember the unique kernel for a function. DenseMap> UniqueKernelMap; diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -0,0 +1,66 @@ +; RUN: opt -S -attributor -openmpopt < %s | FileCheck %s +; RUN: opt -S -passes='attributor,cgscc(openmpopt)' < %s | FileCheck %s +; RUN: opt -openmpopt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS +; RUN: opt -passes=openmpopt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64" + +@foo_exec_mode = weak constant i8 0 + +; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory +; CHECK: [[SHARED:@.+]] = internal addrspace(3) global [16 x i8] undef + +; CHECK: %{{.*}} = call i8* @__kmpc_alloc_shared({{.*}}) +; CHECK: call void @__kmpc_free_shared({{.*}}) +define void @foo() { +entry: + %x = call i8* @__kmpc_alloc_shared(i64 4) + %x_on_stack = bitcast i8* %x to i32* + %0 = bitcast i32* %x_on_stack to i8* + call void @use(i8* %0) + call void @__kmpc_free_shared(i8* %x) + ret void +} + +@bar_exec_mode = weak constant i8 1 + +; CHECK: %{{.*}} = addrspacecast i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* [[SHARED]], i64 0, i64 0) to i8*, !dbg !{{[0-9]+}} +define void @bar() { +entry: + %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !9 + %x_on_stack = bitcast i8* %x to [4 x i32]* + %0 = bitcast [4 x i32]* %x_on_stack to i8* + call void @use(i8* %0) + call void @__kmpc_free_shared(i8* %x) + ret void +} + +define void @use(i8* %x) { +entry: + %.addr = alloca i8* + store i8* %x, i8** %.addr + ret void +} + +; CHECK: declare i8* @__kmpc_alloc_shared(i64) +declare i8* @__kmpc_alloc_shared(i64) + +; CHECK: declare void @__kmpc_free_shared(i8*) +declare void @__kmpc_free_shared(i8*) + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!nvvm.annotations = !{!5, !6} + + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{void ()* @foo, !"kernel", i32 1} +!6 = !{void ()* @bar, !"kernel", i32 1} +!7 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 5, column: 7, scope: !7)