diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -56,6 +56,17 @@ " transfers"), cl::Hidden, cl::init(false)); +static cl::opt SharedMemoryLimit( + "openmp-shared-limit", cl::Optional, + cl::desc("Limits the total amount of shared memory used for optimizations"), + cl::init(-1)); + +static cl::opt + SharedMemoryThreshold("openmp-shared-threshold", cl::Optional, + cl::desc("Only place variables sizes larger than the " + "threshold in shared memory"), + cl::init(8)); + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -71,6 +82,8 @@ "Number of OpenMP parallel regions replaced with ID in GPU state machines"); STATISTIC(NumOpenMPParallelRegionsMerged, "Number of OpenMP parallel regions merged"); +STATISTIC(NumBytesMovedToSharedMemory, + "Amount of memory pushed to shared memory"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -517,6 +530,7 @@ // Recollect uses, in case Attributor deleted any. OMPInfoCache.recollectUses(); + Changed |= replaceGlobalization(); Changed |= deleteParallelRegions(); if (HideMemoryTransferLatency) Changed |= hideMemTransfersLatency(); @@ -982,6 +996,86 @@ return Changed; } + /// Replace globalization calls in the device with shared memory. Variables + /// will not be placed in shared memory if their size is below the threshold, + /// or if it would exceed the limit. + bool replaceGlobalization() { + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + bool Changed = false; + + auto ReplaceAllocCalls = [&](Use &U, Function &F) { + auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; + CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!CB) + return false; + + ConstantInt *AllocSize = dyn_cast(CB->getArgOperand(0)); + ConstantInt *IsOnePerTeam = dyn_cast(CB->getArgOperand(1)); + if (!AllocSize || !IsOnePerTeam || !IsOnePerTeam->getZExtValue()) + return false; + + if (AllocSize->getZExtValue() <= SharedMemoryThreshold) + return false; + + if (NumBytesMovedToSharedMemory + AllocSize->getZExtValue() > + SharedMemoryLimit) + return false; + + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in " + << CB->getCaller()->getName() << " with " + << AllocSize->getZExtValue() + << " bytes of shared memory\n"); + + // Remove the free call + for (auto *U : CB->users()) { + CallBase *FC = dyn_cast(U); + if (FC && FC->getCalledFunction() == FreeCall.Declaration) { + FC->eraseFromParent(); + break; + } + } + + // Create a new shared memory buffer of the same size as the allocation + // and replace all the uses of the original allocation with it. + Type *Int8Ty = Type::getInt8Ty(M.getContext()); + Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); + auto *SharedMem = new GlobalVariable( + M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, + UndefValue::get(Int8ArrTy), CB->getName(), nullptr, + GlobalValue::NotThreadLocal, 3); + + auto *NullInt = Constant::getNullValue(Type::getInt64Ty(M.getContext())); + auto *GEPExpr = ConstantExpr::getGetElementPtr( + Int8ArrTy, SharedMem, SmallVector({NullInt, NullInt})); + + auto *NewBuffer = new AddrSpaceCastInst(GEPExpr, Int8Ty->getPointerTo(), + CB->getName() + "_shared", CB); + + NewBuffer->setDebugLoc(CB->getDebugLoc()); + CB->replaceAllUsesWith(NewBuffer); + CB->eraseFromParent(); + + auto Remark = [&](OptimizationRemark OR) { + return OR << "Replaced globalized variable with " + << ore::NV("SharedMemory", AllocSize->getZExtValue()) + << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ") + << " of shared memory"; + }; + emitRemark(NewBuffer, "OpenMPReplaceGlobalization", + Remark); + + OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_alloc_shared); + OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_free_shared); + NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); + Changed = true; + + return false; + }; + RFI.foreachUse(SCC, ReplaceAllocCalls); + + return Changed; + } + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -0,0 +1,78 @@ +; RUN: opt -S -attributor -openmpopt -openmp-shared-limit=256 -openmp-shared-threshold=2 < %s | FileCheck %s +; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-shared-limit=256 -openmp-shared-threshold=2 < %s | FileCheck %s +; RUN: opt -openmpopt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS +; RUN: opt -passes=openmpopt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64" + +; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory +; CHECK: @{{.*}} = internal addrspace(3) global [{{[0-9]+}} x i8] undef + +; CHECK: {{.*}} = call i8* @__kmpc_alloc_shared({{.*}}) +; CHECK: call void @__kmpc_free_shared({{.*}}) +define void @foo() { +entry: + %x = call i8* @__kmpc_alloc_shared(i64 4, i16 0) + %x_on_stack = bitcast i8* %x to i32* + %0 = bitcast i32* %x_on_stack to i8* + call void @use(i8* %0) + call void @__kmpc_free_shared(i8* %x) + ret void +} + +; CHECK: {{.*}} = addrspacecast i8 addrspace(3)* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8] addrspace(3)* @x, i64 0, i64 0) to i8*, !dbg !{{[0-9]+}} +define void @bar() { +entry: + %x = call i8* @__kmpc_alloc_shared(i64 16, i16 1), !dbg !8 + %x_on_stack = bitcast i8* %x to [4 x i32]* + %0 = bitcast [4 x i32]* %x_on_stack to i8* + call void @use(i8* %0) + call void @__kmpc_free_shared(i8* %x) + ret void +} + +; CHECK: {{.*}} = call i8* @__kmpc_alloc_shared({{.*}}) +; CHECK: {{.*}} = call i8* @__kmpc_alloc_shared({{.*}}) +; CHECK: call void @__kmpc_free_shared({{.*}}) +; CHECK: call void @__kmpc_free_shared({{.*}}) +define void @baz() { +entry: + %x = call i8* @__kmpc_alloc_shared(i64 2, i16 1) + %y = call i8* @__kmpc_alloc_shared(i64 256, i16 1) + %x_on_stack = bitcast i8* %x to i16* + %y_on_stack = bitcast i8* %y to [64 x i32]* + %0 = bitcast i16* %x_on_stack to i8* + %1 = bitcast [64 x i32]* %y_on_stack to i8* + call void @use(i8* %0) + call void @use(i8* %1) + call void @__kmpc_free_shared(i8* %x) + call void @__kmpc_free_shared(i8* %y) + ret void +} + + +define void @use(i8* %x) { +entry: + %.addr = alloca i8* + store i8* %x, i8** %.addr + ret void +} + +; CHECK: declare i8* @__kmpc_alloc_shared({{.*}}) +declare i8* @__kmpc_alloc_shared(i64, i16) + +; CHECK: declare void @__kmpc_free_shared({{.*}}) +declare void @__kmpc_free_shared(i8*) + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 5, column: 7, scope: !6)