diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -129,6 +129,11 @@ cl::desc("Maximal number of attributor iterations."), cl::init(256)); +static cl::opt + SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, + cl::desc("Maximum amount of shared memory to use."), + cl::init(std::numeric_limits::max())); + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -3000,6 +3005,14 @@ auto *AllocSize = cast(CB->getArgOperand(0)); + if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) { + LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB + << " with shared memory." + << " Shared memory usage is limited to " + << SharedMemoryLimit << " bytes\n"); + continue; + } + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB << " with " << AllocSize->getZExtValue() << " bytes of shared memory\n"); @@ -3034,7 +3047,8 @@ A.deleteAfterManifest(*CB); A.deleteAfterManifest(*FreeCalls.front()); - NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); + SharedMemoryUsed += AllocSize->getZExtValue(); + NumBytesMovedToSharedMemory = SharedMemoryUsed; Changed = ChangeStatus::CHANGED; } @@ -3070,6 +3084,8 @@ SmallSetVector MallocCalls; /// Collection of potentially removed free calls in a function. SmallPtrSet PotentialRemovedFreeCalls; + /// The total amount of shared memory that has been used for HeapToShared. + unsigned SharedMemoryUsed = 0; }; struct AAKernelInfo : public StateWrapper { diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs ; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS +; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64" @@ -8,6 +9,8 @@ ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory ; CHECK-REMARKS-NOT: 6 bytes +; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory +; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization ; UTC_ARGS: --enable %struct.ident_t = type { i32, i32, i32, i32, i8* }