diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -34,6 +35,7 @@ #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Transforms/Utils/CodeExtractor.h" +using namespace llvm::PatternMatch; using namespace llvm; using namespace omp; @@ -75,6 +77,8 @@ "Number of OpenMP parallel regions replaced with ID in GPU state machines"); STATISTIC(NumOpenMPParallelRegionsMerged, "Number of OpenMP parallel regions merged"); +STATISTIC(NumBytesMovedToSharedMemory, + "Amount of memory pushed to shared memory"); #if !defined(NDEBUG) static constexpr auto TAG = "[" DEBUG_TYPE "]"; @@ -82,6 +86,16 @@ namespace { +enum class AddressSpace : unsigned { + Generic = 0, + Global = 1, + Shared = 3, + Constant = 4, + Local = 5, +}; + +struct AAHeapToShared; + struct AAICVTracker; /// OpenMP specific information. For now, stores RFIs and ICVs also needed for @@ -512,6 +526,9 @@ if (IsModulePass) { Changed |= runAttributor(); + // Recollect uses, in case Attributor deleted any. + OMPInfoCache.recollectUses(); + if (remarksEnabled()) analysisGlobalization(); } else { @@ -1122,28 +1139,23 @@ } void analysisGlobalization() { - RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared, - OMPRTL___kmpc_free_shared}; - - for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { - auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; - - auto CheckGlobalization = [&](Use &U, Function &Decl) { - if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { - auto Remark = [&](OptimizationRemarkAnalysis ORA) { - return ORA - << "Found thread data sharing on the GPU. " - << "Expect degraded performance due to data globalization."; - }; - emitRemark(CI, "OpenMPGlobalization", - Remark); - } + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; - return false; - }; + auto CheckGlobalization = [&](Use &U, Function &Decl) { + if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { + auto Remark = [&](OptimizationRemarkAnalysis ORA) { + return ORA + << "Found thread data sharing on the GPU. " + << "Expect degraded performance due to data globalization."; + }; + emitRemark(CI, "OpenMPGlobalization", + Remark); + } - RFI.foreachUse(SCC, CheckGlobalization); - } + return false; + }; + + RFI.foreachUse(SCC, CheckGlobalization); } /// Maps the values stored in the offload arrays passed as arguments to @@ -1604,6 +1616,12 @@ GetterRFI.foreachUse(SCC, CreateAA); } + auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + auto CreateAA = [&](Use &U, Function &F) { + A.getOrCreateAAFor(IRPosition::function(F)); + return false; + }; + GlobalizationRFI.foreachUse(SCC, CreateAA); for (auto &F : M) { if (!F.isDeclaration()) @@ -2321,7 +2339,7 @@ // a constant zero. // TODO: Use AAValueSimplify to simplify and propogate constants. // TODO: Check more than a single use for thread ID's. - auto IsSingleThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { + auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { if (!Edge || !Edge->isConditional()) return false; if (Edge->getSuccessor(0) != SuccessorBB) @@ -2331,6 +2349,21 @@ if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality()) return false; + // Temporarily match the pattern generated by clang for teams regions. + // TODO: Remove this once the new runtime is in place. + ConstantInt *One, *NegOne; + CmpInst::Predicate Pred; + auto &&m_ThreadID = m_Intrinsic(); + auto &&m_WarpSize = m_Intrinsic(); + auto &&m_BlockSize = m_Intrinsic(); + if (match(Cmp, m_Cmp(Pred, m_ThreadID, + m_And(m_Sub(m_BlockSize, m_ConstantInt(One)), + m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)), + m_ConstantInt(NegOne)))))) + if (One->isOne() && NegOne->isMinusOne() && + Pred == CmpInst::Predicate::ICMP_EQ) + return true; + ConstantInt *C = dyn_cast(Cmp->getOperand(1)); if (!C || !C->isZero()) return false; @@ -2351,15 +2384,15 @@ if (pred_begin(BB) == pred_end(BB)) return SingleThreadedBBs.contains(BB); - bool IsSingleThreaded = true; + bool IsInitialThread = true; for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB); PredBB != PredEndBB; ++PredBB) { - if (!IsSingleThreadOnly(dyn_cast((*PredBB)->getTerminator()), + if (!IsInitialThreadOnly(dyn_cast((*PredBB)->getTerminator()), BB)) - IsSingleThreaded &= SingleThreadedBBs.contains(*PredBB); + IsInitialThread &= SingleThreadedBBs.contains(*PredBB); } - return IsSingleThreaded; + return IsInitialThread; }; for (auto *BB : RPOT) { @@ -2372,10 +2405,145 @@ : ChangeStatus::CHANGED; } +/// Try to replace memory allocation calls called by a single thread with a +/// static buffer of shared memory. +struct AAHeapToShared : public StateWrapper { + using Base = StateWrapper; + AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {} + + /// Create an abstract attribute view for the position \p IRP. + static AAHeapToShared &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName(). + const std::string getName() const override { return "AAHeapToShared"; } + + /// See AbstractAttribute::getIdAddr(). + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAHeapToShared. + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; + +struct AAHeapToSharedFunction : public AAHeapToShared { + AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A) + : AAHeapToShared(IRP, A) {} + + const std::string getAsStr() const override { + return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) + + " malloc calls eligible."; + } + + /// See AbstractAttribute::trackStatistics(). + void trackStatistics() const override {} + + void initialize(Attributor &A) override { + auto &OMPInfoCache = static_cast(A.getInfoCache()); + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + + for (User *U : RFI.Declaration->users()) + if (CallBase *CB = dyn_cast(U)) + MallocCalls.insert(CB); + } + + ChangeStatus manifest(Attributor &A) override { + if (MallocCalls.empty()) + return ChangeStatus::UNCHANGED; + + auto &OMPInfoCache = static_cast(A.getInfoCache()); + auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; + + Function *F = getAnchorScope(); + auto *HS = A.lookupAAFor(IRPosition::function(*F), this, + DepClassTy::OPTIONAL); + + ChangeStatus Changed = ChangeStatus::UNCHANGED; + for (CallBase *CB : MallocCalls) { + // Skip replacing this if HeapToStack has already claimed it. + if (HS && HS->isKnownHeapToStack(*CB)) + continue; + + // Find the unique free call to remove it. + SmallVector FreeCalls; + for (auto *U : CB->users()) { + CallBase *C = dyn_cast(U); + if (C && C->getCalledFunction() == FreeCall.Declaration) + FreeCalls.push_back(C); + } + if (FreeCalls.size() != 1) + continue; + + ConstantInt *AllocSize = dyn_cast(CB->getArgOperand(0)); + + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in " + << CB->getCaller()->getName() << " with " + << AllocSize->getZExtValue() + << " bytes of shared memory\n"); + + // Create a new shared memory buffer of the same size as the allocation + // and replace all the uses of the original allocation with it. + Module *M = CB->getModule(); + Type *Int8Ty = Type::getInt8Ty(M->getContext()); + Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); + auto *SharedMem = new GlobalVariable( + *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, + UndefValue::get(Int8ArrTy), CB->getName(), nullptr, + GlobalValue::NotThreadLocal, + static_cast(AddressSpace::Shared)); + auto *NewBuffer = + ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo()); + + SharedMem->setAlignment(MaybeAlign(32)); + + A.changeValueAfterManifest(*CB, *NewBuffer); + A.deleteAfterManifest(*CB); + A.deleteAfterManifest(*FreeCalls.front()); + + NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); + Changed = ChangeStatus::CHANGED; + } + + return Changed; + } + + ChangeStatus updateImpl(Attributor &A) override { + auto &OMPInfoCache = static_cast(A.getInfoCache()); + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + Function *F = getAnchorScope(); + + auto NumMallocCalls = MallocCalls.size(); + + // Only consider malloc calls executed by a single thread with a constant. + for (User *U : RFI.Declaration->users()) { + const auto &ED = A.getAAFor( + *this, IRPosition::function(*F), DepClassTy::REQUIRED); + if (CallBase *CB = dyn_cast(U)) + if (!dyn_cast(CB->getArgOperand(0)) || + !ED.isExecutedByInitialThreadOnly(*CB)) + MallocCalls.erase(CB); + } + + if (NumMallocCalls != MallocCalls.size()) + return ChangeStatus::CHANGED; + + return ChangeStatus::UNCHANGED; + } + + /// Collection of all malloc calls in a function. + SmallPtrSet MallocCalls; +}; + } // namespace const char AAICVTracker::ID = 0; const char AAExecutionDomain::ID = 0; +const char AAHeapToShared::ID = 0; AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, Attributor &A) { @@ -2424,6 +2592,27 @@ return *AA; } +AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP, + Attributor &A) { + AAHeapToSharedFunction *AA = nullptr; + switch (IRP.getPositionKind()) { + case IRPosition::IRP_INVALID: + case IRPosition::IRP_FLOAT: + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_CALL_SITE_ARGUMENT: + case IRPosition::IRP_RETURNED: + case IRPosition::IRP_CALL_SITE_RETURNED: + case IRPosition::IRP_CALL_SITE: + llvm_unreachable( + "AAHeapToShared can only be created for function position!"); + case IRPosition::IRP_FUNCTION: + AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A); + break; + } + + return *AA; +} + PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { if (!containsOpenMP(M, OMPInModule)) return PreservedAnalyses::all(); diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -0,0 +1,100 @@ +; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64" + +; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef +; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef + +; CHECK: %{{.*}} = call i8* @__kmpc_alloc_shared({{.*}}) +; CHECK: call void @__kmpc_free_shared({{.*}}) +define dso_local void @foo() { +entry: + %x = call i8* @__kmpc_alloc_shared(i64 4) + %x_on_stack = bitcast i8* %x to i32* + %0 = bitcast i32* %x_on_stack to i8* + call void @use(i8* %0) + call void @__kmpc_free_shared(i8* %x) + ret void +} + +define void @bar() { + call void @baz() + call void @qux() + ret void +} + +; CHECK: %{{.*}} = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* [[SHARED_X]], i32 0, i32 0) to i8*) to [4 x i32]* +define internal void @baz() { +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cmp = icmp eq i32 %tid, 0 + br i1 %cmp, label %master, label %exit +master: + %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !9 + %x_on_stack = bitcast i8* %x to [4 x i32]* + %0 = bitcast [4 x i32]* %x_on_stack to i8* + call void @use(i8* %0) + call void @__kmpc_free_shared(i8* %x) + br label %exit +exit: + ret void +} + +; CHECK: %{{.*}} = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Y]], i32 0, i32 0) to i8*) to [4 x i32]* +define internal void @qux() { +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %ntid = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %warpsize = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %0 = sub nuw i32 %warpsize, 1 + %1 = sub nuw i32 %ntid, 1 + %2 = xor i32 %0, -1 + %master_tid = and i32 %1, %2 + %3 = icmp eq i32 %tid, %master_tid + br i1 %3, label %master, label %exit +master: + %y = call i8* @__kmpc_alloc_shared(i64 4), !dbg !10 + %y_on_stack = bitcast i8* %y to [4 x i32]* + %4 = bitcast [4 x i32]* %y_on_stack to i8* + call void @use(i8* %4) + call void @__kmpc_free_shared(i8* %y) + br label %exit +exit: + ret void +} + + +define void @use(i8* %x) { +entry: + %addr = alloca i8* + store i8* %x, i8** %addr + ret void +} + +declare i8* @__kmpc_alloc_shared(i64) + +declare void @__kmpc_free_shared(i8*) + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!nvvm.annotations = !{!5, !6} + + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{void ()* @foo, !"kernel", i32 1} +!6 = !{void ()* @bar, !"kernel", i32 1} +!7 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 5, column: 7, scope: !7) +!10 = !DILocation(line: 5, column: 14, scope: !7)