diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -372,6 +372,8 @@ unsigned getAssumedAddrSpace(const Value *V) const; + bool isSingleThreaded() const; + std::pair getPredicatedAddrSpace(const Value *V) const; @@ -1581,6 +1583,7 @@ virtual bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const = 0; virtual unsigned getAssumedAddrSpace(const Value *V) const = 0; + virtual bool isSingleThreaded() const = 0; virtual std::pair getPredicatedAddrSpace(const Value *V) const = 0; virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, @@ -1959,6 +1962,8 @@ return Impl.getAssumedAddrSpace(V); } + bool isSingleThreaded() const override { return Impl.isSingleThreaded(); } + std::pair getPredicatedAddrSpace(const Value *V) const override { return Impl.getPredicatedAddrSpace(V); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -108,6 +108,8 @@ unsigned getAssumedAddrSpace(const Value *V) const { return -1; } + bool isSingleThreaded() const { return false; } + std::pair getPredicatedAddrSpace(const Value *V) const { return std::make_pair(nullptr, -1); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -47,6 +47,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" #include #include #include @@ -287,6 +288,11 @@ return getTLI()->getTargetMachine().getAssumedAddrSpace(V); } + bool isSingleThreaded() const { + return getTLI()->getTargetMachine().Options.ThreadModel == + ThreadModel::Single; + } + std::pair getPredicatedAddrSpace(const Value *V) const { return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -210,8 +210,9 @@ const SmallSetVector &, SmallVectorImpl &, SmallVectorImpl &, SmallVectorImpl &, PredIteratorCache &, LoopInfo *, DominatorTree *, AssumptionCache *AC, - const TargetLibraryInfo *, Loop *, MemorySSAUpdater &, ICFLoopSafetyInfo *, - OptimizationRemarkEmitter *, bool AllowSpeculation); + const TargetLibraryInfo *, TargetTransformInfo *, Loop *, + MemorySSAUpdater &, ICFLoopSafetyInfo *, OptimizationRemarkEmitter *, + bool AllowSpeculation); /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -273,6 +273,10 @@ return TTIImpl->getAssumedAddrSpace(V); } +bool TargetTransformInfo::isSingleThreaded() const { + return TTIImpl->isSingleThreaded(); +} + std::pair TargetTransformInfo::getPredicatedAddrSpace(const Value *V) const { return TTIImpl->getPredicatedAddrSpace(V); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -76,6 +76,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -112,6 +113,10 @@ "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); +static cl::opt + SingleThread("licm-force-thread-model-single", cl::Hidden, cl::init(false), + cl::desc("Force thread model single in LICM pass")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -489,7 +494,8 @@ collectPromotionCandidates(MSSA, AA, L)) { LocalPromoted |= promoteLoopAccessesToScalars( PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, - DT, AC, TLI, L, MSSAU, &SafetyInfo, ORE, LicmAllowSpeculation); + DT, AC, TLI, TTI, L, MSSAU, &SafetyInfo, ORE, + LicmAllowSpeculation); } Promoted |= LocalPromoted; } while (LocalPromoted); @@ -1911,17 +1917,21 @@ if (auto *A = dyn_cast(Object)) return A->hasByValAttr(); + if (auto *G = dyn_cast(Object)) + return !G->isConstant(); + // TODO: Noalias has nothing to do with writability, this should check for // an allocator function. return isNoAliasCall(Object); } -bool isThreadLocalObject(const Value *Object, const Loop *L, - DominatorTree *DT) { +bool isThreadLocalObject(const Value *Object, const Loop *L, DominatorTree *DT, + TargetTransformInfo *TTI) { // The object must be function-local to start with, and then not captured // before/in the loop. - return isIdentifiedFunctionLocal(Object) && - isNotCapturedBeforeOrInLoop(Object, L, DT); + return (isIdentifiedFunctionLocal(Object) && + isNotCapturedBeforeOrInLoop(Object, L, DT)) || + (TTI->isSingleThreaded() || SingleThread); } } // namespace @@ -1937,9 +1947,9 @@ SmallVectorImpl &InsertPts, SmallVectorImpl &MSSAInsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, - const TargetLibraryInfo *TLI, Loop *CurLoop, MemorySSAUpdater &MSSAU, - ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE, - bool AllowSpeculation) { + const TargetLibraryInfo *TLI, TargetTransformInfo *TTI, Loop *CurLoop, + MemorySSAUpdater &MSSAU, ICFLoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, bool AllowSpeculation) { // Verify inputs. assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -2150,7 +2160,8 @@ // violating the memory model. if (StoreSafety == StoreSafetyUnknown) { Value *Object = getUnderlyingObject(SomePtr); - if (isWritableObject(Object) && isThreadLocalObject(Object, CurLoop, DT)) + if (isWritableObject(Object) && + isThreadLocalObject(Object, CurLoop, DT, TTI)) StoreSafety = StoreSafe; } diff --git a/llvm/test/Transforms/LICM/promote-single-thread.ll b/llvm/test/Transforms/LICM/promote-single-thread.ll --- a/llvm/test/Transforms/LICM/promote-single-thread.ll +++ b/llvm/test/Transforms/LICM/promote-single-thread.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -licm < %s | FileCheck %s +; RUN: opt -S -licm < %s | FileCheck %s --check-prefixes=CHECK,MT +; RUN: opt -S -licm -licm-force-thread-model-single < %s | FileCheck %s --check-prefixes=CHECK,ST @g = external global i32 @c = external constant i32 @@ -10,22 +11,40 @@ ; mode only loads can be promoted, as a different thread might write to the ; global. define void @promote_global(i1 %c, i1 %c2) { -; CHECK-LABEL: @promote_global( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[G_PROMOTED:%.*]] = load i32, ptr @g, align 4 -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[V_INC2:%.*]] = phi i32 [ [[V_INC1:%.*]], [[LATCH:%.*]] ], [ [[G_PROMOTED]], [[ENTRY:%.*]] ] -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LATCH]] -; CHECK: if: -; CHECK-NEXT: [[V_INC:%.*]] = add i32 [[V_INC2]], 1 -; CHECK-NEXT: store i32 [[V_INC]], ptr @g, align 4 -; CHECK-NEXT: br label [[LATCH]] -; CHECK: latch: -; CHECK-NEXT: [[V_INC1]] = phi i32 [ [[V_INC]], [[IF]] ], [ [[V_INC2]], [[LOOP]] ] -; CHECK-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void +; MT-LABEL: @promote_global( +; MT-NEXT: entry: +; MT-NEXT: [[G_PROMOTED:%.*]] = load i32, ptr @g, align 4 +; MT-NEXT: br label [[LOOP:%.*]] +; MT: loop: +; MT-NEXT: [[V_INC2:%.*]] = phi i32 [ [[V_INC1:%.*]], [[LATCH:%.*]] ], [ [[G_PROMOTED]], [[ENTRY:%.*]] ] +; MT-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LATCH]] +; MT: if: +; MT-NEXT: [[V_INC:%.*]] = add i32 [[V_INC2]], 1 +; MT-NEXT: store i32 [[V_INC]], ptr @g, align 4 +; MT-NEXT: br label [[LATCH]] +; MT: latch: +; MT-NEXT: [[V_INC1]] = phi i32 [ [[V_INC]], [[IF]] ], [ [[V_INC2]], [[LOOP]] ] +; MT-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[LOOP]] +; MT: exit: +; MT-NEXT: ret void +; +; ST-LABEL: @promote_global( +; ST-NEXT: entry: +; ST-NEXT: [[G_PROMOTED:%.*]] = load i32, ptr @g, align 4 +; ST-NEXT: br label [[LOOP:%.*]] +; ST: loop: +; ST-NEXT: [[V_INC2:%.*]] = phi i32 [ [[V_INC1:%.*]], [[LATCH:%.*]] ], [ [[G_PROMOTED]], [[ENTRY:%.*]] ] +; ST-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LATCH]] +; ST: if: +; ST-NEXT: [[V_INC:%.*]] = add i32 [[V_INC2]], 1 +; ST-NEXT: br label [[LATCH]] +; ST: latch: +; ST-NEXT: [[V_INC1]] = phi i32 [ [[V_INC]], [[IF]] ], [ [[V_INC2]], [[LOOP]] ] +; ST-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[LOOP]] +; ST: exit: +; ST-NEXT: [[V_INC1_LCSSA:%.*]] = phi i32 [ [[V_INC1]], [[LATCH]] ] +; ST-NEXT: store i32 [[V_INC1_LCSSA]], ptr @g, align 4 +; ST-NEXT: ret void ; entry: br label %loop @@ -87,24 +106,44 @@ ; mode only loads can be promoted, as a different thread might write to the ; captured alloca. define void @promote_captured_alloca(i1 %c, i1 %c2) { -; CHECK-LABEL: @promote_captured_alloca( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 -; CHECK-NEXT: call void @capture(ptr [[A]]) -; CHECK-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 4 -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[V_INC2:%.*]] = phi i32 [ [[V_INC1:%.*]], [[LATCH:%.*]] ], [ [[A_PROMOTED]], [[ENTRY:%.*]] ] -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LATCH]] -; CHECK: if: -; CHECK-NEXT: [[V_INC:%.*]] = add i32 [[V_INC2]], 1 -; CHECK-NEXT: store i32 [[V_INC]], ptr [[A]], align 4 -; CHECK-NEXT: br label [[LATCH]] -; CHECK: latch: -; CHECK-NEXT: [[V_INC1]] = phi i32 [ [[V_INC]], [[IF]] ], [ [[V_INC2]], [[LOOP]] ] -; CHECK-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void +; MT-LABEL: @promote_captured_alloca( +; MT-NEXT: entry: +; MT-NEXT: [[A:%.*]] = alloca i32, align 4 +; MT-NEXT: call void @capture(ptr [[A]]) +; MT-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 4 +; MT-NEXT: br label [[LOOP:%.*]] +; MT: loop: +; MT-NEXT: [[V_INC2:%.*]] = phi i32 [ [[V_INC1:%.*]], [[LATCH:%.*]] ], [ [[A_PROMOTED]], [[ENTRY:%.*]] ] +; MT-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LATCH]] +; MT: if: +; MT-NEXT: [[V_INC:%.*]] = add i32 [[V_INC2]], 1 +; MT-NEXT: store i32 [[V_INC]], ptr [[A]], align 4 +; MT-NEXT: br label [[LATCH]] +; MT: latch: +; MT-NEXT: [[V_INC1]] = phi i32 [ [[V_INC]], [[IF]] ], [ [[V_INC2]], [[LOOP]] ] +; MT-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[LOOP]] +; MT: exit: +; MT-NEXT: ret void +; +; ST-LABEL: @promote_captured_alloca( +; ST-NEXT: entry: +; ST-NEXT: [[A:%.*]] = alloca i32, align 4 +; ST-NEXT: call void @capture(ptr [[A]]) +; ST-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 4 +; ST-NEXT: br label [[LOOP:%.*]] +; ST: loop: +; ST-NEXT: [[V_INC2:%.*]] = phi i32 [ [[V_INC1:%.*]], [[LATCH:%.*]] ], [ [[A_PROMOTED]], [[ENTRY:%.*]] ] +; ST-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[LATCH]] +; ST: if: +; ST-NEXT: [[V_INC:%.*]] = add i32 [[V_INC2]], 1 +; ST-NEXT: br label [[LATCH]] +; ST: latch: +; ST-NEXT: [[V_INC1]] = phi i32 [ [[V_INC]], [[IF]] ], [ [[V_INC2]], [[LOOP]] ] +; ST-NEXT: br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[LOOP]] +; ST: exit: +; ST-NEXT: [[V_INC1_LCSSA:%.*]] = phi i32 [ [[V_INC1]], [[LATCH]] ] +; ST-NEXT: store i32 [[V_INC1_LCSSA]], ptr [[A]], align 4 +; ST-NEXT: ret void ; entry: %a = alloca i32