Index: llvm/trunk/docs/LangRef.rst =================================================================== --- llvm/trunk/docs/LangRef.rst +++ llvm/trunk/docs/LangRef.rst @@ -4711,6 +4711,27 @@ !0 = !{!"llvm.loop.licm_versioning.disable"} +'``llvm.loop.distribute.enable``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Loop distribution allows splitting a loop into multiple loops. Currently, +this is only performed if the entire loop cannot be vectorized due to unsafe +memory dependencies. The transformation will atempt to isolate the unsafe +dependencies into their own loop. + +This metadata can be used to selectively enable or disable distribution of the +loop. The first operand is the string ``llvm.loop.distribute.enable`` and the +second operand is a bit. If the bit operand value is 1 distribution is +enabled. A value of 0 disables distribution: + +.. code-block:: llvm + + !0 = !{!"llvm.loop.distribute.enable", i1 0} + !1 = !{!"llvm.loop.distribute.enable", i1 1} + +This metadata should be used in conjunction with ``llvm.loop`` loop +identification metadata. + '``llvm.mem``' ^^^^^^^^^^^^^^^ Index: llvm/trunk/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar.h +++ llvm/trunk/include/llvm/Transforms/Scalar.h @@ -479,7 +479,10 @@ // // LoopDistribute - Distribute loops. // -FunctionPass *createLoopDistributePass(); +// ProcessAllLoopsByDefault instructs the pass to look for distribution +// opportunities in all loops unless -enable-loop-distribute or the +// llvm.loop.distribute.enable metadata data override this default. +FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault); //===----------------------------------------------------------------------===// // Index: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -95,10 +95,6 @@ "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); -static cl::opt EnableLoopDistribute( - "enable-loop-distribute", cl::init(false), cl::Hidden, - cl::desc("Enable the new, experimental LoopDistribution Pass")); - static cl::opt EnableNonLTOGlobalsModRef( "enable-non-lto-gmr", cl::init(true), cl::Hidden, cl::desc( @@ -480,9 +476,10 @@ MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); // Distribute loops to allow partial vectorization. I.e. isolate dependences - // into separate loop that would otherwise inhibit vectorization. - if (EnableLoopDistribute) - MPM.add(createLoopDistributePass()); + // into separate loop that would otherwise inhibit vectorization. This is + // currently only performed for loops marked with the metadata + // llvm.loop.distribute=true or when -enable-loop-distribute is specified. + MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false)); MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); Index: llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopDistribute.cpp @@ -60,6 +60,19 @@ cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); +static cl::opt PragmaDistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), + cl::Hidden, + cl::desc( + "The maximum number of SCEV checks allowed for Loop " + "Distribution for loop marked with #pragma loop distribute(enable)")); + +// Note that the initial value for this depends on whether the pass is invoked +// directly or from the optimization pipeline. +static cl::opt EnableLoopDistribute( + "enable-loop-distribute", cl::Hidden, + cl::desc("Enable the new, experimental LoopDistribution Pass")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -576,7 +589,9 @@ public: LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, DominatorTree *DT, ScalarEvolution *SE) - : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {} + : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) { + setForced(); + } /// \brief Try to distribute an inner-most loop. bool processLoop() { @@ -683,7 +698,9 @@ // Don't distribute the loop if we need too many SCEV run-time checks. const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); - if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { + if (Pred.getComplexity() > (IsForced.getValueOr(false) + ? PragmaDistributeSCEVCheckThreshold + : DistributeSCEVCheckThreshold)) { DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); return false; } @@ -735,6 +752,13 @@ return true; } + /// \brief Return if distribution forced to be enabled/disabled for the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + const Optional &isForced() const { return IsForced; } + private: /// \brief Filter out checks between pointers from the same partition. /// @@ -775,18 +799,47 @@ return Checks; } + /// \brief Check whether the loop metadata is forcing distribution to be + /// enabled/disabled. + void setForced() { + Optional Value = + findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); + if (!Value) + return; + + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa(*Op) && "invalid metadata"); + IsForced = mdconst::extract(*Op)->getZExtValue(); + } + // Analyses used. Loop *L; LoopInfo *LI; const LoopAccessInfo &LAI; DominatorTree *DT; ScalarEvolution *SE; + + /// \brief Indicates whether distribution is forced to be enabled/disabled for + /// the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + Optional IsForced; }; /// \brief The pass class. class LoopDistribute : public FunctionPass { public: - LoopDistribute() : FunctionPass(ID) { + /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be + /// performed by default. Pass -enable-loop-distribute={0,1} overrides this + /// default. We use this to keep LoopDistribution off by default when invoked + /// from the optimization pipeline but on when invoked explicitly from opt. + LoopDistribute(bool ProcessAllLoopsByDefault = true) + : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { + // The default is set by the caller. + if (EnableLoopDistribute.getNumOccurrences() > 0) + ProcessAllLoops = EnableLoopDistribute; initializeLoopDistributePass(*PassRegistry::getPassRegistry()); } @@ -812,7 +865,11 @@ for (Loop *L : Worklist) { const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); LoopDistributeForLoop LDL(L, LI, LAI, DT, SE); - Changed |= LDL.processLoop(); + + // If distribution was forced for the specific loop to be + // enabled/disabled, follow that. Otherwise use the global flag. + if (LDL.isForced().getValueOr(ProcessAllLoops)) + Changed |= LDL.processLoop(); } // Process each loop nest in the function. @@ -829,6 +886,11 @@ } static char ID; + +private: + /// \brief Whether distribution should be on in this function. The per-loop + /// pragma can override this. + bool ProcessAllLoops; }; } // anonymous namespace @@ -843,5 +905,7 @@ INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) namespace llvm { -FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } +FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { + return new LoopDistribute(ProcessAllLoopsByDefault); +} } Index: llvm/trunk/test/Transforms/LoopDistribute/metadata.ll =================================================================== --- llvm/trunk/test/Transforms/LoopDistribute/metadata.ll +++ llvm/trunk/test/Transforms/LoopDistribute/metadata.ll @@ -0,0 +1,149 @@ +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=0 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_OFF +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_ON + +; Same loop as in basic.ll. Check that distribution is enabled/disabled +; properly according to -enable-loop-distribute=0/1 and the +; llvm.loop.distribute.enable metadata. + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: @explicit_on( +define void @explicit_on(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; EXPLICIT: for.body.ldist1: + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %loadD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +; CHECK-LABEL: @explicit_off( +define void @explicit_off(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; EXPLICIT-NOT: for.body.ldist1: + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %loadD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body + ret void +} + +; CHECK-LABEL: @default_distribute( +define void @default_distribute(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; Verify the two distributed loops. + +; DEFAULT_ON: for.body.ldist1: +; DEFAULT_OFF-NOT: for.body.ldist1: + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %loadD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.distribute.enable", i1 true} +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.distribute.enable", i1 false}