Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -4699,6 +4699,25 @@ !0 = !{!"llvm.loop.licm_versioning.disable"} +'``llvm.loop.distribute.enable``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata selectively enables or disables loop distribution for the loop. The +first operand is the string ``llvm.loop.distribute.enable`` and the second +operand is a bit. If the bit operand value is 1 distribution is enabled. A value +of 0 disables distribution: + +.. code-block:: llvm + + !0 = !{!"llvm.loop.distribute.enable", i1 0} + !1 = !{!"llvm.loop.distribute.enable", i1 1} + +This metadata should be used in conjunction with ``llvm.loop`` loop +identification metadata. The metadata is only a optimization hint, the optimizer +will only perform distribution if it believes to be beneficial. Currently, this +is only done if the original loop cannot be vectorized due to memory +dependencies. + '``llvm.mem``' ^^^^^^^^^^^^^^^ Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -479,7 +479,10 @@ // // LoopDistribute - Distribute loops. // -FunctionPass *createLoopDistributePass(); +// ProcessAllLoopsByDefault instructs the pass to look for distribution +// opportunities in all loops unless -enable-loop-distribute or the +// llvm.loop.distribute.enable metadata data override this default. +FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault); //===----------------------------------------------------------------------===// // Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -95,10 +95,6 @@ "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); -static cl::opt EnableLoopDistribute( - "enable-loop-distribute", cl::init(false), cl::Hidden, - cl::desc("Enable the new, experimental LoopDistribution Pass")); - static cl::opt EnableNonLTOGlobalsModRef( "enable-non-lto-gmr", cl::init(true), cl::Hidden, cl::desc( @@ -477,9 +473,10 @@ MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); // Distribute loops to allow partial vectorization. I.e. isolate dependences - // into separate loop that would otherwise inhibit vectorization. - if (EnableLoopDistribute) - MPM.add(createLoopDistributePass()); + // into separate loop that would otherwise inhibit vectorization. This is + // currently only performed for loops marked with the metadata + // llvm.loop.distribute=true or when -enable-loop-distribute is specified. + MPM.add(createLoopDistributePass(/*ProcessAllLoopsByDefault=*/false)); MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); Index: lib/Transforms/Scalar/LoopDistribute.cpp =================================================================== --- lib/Transforms/Scalar/LoopDistribute.cpp +++ lib/Transforms/Scalar/LoopDistribute.cpp @@ -60,6 +60,12 @@ cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); +// Note that the initial value for this depends on whether the pass is invoked +// direcctly or from the optimization pipeline. +static cl::opt EnableLoopDistribute( + "enable-loop-distribute", cl::Hidden, + cl::desc("Enable the new, experimental LoopDistribution Pass")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -574,7 +580,15 @@ /// \brief The pass class. class LoopDistribute : public FunctionPass { public: - LoopDistribute() : FunctionPass(ID) { + /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be + /// performed by default. Pass -enable-loop-distribute={0,1} overrides this + /// default. We use this to keep LoopDistribution off by default when invoked + /// from the optimization pipeline but on when invoked explicitly from opt. + LoopDistribute(bool ProcessAllLoopsByDefault = true) + : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { + // The default is set by the caller. + if (EnableLoopDistribute.getNumOccurrences() > 0) + ProcessAllLoops = EnableLoopDistribute; initializeLoopDistributePass(*PassRegistry::getPassRegistry()); } @@ -598,7 +612,8 @@ // Now walk the identified inner loops. bool Changed = false; for (Loop *L : Worklist) - Changed |= processLoop(L); + if (shouldProcessLoop(L)) + Changed |= processLoop(L); // Process each loop nest in the function. return Changed; @@ -616,6 +631,19 @@ static char ID; private: + bool ProcessAllLoops; + + bool shouldProcessLoop(Loop *L) { + Optional Value = + findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); + if (!Value) + return ProcessAllLoops; + + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa(*Op) && "invalid metadata"); + return mdconst::extract(*Op)->getZExtValue(); + } + /// \brief Filter out checks between pointers from the same partition. /// /// \p PtrToPartition contains the partition number for pointers. Partition @@ -833,5 +861,7 @@ INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) namespace llvm { -FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } +FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { + return new LoopDistribute(ProcessAllLoopsByDefault); +} } Index: test/Transforms/LoopDistribute/metadata.ll =================================================================== --- /dev/null +++ test/Transforms/LoopDistribute/metadata.ll @@ -0,0 +1,149 @@ +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=0 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_OFF +; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_ON + +; Same loop as in basic.ll. Check that distribution is enabled/disabled +; properly according to -enable-loop-distribute=0/1 and the +; llvm.loop.distribute.enable metadata. + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: @explicit_on( +define void @explicit_on(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; EXPLICIT: for.body.ldist1: + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %loadD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +; CHECK-LABEL: @explicit_off( +define void @explicit_off(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; EXPLICIT-NOT: for.body.ldist1: + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %loadD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body + ret void +} + +; CHECK-LABEL: @default_distribute( +define void @default_distribute(i32* noalias %a, + i32* noalias %b, + i32* noalias %c, + i32* noalias %d, + i32* noalias %e) { +entry: + br label %for.body + +; Verify the two distributed loops. + +; DEFAULT_ON: for.body.ldist1: +; DEFAULT_OFF-NOT: for.body.ldist1: + +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind + %loadA = load i32, i32* %arrayidxA, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind + %loadB = load i32, i32* %arrayidxB, align 4 + + %mulA = mul i32 %loadB, %loadA + + %add = add nuw nsw i64 %ind, 1 + %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add + store i32 %mulA, i32* %arrayidxA_plus_4, align 4 + + %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind + %loadD = load i32, i32* %arrayidxD, align 4 + + %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind + %loadE = load i32, i32* %arrayidxE, align 4 + + %mulC = mul i32 %loadD, %loadE + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind + store i32 %mulC, i32* %arrayidxC, align 4 + + %exitcond = icmp eq i64 %add, 20 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.distribute.enable", i1 true} +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.distribute.enable", i1 false}