Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -3142,6 +3142,21 @@ !0 = metadata !{ metadata !"llvm.loop.unroll.full" } +'``llvm.loop.minimum_dependence_distance``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata specifies a minimum dependence distance for accesses marked +with ``llvm.mem.parallel_loop_access``. The first operand is the string +``llvm.loop.minimum_dependence_distance`` and the second operand is a +positive integer specifying the minimum distance. +For example: + +.. code-block:: llvm + + !0 = metadata !{metadata !"llvm.loop.minimum_dependence_distance", i32 4} + +specifies that parallel accesses have a minimum dependence distance of 4. + '``llvm.mem``' ^^^^^^^^^^^^^^^ @@ -3154,19 +3169,25 @@ The ``llvm.mem.parallel_loop_access`` metadata refers to a loop identifier, or metadata containing a list of loop identifiers for nested loops. The metadata is attached to memory accessing instructions and denotes that -no loop carried memory dependence exist between it and other instructions denoted -with the same loop identifier. +any loop carried memory dependence between it and other instructions denoted +with the same loop identifier have a minimum dependence distance of ``k``, +where ``k`` is specified by ``llvm.mem.minimum_dependence_distance`` metadata +on the loop. If ``k`` is not specified by such metadata, it is positive +infinity. Precisely, given two instructions ``m1`` and ``m2`` that both have the ``llvm.mem.parallel_loop_access`` metadata, with ``L1`` and ``L2`` being the set of loops associated with that metadata, respectively, then there is no loop -carried dependence between ``m1`` and ``m2`` for loops in both ``L1`` and -``L2``. +carried dependence with distance less than ``k1`` and ``k2`` between +``m1`` and ``m2`` for loops in both ``L1`` and ``L2``, where ``k1`` and ``k2`` +are the minimum distancesd specified by ``llvm.loop.minimum_dependence_distance`` +on ``L1`` and ``L2`` respectively (or the default of infinity if not specified). As a special case, if all memory accessing instructions in a loop have -``llvm.mem.parallel_loop_access`` metadata that refers to that loop, then the -loop has no loop carried memory dependences and is considered to be a parallel -loop. +``llvm.mem.parallel_loop_access`` metadata that refers to that loop and +``llvm.loop.minimum_dependence_distance`` is not specified for the loop, +then the loop has no loop carried memory dependences and is considered to +be a parallel loop. Note that if not all memory access instructions have such metadata referring to the loop, then the loop is considered not being trivially parallel. Additional Index: include/llvm/Analysis/LoopInfo.h =================================================================== --- include/llvm/Analysis/LoopInfo.h +++ include/llvm/Analysis/LoopInfo.h @@ -407,7 +407,8 @@ /// isSafeToClone - Return true if the loop body is safe to clone in practice. bool isSafeToClone() const; - /// Returns true if the loop is annotated parallel. + /// Returns true if the loop is annotated parallel *without* distance + /// restrictions. /// /// A parallel loop can be assumed to not contain any dependencies between /// iterations by the compiler. That is, any loop-carried dependency checking @@ -419,7 +420,20 @@ /// iterations is not guaranteed, thus, the end result might or might not /// implement actual concurrent execution of instructions across multiple /// iterations. - bool isAnnotatedParallel() const; + bool isAnnotatedParallel() const { + return getAnnotatedRestrictedParallel().second == 0; + }; + + /// Determine if the loop is annotated parallel, possibly with distance + /// restrictions. + /// + /// Let (p,k) denote the returned pair. + /// If p==false, no assumptions can be made. k is always 1 in this case. + /// If p==true && k==0, then the loop can be assumed to have no + /// dependencies between iterations, i.e. isAnnotatedParallel()==true. + /// If p==true && k>0, the loop can be assumed to have no dependencies + /// with distance less than k between iterations. + std::pair getAnnotatedRestrictedParallel() const; /// Return the llvm.loop loop id metadata node for this loop if it is present. /// Index: lib/Analysis/LoopInfo.cpp =================================================================== --- lib/Analysis/LoopInfo.cpp +++ lib/Analysis/LoopInfo.cpp @@ -285,11 +285,11 @@ } } -bool Loop::isAnnotatedParallel() const { +std::pair Loop::getAnnotatedRestrictedParallel() const { MDNode *desiredLoopIdMetadata = getLoopID(); if (!desiredLoopIdMetadata) - return false; + return std::make_pair(false, 1u); // The loop branch contains the parallel loop metadata. In order to ensure // that any parallel-loop-unaware optimization pass hasn't added loop-carried @@ -310,7 +310,7 @@ MDNode *loopIdMD = II->getMetadata("llvm.mem.parallel_loop_access"); if (!loopIdMD) - return false; + return std::make_pair(false, 1u); bool loopIdMDFound = false; for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) { @@ -321,12 +321,26 @@ } if (!loopIdMDFound) - return false; + return std::make_pair(false, 1); } } - return true; -} + // Check for restriction to distance d. If present, it appears as a non-first + // operand of the loopId with the form + // 'metadata !"llvm.loop.minimum_dependence_distance", i32 d' + for (unsigned i = 1, ie = desiredLoopIdMetadata->getNumOperands(); i < ie; + ++i) + if (const MDNode *MD = + dyn_cast(desiredLoopIdMetadata->getOperand(i))) + if (MD->getNumOperands() == 2) + if (MDString *s = dyn_cast(MD->getOperand(0))) + if (s->getString() == "llvm.loop.minimum_dependence_distance") + return std::make_pair( + true, dyn_cast(MD->getOperand(1))->getZExtValue()); + + // Zero is convention for unrestricted. + return std::make_pair(true, 0); +} /// hasDedicatedExits - Return true if no exit block for the loop /// has a predecessor that is outside the loop. Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -581,8 +581,8 @@ AliasAnalysis *AA, Function *F) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { - } + WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U), + AnnotatedMinDepDist(1) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -759,6 +759,13 @@ unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + void adjustForAnnotatedMinDepDist(unsigned WidestTypeBytes) { + if (AnnotatedMinDepDist != 1) + MaxSafeDepDistBytes = AnnotatedMinDepDist < -1u / WidestTypeBytes + ? AnnotatedMinDepDist * WidestTypeBytes + : -1u; + } + bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } SmallPtrSet::iterator strides_begin() { @@ -866,6 +873,7 @@ bool HasFunNoNaNAttr; unsigned MaxSafeDepDistBytes; + unsigned AnnotatedMinDepDist; ValueToValueMap Strides; SmallPtrSet StrideSet; @@ -986,7 +994,8 @@ enum HintKind { HK_WIDTH, HK_UNROLL, - HK_FORCE + HK_FORCE, + HK_MINIMUM_DEPEENDENCE_DISTANCE }; /// Hint - associates name and validation with the hint value. @@ -1006,9 +1015,16 @@ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: return (Val <= 1); + case HK_MINIMUM_DEPEENDENCE_DISTANCE: + return Val >= 1; } return false; } + + /// True if hint should not be written out after vectorization. + bool squash() const { + return Value == 0 && Kind == HK_MINIMUM_DEPEENDENCE_DISTANCE; + } }; /// Vectorization width. @@ -1017,8 +1033,11 @@ Hint Interleave; /// Vectorization forced Hint Force; + /// Minimum dependence distance. Technically, is not a hint, but it's handled + /// syntactically the same way as a hint. + Hint MinimumDependenceDistance; /// Array to help iterating through all hints. - Hint *Hints[3]; // avoiding initialisation due to MSVC2012 + Hint *Hints[4]; // avoiding initialisation due to MSVC2012 /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } @@ -1034,11 +1053,14 @@ : Width("vectorize.width", VectorizationFactor, HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), + MinimumDependenceDistance("minimum_dependence_distance", 0, + HK_MINIMUM_DEPEENDENCE_DISTANCE), TheLoop(L) { // FIXME: Move this up initialisation when MSVC requirement is 2013+ Hints[0] = &Width; Hints[1] = &Interleave; Hints[2] = &Force; + Hints[3] = &MinimumDependenceDistance; // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -1054,12 +1076,17 @@ /// Mark the loop L as already vectorized by setting the width to 1. void setAlreadyVectorized() { Width.Value = Interleave.Value = 1; + // If distance was finite, conservatively assume it shrunk to 1 + if (MinimumDependenceDistance.Value > 0) + MinimumDependenceDistance.Value = 1; + // FIXME: Change all lines below for this when we can use MSVC 2013+ - //writeHintsToMetadata({ Width, Unroll }); + // writeHintsToMetadata({ Width, Unroll, MinimumDependenceDistance }); std::vector hints; - hints.reserve(2); + hints.reserve(3); hints.emplace_back(Width); hints.emplace_back(Interleave); + hints.emplace_back(MinimumDependenceDistance); writeHintsToMetadata(std::move(hints)); } @@ -1187,8 +1214,10 @@ // Now, add the missing hints. for (auto H : HintTypes) - Vals.push_back( - createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + // Elide hint with value of zero. + if (!H.squash()) + Vals.push_back( + createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); // Replace current metadata node with new one. LLVMContext &Context = TheLoop->getHeader()->getContext(); @@ -4710,7 +4739,12 @@ PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + const auto Annotations = TheLoop->getAnnotatedRestrictedParallel(); + // Logic below handles fully parallel loops by treating them + // as having a very large dependence distance. + AnnotatedMinDepDist = Annotations.second ? Annotations.second : -1u; + + const bool NeedDepCheck = AnnotatedMinDepDist == 1; MemoryDepChecker DepChecker(SE, DL, TheLoop); // For each block. @@ -4733,7 +4767,7 @@ continue; LoadInst *Ld = dyn_cast(it); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + if (!Ld || (!Ld->isSimple() && NeedDepCheck)) { emitAnalysis(Report(Ld) << "read with atomic ordering or volatile read"); DEBUG(dbgs() << "LV: Found a non-simple load.\n"); @@ -4752,7 +4786,7 @@ emitAnalysis(Report(it) << "instruction cannot be vectorized"); return false; } - if (!St->isSimple() && !IsAnnotatedParallel) { + if (!St->isSimple() && NeedDepCheck) { emitAnalysis(Report(St) << "write with atomic ordering or volatile write"); DEBUG(dbgs() << "LV: Found a non-simple store.\n"); @@ -4814,10 +4848,10 @@ } } - if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); + if (!NeedDepCheck) { + DEBUG(dbgs() << "LV: A loop annotated with min dependence distance " + << AnnotatedMinDepDist + << ", ignore memory dependency checks.\n"); return true; } @@ -5358,10 +5392,11 @@ unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; + Legal->adjustForAnnotatedMinDepDist(WidestType / 8); if (Legal->getMaxSafeDepDistBytes() != -1U) MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; - WidestRegister = ((WidestRegister < MaxSafeDepDist) ? - WidestRegister : MaxSafeDepDist); + WidestRegister = + ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " Index: test/Transforms/LoopVectorize/X86/min-dep-dist.ll =================================================================== --- test/Transforms/LoopVectorize/X86/min-dep-dist.ll +++ test/Transforms/LoopVectorize/X86/min-dep-dist.ll @@ -0,0 +1,33 @@ +; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S | FileCheck %s +; CHECK-NOT: memcheck +; CHECK-NOT: <8 x float +; CHECK-NOT: <4 x float +; CHECK: <2 x float +; CHECK-NOT: minimum_dependence_distance", i32 2} + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Simple routine that would be vectorized wider than 2 +; if llvm.loop.minimum_dependence_distance were not present. +define void @foo(float* %a, float* %b) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %i = getelementptr inbounds float* %a, i64 %indvars.iv + %t = load float* %i, align 4, !llvm.mem.parallel_loop_access !0 + %mul = fmul float %t, %t + %j = getelementptr inbounds float* %b, i64 %indvars.iv + store float %mul, float* %j, align 4, !llvm.mem.parallel_loop_access !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +!0 = metadata !{metadata !0, metadata !1} +!1 = metadata !{metadata !"llvm.loop.minimum_dependence_distance", i32 2}