Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -5489,6 +5489,22 @@ !0 = !{!"llvm.loop.vectorize.predicate.enable", i1 0} !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 1} +'``llvm.loop.vectorize.ivdep.enable``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This metadata indicates to the vectorizer to ignore dependencies between +memory accesses which have not been determined to be either safe or unsafe +for vectorization. This differs from ``llvm.loop.parallel_access``, which +considers no dependencies to be present between memory accesses belonging +to the same access group. The first operand is the string +``llvm.loop.vectorize.ivdep.enable`` and the second operand is a bit. A +value of 1 implies that the functionality of this metadata is enabled for +the loop. + +.. code-block:: llvm + + !0 = !{!"llvm.loop.vectorize.ivdep.enable", i1 1} + '``llvm.loop.vectorize.width``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -201,7 +201,7 @@ /// /// Only checks sets with elements in \p CheckDeps. bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, - const ValueToValueMap &Strides); + const ValueToValueMap &Strides, bool UnknownDepHint); /// No memory dependence was encountered that would inhibit /// vectorization. @@ -516,7 +516,8 @@ class LoopAccessInfo { public: LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI); + AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, + bool UnknownDepHint = false); /// Return true we can analyze the memory accesses in the loop and there are /// no memory dependence cycles. @@ -608,7 +609,8 @@ private: /// Analyze the loop. void analyzeLoop(AliasAnalysis *AA, LoopInfo *LI, - const TargetLibraryInfo *TLI, DominatorTree *DT); + const TargetLibraryInfo *TLI, DominatorTree *DT, + bool UnknownDepHint); /// Check if the structure of the loop allows it to be analyzed by this /// pass. @@ -735,7 +737,7 @@ /// Query the result of the loop access information for the loop \p L. /// /// If there is no cached result available run the analysis. - const LoopAccessInfo &getInfo(Loop *L); + const LoopAccessInfo &getInfo(Loop *L, bool UnknownDepHint = false); void releaseMemory() override { // Invalidate the cache when the pass is freed. Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -44,7 +44,7 @@ /// careful NOT to add them if the user hasn't specifically asked so. class LoopVectorizeHints { enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED, - HK_PREDICATE }; + HK_PREDICATE, HK_IVDEP }; /// Hint - associates name and validation with the hint value. struct Hint { @@ -73,6 +73,9 @@ /// Vector Predicate Hint Predicate; + /// Ignore Vector dependencies + Hint Ivdep; + /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } @@ -102,6 +105,7 @@ unsigned getInterleave() const { return Interleave.Value; } unsigned getIsVectorized() const { return IsVectorized.Value; } unsigned getPredicate() const { return Predicate.Value; } + unsigned getIvdep() const { return Ivdep.Value; } enum ForceKind getForce() const { if ((ForceKind)Force.Value == FK_Undefined && hasDisableAllTransformsHint(TheLoop)) @@ -199,7 +203,7 @@ LoopVectorizationLegality( Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AliasAnalysis *AA, - Function *F, std::function *GetLAA, + Function *F, std::function *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) @@ -405,7 +409,7 @@ DominatorTree *DT; // LoopAccess analysis. - std::function *GetLAA; + std::function *GetLAA; // And the loop-accesses info corresponding to this loop. This pointer is // null until canVectorizeMemory sets it up. Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -138,7 +138,7 @@ DemandedBits *DB; AliasAnalysis *AA; AssumptionCache *AC; - std::function *GetLAA; + std::function *GetLAA; OptimizationRemarkEmitter *ORE; ProfileSummaryInfo *PSI; @@ -149,7 +149,7 @@ TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, - std::function &GetLAA_, + std::function &GetLAA_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_); bool processLoop(Loop *L); Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1633,10 +1633,12 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, - const ValueToValueMap &Strides) { + const ValueToValueMap &Strides, + bool UnknownDepHint) { MaxSafeDepDistBytes = -1; SmallPtrSet Visited; + Status = VectorizationSafetyStatus::Safe; for (MemAccessInfo CurAccess : CheckDeps) { if (Visited.count(CurAccess)) continue; @@ -1678,7 +1680,13 @@ Dependence::DepType Type = isDependent(*A.first, A.second, *B.first, B.second, Strides); - mergeInStatus(Dependence::isSafeForVectorization(Type)); + // Update safety status depending on whether the Dependence type + // is safe. If Unknown Dependence type is to be considered safe, + // do not update safety status. + if (!UnknownDepHint || + !(Dependence::isSafeForVectorization(Type) == + VectorizationSafetyStatus::PossiblySafeWithRtChecks)) + mergeInStatus(Dependence::isSafeForVectorization(Type)); // Gather dependences unless we accumulated MaxDependences // dependences. In that case return as soon as we find the first @@ -1788,7 +1796,8 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI, const TargetLibraryInfo *TLI, - DominatorTree *DT) { + DominatorTree *DT, + bool UnknownDepHint) { typedef SmallPtrSet ValueSet; // Holds the Load and Store instructions. @@ -2022,7 +2031,8 @@ if (Accesses.isDependencyCheckNeeded()) { LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n"); CanVecMem = DepChecker->areDepsSafe( - DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides); + DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides, + UnknownDepHint); MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes(); if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) { @@ -2343,7 +2353,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AliasAnalysis *AA, - DominatorTree *DT, LoopInfo *LI) + DominatorTree *DT, LoopInfo *LI, + bool UnknownDepHint) : PSE(std::make_unique(*SE, *L)), PtrRtChecking(std::make_unique(SE)), DepChecker(std::make_unique(*PSE, L)), TheLoop(L), @@ -2351,7 +2362,7 @@ HasConvergentOp(false), HasDependenceInvolvingLoopInvariantAddress(false) { if (canAnalyzeLoop()) - analyzeLoop(AA, LI, TLI, DT); + analyzeLoop(AA, LI, TLI, DT, UnknownDepHint); } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { @@ -2397,11 +2408,13 @@ PSE->print(OS, Depth); } -const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) { +const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L, + bool UnknownDepHint) { auto &LAI = LoopAccessInfoMap[L]; if (!LAI) - LAI = std::make_unique(L, SE, TLI, AA, DT, LI); + LAI = std::make_unique(L, SE, TLI, AA, DT, LI, + UnknownDepHint); return *LAI.get(); } Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -61,6 +61,8 @@ case HK_ISVECTORIZED: case HK_PREDICATE: return (Val == 0 || Val == 1); + case HK_IVDEP: + return (Val == 1); } return false; } @@ -72,7 +74,8 @@ Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L), + Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), + Ivdep("vectorize.ivdep.enable", 0, HK_IVDEP), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -224,7 +227,8 @@ return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate}; + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate, + &Ivdep}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -825,7 +829,7 @@ } bool LoopVectorizationLegality::canVectorizeMemory() { - LAI = &(*GetLAA)(*TheLoop); + LAI = &(*GetLAA)(*TheLoop, Hints->getIvdep()); const OptimizationRemarkAnalysis *LAR = LAI->getReport(); if (LAR) { ORE->emit([&]() { Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1610,8 +1610,10 @@ auto *ORE = &getAnalysis().getORE(); auto *PSI = &getAnalysis().getPSI(); - std::function GetLAA = - [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; + std::function GetLAA = + [&](Loop &L, bool UnknownDepHint) -> const LoopAccessInfo & + { return LAA-> + getInfo(&L, UnknownDepHint); }; return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, GetLAA, *ORE, PSI); @@ -7800,7 +7802,7 @@ Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, - std::function &GetLAA_, + std::function &GetLAA_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; @@ -7879,8 +7881,8 @@ : nullptr; auto &LAM = AM.getResult(F).getManager(); - std::function GetLAA = - [&](Loop &L) -> const LoopAccessInfo & { + std::function GetLAA = + [&](Loop &L, bool UnknownDepHint) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; return LAM.getResult(L, AR); }; Index: llvm/test/Transforms/LoopVectorize/X86/ivdep-alias.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/ivdep-alias.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -O3 -S | FileCheck %s +; IR generated for a function containing the loop: +; #pragma clang loop ivdep(enable) +; for (int i=0; i, <4 x i32>* %1, align 4 +; CHECK: %3 = extractelement <4 x i64> %2, i32 0 + %4 = load i32, i32* %i, align 4 + %idxprom = sext i32 %4 to i64 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 %idxprom + %5 = load i32, i32* %arrayidx, align 4 +; CHECK: %16 = insertelement <4 x i32> %15, i32 %12, i32 1 +; CHECK: %21 = extractelement <4 x i32> %19, i32 1 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %2, i64 %idxprom1 + %6 = load i32, i32* %arrayidx2, align 4 + %inc = add nsw i32 %6, 1 + store i32 %inc, i32* %arrayidx2, align 4 + br label %for.inc +; CHECK: %24 = icmp eq i64 %index.next, %n.vec +; CHECK: br i1 %24, label %middle.block, label %vector.body, !llvm.loop !2 + +for.inc: ; preds = %for.body + %7 = load i32, i32* %i, align 4 + %inc3 = add nsw i32 %7, 1 + store i32 %inc3, i32* %i, align 4 + br label %for.cond, !llvm.loop !2 + +for.end: ; preds = %for.cond + %8 = load i32*, i32** %a.addr, align 8 + ret i32* %8 +} + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project 8a5bfbe6db2824642bf9a1d27a24c5b6132b244f)"} +; CHECK: !2 = distinct !{!2, !3} +; CHECK-NEXT: !3 = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEXT: !4 = distinct !{!4, !5, !3} +; CHECK-NEXT: !5 = !{!"llvm.loop.unroll.runtime.disable"} +!2 = distinct !{!2, !3, !4} +!3 = !{!"llvm.loop.vectorize.ivdep.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/X86/ivdep-novec.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/ivdep-novec.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -O3 -S | FileCheck %s +; IR generated for a function containing the loop: +; #pragma clang loop ivdep(enable) +; for (i = 1; i < n; i++) +; A[i] = A[i] + A[i-1]; +; where n is an integer constants. +; The above dependency can be determine by the vectorizer to be unsafe for +; vectorization. +; Should not vectorize even if ivdep is present. +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define dso_local i32* @calcDepArray(i32* %A, i32 %n) #0 { +entry: + %A.addr = alloca i32*, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8 + store i32 %n, i32* %n.addr, align 4 + store i32 1, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %n.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end + +; CHECK: for.body: +for.body: ; preds = %for.cond + %2 = load i32*, i32** %A.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + %5 = load i32*, i32** %A.addr, align 8 + %6 = load i32, i32* %i, align 4 + %sub = sub nsw i32 %6, 1 + %idxprom1 = sext i32 %sub to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %5, i64 %idxprom1 + %7 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %4, %7 + %8 = load i32*, i32** %A.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom3 = sext i32 %9 to i64 + %arrayidx4 = getelementptr inbounds i32, i32* %8, i64 %idxprom3 + store i32 %add, i32* %arrayidx4, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %10 = load i32, i32* %i, align 4 + %inc = add nsw i32 %10, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !2 + +for.end: ; preds = %for.cond + %11 = load i32*, i32** %A.addr, align 8 + ret i32* %11 +} + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project 8a5bfbe6db2824642bf9a1d27a24c5b6132b244f)"} +; CHECK: !2 = distinct !{!2, !3, !4} +; CHECK-NEXT: !3 = !{!"llvm.loop.vectorize.ivdep.enable", i1 true} +; CHECK-NEXT: !4 = !{!"llvm.loop.vectorize.enable", i1 true} +!2 = distinct !{!2, !3, !4} +!3 = !{!"llvm.loop.vectorize.ivdep.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/X86/ivdep-unkbounds.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/ivdep-unkbounds.ll @@ -0,0 +1,70 @@ +; RUN: opt < %s -O3 -S | FileCheck %s +; IR generated for a function containing the loop: +; #pragma clang loop ivdep(enable) +; for (i = 0; i < 64; i++) +; A[i*i] *= 2; +; In the above example, the vectorizer cannot determine if +; array accesses are within array bounds and is safe for vectorization. +; Vectorizer regards it as an unknown dependency. +; Check if loop has been vectorized when ivdep is present. +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define dso_local i32* @doubleArrayElements(i32* %A) #0 { +entry: + %A.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store i32* %A, i32** %A.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond +; CHECK: br label %vector.body + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 64 + br i1 %cmp, label %for.body, label %for.end + +; CHECK: vector.body: +for.body: ; preds = %for.cond +; CHECK: %vec.ind = phi <4 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] +; CHECK: %0 = mul <4 x i64> %vec.ind, %vec.ind +; CHECK: %2 = extractelement <4 x i64> %1, i32 0 + %1 = load i32*, i32** %A.addr, align 8 + %2 = load i32, i32* %i, align 4 + %3 = load i32, i32* %i, align 4 + %mul = mul nsw i32 %2, %3 + %idxprom = sext i32 %mul to i64 + %arrayidx = getelementptr inbounds i32, i32* %1, i64 %idxprom +; CHECK: %15 = insertelement <4 x i32> %14, i32 %11, i32 1 +; CHECK: %21 = extractelement <4 x i32> %18, i32 2 + %4 = load i32, i32* %arrayidx, align 4 + %mul1 = mul nsw i32 %4, 2 + store i32 %mul1, i32* %arrayidx, align 4 +; CHECK: %vec.ind.next = add <4 x i64> %vec.ind, + br label %for.inc +; CHECK: br i1 %23, label %for.end, label %vector.body, !llvm.loop !2 + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !2 + +for.end: ; preds = %for.cond + %6 = load i32*, i32** %A.addr, align 8 + ret i32* %6 +} + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project 8a5bfbe6db2824642bf9a1d27a24c5b6132b244f)"} +; CHECK: !2 = distinct !{!2, !3} +; CHECK-NEXT: !3 = !{!"llvm.loop.isvectorized", i32 1} +!2 = distinct !{!2, !3, !4} +!3 = !{!"llvm.loop.vectorize.ivdep.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/X86/ivdep-unkdep.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/X86/ivdep-unkdep.ll @@ -0,0 +1,88 @@ +; RUN: opt < %s -O3 -S | FileCheck %s +; IR generated for a function containing the loop: +; #pragma clang loop ivdep(enable) +; for (i = 0; i < m; i++){ +; a[i] = a[i + k] * c; +; where m, k, c are integer constants. +; The above is an unknown dependency as the vectorizer cannot determine if +; accesses are independent and a[i + k] is within +; array bounds. It depends on value of k and dependence is not determined to +; be safe or unsafe. +; Check if the loop has been vectorized when ivdep is present. +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @calcArray(i32* %a, i32 %m, i32 %k, i32 %c) #0 { +entry: + %a.addr = alloca i32*, align 8 + %m.addr = alloca i32, align 4 + %k.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %a, i32** %a.addr, align 8 + store i32 %m, i32* %m.addr, align 4 + store i32 %k, i32* %k.addr, align 4 + store i32 %c, i32* %c.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %m.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end +; CHECK: vector.ph +; CHECK: %n.vec = and i64 %wide.trip.count, 4294967288 +; CHECK: %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 +; CHECK: %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: br i1 %4, label %middle.block.unr-lcssa, label %vector.ph.new + +; CHECK: vector.ph.new: +; CHECK: br label %vector.body + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %a.addr, align 8 + %3 = load i32, i32* %i, align 4 + %4 = load i32, i32* %k.addr, align 4 + %add = add nsw i32 %3, %4 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %5 = load i32, i32* %arrayidx, align 4 + %6 = load i32, i32* %c.addr, align 4 +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* %7, align 4 +; CHECK: %10 = mul nsw <4 x i32> %wide.load, %broadcast.splat11 +; CHECK: store <4 x i32> %10, <4 x i32>* %13, align 4 + %mul = mul nsw i32 %5, %6 + %7 = load i32*, i32** %a.addr, align 8 + %8 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %8 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %7, i64 %idxprom1 + store i32 %mul, i32* %arrayidx2, align 4 + br label %for.inc +; CHECK: br i1 %niter.ncmp.1, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !2 + +for.inc: ; preds = %for.body + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !2 + +for.end: ; preds = %for.cond + ret i32 0 +} + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project 8a5bfbe6db2824642bf9a1d27a24c5b6132b244f)"} +; CHECK: !2 = distinct !{!2, !3} +; CHECK-NEXT: !3 = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEXT: !4 = distinct !{!4, !5, !3} +; CHECK-NEXT: !5 = !{!"llvm.loop.unroll.runtime.disable"} +!2 = distinct !{!2, !3, !4} +!3 = !{!"llvm.loop.vectorize.ivdep.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true}