Index: include/llvm/Transforms/Utils/LoopVersioning.h =================================================================== --- include/llvm/Transforms/Utils/LoopVersioning.h +++ include/llvm/Transforms/Utils/LoopVersioning.h @@ -80,8 +80,23 @@ /// \brief Annotate memory instructions in the versioned loop with no-alias /// metadata based on the memchecks issued. + /// + /// This is just wrapper that calls prepareNoAliasMetadata and + /// annotateInstWithNoAlias on the instructions of the versioned loop. void annotateLoopWithNoAlias(); + /// \brief Set up the aliasing scopes based on the memchecks. This needs to + /// be called before the first call to annotateInstWithNoAlias. + void prepareNoAliasMetadata(); + + /// \brief Add the noalias annotations to \p VersionedInst. + /// + /// \p OrigInst is the instruction corresponding to \p VersionedInst in the + /// original loop. Initialize the aliasing scopes with + /// prepareNoAliasMetadata once before this can be called. + void annotateInstWithNoAlias(Instruction *VersionedInst, + const Instruction *OrigInst); + private: /// \brief Adds the necessary PHI nodes for the versioned loops based on the /// loop-defined values used outside of the loop. @@ -90,13 +105,11 @@ /// that are used outside the loop. void addPHINodes(const SmallVectorImpl &DefsUsedOutside); - /// \brief Set up the aliasing scopes based on the memchecks. This needs to - /// be called before the first call to annotateInstWithNoAlias. - void prepareNoAliasMetadata(); - /// \brief Add the noalias annotations to \p I. Initialize the aliasing /// scopes with prepareNoAliasMetadata once before this can be called. - void annotateInstWithNoAlias(Instruction *I); + void annotateInstWithNoAlias(Instruction *I) { + annotateInstWithNoAlias(I, I); + } /// \brief The original loop. This becomes the "versioned" one. I.e., /// control flows here if pointers in the loop don't alias. Index: lib/Transforms/Utils/LoopVersioning.cpp =================================================================== --- lib/Transforms/Utils/LoopVersioning.cpp +++ lib/Transforms/Utils/LoopVersioning.cpp @@ -207,29 +207,33 @@ } } -void LoopVersioning::annotateInstWithNoAlias(Instruction *I) { +void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst, + const Instruction *OrigInst) { if (!AnnotateNoAlias) return; LLVMContext &Context = VersionedLoop->getHeader()->getContext(); - Value *Ptr = isa(I) ? cast(I)->getPointerOperand() - : cast(I)->getPointerOperand(); + const Value *Ptr = isa(OrigInst) + ? cast(OrigInst)->getPointerOperand() + : cast(OrigInst)->getPointerOperand(); // Find the group for the pointer and then add the scope metadata. auto Group = PtrToGroup.find(Ptr); if (Group != PtrToGroup.end()) { - I->setMetadata( + VersionedInst->setMetadata( LLVMContext::MD_alias_scope, - MDNode::concatenate(I->getMetadata(LLVMContext::MD_alias_scope), - MDNode::get(Context, GroupToScope[Group->second]))); + MDNode::concatenate( + VersionedInst->getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Context, GroupToScope[Group->second]))); // Add the no-alias metadata. auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second); if (NonAliasingScopeList != GroupToNonAliasingScopeList.end()) - I->setMetadata( + VersionedInst->setMetadata( LLVMContext::MD_noalias, - MDNode::concatenate(I->getMetadata(LLVMContext::MD_noalias), - NonAliasingScopeList->second)); + MDNode::concatenate( + VersionedInst->getMetadata(LLVMContext::MD_noalias), + NonAliasingScopeList->second)); } } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -98,6 +98,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include @@ -445,6 +446,24 @@ /// Emit bypass checks to check any memory assumptions we may have made. void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Add additional metadata to \p To that was not present on \p Orig. + /// + /// Currently this is used to add the noalias annotations based on the + /// inserted memchecks. Use this for instructions that are *cloned* into the + /// vector loop. + void addNewMetadata(Instruction *To, const Instruction *Orig); + + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Instruction *To, const Instruction *From); + + /// \brief Similar to the previous function but it adds the metadata to a + /// vector of instructions. + void addMetadata(SmallVectorImpl &To, const Instruction *From); + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -502,6 +521,13 @@ /// Target Transform Info. const TargetTransformInfo *TTI; + /// \brief LoopVersioning. It's only set up (non-null) if memchecks were + /// used. + /// + /// This is currently only used to add no-alias metadata based on the + /// memchecks. The actually versioning is performed manually. + std::unique_ptr LVer; + /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; @@ -642,12 +668,25 @@ } } -/// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl &To, - const Instruction *From) { +void InnerLoopVectorizer::addNewMetadata(Instruction *To, + const Instruction *Orig) { + // If the loop was versioned with memchecks, add the corresponding no-alias + // metadata. + if (LVer && (isa(Orig) || isa(Orig))) + LVer->annotateInstWithNoAlias(To, Orig); +} + +void InnerLoopVectorizer::addMetadata(Instruction *To, + const Instruction *From) { + propagateMetadata(To, From); + addNewMetadata(To, From); +} + +void InnerLoopVectorizer::addMetadata(SmallVectorImpl &To, + const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast(V)) - propagateMetadata(I, From); + addMetadata(I, From); } /// \brief The group of interleaved loads/stores sharing the same stride and @@ -2305,7 +2344,7 @@ Group->isReverse() ? reverseVector(StridedVec) : StridedVec; } - propagateMetadata(NewLoadInstr, Instr); + addMetadata(NewLoadInstr, Instr); } return; } @@ -2344,7 +2383,7 @@ Instruction *NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); - propagateMetadata(NewStoreInstr, Instr); + addMetadata(NewStoreInstr, Instr); } } @@ -2479,7 +2518,7 @@ Mask[Part]); else NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); - propagateMetadata(NewSI, SI); + addMetadata(NewSI, SI); } return; } @@ -2509,7 +2548,7 @@ "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); - propagateMetadata(NewLI, LI); + addMetadata(NewLI, LI); Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } @@ -2592,6 +2631,7 @@ Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); Cloned->setOperand(op, Op); } + addNewMetadata(Cloned, Instr); // Place the cloned scalar in the new loop. Builder.Insert(Cloned); @@ -2812,6 +2852,12 @@ BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); LoopBypassBlocks.push_back(BB); AddedSafetyChecks = true; + + // We currently don't use LoopVersioning for the actual loop cloning but we + // still use it to add the noalias metadata. + LVer = llvm::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, + PSE.getSE()); + LVer->prepareNoAliasMetadata(); } @@ -3791,7 +3837,7 @@ Entry[Part] = V; } - propagateMetadata(Entry, &*it); + addMetadata(Entry, &*it); break; } case Instruction::Select: { @@ -3821,7 +3867,7 @@ Op1[Part]); } - propagateMetadata(Entry, &*it); + addMetadata(Entry, &*it); break; } @@ -3844,7 +3890,7 @@ Entry[Part] = C; } - propagateMetadata(Entry, &*it); + addMetadata(Entry, &*it); break; } @@ -3881,7 +3927,7 @@ CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); - propagateMetadata(Entry, &*it); + addMetadata(Entry, &*it); break; } /// Vectorize casts. @@ -3891,7 +3937,7 @@ VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - propagateMetadata(Entry, &*it); + addMetadata(Entry, &*it); break; } @@ -3967,7 +4013,7 @@ Entry[Part] = Builder.CreateCall(VectorF, Args); } - propagateMetadata(Entry, &*it); + addMetadata(Entry, &*it); break; } Index: test/Transforms/LoopVectorize/noalias-md-licm.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/noalias-md-licm.ll @@ -0,0 +1,59 @@ +; RUN: opt -basicaa -scoped-noalias -loop-vectorize -licm -force-vector-width=2 \ +; RUN: -force-vector-interleave=1 -S < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; In order to vectorize the inner loop, it needs to be versioned with +; memchecks between {A} x {B, C} first: +; +; for (i = 0; i < n; i++) +; for (j = 0; j < m; j++) +; A[j] += B[i] + C[j]; +; +; Since in the versioned vector loop A and B can no longer alias, B[i] can be +; LICM'ed from the inner loop. + + +define void @f(i32* %a, i32* %b, i32* %c) { +entry: + br label %outer + +outer: + %i.2 = phi i64 [ 0, %entry ], [ %i, %inner.end ] + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %i.2 + br label %inner.ph + +inner.ph: +; CHECK: vector.ph: +; CHECK: load i32, i32* %arrayidxB, +; CHECK: br label %vector.body + br label %inner + +inner: + %j.2 = phi i64 [ 0, %inner.ph ], [ %j, %inner ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %j.2 + %loadA = load i32, i32* %arrayidxA, align 4 + + %loadB = load i32, i32* %arrayidxB, align 4 + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %j.2 + %loadC = load i32, i32* %arrayidxC, align 4 + + %add = add nuw i32 %loadA, %loadB + %add2 = add nuw i32 %add, %loadC + + store i32 %add2, i32* %arrayidxA, align 4 + + %j = add nuw nsw i64 %j.2, 1 + %cond1 = icmp eq i64 %j, 20 + br i1 %cond1, label %inner.end, label %inner + +inner.end: + %i = add nuw nsw i64 %i.2, 1 + %cond2 = icmp eq i64 %i, 30 + br i1 %cond2, label %outer.end, label %outer + +outer.end: + ret void +} Index: test/Transforms/LoopVectorize/noalias-md.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/noalias-md.ll @@ -0,0 +1,78 @@ +; RUN: opt -basicaa -loop-vectorize -force-vector-width=2 \ +; RUN: -force-vector-interleave=1 -S < %s \ +; RUN: | FileCheck %s -check-prefix=BOTH -check-prefix=LV +; RUN: opt -basicaa -scoped-noalias -loop-vectorize -dse -force-vector-width=2 \ +; RUN: -force-vector-interleave=1 -S < %s \ +; RUN: | FileCheck %s -check-prefix=BOTH -check-prefix=DSE + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; This loop needs to be versioned with memchecks between {A, B} x {C} before +; it can be vectorized. +; +; for (i = 0; i < n; i++) { +; C[i] = A[i] + 1; +; C[i] += B[i]; +; } +; +; Check that the corresponding noalias metadata is added to the vector loop +; but not to the scalar loop. +; +; Since in the versioned vector loop C and B can no longer alias, the first +; store to C[i] can be DSE'd. + + +define void @f(i32* %a, i32* %b, i32* %c) { +entry: + br label %for.body + +; BOTH: vector.memcheck: +; BOTH: vector.body: +for.body: ; preds = %for.body, %entry + %ind = phi i64 [ 0, %entry ], [ %inc, %for.body ] + + %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind +; Scope 1 +; LV: = load {{.*}} !alias.scope !0 + %loadA = load i32, i32* %arrayidxA, align 4 + + %add = add nuw i32 %loadA, 2 + + %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind +; Noalias with scope 1 and 6 +; LV: store {{.*}} !alias.scope !3, !noalias !5 +; DSE-NOT: store + store i32 %add, i32* %arrayidxC, align 4 + + %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind +; Scope 6 +; LV: = load {{.*}} !alias.scope !7 + %loadB = load i32, i32* %arrayidxB, align 4 + + %add2 = add nuw i32 %add, %loadB + +; Noalias with scope 1 and 6 +; LV: store {{.*}} !alias.scope !3, !noalias !5 +; DSE: store + store i32 %add2, i32* %arrayidxC, align 4 + + %inc = add nuw nsw i64 %ind, 1 + %exitcond = icmp eq i64 %inc, 20 + br i1 %exitcond, label %for.end, label %for.body + +; BOTH: for.body: +; BOTH-NOT: !alias.scope +; BOTH-NOT: !noalias + +for.end: ; preds = %for.body + ret void +} + +; LV: !0 = !{!1} +; LV: !1 = distinct !{!1, !2} +; LV: !2 = distinct !{!2, !"LVerDomain"} +; LV: !3 = !{!4} +; LV: !4 = distinct !{!4, !2} +; LV: !5 = !{!1, !6} +; LV: !6 = distinct !{!6, !2} +; LV: !7 = !{!6}