Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -282,9 +282,12 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; +class VPBlendRecipe; class VPInterleaveRecipe; class VPReplicateRecipe; class VPWidenIntOrFpInductionRecipe; +class VPWidenRecipe; +class VPWidenMemoryInstructionRecipe; } // end anonymous namespace @@ -452,6 +455,10 @@ /// mask for the block BB. VectorParts createBlockInMask(BasicBlock *BB); + /// A helper function that computes the predicate of the edge between SRC + /// and DST. + VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -504,6 +511,13 @@ /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); + /// Vectorize Load and Store instructions, + virtual void vectorizeMemoryInstruction(Instruction *Instr); + + /// \brief Set the debug location in the builder using the debug location in + /// the instruction. + void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + protected: friend class LoopVectorizationPlanner; @@ -556,17 +570,10 @@ /// represented as. void truncateToMinimalBitwidths(); - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); - /// Vectorize Load and Store instructions, - virtual void vectorizeMemoryInstruction(Instruction *Instr); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -647,10 +654,6 @@ /// vector of instructions. void addMetadata(ArrayRef To, Instruction *From); - /// \brief Set the debug location in the builder using the debug location in - /// the instruction. - void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); - /// The original loop. Loop *OrigLoop; @@ -2295,6 +2298,11 @@ /// to \p Range.End. VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + // Check if \I is a memory instruction to be widened for \p Range.Start and + // potentially masked. + VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, + VFRange &Range); + /// Check if an induction recipe should be constructed for \I within the given /// VF \p Range. If so build and return it. If not, return null. \p Range.End /// may be decreased to ensure same decision from \p Range.Start to @@ -2302,6 +2310,11 @@ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, VFRange &Range); + /// Handle non-loop phi nodes. Currently all such phi nodes are turned into + /// a sequence of select instructions as the vectorizer currently performs + /// full if-conversion. + VPBlendRecipe *tryToBlend(Instruction *I); + /// Check if \p I can be widened within the given VF \p Range. If \p I can be /// widened for \p Range.Start, check if the last recipe of \p VPBB can be /// extended to include \p I or else build a new VPWidenRecipe for it and @@ -4497,77 +4510,6 @@ } while (Changed); } -InnerLoopVectorizer::VectorParts -InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { - assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - - // Look for cached value. - std::pair Edge(Src, Dst); - EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); - if (ECEntryIt != EdgeMaskCache.end()) - return ECEntryIt->second; - - VectorParts SrcMask = createBlockInMask(Src); - - // The terminator has to be a branch inst! - BranchInst *BI = dyn_cast(Src->getTerminator()); - assert(BI && "Unexpected terminator found"); - - if (!BI->isConditional()) - return EdgeMaskCache[Edge] = SrcMask; - - VectorParts EdgeMask(UF); - for (unsigned Part = 0; Part < UF; ++Part) { - auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part); - if (BI->getSuccessor(0) != Dst) - EdgeMaskPart = Builder.CreateNot(EdgeMaskPart); - - if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND. - EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]); - - EdgeMask[Part] = EdgeMaskPart; - } - - return EdgeMaskCache[Edge] = EdgeMask; -} - -InnerLoopVectorizer::VectorParts -InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { - assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - - // Look for cached value. - BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); - if (BCEntryIt != BlockMaskCache.end()) - return BCEntryIt->second; - - // All-one mask is modelled as no-mask following the convention for masked - // load/store/gather/scatter. Initialize BlockMask to no-mask. - VectorParts BlockMask(UF); - for (unsigned Part = 0; Part < UF; ++Part) - BlockMask[Part] = nullptr; - - // Loop incoming mask is all-one. - if (OrigLoop->getHeader() == BB) - return BlockMaskCache[BB] = BlockMask; - - // This is the block mask. We OR all incoming edges. - for (auto *Predecessor : predecessors(BB)) { - VectorParts EdgeMask = createEdgeMask(Predecessor, BB); - if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too. - return BlockMaskCache[BB] = EdgeMask; - - if (!BlockMask[0]) { // BlockMask has its initialized nullptr value. - BlockMask = EdgeMask; - continue; - } - - for (unsigned Part = 0; Part < UF; ++Part) - BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]); - } - - return BlockMaskCache[BB] = BlockMask; -} - void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast(PN); @@ -4588,43 +4530,6 @@ } setDebugLocFromInst(Builder, P); - // Check for PHI nodes that are lowered to vector selects. - if (P->getParent() != OrigLoop->getHeader()) { - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = P->getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // ( ...))) - VectorParts Entry(UF); - for (unsigned In = 0; In < NumIncoming; In++) { - VectorParts Cond = - createEdgeMask(P->getIncomingBlock(In), P->getParent()); - - for (unsigned Part = 0; Part < UF; ++Part) { - Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part); - assert((Cond[Part] || NumIncoming == 1) && - "Multiple predecessors with one predecessor having a full mask"); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part], - "predphi"); - } - } - for (unsigned Part = 0; Part < UF; ++Part) - VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]); - return; - } // This PHINode must be an induction variable. // Make sure that we know about it. @@ -4848,10 +4753,6 @@ break; } - case Instruction::Store: - case Instruction::Load: - vectorizeMemoryInstruction(&I); - break; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -4956,7 +4857,7 @@ } default: - // All other instructions are scalarized. + // This instruction is not vectorized by simple widening. DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); llvm_unreachable("Unhandled instruction!"); } // end of switch. @@ -7827,6 +7728,82 @@ } }; +/// A recipe for vectorizing a phi-node as a sequence of mask-based select +/// instructions. +class VPBlendRecipe : public VPRecipeBase { +private: + PHINode *Phi; + +public: + VPBlendRecipe(PHINode *Phi) : VPRecipeBase(VPBlendSC), Phi(Phi) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPBlendSC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override { + State.ILV->setDebugLocFromInst(State.Builder, Phi); + // We know that all PHIs in non-header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = Phi->getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // ( ...))) + InnerLoopVectorizer::VectorParts Entry(State.UF); + for (unsigned In = 0; In < NumIncoming; In++) { + InnerLoopVectorizer::VectorParts Cond = + State.ILV->createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *In0 = + State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); + assert((Cond[Part] || NumIncoming == 1) && + "Multiple predecessors with one predecessor having a full mask"); + if (In == 0) + Entry[Part] = In0; // Initialize with the first incoming value. + else + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Entry[Part] = State.Builder.CreateSelect(Cond[Part], In0, Entry[Part], + "predphi"); + } + } + for (unsigned Part = 0; Part < State.UF; ++Part) + State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"BLEND "; + Phi->printAsOperand(O, false); + O << " ="; + if (Phi->getNumIncomingValues() == 1) { + // Not a User of any mask: not really blending, this is a + // single-predecessor phi. + O << " "; + Phi->getIncomingValue(0)->printAsOperand(O, false); + } else { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; ++I) { + O << " "; + Phi->getIncomingValue(I)->printAsOperand(O, false); + O << "/"; + Phi->getIncomingBlock(I)->printAsOperand(O, false); + } + } + O << "\\l\""; + + } +}; + /// VPInterleaveRecipe is a recipe for transforming an interleave group of load /// or stores into one wide load/store and shuffles. class VPInterleaveRecipe : public VPRecipeBase { @@ -7970,6 +7947,31 @@ } }; +/// A Recipe for widening load/store operations. +class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +private: + Instruction &Instr; + +public: + VPWidenMemoryInstructionRecipe(Instruction &Instr) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; + } + + /// Generate the wide load/store. + void execute(VPTransformState &State) override { + State.ILV->vectorizeMemoryInstruction(&Instr); + } + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override { + O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); + O << "\\l\""; + } +}; } // end anonymous namespace bool LoopVectorizationPlanner::getDecisionAndClampRange( @@ -7999,6 +8001,77 @@ } } +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { + assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); + + // Look for cached value. + std::pair Edge(Src, Dst); + EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); + if (ECEntryIt != EdgeMaskCache.end()) + return ECEntryIt->second; + + VectorParts SrcMask = createBlockInMask(Src); + + // The terminator has to be a branch inst! + BranchInst *BI = dyn_cast(Src->getTerminator()); + assert(BI && "Unexpected terminator found"); + + if (!BI->isConditional()) + return EdgeMaskCache[Edge] = SrcMask; + + VectorParts EdgeMask(UF); + for (unsigned Part = 0; Part < UF; ++Part) { + auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part); + if (BI->getSuccessor(0) != Dst) + EdgeMaskPart = Builder.CreateNot(EdgeMaskPart); + + if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND. + EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]); + + EdgeMask[Part] = EdgeMaskPart; + } + + return EdgeMaskCache[Edge] = EdgeMask; +} + +InnerLoopVectorizer::VectorParts +InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + + // Look for cached value. + BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); + if (BCEntryIt != BlockMaskCache.end()) + return BCEntryIt->second; + + // All-one mask is modelled as no-mask following the convention for masked + // load/store/gather/scatter. Initialize BlockMask to no-mask. + VectorParts BlockMask(UF); + for (unsigned Part = 0; Part < UF; ++Part) + BlockMask[Part] = nullptr; + + // Loop incoming mask is all-one. + if (OrigLoop->getHeader() == BB) + return BlockMaskCache[BB] = BlockMask; + + // This is the block mask. We OR all incoming edges. + for (auto *Predecessor : predecessors(BB)) { + VectorParts EdgeMask = createEdgeMask(Predecessor, BB); + if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too. + return BlockMaskCache[BB] = EdgeMask; + + if (!BlockMask[0]) { // BlockMask has its initialized nullptr value. + BlockMask = EdgeMask; + continue; + } + + for (unsigned Part = 0; Part < UF; ++Part) + BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]); + } + + return BlockMaskCache[BB] = BlockMask; +} + VPInterleaveRecipe * LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, VFRange &Range) { @@ -8026,6 +8099,32 @@ return new VPInterleaveRecipe(IG); } +VPWidenMemoryInstructionRecipe * +LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range) { + if (!isa(I) && !isa(I)) + return nullptr; + + auto willWiden = [&](unsigned VF) -> bool { + if (VF == 1) + return false; + if (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF)) + return false; + LoopVectorizationCostModel::InstWidening Decision = + CM.getWideningDecision(I, VF); + assert(Decision != LoopVectorizationCostModel::CM_Unknown && + "CM decision should be taken at this point."); + assert(Decision != LoopVectorizationCostModel::CM_Interleave && + "Interleave memory opportunity should be caught earlier."); + return Decision != LoopVectorizationCostModel::CM_Scalarize; + }; + + if (!getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + return new VPWidenMemoryInstructionRecipe(*I); +} + VPWidenIntOrFpInductionRecipe * LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, VFRange &Range) { @@ -8060,6 +8159,14 @@ return nullptr; } +VPBlendRecipe *LoopVectorizationPlanner::tryToBlend(Instruction *I) { + PHINode *Phi = dyn_cast(I); + if (!Phi || Phi->getParent() == OrigLoop->getHeader()) + return nullptr; + + return new VPBlendRecipe(Phi); +} + bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range) { if (Legal->isScalarWithPredication(I)) @@ -8313,11 +8420,21 @@ continue; } + // Check if Instr is a memory operation that should be widened. + if ((Recipe = tryToWidenMemory(Instr, Range))) { + VPBB->appendRecipe(Recipe); + continue; + } + // Check if Instr should form some PHI recipe. if ((Recipe = tryToOptimizeInduction(Instr, Range))) { VPBB->appendRecipe(Recipe); continue; } + if ((Recipe = tryToBlend(Instr))) { + VPBB->appendRecipe(Recipe); + continue; + } if (PHINode *Phi = dyn_cast(Instr)) { VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); continue; Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.h +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.h @@ -452,11 +452,13 @@ /// SubclassID field of the VPRecipeBase objects. They are used for concrete /// type identification. using VPRecipeTy = enum { + VPBlendSC, VPBranchOnMaskSC, VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, VPWidenIntOrFpInductionSC, + VPWidenMemoryInstructionSC, VPWidenPHISC, VPWidenSC, }; Index: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s +; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -58,3 +59,40 @@ %tmp8 = phi i32 [ %tmp7, %for.inc ] ret i32 %tmp8 } + +; This test ensures that a load, which would have been widened otherwise is +; instead scalarized if Cost-Model so decided as part of its +; sink-scalar-operands optimization for predicated instructions. +; +; SINK-GATHER: vector.body: +; SINK-GATHER: pred.udiv.if: +; SINK-GATHER: %[[T0:.+]] = load i32, i32* %{{.*}}, align 4 +; SINK-GATHER: %{{.*}} = udiv i32 %[[T0]], %{{.*}} +; SINK-GATHER: pred.udiv.continue: +define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] + %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ] + %i7 = mul i64 %i, 777 + br i1 %c, label %if.then, label %for.inc + +if.then: + %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7 + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp4 = udiv i32 %tmp2, %x + br label %for.inc + +for.inc: + %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then] + %tmp6 = add i32 %r, %tmp5 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %tmp7 = phi i32 [ %tmp6, %for.inc ] + ret i32 %tmp7 +}