Index: llvm/trunk/include/llvm/Analysis/VectorUtils.h =================================================================== --- llvm/trunk/include/llvm/Analysis/VectorUtils.h +++ llvm/trunk/include/llvm/Analysis/VectorUtils.h @@ -345,20 +345,29 @@ const LoopAccessInfo *LAI) : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} - ~InterleavedAccessInfo() { + ~InterleavedAccessInfo() { reset(); } + + /// Analyze the interleaved accesses and collect them in interleave + /// groups. Substitute symbolic strides using \p Strides. + /// Consider also predicated loads/stores in the analysis if + /// \p EnableMaskedInterleavedGroup is true. + void analyzeInterleaving(bool EnableMaskedInterleavedGroup); + + /// Invalidate groups, e.g., in case all blocks in loop will be predicated + /// contrary to original assumption. Although we currently prevent group + /// formation for predicated accesses, we may be able to relax this limitation + /// in the future once we handle more complicated blocks. + void reset() { SmallPtrSet DelSet; // Avoid releasing a pointer twice. for (auto &I : InterleaveGroupMap) DelSet.insert(I.second); for (auto *Ptr : DelSet) delete Ptr; + InterleaveGroupMap.clear(); + RequiresScalarEpilogue = false; } - /// Analyze the interleaved accesses and collect them in interleave - /// groups. Substitute symbolic strides using \p Strides. - /// Consider also predicated loads/stores in the analysis if - /// \p EnableMaskedInterleavedGroup is true. - void analyzeInterleaving(bool EnableMaskedInterleavedGroup); /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { Index: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -241,6 +241,10 @@ /// If false, good old LV code. bool canVectorize(bool UseVPlanNativePath); + /// Return true if we can vectorize this loop while folding its tail by + /// masking. + bool canFoldTailByMasking(); + /// Returns the primary induction variable. PHINode *getPrimaryInduction() { return PrimaryInduction; } Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1134,4 +1134,59 @@ return Result; } +bool LoopVectorizationLegality::canFoldTailByMasking() { + + LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); + + if (!PrimaryInduction) { + ORE->emit(createMissedAnalysis("NoPrimaryInduction") + << "Missing a primary induction variable in the loop, which is " + << "needed in order to fold tail by masking as required."); + LLVM_DEBUG(dbgs() << "LV: No primary induction, cannot fold tail by " + << "masking.\n"); + return false; + } + + // TODO: handle reductions when tail is folded by masking. + if (!Reductions.empty()) { + ORE->emit(createMissedAnalysis("ReductionFoldingTailByMasking") + << "Cannot fold tail by masking in the presence of reductions."); + LLVM_DEBUG(dbgs() << "LV: Loop has reductions, cannot fold tail by " + << "masking.\n"); + return false; + } + + // TODO: handle outside users when tail is folded by masking. + for (auto *AE : AllowedExit) { + // Check that all users of allowed exit values are inside the loop. + for (User *U : AE->users()) { + Instruction *UI = cast(U); + if (TheLoop->contains(UI)) + continue; + ORE->emit(createMissedAnalysis("LiveOutFoldingTailByMasking") + << "Cannot fold tail by masking in the presence of live outs."); + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop has an " + << "outside user for : " << *UI << '\n'); + return false; + } + } + + // The list of pointers that we can safely read and write to remains empty. + SmallPtrSet SafePointers; + + // Check and mark all blocks for predication, including those that ordinarily + // do not need predication such as the header block. + for (BasicBlock *BB : TheLoop->blocks()) { + if (!blockCanBePredicated(BB, SafePointers)) { + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as required.\n"); + return false; + } + } + + LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); + return true; +} + } // namespace llvm Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1105,7 +1105,7 @@ // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) { - if (!Legal->blockNeedsPredication(I->getParent())) + if (!blockNeedsPredication(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated // instructions. @@ -1139,6 +1139,13 @@ return InterleaveInfo.requiresScalarEpilogue(); } + /// Returns true if all loop blocks should be masked to fold tail loop. + bool foldTailByMasking() const { return FoldTailByMasking; } + + bool blockNeedsPredication(BasicBlock *BB) { + return foldTailByMasking() || Legal->blockNeedsPredication(BB); + } + private: unsigned NumPredStores = 0; @@ -1222,6 +1229,9 @@ /// vectorization as a predicated block. SmallPtrSet PredicatedBBsAfterVectorization; + /// All blocks of loop are to be masked to fold tail of scalar iterations. + bool FoldTailByMasking = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -2339,6 +2349,7 @@ if (TripCount) return TripCount; + assert(L && "Create Trip Count for null loop."); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); // Find the loop boundaries. ScalarEvolution *SE = PSE.getSE(); @@ -2388,12 +2399,26 @@ Value *TC = getOrCreateTripCount(L); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Type *Ty = TC->getType(); + Constant *Step = ConstantInt::get(Ty, VF * UF); + + // If the tail is to be folded by masking, round the number of iterations N + // up to a multiple of Step instead of rounding down. This is done by first + // adding Step-1 and then rounding down. Note that it's ok if this addition + // overflows: the vector induction variable will eventually wrap to zero given + // that it starts at zero and its Step is a power of two; the loop will then + // exit, with the last early-exit vector comparison also producing all-true. + if (Cost->foldTailByMasking()) { + assert(isPowerOf2_32(VF * UF) && + "VF*UF must be a power of 2 when folding tail by masking"); + TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); + } + // Now we need to generate the expression for the part of the loop that the // vectorized body will execute. This is equal to N - (N % Step) if scalar // iterations are not required for correctness, or N - Step, otherwise. Step // is equal to the vectorization factor (number of SIMD elements) times the // unroll factor (number of SIMD instructions). - Constant *Step = ConstantInt::get(TC->getType(), VF * UF); Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); // If there is a non-reversed interleaved group that may speculatively access @@ -2456,8 +2481,13 @@ // of zero. In this case we will also jump to the scalar loop. auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - Value *CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); + + // If tail is to be folded, vector loop takes care of all iterations. + Value *CheckMinIters = Builder.getFalse(); + if (!Cost->foldTailByMasking()) + CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a @@ -2486,6 +2516,7 @@ if (C->isZero()) return; + assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail"); // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2518,6 +2549,7 @@ if (!MemRuntimeCheck) return; + assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); // Create a new block containing the memory check. BB->setName("vector.memcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2786,9 +2818,12 @@ // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + // If tail is to be folded, we know we don't need to run the remainder. + Value *CmpN = Builder.getTrue(); + if (!Cost->foldTailByMasking()) + CmpN = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BranchInst::Create(ExitBlock, ScalarPH, CmpN)); @@ -4262,7 +4297,7 @@ } bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { - if (!Legal->blockNeedsPredication(I->getParent())) + if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { default: @@ -4564,36 +4599,36 @@ return None; } - // If we don't know the precise trip count, don't try to vectorize. + unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); + + if (TC > 0 && TC % MaxVF == 0) { + LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + return MaxVF; + } + + // If we don't know the precise trip count, or if the trip count that we + // found modulo the vectorization factor is not zero, try to fold the tail + // by masking. + // FIXME: look for a smaller MaxVF that does divide TC rather than masking. + // FIXME: return None if loop requiresScalarEpilog(), or look for a + // smaller MaxVF that does not require a scalar epilog. + if (Legal->canFoldTailByMasking()) { + FoldTailByMasking = true; + return MaxVF; + } + if (TC == 0) { ORE->emit( createMissedAnalysis("UnknownLoopCountComplexCFG") << "unable to calculate the loop count due to complex control flow"); - LLVM_DEBUG( - dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); - return None; - } - - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); - - if (TC % MaxVF != 0) { - // If the trip count that we found modulo the vectorization factor is not - // zero then we require a tail. - // FIXME: look for a smaller MaxVF that does divide TC rather than give up. - // FIXME: return None if loop requiresScalarEpilog(), or look for a - // smaller MaxVF that does not require a scalar epilog. - - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return None; } - return MaxVF; + ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") + << "cannot optimize for size and vectorize at the same time. " + "Enable vectorization of this loop with '#pragma clang loop " + "vectorize(enable)' when compiling with -Os/-Oz"); + return None; } unsigned @@ -4831,6 +4866,9 @@ // fit without causing spills. All of this is rounded down if necessary to be // a power of two. We want power of two interleave count to simplify any // addressing operations or alignment considerations. + // We also want power of two interleave counts to ensure that the induction + // variable of the vector loop wraps to zero, when tail is folded by masking; + // this currently happens when OptForSize, in which case IC is set to 1 above. unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers); @@ -5117,7 +5155,7 @@ // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { - if (!Legal->blockNeedsPredication(BB)) + if (!blockNeedsPredication(BB)) continue; for (Instruction &I : *BB) if (isScalarWithPredication(&I)) { @@ -5282,7 +5320,7 @@ // unconditionally executed. For the scalar case, we may not always execute // the predicated block. Thus, scale the block's cost by the probability of // executing it. - if (VF == 1 && Legal->blockNeedsPredication(BB)) + if (VF == 1 && blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -5973,6 +6011,10 @@ if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. return NoVectorization; + // Invalidate interleave groups if all blocks of loop will be predicated. + if (CM.blockNeedsPredication(OrigLoop->getHeader())) + CM.InterleaveInfo.reset(); + if (UserVF) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); @@ -6029,6 +6071,7 @@ DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); + State.TripCount = ILV.getOrCreateTripCount(nullptr); //===------------------------------------------------===// // @@ -6209,9 +6252,17 @@ // load/store/gather/scatter. Initialize BlockMask to no-mask. VPValue *BlockMask = nullptr; - // Loop incoming mask is all-one. - if (OrigLoop->getHeader() == BB) + if (OrigLoop->getHeader() == BB) { + if (!CM.blockNeedsPredication(BB)) + return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + + // Introduce the early-exit compare IV <= BTC to form header block mask. + // This is used instead of IV < TC because TC may wrap, unlike BTC. + VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; + } // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { @@ -6577,6 +6628,11 @@ NeedDef.insert(Branch->getCondition()); } + // If the tail is to be folded by masking, the primary induction variable + // needs to be represented in VPlan for it to model early-exit masking. + if (CM.foldTailByMasking()) + NeedDef.insert(Legal->getPrimaryInduction()); + // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For // example, original induction update instructions can become dead because we Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.h +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.h @@ -317,6 +317,9 @@ /// Values they correspond to. VPValue2ValueTy VPValue2Value; + /// Hold the trip count of the scalar loop. + Value *TripCount = nullptr; + /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; @@ -607,7 +610,7 @@ public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. - enum { Not = Instruction::OtherOpsEnd + 1 }; + enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE }; private: typedef unsigned char OpcodeTy; @@ -1115,6 +1118,10 @@ // (operators '==' and '<'). SmallPtrSet VPExternalDefs; + /// Represents the backedge taken count of the original loop, for folding + /// the tail. + VPValue *BackedgeTakenCount = nullptr; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -1132,7 +1139,10 @@ if (Entry) VPBlockBase::deleteCFG(Entry); for (auto &MapEntry : Value2VPValue) - delete MapEntry.second; + if (MapEntry.second != BackedgeTakenCount) + delete MapEntry.second; + if (BackedgeTakenCount) + delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not. for (VPValue *Def : VPExternalDefs) delete Def; for (VPValue *CBV : VPCBVs) @@ -1147,6 +1157,13 @@ VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; } + /// The backedge taken count of the original loop. + VPValue *getOrCreateBackedgeTakenCount() { + if (!BackedgeTakenCount) + BackedgeTakenCount = new VPValue(); + return BackedgeTakenCount; + } + void addVF(unsigned VF) { VFs.insert(VF); } bool hasVF(unsigned VF) { return VFs.count(VF); } Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp @@ -303,6 +303,13 @@ State.set(this, V, Part); break; } + case VPInstruction::ICmpULE: { + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *V = Builder.CreateICmpULE(IV, TC); + State.set(this, V, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -328,6 +335,9 @@ case VPInstruction::Not: O << "not"; break; + case VPInstruction::ICmpULE: + O << "icmp ule"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -342,6 +352,15 @@ /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { + // -1. Check if the backedge taken count is needed, and if so build it. + if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { + Value *TC = State->TripCount; + IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); + auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), + "trip.count.minus.1"); + Value2VPValue[TCMO] = BackedgeTakenCount; + } + // 0. Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; @@ -469,8 +488,11 @@ OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; if (!Plan.getName().empty()) OS << "\\n" << DOT::EscapeString(Plan.getName()); - if (!Plan.Value2VPValue.empty()) { + if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { OS << ", where:"; + if (Plan.BackedgeTakenCount) + OS << "\\n" + << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount"; for (auto Entry : Plan.Value2VPValue) { OS << "\\n" << *Entry.second; OS << DOT::EscapeString(" := "); Index: llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/optsize.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; This test verifies that the loop vectorizer will NOT vectorize loops that ; will produce a tail loop with the optimize for size or the minimize size ; attributes. This is a target-dependent version of the test. @@ -9,7 +10,47 @@ define i32 @foo_optsize() #0 { ; CHECK-LABEL: @foo_optsize( -; CHECK-NOT: x i8> +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2 +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; entry: br label %for.body @@ -33,7 +74,47 @@ define i32 @foo_minsize() #1 { ; CHECK-LABEL: @foo_minsize( -; CHECK-NOT: x i8> +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5 +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; entry: br label %for.body Index: llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/small-size.ll @@ -68,11 +68,68 @@ ret void } -; Can't vectorize in 'optsize' mode because we need a tail. -;CHECK-LABEL: @example2( -;CHECK-NOT: store <4 x i32> -;CHECK: ret void +; Can vectorize in 'optsize' mode by masking the needed tail. define void @example2(i32 %n, i32 %x) optsize { +; CHECK-LABEL: @example2( +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]] +; CHECK: .lr.ph5.preheader: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP10]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: pred.store.if3: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP12]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue4: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK: pred.store.if5: +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP14]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.if7: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP16]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.continue8: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK: ._crit_edge: +; CHECK-NEXT: ret void +; %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph5, label %.preheader @@ -113,7 +170,8 @@ ret void } -; N is unknown, we need a tail. Can't vectorize. +; N is unknown, we need a tail. Can't vectorize because loop has no primary +; induction. ;CHECK-LABEL: @example3( ;CHECK-NOT: <4 x i32> ;CHECK: ret void @@ -181,12 +239,12 @@ ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP6:%.*]] -; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !5 +; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7 ; CHECK: ret void ; br label %1 @@ -209,11 +267,102 @@ ret void } -; We CAN'T vectorize this example because it would entail a tail. +; We CAN vectorize this example by folding the tail it entails. define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize { ; CHECK-LABEL: @example23c( -; CHECK-NOT: <4 x -; CHECK: ret void +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK: pred.load.if11: +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK: pred.load.continue12: +; CHECK-NEXT: [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK: pred.load.if13: +; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK: pred.load.continue14: +; CHECK-NEXT: [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK: pred.load.if15: +; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK: pred.load.continue16: +; CHECK-NEXT: [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP18:%.*]] = zext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; CHECK: pred.store.if17: +; CHECK-NEXT: [[TMP21:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7 +; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]] +; CHECK-NEXT: store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] +; CHECK: pred.store.continue18: +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; CHECK: pred.store.if19: +; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7 +; CHECK-NEXT: [[TMP27:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]] +; CHECK-NEXT: store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] +; CHECK: pred.store.continue20: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]] +; CHECK: pred.store.if21: +; CHECK-NEXT: [[TMP29:%.*]] = zext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]] +; CHECK-NEXT: store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] +; CHECK: pred.store.continue22: +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[TMP33:%.*]] +; CHECK: br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9 +; CHECK: ret void +; br label %1 ;