diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -240,6 +240,10 @@ Optional BestVF = None; unsigned BestUF = 0; + /// Candidates for sinking scalar operands after VPlan code-generation, which + /// cannot be handled by the VPlan-based sinkScalarOperands yet. + SetVector SinkCandidates; + public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -523,7 +523,7 @@ /// inclusive. Uses the VPValue operands from \p Operands instead of \p /// Instr's operands. void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, - const VPIteration &Instance, bool IfPredicateInstr, + const VPIteration &Instance, VPTransformState &State); /// Widen an integer or floating-point induction variable \p IV. If \p Trunc @@ -612,9 +612,9 @@ /// block. void fixLCSSAPHIs(VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into - /// the block that was created for it. - void sinkScalarOperands(Instruction *PredInst); + /// Iteratively sink the scalarized operands of a sink candidate instruction + /// into the block that was created for it. + void sinkScalarOperands(Instruction *SinkCandidate); void sinkScalarOperands(VPlan &Plan); /// Shrinks vector element sizes to the smallest bitwidth they can be legally @@ -839,9 +839,6 @@ /// The induction variable of the old basic block. PHINode *OldInduction = nullptr; - /// Store instructions that were predicated. - SmallVector PredicatedInstructions; - /// Trip count of the original loop. Value *TripCount = nullptr; @@ -2976,7 +2973,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &User, const VPIteration &Instance, - bool IfPredicateInstr, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); @@ -3018,10 +3014,6 @@ // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) AC->registerAssumption(II); - - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(Cloned); } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -4030,7 +4022,17 @@ IVEndValues[Entry.first], LoopMiddleBlock); fixLCSSAPHIs(State); - for (Instruction *PI : PredicatedInstructions) + SetVector SinkCands; + for (VPValue *VPCand : State.SinkCandidates) { + for (unsigned Part = 0; Part < State.UF; ++Part) + for (unsigned Lane = 0; Lane < State.VF.getKnownMinValue(); ++Lane) + if (State.hasScalarValue(VPCand, {Part, Lane})) { + if (Instruction *I = + dyn_cast(State.get(VPCand, {Part, Lane}))) + SinkCands.insert(I); + } + } + for (Instruction *PI : SinkCands) sinkScalarOperands(&*PI); // Remove redundant induction instructions. @@ -4502,13 +4504,24 @@ } } -void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { - // The basic block and loop containing the predicated instruction. - auto *PredBB = PredInst->getParent(); +void InnerLoopVectorizer::sinkScalarOperands(Instruction *SinkCandidate) { + BasicBlock *PredBB = nullptr; + // Find the predicated block containing the uses of SinkCandidate. If there + // are users in different blocks, bail out. + for (Use &U : SinkCandidate->uses()) { + auto *I = cast(U.getUser()); + assert(!isa(I)); + BasicBlock *BB = I->getParent(); + if (!PredBB) + PredBB = BB; + if (PredBB != BB) + return; + } auto *VectorLoop = LI->getLoopFor(PredBB); - // Initialize a worklist with the operands of the predicated instruction. - SetVector Worklist(PredInst->op_begin(), PredInst->op_end()); + // Initialize a worklist. + SetVector Worklist; + Worklist.insert(SinkCandidate); // Holds instructions that we need to analyze again. An instruction may be // reanalyzed if we don't yet know if we can sink it or not. @@ -7891,6 +7904,7 @@ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; + State.SinkCandidates = SinkCandidates; ILV.printDebugTracesAtStart(); @@ -9026,7 +9040,7 @@ } } - VPlanTransforms::sinkScalarOperands(*Plan); + VPlanTransforms::sinkScalarOperands(*Plan, SinkCandidates); std::string PlanName; raw_string_ostream RSO(PlanName); @@ -9281,7 +9295,7 @@ if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + *State.Instance, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9305,8 +9319,7 @@ for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - VPIteration(Part, Lane), IsPredicated, - State); + VPIteration(Part, Lane), State); } void VPBranchOnMaskRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -338,6 +338,10 @@ /// Pointer to the VPlan code is generated for. VPlan *Plan; + + /// Candidates for sinking scalar operands after VPlan code-generation, which + /// cannot be handled by the VPlan-based sinkScalarOperands yet. + SetVector SinkCandidates; }; /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -14,6 +14,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" namespace llvm { @@ -29,7 +30,8 @@ LoopVectorizationLegality::InductionList &Inductions, SmallPtrSetImpl &DeadInstructions, ScalarEvolution &SE); - static bool sinkScalarOperands(VPlan &Plan); + static bool sinkScalarOperands(VPlan &Plan, + SetVector &SinkCandidates); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -100,7 +100,8 @@ } } -bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) { +bool VPlanTransforms::sinkScalarOperands(VPlan &Plan, + SetVector &SinkCandidates) { ReversePostOrderTraversal> RPOT(VPBlockRecursiveTraversalWrapper(Plan.getEntry())); @@ -118,19 +119,26 @@ while (!WorkList.empty()) { auto *C = WorkList.pop_back_val(); - auto *Current = dyn_cast_or_null(C->Def); - if (!Current || Current->getParent() == RepR->getParent() || + auto *Current = dyn_cast_or_null(C->Def); + if (!Current || Current->getNumDefinedValues() == 0 || + Current->getParent() == RepR->getParent() || Current->mayHaveSideEffects() || Current->mayReadOrWriteMemory()) continue; - if (any_of(Current->users(), [VPBB](VPUser *U) { + // If we encounter a recipe we cannot sink directly, queue it for + // post-processing after code-generation, where instructions for a + // sub-set of lanes can be sunk. + if (!isa(Current) || + any_of(Current->getVPValue(0)->users(), [VPBB](VPUser *U) { auto *UI = dyn_cast(U); return UI && UI->getParent() != VPBB; - })) + })) { + for (VPValue *D : Current->definedValues()) + SinkCandidates.insert(D); continue; + } Current->moveBefore(*RepR->getParent(), RepR->getParent()->begin()); - Current->setIsPredicated(); WorkList.insert(Current->op_begin(), Current->op_end()); Changed = true; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -64,9 +64,10 @@ ; instead scalarized if Cost-Model so decided as part of its ; sink-scalar-operands optimization for predicated instructions. ; +; SINK-GATHER-LABEL: @scalarize_and_sink_gather ; SINK-GATHER: vector.body: -; SINK-GATHER: pred.udiv.if: ; SINK-GATHER: %[[T0:.+]] = load i32, i32* %{{.*}}, align 4 +; SINK-GATHER: pred.udiv.if: ; SINK-GATHER: %{{.*}} = udiv i32 %[[T0]], %{{.*}} ; SINK-GATHER: pred.udiv.continue: define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -34,7 +34,7 @@ ; UNROLL: pred.store.continue3: ; UNROLL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] @@ -51,7 +51,7 @@ ; UNROLL: for.inc: ; UNROLL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 -; UNROLL-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; UNROLL-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret i32 0 ; @@ -84,7 +84,7 @@ ; UNROLL-NOSIMPLIFY: pred.store.continue3: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -104,7 +104,7 @@ ; UNROLL-NOSIMPLIFY: for.inc: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NOSIMPLIFY-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; UNROLL-NOSIMPLIFY: for.end: ; UNROLL-NOSIMPLIFY-NEXT: ret i32 0 ; @@ -139,7 +139,7 @@ ; VEC: pred.store.continue2: ; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 -; VEC-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; VEC-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] @@ -156,7 +156,7 @@ ; VEC: for.inc: ; VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128 -; VEC-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; VEC-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; VEC: for.end: ; VEC-NEXT: ret i32 0 ; @@ -229,7 +229,7 @@ ; UNROLL-NEXT: [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP9]] ; UNROLL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; UNROLL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] @@ -302,7 +302,7 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP8]] ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[BIN_RDX:%.*]] = add i32 [[PREDPHI5]], [[PREDPHI]] ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] @@ -326,7 +326,7 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV_NEXT4]] = add nsw i64 [[INDVARS_IV3]], 1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV3]] to i32 ; UNROLL-NOSIMPLIFY-NEXT: [[CMP13:%.*]] = icmp slt i32 [[TMP1]], 0 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP13]], label [[FOR_BODY14]], label [[FOR_INC26_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP13]], label [[FOR_BODY14]], label [[FOR_INC26_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL-NOSIMPLIFY: for.inc26.loopexit: ; UNROLL-NOSIMPLIFY-NEXT: [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_INC26]] @@ -367,7 +367,7 @@ ; VEC-NEXT: [[PREDPHI]] = select <2 x i1> undef, <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP8]] ; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI]]) ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] @@ -436,28 +436,28 @@ ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]] ; UNROLL-NEXT: [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; UNROLL-NEXT: [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1 +; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]] +; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] +; UNROLL-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 +; UNROLL-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]] -; UNROLL-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; UNROLL-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; UNROLL-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] -; UNROLL-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; UNROLL-NEXT: [[TMP4:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; UNROLL-NEXT: store i8 [[TMP5]], i8* [[TMP0]], align 1 +; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1 +; UNROLL-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1 ; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE6]] ; UNROLL: pred.store.continue6: ; UNROLL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; UNROLL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] @@ -476,7 +476,7 @@ ; UNROLL-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 ; UNROLL-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 ; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; UNROLL-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; UNROLL-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void ; @@ -487,32 +487,32 @@ ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 ; UNROLL-NOSIMPLIFY-NEXT: [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]] ; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP5]], i8* [[TMP0]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] ; UNROLL-NOSIMPLIFY: pred.store.if5: -; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]] ; UNROLL-NOSIMPLIFY: pred.store.continue6: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -535,7 +535,7 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; UNROLL-NOSIMPLIFY: for.end: ; UNROLL-NOSIMPLIFY-NEXT: ret void ; @@ -575,7 +575,7 @@ ; VEC: pred.store.continue3: ; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC: middle.block: ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] @@ -594,7 +594,7 @@ ; VEC-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 ; VEC-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 ; VEC-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; VEC-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; VEC-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VEC: for.end: ; VEC-NEXT: ret void ; @@ -637,16 +637,16 @@ ; UNROLL-NEXT: [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1 ; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]] ; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] +; UNROLL-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 +; UNROLL-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1 ; UNROLL-NEXT: store i8 0, i8* [[TMP0]], align 1 ; UNROLL-NEXT: store i8 0, i8* [[TMP1]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]] ; UNROLL: pred.store.if: -; UNROLL-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 -; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; UNROLL-NEXT: store i8 [[TMP4]], i8* [[TMP0]], align 1 -; UNROLL-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1 -; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; UNROLL-NEXT: [[TMP4:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; UNROLL-NEXT: store i8 [[TMP5]], i8* [[TMP0]], align 1 +; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 ; UNROLL-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1 ; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE6]] @@ -691,20 +691,20 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP0]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: store i8 0, i8* [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; UNROLL-NOSIMPLIFY: pred.store.if: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = load i8, i8* [[TMP0]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 -; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP4]], i8* [[TMP0]], align 1 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = zext i8 [[TMP2]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 +; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP5]], i8* [[TMP0]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]] ; UNROLL-NOSIMPLIFY: pred.store.continue: ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] ; UNROLL-NOSIMPLIFY: pred.store.if5: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP3]] to i32 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8 ; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]]