diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9298,6 +9298,7 @@ } VPlanTransforms::sinkScalarOperands(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); std::string PlanName; raw_string_ostream RSO(PlanName); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -30,6 +30,8 @@ SmallPtrSetImpl &DeadInstructions, ScalarEvolution &SE); static bool sinkScalarOperands(VPlan &Plan); + + static bool mergeReplicateRegions(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -148,3 +148,138 @@ } return Changed; } + +/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return +/// the mask. +VPValue *getPredicatedMask(VPRegionBlock *R) { + auto *EntryBB = dyn_cast(R->getEntry()); + if (!EntryBB || EntryBB->size() != 1 || + !isa(EntryBB->begin())) + return nullptr; + + return cast(&*EntryBB->begin())->getOperand(0); +} + +/// If \p R is a triangle region, return the 'then' block of the triangle. +static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { + auto *EntryBB = cast(R->getEntry()); + if (EntryBB->getNumSuccessors() != 2) + return nullptr; + + auto *Succ0 = dyn_cast(EntryBB->getSuccessors()[0]); + auto *Succ1 = dyn_cast(EntryBB->getSuccessors()[1]); + if (!Succ0 || !Succ1) + return nullptr; + + if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1) + return nullptr; + if (Succ0->getSingleSuccessor() == Succ1) + return Succ0; + if (Succ1->getSingleSuccessor() == Succ0) + return Succ1; + return nullptr; +} + +bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { + SetVector DeletedRegions; + bool Changed = false; + + // Collect region blocks to process up-front, to avoid iterator invalidation + // issues while merging regions. + SmallVector CandidateRegions( + VPBlockUtils::blocksOnly(depth_first( + VPBlockRecursiveTraversalWrapper(Plan.getEntry())))); + + // Check if Base is a predicated triangle, followed by an empty block, + // followed by another predicate triangle. If that's the case, move the + // recipes from the first to the second triangle. + for (VPRegionBlock *Region1 : CandidateRegions) { + if (DeletedRegions.contains(Region1)) + continue; + auto *MiddleBasicBlock = + dyn_cast_or_null(Region1->getSingleSuccessor()); + if (!MiddleBasicBlock || !MiddleBasicBlock->empty()) + continue; + + auto *Region2 = + dyn_cast_or_null(MiddleBasicBlock->getSingleSuccessor()); + if (!Region2) + continue; + + VPValue *Mask1 = getPredicatedMask(Region1); + VPValue *Mask2 = getPredicatedMask(Region2); + if (!Mask1 || Mask1 != Mask2) + continue; + VPBasicBlock *Then1 = getPredicatedThenBlock(Region1); + VPBasicBlock *Then2 = getPredicatedThenBlock(Region2); + if (!Then1 || !Then2) + continue; + + assert(Mask1 && Mask2 && "both region must have conditions"); + + // Note: No fusion-preventing memory dependencies are expected in either + // region. Such dependencies should be rejected during earlier dependence + // checks, which guarantee accesses can be re-ordered for vectorization. + // + // If a recipe is used by a first-order recurrence phi, we cannot move it at + // the moment: a recipe R feeding a first order recurrence phi must allow + // for a *vector* shuffle to be inserted immediately after it, and therefore + // if R is *scalarized and predicated* it must appear last in its basic + // block. In addition, other recipes may need to "sink after" R, so best if + // R not be moved at all. + auto IsImmovableRecipe = [](VPRecipeBase &R) { + assert(R.getNumDefinedValues() <= 1 && + "no multi-defs are expected in predicated blocks"); + for (VPUser *U : R.getVPValue()->users()) { + auto *UI = dyn_cast(U); + if (!UI) + continue; + auto *PhiR = dyn_cast(UI); + if (PhiR && !PhiR->getRecurrenceDescriptor()) + return true; + } + return false; + }; + if (any_of(*Then1, IsImmovableRecipe)) + continue; + + // Move recipes to the successor region. + for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1))) + ToMove.moveBefore(*Then2, Then2->getFirstNonPhi()); + + auto *Merge1 = cast(Then1->getSingleSuccessor()); + auto *Merge2 = cast(Then2->getSingleSuccessor()); + + // Move VPPredInstPHIRecipes from the merge block to the successor region's + // merge block. Update all users inside the successor region to use the + // original values. + for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) { + VPValue *PredInst1 = + cast(&Phi1ToMove)->getOperand(0); + for (VPUser *U : Phi1ToMove.getVPValue()->users()) { + auto *UI = dyn_cast(U); + if (!UI || UI->getParent() != Then2) + continue; + for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) { + if (Phi1ToMove.getVPValue() != U->getOperand(I)) + continue; + U->setOperand(I, PredInst1); + } + } + + Phi1ToMove.moveBefore(*Merge2, Merge2->begin()); + } + + // Finally, remove the first region. + for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) { + VPBlockUtils::disconnectBlocks(Pred, Region1); + VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock); + } + VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock); + DeletedRegions.insert(Region1); + } + + for (VPRegionBlock *ToDelete : DeletedRegions) + delete ToDelete; + return Changed; +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -89,38 +89,28 @@ ; FORCE: vector.body: ; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] ; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ] -; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 ; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], ; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; FORCE: pred.store.if: -; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1 -; FORCE-NEXT: br label [[PRED_STORE_CONTINUE]] -; FORCE: pred.store.continue: -; FORCE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; FORCE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; FORCE: pred.store.if1: -; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1 -; FORCE-NEXT: br label [[PRED_STORE_CONTINUE2]] -; FORCE: pred.store.continue2: -; FORCE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; FORCE-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; FORCE: pred.load.if: +; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1 ; FORCE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]] ; FORCE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1 ; FORCE-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 ; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]] ; FORCE: pred.load.continue: -; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] ; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 ; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]] -; FORCE: pred.load.if3: +; FORCE: pred.load.if1: +; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1 ; FORCE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]] ; FORCE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1 ; FORCE-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1 ; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]] -; FORCE: pred.load.continue4: +; FORCE: pred.load.continue2: ; FORCE-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ] ; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -40,13 +40,13 @@ ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP9:%.*]] ; CHECK: 9: -; CHECK-NEXT: br i1 undef, label [[TMP10]], label [[TMP9]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: 10: ; CHECK-NEXT: ret void ; @@ -123,7 +123,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -146,122 +146,74 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT20]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY9:%.*]] ; CHECK: vector.body9: -; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT15:%.*]], [[PRED_STORE_CONTINUE51:%.*]] ] +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT15:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX14]] -; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX14]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT28]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT29]], -; CHECK-NEXT: [[TMP23:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT21]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP27:%.*]] = phi i32 [ poison, [[VECTOR_BODY9]] ], [ [[TMP26]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] -; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP20]] -; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] -; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP31:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP30]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]] -; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[TMP33]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP35:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE31]] ], [ [[TMP34]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]] -; CHECK: pred.load.if34: -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP22]] -; CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE35]] -; CHECK: pred.load.continue35: -; CHECK-NEXT: [[TMP39:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE33]] ], [ [[TMP38]], [[PRED_LOAD_IF34]] ] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]] -; CHECK: pred.load.if36: -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE37]] -; CHECK: pred.load.continue37: -; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE35]] ], [ [[TMP42]], [[PRED_LOAD_IF36]] ] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39:%.*]] -; CHECK: pred.load.if38: -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP20]] -; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE39]] -; CHECK: pred.load.continue39: -; CHECK-NEXT: [[TMP47:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE37]] ], [ [[TMP46]], [[PRED_LOAD_IF38]] ] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP48]], label [[PRED_LOAD_IF40:%.*]], label [[PRED_LOAD_CONTINUE41:%.*]] -; CHECK: pred.load.if40: -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE41]] -; CHECK: pred.load.continue41: -; CHECK-NEXT: [[TMP51:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE39]] ], [ [[TMP50]], [[PRED_LOAD_IF40]] ] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF42:%.*]], label [[PRED_LOAD_CONTINUE43:%.*]] -; CHECK: pred.load.if42: -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP22]] -; CHECK-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE43]] -; CHECK: pred.load.continue43: -; CHECK-NEXT: [[TMP55:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE41]] ], [ [[TMP54]], [[PRED_LOAD_IF42]] ] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] -; CHECK: pred.store.if44: -; CHECK-NEXT: [[TMP57:%.*]] = and i32 [[TMP43]], [[TMP27]] -; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: store i32 [[TMP57]], i32* [[TMP58]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE45]] -; CHECK: pred.store.continue45: -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] -; CHECK: pred.store.if46: -; CHECK-NEXT: [[TMP60:%.*]] = and i32 [[TMP47]], [[TMP31]] -; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP20]] -; CHECK-NEXT: store i32 [[TMP60]], i32* [[TMP61]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE47]] -; CHECK: pred.store.continue47: -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]] -; CHECK: pred.store.if48: -; CHECK-NEXT: [[TMP63:%.*]] = and i32 [[TMP51]], [[TMP35]] -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP21]] -; CHECK-NEXT: store i32 [[TMP63]], i32* [[TMP64]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE49]] -; CHECK: pred.store.continue49: -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51]] -; CHECK: pred.store.if50: -; CHECK-NEXT: [[TMP66:%.*]] = and i32 [[TMP55]], [[TMP39]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP22]] -; CHECK-NEXT: store i32 [[TMP66]], i32* [[TMP67]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE51]] -; CHECK: pred.store.continue51: +; CHECK-NEXT: [[TMP20:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT21]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; CHECK: pred.store.if30: +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP27]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] +; CHECK: pred.store.continue31: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; CHECK: pred.store.if32: +; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], [[TMP31]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP29]] +; CHECK-NEXT: store i32 [[TMP34]], i32* [[TMP35]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] +; CHECK: pred.store.continue33: +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] +; CHECK: pred.store.if34: +; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP38]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], [[TMP39]] +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP37]] +; CHECK-NEXT: store i32 [[TMP42]], i32* [[TMP43]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] +; CHECK: pred.store.continue35: +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.if36: +; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP45]] +; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP49]], [[TMP47]] +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP45]] +; CHECK-NEXT: store i32 [[TMP50]], i32* [[TMP51]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.continue37: ; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX14]], 4 -; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block7: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]] ; CHECK: scalar.ph8: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph5: -; CHECK-NEXT: br i1 undef, label [[DOT_PREHEADER_CRIT_EDGE]], label [[DOTLR_PH5]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[DOT_PREHEADER_CRIT_EDGE]], label [[DOTLR_PH5]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: @@ -324,86 +276,62 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE21:%.*]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT14]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT15]], ; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] -; CHECK: pred.load.if16: +; CHECK-NEXT: store i32 [[TMP6]], i32* [[NEXT_GEP]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK: pred.store.if16: +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] -; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] -; CHECK: pred.load.if18: +; CHECK-NEXT: store i32 [[TMP10]], i32* [[NEXT_GEP7]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK: pred.store.continue17: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK: pred.store.if18: +; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2 ; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP12]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] -; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP14]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] -; CHECK: pred.load.if20: +; CHECK-NEXT: store i32 [[TMP14]], i32* [[NEXT_GEP8]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK: pred.store.continue19: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.if20: +; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP13]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] -; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE19]] ], [ [[TMP18]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[TMP7]], i32* [[NEXT_GEP]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] -; CHECK: pred.store.if22: -; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP22]] -; CHECK-NEXT: store i32 [[TMP11]], i32* [[NEXT_GEP7]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] -; CHECK: pred.store.continue23: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] -; CHECK: pred.store.if24: -; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP24]] -; CHECK-NEXT: store i32 [[TMP15]], i32* [[NEXT_GEP8]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] -; CHECK: pred.store.continue25: -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 -; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]] -; CHECK: pred.store.if26: -; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP26]] -; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP9]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] -; CHECK: pred.store.continue27: +; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP9]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: @@ -487,13 +415,13 @@ ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP6:%.*]] ; CHECK: 6: -; CHECK-NEXT: br i1 undef, label [[TMP7]], label [[TMP6]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: 7: ; CHECK-NEXT: ret void ; @@ -524,95 +452,71 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP4:%.*]] = phi i16 [ poison, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP8:%.*]] = phi i16 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP12:%.*]] = phi i16 [ poison, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: -; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: -; CHECK-NEXT: [[TMP16:%.*]] = phi i16 [ poison, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP18:%.*]] = zext i16 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7 -; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7 +; CHECK-NEXT: store i32 [[TMP5]], i32* [[NEXT_GEP7]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] -; CHECK: pred.store.if17: -; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK: pred.store.if11: +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = zext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i32 [[TMP10]], 7 +; CHECK-NEXT: store i32 [[TMP11]], i32* [[NEXT_GEP8]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] +; CHECK: pred.store.continue12: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] +; CHECK: pred.store.if13: +; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = zext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 7 +; CHECK-NEXT: store i32 [[TMP17]], i32* [[NEXT_GEP9]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] +; CHECK: pred.store.continue14: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]] +; CHECK: pred.store.if15: +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2 +; CHECK-NEXT: [[TMP22:%.*]] = zext i16 [[TMP21]] to i32 ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7 -; CHECK-NEXT: store i32 [[TMP23]], i32* [[NEXT_GEP8]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] -; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] -; CHECK: pred.store.if19: -; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP27:%.*]] = shl nuw nsw i32 [[TMP26]], 7 -; CHECK-NEXT: store i32 [[TMP27]], i32* [[NEXT_GEP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] -; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]] -; CHECK: pred.store.if21: -; CHECK-NEXT: [[TMP29:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = zext i16 [[TMP16]] to i32 -; CHECK-NEXT: [[TMP31:%.*]] = shl nuw nsw i32 [[TMP30]], 7 -; CHECK-NEXT: store i32 [[TMP31]], i32* [[NEXT_GEP10]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] -; CHECK: pred.store.continue22: +; CHECK-NEXT: store i32 [[TMP23]], i32* [[NEXT_GEP10]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] +; CHECK: pred.store.continue16: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[TMP26:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[TMP33:%.*]] -; CHECK: 33: -; CHECK-NEXT: br i1 undef, label [[TMP34]], label [[TMP33]], [[LOOP13:!llvm.loop !.*]] -; CHECK: 34: +; CHECK-NEXT: br label [[TMP25:%.*]] +; CHECK: 25: +; CHECK-NEXT: br i1 undef, label [[TMP26]], label [[TMP25]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: 26: ; CHECK-NEXT: ret void ; br label %1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll @@ -28,7 +28,7 @@ ;CHECK-NEXT: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} ;CHECK-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], ;CHECK-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;CHECK-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;CHECK-NEXT: br i1 %[[M]], label %pred.store.if, label %pred.store.continue ;CHECK-NOT: %{{.+}} = load <16 x i8>, <16 x i8>* %{{.*}}, align 1 define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -64,14 +64,18 @@ ; instead scalarized if Cost-Model so decided as part of its ; sink-scalar-operands optimization for predicated instructions. ; -; SINK-GATHER: vector.body: -; SINK-GATHER: pred.load.if: -; SINK-GATHER: %[[T0:.+]] = load i32, i32* %{{.*}}, align 4 -; SINK-GATHER: pred.load.continue: -; SINK-GATHER: %[[T1:.+]] = phi i32 [ poison, %vector.body ], [ %[[T0]], %pred.load.if ] -; SINK-GATHER: pred.udiv.if: -; SINK-GATHER: %{{.*}} = udiv i32 %[[T1]], %{{.*}} -; SINK-GATHER: pred.udiv.continue: +; SINK-GATHER-LABEL: @scalarize_and_sink_gather +; SINK-GATHER: vector.body: +; SINK-GATHER-LABEL: pred.udiv.if: ; preds = %vector.body +; SINK-GATHER-NEXT: [[EXT:%.+]] = extractelement <8 x i64> {{.*}}, i32 0 +; SINK-GATHER-NEXT: [[GEP:%.+]] = getelementptr inbounds i32, i32* %a, i64 [[EXT]] +; SINK-GATHER-NEXT: [[LV:%.+]] = load i32, i32* [[GEP]], align 4 +; SINK-GATHER-NEXT: [[UDIV:%.+]] = udiv i32 [[LV]], %x +; SINK-GATHER-NEXT: [[INS:%.+]] = insertelement <8 x i32> poison, i32 [[UDIV]], i32 0 +; SINK-GATHER-NEXT: br label %pred.udiv.continue +; SINK-GATHER: pred.udiv.continue: +; SINK-GATHER-NEXT: phi i32 [ poison, %vector.body ], [ [[LV]], %pred.udiv.if ] +; SINK-GATHER-NEXT: phi <8 x i32> [ poison, %vector.body ], [ [[INS]], %pred.udiv.if ] define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -90,40 +90,26 @@ ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN ir<%recur.next> = sext ir<%y> -; CHECK-NEXT: Successor(s): pred.srem -; CHECK-EMPTY: -; CHECK-NEXT: pred.srem: { -; CHECK-NEXT: pred.srem.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%3> -; CHECK-NEXT: Successor(s): pred.srem.if, pred.srem.continue -; CHECK-NEXT: CondBit: vp<%3> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.srem.if: -; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%recur>, ir<%x> -; CHECK-NEXT: Successor(s): pred.srem.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.srem.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%rem> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.0.split ; CHECK-EMPTY: ; CHECK-NEXT: loop.0.split: -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%3> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<%3> (loop) +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%3> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%3> (loop) ; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%add> = add vp<%6>, ir<%recur.next> +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%recur>, ir<%x> +; CHECK-NEXT: REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%rem> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.1 @@ -241,45 +227,30 @@ ; CHECK-EMPTY: ; CHECK-NEXT: loop.1: ; CHECK-NEXT: WIDEN ir<%conv> = sext vp<%6> -; CHECK-NEXT: Successor(s): pred.srem -; CHECK-EMPTY: -; CHECK-NEXT: pred.srem: { -; CHECK-NEXT: pred.srem.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%3> -; CHECK-NEXT: Successor(s): pred.srem.if, pred.srem.continue -; CHECK-NEXT: CondBit: vp<%3> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.srem.if: -; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%0>, ir<%x> (S->V) -; CHECK-NEXT: Successor(s): pred.srem.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.srem.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%rem> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.1.split -; CHECK-EMPTY: -; CHECK-NEXT: loop.1.split: + +; CHECK: loop.1.split: ; CHECK-NEXT: Successor(s): pred.load -; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { + +; CHECK: pred.load: { ; CHECK-NEXT: pred.load.entry: ; CHECK-NEXT: BRANCH-ON-MASK vp<%3> ; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue ; CHECK-NEXT: CondBit: vp<%3> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: + +; CHECK: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%rem> = srem ir<%0>, ir<%x> (S->V) ; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> (S->V) ; CHECK-NEXT: Successor(s): pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: + +; CHECK: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%10> = ir<%rem> ; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%11> = ir<%lv.2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.2 -; CHECK-EMPTY: -; CHECK-NEXT: loop.2: -; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, vp<%9> + +; CHECK: loop.2: +; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, vp<%10> ; CHECK-NEXT: WIDEN ir<%conv.lv.2> = sext vp<%11> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2> ; CHECK-NEXT: No successors @@ -338,21 +309,6 @@ ; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%rem> ; CHECK-NEXT: No successors ; CHECK-NEXT: } - -; CHECK: pred.sdiv: { -; CHECK-NEXT: pred.sdiv.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%3> -; CHECK-NEXT: Successor(s): pred.sdiv.if, pred.sdiv.continue -; CHECK-NEXT: CondBit: vp<%3> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv.if: -; CHECK-NEXT: REPLICATE ir<%rem.div> = sdiv ir<20>, vp<%6> -; CHECK-NEXT: Successor(s): pred.sdiv.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%rem.div> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.1.split ; CHECK-EMPTY: ; CHECK-NEXT: loop.1.split: @@ -365,15 +321,18 @@ ; CHECK-NEXT: CondBit: vp<%3> (loop) ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%rem.div> = sdiv ir<20>, vp<%6> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv> -; CHECK-NEXT: REPLICATE store vp<%8>, ir<%gep> +; CHECK-NEXT: REPLICATE store ir<%rem.div>, ir<%gep> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%10> = ir<%rem.div> ; CHECK-NEXT: No successors ; CHECK-NEXT: } - -; CHECK: loop.2: +; CHECK-NEXT: Successor(s): loop.2 +; CHECK-EMPTY: +; CHECK-NEXT: loop.2: ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -5,9 +5,7 @@ ; Test predication of non-void instructions, specifically (i) that these ; instructions permit vectorization and (ii) the creation of an insertelement -; and a Phi node. We check the full 2-element sequence for the first -; instruction; For the rest we'll just make sure they get predicated based -; on the code generated for the first element. +; and a Phi node. We check the full 2-element sequence for all predicate instructions. define void @test(i32* nocapture %asd, i32* nocapture %aud, i32* nocapture %asr, i32* nocapture %aur) { entry: @@ -25,53 +23,50 @@ ; CHECK: %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]] ; CHECK: %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> poison, i32 %[[SD0]], i32 0 -; CHECK: br label %[[ESD]] -; CHECK: [[ESD]]: -; CHECK: %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ poison, %vector.body ], [ %[[SD1]], %[[CSD]] ] -; CHECK: %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1 -; CHECK: br i1 %[[SDEEH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]] -; CHECK: [[CSDH]]: -; CHECK: %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 -; CHECK: %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 -; CHECK: %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]] -; CHECK: %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1 -; CHECK: br label %[[ESDH]] -; CHECK: [[ESDH]]: -; CHECK: %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ] - -; CHECK: %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: br i1 %[[UDEE]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]] -; CHECK: [[CUD]]: ; CHECK: %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]] ; CHECK: %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> poison, i32 %[[UD0]], i32 0 -; CHECK: br label %[[EUD]] -; CHECK: [[EUD]]: -; CHECK: %{{.*}} = phi <2 x i32> [ poison, %{{.*}} ], [ %[[UD1]], %[[CUD]] ] - -; CHECK: %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: br i1 %[[SREE]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]] -; CHECK: [[CSR]]: ; CHECK: %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]] ; CHECK: %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> poison, i32 %[[SR0]], i32 0 -; CHECK: br label %[[ESR]] -; CHECK: [[ESR]]: -; CHECK: %{{.*}} = phi <2 x i32> [ poison, %{{.*}} ], [ %[[SR1]], %[[CSR]] ] - -; CHECK: %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: br i1 %[[UREE]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]] -; CHECK: [[CUR]]: ; CHECK: %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]] ; CHECK: %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> poison, i32 %[[UR0]], i32 0 -; CHECK: br label %[[EUR]] -; CHECK: [[EUR]]: -; CHECK: %{{.*}} = phi <2 x i32> [ poison, %{{.*}} ], [ %[[UR1]], %[[CUR]] ] - +; CHECK: br label %[[ESD]] +; CHECK: [[ESD]]: +; CHECK: [[SDR:%[a-zA-Z0-9]+]] = phi <2 x i32> [ poison, %vector.body ], [ %[[SD1]], %[[CSD]] ] +; CHECK: [[UDR:%.+]] = phi <2 x i32> [ poison, %{{.*}} ], [ %[[UD1]], %[[CSD]] ] +; CHECK: [[SRR:%.+]] = phi <2 x i32> [ poison, %{{.*}} ], [ %[[SR1]], %[[CSD]] ] +; CHECK: [[URR:%.+]] = phi <2 x i32> [ poison, %{{.*}} ], [ %[[UR1]], %[[CSD]] ] +; CHECK: %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1 +; CHECK: br i1 %[[SDEEH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]] +; CHECK: [[CSDH]]: +; CHECK: %[[SD1_A0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[SD1_A1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[SD1_0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SD1_A0H]], %[[SD1_A1H]] +; CHECK: %[[SD1_1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> [[SDR]], i32 %[[SD1_0H]], i32 1 +; CHECK: %[[UD1_A0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[UD1_A1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[UD1_0:[a-zA-Z0-9]+]] = udiv i32 %[[UD1_A0]], %[[UD1_A1]] +; CHECK: %[[UD1_1:[a-zA-Z0-9]+]] = insertelement <2 x i32> [[UDR]], i32 %[[UD1_0]], i32 1 +; CHECK: %[[SR1_A0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[SR1_A1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[SR1_0:[a-zA-Z0-9]+]] = srem i32 %[[SR1_A0]], %[[SR1_A1]] +; CHECK: %[[SR1_1:[a-zA-Z0-9]+]] = insertelement <2 x i32> [[SRR]], i32 %[[SR1_0]], i32 1 +; CHECK: %[[UR1_A0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[UR1_A1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 +; CHECK: %[[UR1_0:[a-zA-Z0-9]+]] = urem i32 %[[UR1_A0]], %[[UR1_A1]] +; CHECK: %[[UR1_1:[a-zA-Z0-9]+]] = insertelement <2 x i32> [[URR]], i32 %[[UR1_0]], i32 1 +; CHECK: br label %[[ESDH]] +; CHECK: [[ESDH]]: +; CHECK: [[SDR1:%[a-zA-Z0-9]+]] = phi <2 x i32> [ [[SDR]], %[[ESD]] ], [ %[[SD1_1H]], %[[CSDH]] ] +; CHECK: [[UDR1:%.+]] = phi <2 x i32> [ [[UDR]], %{{.*}} ], [ %[[UD1_1]], %[[CSDH]] ] +; CHECK: [[SRR1:%.+]] = phi <2 x i32> [ [[SRR]], %{{.*}} ], [ %[[SR1_1]], %[[CSDH]] ] +; CHECK: [[URR1:%.+]] = phi <2 x i32> [ [[URR]], %{{.*}} ], [ %[[UR1_1]], %[[CSDH]] ] +; for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -96,100 +96,80 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND15]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP45]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP23]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP47]]) -; CHECK-NEXT: [[TMP49:%.*]] = add i32 [[TMP48]], [[TMP46]] -; CHECK-NEXT: [[TMP50:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP43]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP50]]) -; CHECK-NEXT: [[TMP52]] = add i32 [[TMP51]], [[TMP49]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP41]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP38]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP43]]) +; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP44]], [[TMP42]] +; CHECK-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP39]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]]) +; CHECK-NEXT: [[TMP48]] = add i32 [[TMP47]], [[TMP45]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], -; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP53]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <4 x i32> [[VEC_IND7]], +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -197,7 +177,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP52]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP48]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: @@ -321,100 +301,80 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND15]], <4 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP23]], <4 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP47]]) -; CHECK-NEXT: [[TMP49:%.*]] = mul i32 [[TMP48]], [[TMP46]] -; CHECK-NEXT: [[TMP50:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP43]], <4 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP50]]) -; CHECK-NEXT: [[TMP52]] = mul i32 [[TMP51]], [[TMP49]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND7]], <4 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP38]], <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP43]]) +; CHECK-NEXT: [[TMP45:%.*]] = mul i32 [[TMP44]], [[TMP42]] +; CHECK-NEXT: [[TMP46:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP39]], <4 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]]) +; CHECK-NEXT: [[TMP48]] = mul i32 [[TMP47]], [[TMP45]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], -; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP53]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <4 x i32> [[VEC_IND7]], +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -422,7 +382,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP52]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP48]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]] ; entry: @@ -456,98 +416,78 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP23]] -; CHECK-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND15]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP45]]) -; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP46]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP44]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP48]]) -; CHECK-NEXT: [[TMP50]] = add i32 [[TMP49]], [[TMP47]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]]) +; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) +; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], -; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <4 x i32> [[VEC_IND7]], +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -555,7 +495,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP50]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP46]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: @@ -589,95 +529,75 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 19, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 19, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP23]], <4 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP43]], <4 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP47]]) -; CHECK-NEXT: [[TMP49]] = mul i32 [[TMP48]], [[TMP46]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP38]], <4 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP39]], <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP43]]) +; CHECK-NEXT: [[TMP45]] = mul i32 [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -685,7 +605,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: @@ -717,95 +637,75 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -1, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -1, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP23]], <4 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP43]], <4 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP47]]) -; CHECK-NEXT: [[TMP49]] = and i32 [[TMP48]], [[TMP46]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP38]], <4 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP39]], <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP43]]) +; CHECK-NEXT: [[TMP45]] = and i32 [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -813,7 +713,7 @@ ; CHECK: for.body: ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; entry: @@ -845,93 +745,73 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP23]] -; CHECK-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP44]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP45]]) -; CHECK-NEXT: [[TMP47]] = or i32 [[TMP46]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP41]]) +; CHECK-NEXT: [[TMP43]] = or i32 [[TMP42]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -939,7 +819,7 @@ ; CHECK: for.body: ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP47]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; entry: @@ -971,93 +851,73 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP23]] -; CHECK-NEXT: [[TMP45:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP44]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP45]]) -; CHECK-NEXT: [[TMP47]] = xor i32 [[TMP46]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP41]]) +; CHECK-NEXT: [[TMP43]] = xor i32 [[TMP42]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1065,7 +925,7 @@ ; CHECK: for.body: ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP47]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; entry: @@ -1097,95 +957,75 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x float> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x float> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x float> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x float> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x float> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> poison, float [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x float> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x float> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x float> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load float, float* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x float> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP23]], <4 x float> zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = fadd float [[TMP45]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP43]], <4 x float> zeroinitializer -; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP47]]) -; CHECK-NEXT: [[TMP49]] = fadd float [[TMP48]], [[TMP46]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x float> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x float> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x float> [[TMP38]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = fadd float [[TMP41]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x float> [[TMP39]], <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP43]]) +; CHECK-NEXT: [[TMP45]] = fadd float [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1193,7 +1033,7 @@ ; CHECK: for.body: ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: @@ -1225,95 +1065,75 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_LOAD_CONTINUE14]] ] -; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP16]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x float> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x float> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP26]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x float> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x float> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.if5: -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP33]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP36]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x float> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x float> poison, float [[TMP26]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x float> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP27]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x float> [ [[TMP28]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x float> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP37]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP41:%.*]] = load float, float* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x float> [ [[TMP38]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP42]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP23]], <4 x float> -; CHECK-NEXT: [[TMP45:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = fmul float [[TMP45]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP47:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP43]], <4 x float> -; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP47]]) -; CHECK-NEXT: [[TMP49]] = fmul float [[TMP48]], [[TMP46]] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x float> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x float> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x float> [[TMP38]], <4 x float> +; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = fmul float [[TMP41]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x float> [[TMP39]], <4 x float> +; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP43]]) +; CHECK-NEXT: [[TMP45]] = fmul float [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1321,7 +1141,7 @@ ; CHECK: for.body: ; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -32,7 +32,7 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) { ; CHECK-LABEL: @reduction_sum( -; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP47:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP47:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP44:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND:%.*]] ; CHECK: [[TMP45:%.*]] = add <4 x i32> [[TMP44]], [[TMP23:%.*]] ; CHECK: [[TMP46:%.*]] = add <4 x i32> [[TMP45]], [[TMP43:%.*]] @@ -65,7 +65,7 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) { ; CHECK-LABEL: @reduction_prod( ; CHECK: vector.body: -; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP44:%.*]] = mul <4 x i32> [[VEC_PHI]], [[TMP23:%.*]] ; CHECK: [[TMP45:%.*]] = mul <4 x i32> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] @@ -96,7 +96,7 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) { ; CHECK-LABEL: @reduction_and( ; CHECK: vector.body: -; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP44:%.*]] = and <4 x i32> [[VEC_PHI]], [[TMP23:%.*]] ; CHECK: [[TMP45:%.*]] = and <4 x i32> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] @@ -127,7 +127,7 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) { ; CHECK-LABEL: @reduction_or( ; CHECK: vector.body: -; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer ; CHECK: [[TMP46]] = or <4 x i32> [[VEC_PHI]], [[TMP45]] ; CHECK: middle.block: @@ -157,7 +157,7 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) { ; CHECK-LABEL: @reduction_xor( ; CHECK: vector.body: -; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer ; CHECK: [[TMP46]] = xor <4 x i32> [[VEC_PHI]], [[TMP45]] ; CHECK: middle.block: @@ -187,7 +187,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) { ; CHECK-LABEL: @reduction_fadd( ; CHECK: vector.body: -; CHECK: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP44:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP23:%.*]] ; CHECK: [[TMP45:%.*]] = fadd fast <4 x float> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]] @@ -218,7 +218,7 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) { ; CHECK-LABEL: @reduction_fmul( ; CHECK: vector.body: -; CHECK: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] +; CHECK: [[VEC_PHI:%.*]] = phi <4 x float> [ , %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue6 ] ; CHECK: [[TMP44:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[TMP23:%.*]] ; CHECK: [[TMP45:%.*]] = fmul fast <4 x float> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -14,23 +14,7 @@ ; CHECK-NEXT: loop: ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%iv> vp<%0> -; CHECK-NEXT: Successor(s): pred.load - -; CHECK: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%2> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<%2> (loop) - -; CHECK: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> -; CHECK-NEXT: Successor(s): pred.load.continue - -; CHECK: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%5> = ir<%lv.b> -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.0 ; CHECK: loop.0: ; CHECK-NEXT: Successor(s): pred.store @@ -42,13 +26,16 @@ ; CHECK-NEXT: CondBit: vp<%2> (loop) ; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%add> = add vp<%5>, ir<10> +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> +; CHECK-NEXT: REPLICATE ir<%add> = add ir<%lv.b>, ir<10> ; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<2>, ir<%add> ; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> ; CHECK-NEXT: REPLICATE store ir<%mul>, ir<%gep.a> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.b> ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -615,61 +602,12 @@ ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%iv> vp<%0> ; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> -; CHECK-NEXT: Successor(s): pred.load -; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%2> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<%2> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%lv.a> = load ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%5> = ir<%lv.a> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: Successor(s): pred.load -; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%2> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<%2> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> -; CHECK-NEXT: Successor(s): pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%lv.b> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.1 ; CHECK-EMPTY: ; CHECK-NEXT: loop.1: -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%2> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<%2> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.c> = getelementptr ir<@c>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE store vp<%5>, ir<%gep.c> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK-NEXT: loop.2: @@ -682,10 +620,17 @@ ; CHECK-NEXT: CondBit: vp<%2> (loop) ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE store vp<%8>, ir<%gep.a> +; CHECK-NEXT: REPLICATE ir<%lv.a> = load ir<%gep.a> +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> +; CHECK-NEXT: REPLICATE ir<%gep.c> = getelementptr ir<@c>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE store ir<%lv.a>, ir<%gep.c> +; CHECK-NEXT: REPLICATE store ir<%lv.b>, ir<%gep.a> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%10> = ir<%lv.a> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%11> = ir<%lv.b> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.3 @@ -695,7 +640,7 @@ ; CHECK-NEXT: Successor(s): then.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0: -; CHECK-NEXT: WIDEN ir<%mul> = mul vp<%5>, vp<%8> +; CHECK-NEXT: WIDEN ir<%mul> = mul vp<%10>, vp<%11> ; CHECK-NEXT: EMIT vp<%14> = select vp<%2> ir<%c.0> ir ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: @@ -764,41 +709,9 @@ ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next ; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%iv> vp<%0> ; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> -; CHECK-NEXT: Successor(s): pred.load -; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%2> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<%2> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%lv.a> = load ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%5> = ir<%lv.a> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: Successor(s): pred.sdiv -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv: { -; CHECK-NEXT: pred.sdiv.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%2> -; CHECK-NEXT: Successor(s): pred.sdiv.if, pred.sdiv.continue -; CHECK-NEXT: CondBit: vp<%2> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv.if: -; CHECK-NEXT: REPLICATE ir<%div> = sdiv vp<%5>, vp<%5> -; CHECK-NEXT: Successor(s): pred.sdiv.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%div> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.1 ; CHECK-EMPTY: ; CHECK-NEXT: loop.1: @@ -811,10 +724,14 @@ ; CHECK-NEXT: CondBit: vp<%2> (loop) ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE store vp<%7>, ir<%gep.a> +; CHECK-NEXT: REPLICATE ir<%lv.a> = load ir<%gep.a> +; CHECK-NEXT: REPLICATE ir<%div> = sdiv ir<%lv.a>, ir<%lv.a> +; CHECK-NEXT: REPLICATE store ir<%div>, ir<%gep.a> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%lv.a> +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%div> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.2 @@ -871,22 +788,6 @@ ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: Successor(s): pred.sdiv -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv: { -; CHECK-NEXT: pred.sdiv.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%3> -; CHECK-NEXT: Successor(s): pred.sdiv.if, pred.sdiv.continue -; CHECK-NEXT: CondBit: vp<%3> (loop) -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv.if: -; CHECK-NEXT: REPLICATE ir<%div> = sdiv ir<%for>, vp<%6> -; CHECK-NEXT: Successor(s): pred.sdiv.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.sdiv.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%div> -; CHECK-NEXT: No successors -; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.1 ; CHECK-EMPTY: ; CHECK-NEXT: loop.1: @@ -899,12 +800,21 @@ ; CHECK-NEXT: CondBit: vp<%3> (loop) ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE store vp<%8>, ir<%gep.a> +; CHECK-NEXT: REPLICATE ir<%div> = sdiv ir<%for>, vp<%6> +; CHECK-NEXT: REPLICATE store ir<%div>, ir<%gep.a> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%div> ; CHECK-NEXT: No successors ; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.2 +; CHECK-EMPTY: +; CHECK-NEXT: loop.2: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: No successors +; CHECK-NEXT: } ; entry: br label %loop