diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -514,7 +514,7 @@ /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, - Value *StartV, VPValue *Def, + VPValue *StartV, VPValue *Def, VPTransformState &State); /// A helper function to scalarize a single Instruction in the innermost loop. @@ -4517,16 +4517,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { for (PHINode *OrigPhi : OrigPHIsToFix) { - PHINode *NewPhi = - cast(State.get(State.Plan->getVPValue(OrigPhi), 0)); - unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); - - SmallVector ScalarBBPredecessors( - predecessors(OrigPhi->getParent())); - SmallVector VectorBBPredecessors( - predecessors(NewPhi->getParent())); - assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && - "Scalar and Vector BB should have the same number of predecessors"); + VPWidenPHIRecipe *VPPhi = + cast(State.Plan->getVPValue(OrigPhi)); + PHINode *NewPhi = cast(State.get(VPPhi, 0)); // The insertion point in Builder may be invalidated by the time we get // here. Force the Builder insertion point to something valid so that we do @@ -4536,17 +4529,10 @@ // The predecessor order is preserved and we can rely on mapping between // scalar and vector block predecessors. - for (unsigned i = 0; i < NumIncomingValues; ++i) { - BasicBlock *NewPredBB = VectorBBPredecessors[i]; - - // When looking up the new scalar/vector values to fix up, use incoming - // values from original phi. - Value *ScIncV = - OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); - - // Scalar incoming value may need a broadcast - Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); - NewPhi->addIncoming(NewIncV, NewPredBB); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); } } } @@ -4624,7 +4610,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, - Value *StartV, VPValue *Def, + VPValue *StartVPV, VPValue *Def, VPTransformState &State) { assert(!State.VF.isScalable() && "scalable vectors not yet supported."); PHINode *P = cast(PN); @@ -4646,6 +4632,7 @@ assert(PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"); + Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use @@ -9049,10 +9036,8 @@ } void VPWidenPHIRecipe::execute(VPTransformState &State) { - Value *StartV = - getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; State.ILV->widenPHIInstruction(cast(getUnderlyingValue()), RdxDesc, - StartV, this, State); + getStartValue(), this, State); } void VPBlendRecipe::execute(VPTransformState &State) { @@ -9316,8 +9301,11 @@ if (hasVectorValue(Def, Part)) return Data.PerPartOutput[Def][Part]; - if (!hasScalarValue(Def, {Part, 0})) - return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); + if (!hasScalarValue(Def, {Part, 0})) { + Value *V = + Def->getLiveInIRValue() ? Def->getLiveInIRValue() : VPValue2Value[Def]; + return Callback.getOrCreateVectorValues(V, Part); + } Value *ScalarValue = get(Def, {Part, 0}); // If we aren't vectorizing, we can just copy the scalar map values over diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1040,12 +1040,17 @@ /// A recipe for handling all phi nodes except for integer and FP inductions. /// For reduction PHIs, RdxDesc must point to the corresponding recurrence /// descriptor and the start value is the first operand of the recipe. +/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are +/// managed in the recipe directly. class VPWidenPHIRecipe : public VPRecipeBase, public VPValue { PHINode *Phi; /// Descriptor for a reduction PHI. RecurrenceDescriptor *RdxDesc = nullptr; + /// List of incoming blocks. Only used in the VPlan native path. + SmallVector IncomingBlocks; + public: /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. @@ -1065,6 +1070,9 @@ static inline bool classof(const VPDef *D) { return D->getVPDefID() == VPRecipeBase::VPWidenPHISC; } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenPHISC; + } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; @@ -1077,6 +1085,18 @@ VPValue *getStartValue() { return getNumOperands() == 0 ? nullptr : getOperand(0); } + + /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. + void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { + addOperand(IncomingV); + IncomingBlocks.push_back(IncomingBlock); + } + + /// Returns the \p I th incoming VPValue. + VPValue *getIncomingValue(unsigned I) { return getOperand(I); } + + /// Returns the \p I th incoming VPBasicBlock. + VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } }; /// A recipe for vectorizing a phi-node as a sequence of mask-based select diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -94,13 +94,15 @@ for (auto *Phi : PhisToFix) { assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode."); VPValue *VPVal = IRDef2VPValue[Phi]; - assert(isa(VPVal) && "Expected VPInstruction for phi node."); - auto *VPPhi = cast(VPVal); + assert(isa(VPVal) && + "Expected WidenPHIRecipe for phi node."); + auto *VPPhi = cast(VPVal); assert(VPPhi->getNumOperands() == 0 && "Expected VPInstruction with no operands."); - for (Value *Op : Phi->operands()) - VPPhi->addOperand(getOrCreateVPOperand(Op)); + for (unsigned I = 0; I != Phi->getNumOperands(); ++I) + VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)), + BB2VPBB[Phi->getIncomingBlock(I)]); } } @@ -210,13 +212,13 @@ continue; } - VPInstruction *NewVPInst; + VPValue *NewVPV; if (auto *Phi = dyn_cast(Inst)) { // Phi node's operands may have not been visited at this point. We create // an empty VPInstruction that we will fix once the whole plain CFG has // been built. - NewVPInst = cast(VPIRBuilder.createNaryOp( - Inst->getOpcode(), {} /*No operands*/, Inst)); + NewVPV = new VPWidenPHIRecipe(Phi); + VPBB->appendRecipe(cast(NewVPV)); PhisToFix.push_back(Phi); } else { // Translate LLVM-IR operands into VPValue operands and set them in the @@ -227,11 +229,11 @@ // Build VPInstruction for any arbitraty Instruction without specific // representation in VPlan. - NewVPInst = cast( + NewVPV = cast( VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst)); } - IRDef2VPValue[Inst] = NewVPInst; + IRDef2VPValue[Inst] = NewVPV; } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -33,44 +33,65 @@ // Introduce each ingredient into VPlan. for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) { VPRecipeBase *Ingredient = &*I++; + VPValue *VPV = Ingredient->getVPValue(); // Can only handle VPInstructions. - VPInstruction *VPInst = cast(Ingredient); - Instruction *Inst = cast(VPInst->getUnderlyingValue()); + Instruction *Inst = cast(VPV->getUnderlyingValue()); if (DeadInstructions.count(Inst)) { VPValue DummyValue; - VPInst->replaceAllUsesWith(&DummyValue); + VPV->replaceAllUsesWith(&DummyValue); Ingredient->eraseFromParent(); continue; } VPRecipeBase *NewRecipe = nullptr; - // Create VPWidenMemoryInstructionRecipe for loads and stores. - if (LoadInst *Load = dyn_cast(Inst)) - NewRecipe = new VPWidenMemoryInstructionRecipe( - *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), - nullptr /*Mask*/); - else if (StoreInst *Store = dyn_cast(Inst)) - NewRecipe = new VPWidenMemoryInstructionRecipe( - *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), - Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/); - else if (PHINode *Phi = dyn_cast(Inst)) { + if (auto *VPPhi = dyn_cast(Ingredient)) { + auto *Phi = cast(VPPhi->getUnderlyingValue()); InductionDescriptor II = Inductions.lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); - } else + } else { NewRecipe = new VPWidenPHIRecipe(Phi); - } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { - NewRecipe = new VPWidenGEPRecipe( - GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); - } else - NewRecipe = - new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); + for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { + VPValue *Inc = VPPhi->getIncomingValue(i); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + cast(NewRecipe)->addIncoming(Inc, VPBB); + } + } + } else { + assert(isa(Ingredient) && + "only VPInstructions expected here"); + assert(!isa(Inst) && "phis should be handled above"); + // Create VPWidenMemoryInstructionRecipe for loads and stores. + if (LoadInst *Load = dyn_cast(Inst)) + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + nullptr /*Mask*/); + else if (StoreInst *Store = dyn_cast(Inst)) + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + Plan->getOrAddVPValue(Store->getValueOperand()), + nullptr /*Mask*/); + else if (PHINode *Phi = dyn_cast(Inst)) { + InductionDescriptor II = Inductions.lookup(Phi); + if (II.getKind() == InductionDescriptor::IK_IntInduction || + II.getKind() == InductionDescriptor::IK_FpInduction) { + VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); + } else + NewRecipe = new VPWidenPHIRecipe(Phi); + } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { + NewRecipe = new VPWidenGEPRecipe( + GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); + } else + NewRecipe = + new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); + } NewRecipe->insertBefore(Ingredient); if (NewRecipe->getNumDefinedValues() == 1) - VPInst->replaceAllUsesWith(NewRecipe->getVPValue()); + VPV->replaceAllUsesWith(NewRecipe->getVPValue()); else assert(NewRecipe->getNumDefinedValues() == 0 && "Only recpies with zero or one defined values expected"); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll @@ -32,7 +32,7 @@ ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], @@ -97,7 +97,7 @@ ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]] ; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[StoreVal]], <2 x i64*> %[[AAddr2]], i32 4, <2 x i1> ; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll --- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll @@ -33,7 +33,7 @@ ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], @@ -62,7 +62,7 @@ ; AVX: br label %[[InnerLoop:.+]] ; AVX: [[InnerLoop]]: -; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] ; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]] ; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> %[[InnerPhi]], diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -enable-vplan-native-path -loop-vectorize -S %s | FileCheck -S + +; Make sure phi nodes are generated correctly, even if the use list order of +; the predecessors in the scalar code does not match the order in the generated +; vector blocks. + +; Test from PR45958. + +define void @test([2000 x i32]* %src, i64 %n) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_1_LATCH5:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_1_LATCH5]] ] +; CHECK-NEXT: br label [[LOOP_2_HEADER1:%.*]] +; CHECK: loop.2.header1: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[LOOP_2_LATCH4:%.*]] ] +; CHECK-NEXT: br label [[LOOP_32:%.*]] +; CHECK: loop.32: +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], [2000 x i32]* [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> [[TMP1]], <4 x i32*> [[TMP0]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI3]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_2_LATCH4]], label [[LOOP_32]] +; CHECK: loop.2.latch4: +; CHECK-NEXT: [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_1_LATCH5]], label [[LOOP_2_HEADER1]] +; CHECK: loop.1.latch5: +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] +; CHECK: loop.1.header: +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ] +; CHECK-NEXT: br label [[LOOP_2_HEADER:%.*]] +; CHECK: loop.2.header: +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ 0, [[LOOP_1_HEADER]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2_LATCH:%.*]] ] +; CHECK-NEXT: br label [[LOOP_3:%.*]] +; CHECK: loop.3: +; CHECK-NEXT: [[IV_3:%.*]] = phi i64 [ 0, [[LOOP_2_HEADER]] ], [ [[IV_3_NEXT:%.*]], [[LOOP_3]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds [2000 x i32], [2000 x i32]* [[SRC]], i64 [[IV_1]], i64 [[IV_3]] +; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[GEP_SRC]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[L1]], 10 +; CHECK-NEXT: store i32 [[MUL]], i32* [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV_3_NEXT]] = add nuw nsw i64 [[IV_3]], 1 +; CHECK-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_3_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC_3]], label [[LOOP_2_LATCH]], label [[LOOP_3]] +; CHECK: loop.2.latch: +; CHECK-NEXT: [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 1 +; CHECK-NEXT: [[EC_2:%.*]] = icmp eq i64 [[IV_2_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP_1_LATCH]], label [[LOOP_2_HEADER]] +; CHECK: loop.1.latch: +; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 +; CHECK-NEXT: [[EC_1:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], [[LOOP2:!llvm.loop !.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop.1.header + +loop.1.header: + %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop.1.latch ] + br label %loop.2.header + +loop.2.header: + %iv.2 = phi i64 [ 0, %loop.1.header ], [ %iv.2.next, %loop.2.latch ] + br label %loop.3 + +loop.3: + %iv.3 = phi i64 [ 0, %loop.2.header ], [ %iv.3.next, %loop.3 ] + %gep.src = getelementptr inbounds [2000 x i32], [2000 x i32]* %src, i64 %iv.1, i64 %iv.3 + %l1 = load i32, i32* %gep.src, align 4 + %mul = mul nsw i32 %l1, 10 + store i32 %mul, i32* %gep.src, align 4 + %iv.3.next = add nuw nsw i64 %iv.3, 1 + %ec.3 = icmp eq i64 %iv.3.next, %n + br i1 %ec.3, label %loop.2.latch, label %loop.3 + +loop.2.latch: + %iv.2.next = add nuw nsw i64 %iv.2, 1 + %ec.2 = icmp eq i64 %iv.2.next, %n + br i1 %ec.2, label %loop.1.latch, label %loop.2.header + +loop.1.latch: + %iv.1.next = add nuw nsw i64 %iv.1, 1 + %ec.1 = icmp eq i64 %iv.1.next, %n + br i1 %ec.1, label %exit, label %loop.1.header, !llvm.loop !0 + +exit: ; preds = %loop.1.latch + ret void + +; uselistorder directives + uselistorder label %loop.3, { 1, 0 } + uselistorder label %loop.2.header, { 1, 0 } +} + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll --- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll @@ -29,7 +29,7 @@ ; CHECK: br label %[[InnerLoop:.+]] ; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ] +; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] ; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] ; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll --- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll @@ -35,8 +35,8 @@ ; CHECK: br label %[[InnerForBody:.*]] ; CHECK: [[InnerForBody]]: -; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ] -; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ] +; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ zeroinitializer, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ] +; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ] ; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]] ; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> , <4 x i32> undef) ; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll @@ -28,8 +28,8 @@ ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]] ; CHECK: [[FOR2_HEADER]]: -; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ], [ zeroinitializer, %vector.body ] -; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ], [ %[[MASKED_GATHER1]], %vector.body ] +; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ] +; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ] ; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]] ; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], ; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]],