diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -311,9 +311,9 @@ /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue /// vectorization re-using plans for both the main and epilogue vector loops. /// It should be removed once the re-use issue has been fixed. - void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, - InnerLoopVectorizer &LB, DominatorTree *DT, - bool IsEpilogueVectorization); + VPTransformState executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, + InnerLoopVectorizer &LB, DominatorTree *DT, + bool IsEpilogueVectorization); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -564,7 +564,7 @@ /// bypass block and the end value on the edge from bypass to this loop. PHINode *createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &ID, - ArrayRef BypassBlocks, + ArrayRef BypassBlocks, VPlan &Plan, VPTransformState &State, std::pair AdditionalBypass = {nullptr, nullptr}); protected: @@ -644,6 +644,7 @@ /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( + VPlan &Plan, VPTransformState &State, std::pair AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -3158,7 +3159,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &II, - ArrayRef BypassBlocks, + ArrayRef BypassBlocks, VPlan &Plan, VPTransformState &State, std::pair AdditionalBypass) { Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); assert(VectorTripCount && "Expected valid arguments"); @@ -3176,8 +3177,11 @@ if (II.getInductionBinOp() && isa(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); + VPValue *StepSCEV = vputils::getOrCreateVPValueForSCEVExpr( + Plan, II.getStep(), *PSE.getSE()); + Value *Step = StepSCEV->getDefiningRecipe() ? State.get(StepSCEV, 0) + : StepSCEV->getLiveInIRValue(); + EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); EndValue->setName("ind.end"); @@ -3185,8 +3189,6 @@ // Compute the end value for the additional bypass (if applicable). if (AdditionalBypass.first) { B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); EndValueFromAdditionalBypass = emitTransformedIndex( B, AdditionalBypass.second, II.getStartValue(), Step, II); EndValueFromAdditionalBypass->setName("ind.end"); @@ -3216,6 +3218,7 @@ } void InnerLoopVectorizer::createInductionResumeValues( + VPlan &Plan, VPTransformState &State, std::pair AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && @@ -3231,7 +3234,7 @@ PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( - OrigPhi, II, LoopBypassBlocks, AdditionalBypass); + OrigPhi, II, LoopBypassBlocks, Plan, State, AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } @@ -3329,7 +3332,7 @@ emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(); + createInductionResumeValues(Plan, State); return {completeLoopSkeleton(), nullptr}; } @@ -7679,11 +7682,9 @@ } } -void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, - VPlan &BestVPlan, - InnerLoopVectorizer &ILV, - DominatorTree *DT, - bool IsEpilogueVectorization) { +VPTransformState LoopVectorizationPlanner::executePlan( + ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, + InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && @@ -7773,6 +7774,8 @@ ILV.fixVectorizedLoop(State, BestVPlan); ILV.printDebugTracesAtEnd(); + + return State; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -8014,7 +8017,8 @@ // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues({VecEpilogueIterationCountCheck, + createInductionResumeValues(Plan, State, + {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); return {completeLoopSkeleton(), EPResumeVal}; @@ -10427,8 +10431,8 @@ EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); - LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, - DT, true); + auto State = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10472,7 +10476,8 @@ } ResumeV = MainILV.createInductionResumeValue( - IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); + IndPhi, *ID, {EPI.MainLoopIterationCountCheck}, BestEpiPlan, + State); } assert(ResumeV && "Must have a resume value"); VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s + +define void @test1_pr58811() { +; CHECK-LABEL: @pr58811( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_1_PREHEADER:%.*]] +; CHECK: loop.1.preheader: +; CHECK-NEXT: [[IV_1_PH:%.*]] = phi i32 [ [[SUB93_2:%.*]], [[UNREACHABLE_BB:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 0, [[IV_1_PH]] +; CHECK-NEXT: br label [[LOOP_1:%.*]] +; CHECK: loop.1: +; CHECK-NEXT: [[INDUCTION_IV:%.*]] = phi i32 [ [[INDUCTION_IV_NEXT:%.*]], [[LOOP_1]] ], [ [[TMP0]], [[LOOP_1_PREHEADER]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[IV_1_NEXT:%.*]], [[LOOP_1]] ], [ [[IV_1_PH]], [[LOOP_1_PREHEADER]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT:%.*]], [[LOOP_1]] ], [ 0, [[LOOP_1_PREHEADER]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[IV_2]], -1 +; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 +; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_2]], [[IV_1]] +; CHECK-NEXT: [[INDUCTION_IV_NEXT]] = add i32 [[INDUCTION_IV]], [[TMP1]] +; CHECK-NEXT: br i1 false, label [[LOOP_1]], label [[LOOP_2_PREHEADER:%.*]] +; CHECK: loop.2.preheader: +; CHECK-NEXT: [[INDUCTION_IV_LCSSA3:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_1]] ] +; CHECK-NEXT: [[INDUCTION_IV_LCSSA2:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_1]] ] +; CHECK-NEXT: [[INDUCTION_IV_LCSSA:%.*]] = phi i32 [ [[INDUCTION_IV]], [[LOOP_1]] ] +; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], [[LOOP_1]] ] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = mul i32 196, [[INDUCTION_IV_LCSSA]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[INDUCTION_IV_LCSSA2]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 0, [[INDUCTION_IV_LCSSA2]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 1, [[INDUCTION_IV_LCSSA2]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 2, [[INDUCTION_IV_LCSSA2]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 3, [[INDUCTION_IV_LCSSA2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], [[TMP8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 199, 196 +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = mul i32 195, [[INDUCTION_IV_LCSSA3]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_3_PREHEADER:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_2_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_2_PREHEADER]] ] +; CHECK-NEXT: br label [[LOOP_2:%.*]] +; CHECK: loop.2: +; CHECK-NEXT: [[IV_3:%.*]] = phi i16 [ [[IV_3_NEXT:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV_4:%.*]] = phi i32 [ [[IV_4_NEXT:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV_4_NEXT]] = sub i32 [[IV_4]], [[IV_1_LCSSA]] +; CHECK-NEXT: [[IV_3_NEXT]] = add i16 [[IV_3]], 1 +; CHECK-NEXT: [[CMP88_1:%.*]] = icmp ult i16 [[IV_3]], 198 +; CHECK-NEXT: br i1 [[CMP88_1]], label [[LOOP_2]], label [[LOOP_3_PREHEADER]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: loop.3.preheader: +; CHECK-NEXT: [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], [[LOOP_2]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP_3:%.*]] +; CHECK: loop.3: +; CHECK-NEXT: [[IV_5:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_3]] ], [ 0, [[LOOP_3_PREHEADER]] ] +; CHECK-NEXT: [[SUB93_2]] = sub i32 [[IV_5]], [[IV_4_LCSSA]] +; CHECK-NEXT: br label [[LOOP_3]] +; CHECK: unreachable.bb: +; CHECK-NEXT: br label [[LOOP_1_PREHEADER]] +; +entry: + br label %loop.1.preheader + +loop.1.preheader: + %iv.1.ph = phi i32 [ %sub93.2, %unreachable.bb ], [ 0, %entry ] + br label %loop.1 + +loop.1: + %iv.1 = phi i32 [ %iv.1.next, %loop.1 ], [ %iv.1.ph, %loop.1.preheader ] + %iv.2 = phi i32 [ %iv.2.next, %loop.1 ], [ 0, %loop.1.preheader ] + %iv.2.next = add i32 %iv.2, 1 + %iv.1.next = add i32 %iv.2, %iv.1 + br i1 false, label %loop.1, label %loop.2.preheader + +loop.2.preheader: + %iv.1.lcssa = phi i32 [ %iv.1, %loop.1 ] + br label %loop.2 + +loop.2: + %iv.3 = phi i16 [ %iv.3.next, %loop.2 ], [ 0, %loop.2.preheader ] + %iv.4 = phi i32 [ %iv.4.next, %loop.2 ], [ 0, %loop.2.preheader ] + %iv.4.next = sub i32 %iv.4, %iv.1.lcssa + %iv.3.next = add i16 %iv.3, 1 + %cmp88.1 = icmp ult i16 %iv.3, 198 + br i1 %cmp88.1, label %loop.2, label %loop.3.preheader + +loop.3.preheader: + %iv.4.lcssa = phi i32 [ %iv.4, %loop.2 ] + br label %loop.3 + +loop.3: + %iv.5 = phi i32 [ %sub93.2, %loop.3 ], [ 0, %loop.3.preheader ] + %sub93.2 = sub i32 %iv.5, %iv.4.lcssa + br label %loop.3 + +unreachable.bb: ; No predecessors! + br label %loop.1.preheader +} + + +define void @test2_pr58811() { +entry: + br label %invoke.cont27 + +invoke.cont27: ; preds = %invoke.cont99.2, %entry + %uint32_tVar_174.0752 = phi i32 [ 0, %entry ], [ %sub93.2, %invoke.cont99.2 ] + br label %invoke.cont99 + +invoke.cont99: ; preds = %invoke.cont99, %invoke.cont27 + %uint32_tVar_174.2746 = phi i32 [ %uint32_tVar_174.0752, %invoke.cont27 ], [ %add101, %invoke.cont99 ] + %uint32_tVar_177.2745 = phi i32 [ 0, %invoke.cont27 ], [ %sub93, %invoke.cont99 ] + %sub93 = add i32 %uint32_tVar_177.2745, 1 + %add101 = add i32 %uint32_tVar_177.2745, %uint32_tVar_174.2746 + br i1 false, label %invoke.cont99, label %invoke.cont99.1 + +invoke.cont99.1: ; preds = %invoke.cont99.1, %invoke.cont99 + %int16_tIndArraySafeVar_186.0747.1 = phi i16 [ %inc.1, %invoke.cont99.1 ], [ 0, %invoke.cont99 ] + %uint32_tVar_177.2745.1 = phi i32 [ %sub93.1, %invoke.cont99.1 ], [ 0, %invoke.cont99 ] + %sub93.1 = sub i32 %uint32_tVar_177.2745.1, %uint32_tVar_174.2746 + %inc.1 = add i16 %int16_tIndArraySafeVar_186.0747.1, 1 + %cmp88.1 = icmp ult i16 %int16_tIndArraySafeVar_186.0747.1, 198 + br i1 %cmp88.1, label %invoke.cont99.1, label %invoke.cont99.2 + +invoke.cont99.2: ; preds = %invoke.cont99.2, %invoke.cont99.1 + %uint32_tVar_177.2745.2 = phi i32 [ %sub93.2, %invoke.cont99.2 ], [ 0, %invoke.cont99.1 ] + %sub93.2 = sub i32 %uint32_tVar_177.2745.2, %uint32_tVar_177.2745.1 + br i1 false, label %invoke.cont99.2, label %invoke.cont27 +} + +define void @test3_pr58811() { +entry: + br label %invoke.cont27 + +invoke.cont27: ; preds = %invoke.cont120.2, %entry + %uint32_tVar_174.0752 = phi i32 [ 0, %entry ], [ %sub93.2, %invoke.cont120.2 ] + %rem85 = urem i32 1, %uint32_tVar_174.0752 + br label %invoke.cont99 + +invoke.cont99: ; preds = %invoke.cont99, %invoke.cont27 + %uint32_tVar_174.2746 = phi i32 [ 1, %invoke.cont27 ], [ 0, %invoke.cont99 ] + %add101 = add i32 %rem85, %uint32_tVar_174.2746 + br i1 false, label %invoke.cont99, label %invoke.cont99.1 + +invoke.cont99.1: ; preds = %invoke.cont99.1, %invoke.cont99 + %int16_tIndArraySafeVar_186.0747.1 = phi i16 [ %inc.1, %invoke.cont99.1 ], [ 0, %invoke.cont99 ] + %uint32_tVar_177.2745.1 = phi i32 [ %sub93.1, %invoke.cont99.1 ], [ 0, %invoke.cont99 ] + %sub93.1 = sub i32 %uint32_tVar_177.2745.1, %add101 + %inc.1 = add i16 %int16_tIndArraySafeVar_186.0747.1, 1 + %cmp88.1 = icmp ult i16 %int16_tIndArraySafeVar_186.0747.1, 198 + br i1 %cmp88.1, label %invoke.cont99.1, label %invoke.cont99.2 + +invoke.cont99.2: ; preds = %invoke.cont99.2, %invoke.cont99.1 + %uint32_tVar_177.2745.2 = phi i32 [ %sub93.2, %invoke.cont99.2 ], [ 0, %invoke.cont99.1 ] + %sub93.2 = sub i32 %uint32_tVar_177.2745.2, %uint32_tVar_177.2745.1 + br label %invoke.cont99.2 + +invoke.cont120.2: ; No predecessors! + br label %invoke.cont27 +}